Javascript convert unicode string to "Title Case"
Asked Answered
E

4

5

I have a javascript case conversion problem which I cannot solve due to non-English letters. My main concern is the Turkish alphabet.

What I need to do is this:

  • hello world => Hello World
  • HELLO WORLD => Hello World
  • hELLO wOrLd => Hello World

Here is what I've accomplished so far:

String.prototype.turkishToUpper = function(){
    var stringlow = this;
    var letterslow = { 'i': 'İ', 'ş': 'Ş', 'ğ': 'Ğ', 'ü': 'Ü', 'ö': 'Ö', 'ç': 'Ç', 'ı': 'I' };
    stringlow = stringlow.replace(/(([iışğüçö]))/g, function(letterlow){ return letterslow[letterlow]; })
    return stringlow.toUpperCase();
}

String.prototype.turkishToLower = function(){
    var stringup = this;
    var lettersup = { 'İ': 'i', 'I': 'ı', 'Ş': 'ş', 'Ğ': 'ğ', 'Ü': 'ü', 'Ö': 'ö', 'Ç': 'ç' };
    stringup = stringup.replace(/(([İIŞĞÜÇÖ]))/g, function(letterup){ return lettersup[letterup]; })
    return stringup.toLowerCase();
}

String.prototype.toProperCase = function () {
    return this.replace(/\w\S*/g, function(txt){return txt.charAt(0).turkishToUpper() + txt.substr(1).turkishToLower();});
};

But this does not give me the correct results and I am suspecting the regex replace not being usable on unicode, but ascii.

When I test with Turkish characters, I get wrong results.

  • şeker becomes şEker instead of Şeker
  • çoban ırmak becomes çOban ıRmak intead of Çoban Irmak

Also, if this can ever get resolved, I need an icing on the cake to separate words not only by spaces, but also by some other stop characters such as : - = / etc so that

  • hello-world becomes Hello-World
  • hello:world becomes Hello:World

I've read through many similar questions here on SO, but no luck so far.

Thanks

Note: I think this is called Title Case but some have argued that it is Pascal Case. To be frank, I am interested in resolving the unicode issue (which I believe is the root cause) rather than semantics, so please forgive me if I've used wrong terminology :)

Expositor answered 13/8, 2012 at 11:40 Comment(6)
Why don't you use CSS for this? jsfiddle.net/3jg3b - text-transform: capitalizeAlternant
Your problem with regexp selector. \S means not whitespace char but javascript identifies ş,ç as a whitespace. Take a look blog.stevenlevithan.com/archives/javascript-regex-and-unicodeKeelby
I'm not usind CSS @Alternant because this is for correcting formatting errors on user input.Expositor
yes @Keelby I am aware of the fact, hense the unicode hint on the topic ;)Expositor
Does it have to work only with Turkish, or with other languages too?Tranquilize
Hi @Tranquilize in my case Turkish is fine, but actually the solution below seems to do the job with most (if not all) accented characters. The only major problem is the Turkish "i" which is a major non-standard issueExpositor
M
8

Standalone function:

function toProperCase(s){
    return s.replace(/([^\s:\-])([^\s:\-]*)/g,function($0,$1,$2){
        return $1.toUpperCase()+$2.toLowerCase();
    });
}

Or for extending of String.prototype:

String.prototype.toProperCase=function() {
    return this.replace(/([^\s:\-])([^\s:\-]*)/g,function($0,$1,$2){
        return $1.toUpperCase()+$2.toLowerCase();
    });
}

"çoban ırmak becomes çOban ıRmak intead of Çoban Irmak Hello-wOrld".toProperCase();
// "Çoban Irmak Becomes Çoban Irmak Intead Of Çoban Irmak Hello-World"

Update:

Next code uses custom functionality for converting locale specific chars (tested partially). Code adds functions into String.prototype: toLocaleProperCase2, toLocaleLowerCase2 and toLocaleUpperCase2.

(function(){
    // locale specific chars
    // IMPORTANT: name of locale must be always in lower case (for "tr-TR" locale - "tr-tr") !!!
    var localeInfos={
            "tr-tr": { lower: { i:"İ", ı:"I", ş:"Ş", ğ:"Ğ", ü:"Ü", ç:"Ç", ö:"Ö" },
                       upper: { İ:"i", I:"ı", Ş:"ş", Ğ:"ğ", Ü:"ü", Ç:"ç", Ö:"ö" } }
        },
        localeInfo;
    // helper vars
    var mask="\\s:\\-", // add additional delimeters chars to the mask if needed
        rg=new RegExp("([^"+mask+"])([^"+mask+"]*)","g");
    var fnToLocaleLower=function(s){ return localeInfo.upper[s]; },
        fnToLocaleUpper=function(s){ return localeInfo.lower[s]; },
        fnToProper=function($0,$1,$2){
            if(localeInfo){
                if(localeInfo.lower.hasOwnProperty($1))$1=localeInfo.lower[$1];
                $2=$2.replace(localeInfo.upperSearchRegExp,fnToLocaleLower);
            }
            return $1.toUpperCase()+$2.toLowerCase();
        };
    // helper calculations
    var localeInfosKeys=Object.keys(localeInfos);
    for(var i=0;localeInfo=localeInfos[localeInfosKeys[i]];i++){
        localeInfo.lowerSearchRegExp=new RegExp("["+Object.keys(localeInfo.lower).join("")+"]","g");
        localeInfo.upperSearchRegExp=new RegExp("["+Object.keys(localeInfo.upper).join("")+"]","g");
    }

    // extending String.prototype
    String.prototype.toLocaleProperCase2=function toLocaleProperCase2(locale){
        localeInfo=localeInfos[arguments.length?locale.toLowerCase():null];
        return this.replace(rg,fnToProper);
    };
    String.prototype.toLocaleLowerCase2=function toLocaleLowerCase2(locale){
        return ((localeInfo=localeInfos[arguments.length?locale.toLowerCase():null]) ?
                this.replace(localeInfo.upperSearchRegExp,fnToLocaleLower):
                this).toLowerCase();
    };
    String.prototype.toLocaleUpperCase2=function toLocaleUpperCase2(locale){
        return ((localeInfo=localeInfos[arguments.length?locale.toLowerCase():null]) ?
                this.replace(localeInfo.lowerSearchRegExp,fnToLocaleUpper) :
                this).toUpperCase();
    };
})();

// testing
var sss="çoban ırmak ibecıoimes çOban ıRmak intead of Çoban IrImaİk Hello-wOrld";
console.log("Origin:    ", sss);
console.log("Proper TR: ", sss.toLocaleProperCase2("tr-TR"));
console.log("Proper:    ", sss.toLocaleProperCase2());
console.log("Lower TR:  ", sss.toLocaleLowerCase2("tr-TR"));
console.log("Lower:     ", sss.toLocaleLowerCase2());
console.log("Upper TR:  ", sss.toLocaleUpperCase2("tr-TR"));
console.log("Upper:     ", sss.toLocaleUpperCase2());

// Origin:    çoban ırmak ibecıoimes çOban ıRmak intead of Çoban IrImaİk Hello-wOrld
// Proper TR: Çoban Irmak İbecıoimes Çoban Irmak İntead Of Çoban Irımaik Hello-World
// Proper:    Çoban Irmak Ibecıoimes Çoban Irmak Intead Of Çoban Irimaik Hello-World
// Lower TR:  çoban ırmak ibecıoimes çoban ırmak intead of çoban ırımaik hello-world
// Lower:     çoban ırmak ibecıoimes çoban ırmak intead of çoban irimaik hello-world
// Upper TR:  ÇOBAN IRMAK İBECIOİMES ÇOBAN IRMAK İNTEAD OF ÇOBAN IRIMAİK HELLO-WORLD
// Upper:     ÇOBAN IRMAK IBECIOIMES ÇOBAN IRMAK INTEAD OF ÇOBAN IRIMAİK HELLO-WORLD
Methylal answered 13/8, 2012 at 13:4 Comment(4)
Thanks @andrew-d this is almost perfect with the exception that Turkish letter "i" is off-the-charts-non-standard. Caps for "i" is "İ" and small for "I" is "ı". How do I introduce this in the equation?Expositor
@andrew: Are you saying this code does convert ırmak to Irmak? @Serkan: Are you saying it doesn't but accepting it anyway? (I'm confused.)Tranquilize
This much improves on the first answer which I've accepted anyway since it was 99% perfect :) And this last update seems to do the whole trick, thanks a lot!Expositor
Thanks for this useful tips. Adamsın adamJennifferjennilee
U
0

Here is the JS function that will do the job.

function toProperCase(string) {   
    output = '';
    explodedString = string.split(' '); //explode string by space
    for(var i=0;i<explodedString.length;i++){
        output += explodedString[i].charAt(0).toUpperCase() + explodedString[i].slice(1).toLowerCase()+' ';
    }
    return output.slice(0, -1);
}
Uzbek answered 13/8, 2012 at 11:48 Comment(1)
Don't use for (... in ...) for arrays. JavaScript “For …in” with ArraysAlternant
L
0
String.prototype.toProperCase = function (){
   var arr= this.toLowerCase().split(' ');
   for( var i=0;i<arr.length;i++){
      arr[i] = arr[i].charAt(0).toUpperCase() + arr[i].substr(1);
   };
   return arr.join(' ');
};

'çoban ırmak'.toProperCase() // "Çoban Irmak"
Liquidity answered 13/8, 2012 at 12:1 Comment(0)
S
0

Just another option but with the icing on the cake:

function toProperCase( str ) {

    var i,
        j,
        chars,
        arr;

    arr = str.toLowerCase( ).split("");

    chars = {
        " " : true,
        "-" : true,
        ":" : true,
        "=" : true,
        "/" : true
    };

    for( var i = 0, j = -1; i < arr.length; i += 1, j += 1 ) {
        // if previous char (j) exists in chars and current (i) does not;
        // replace with uppercase equivalent.
        if ( ( arr[j] && chars[ arr[j] ] && !chars[ arr[i] ] ) || i === 0){
            arr[i] = arr[i].toUpperCase( );
        }
    }

    return arr.join("");
}

Hopefully that helps :)

Superstructure answered 13/8, 2012 at 13:49 Comment(0)

© 2022 - 2025 — McMap. All rights reserved.