How to convert characters to HTML entities using plain JavaScript
Asked Answered
S

12

61

I have the following:

var text = "Übergroße Äpfel mit Würmern";

I'm searching for a Javascript function to transform the text so that every special letter is represented by its HTML entity sequence like this:

var newText = magicFunction(text);
...
newText = "Übergroße Äpfel mit Würmern";

The function should not only escape the letters of this example but also all of these.

How would you achieve that? Is there any existing function out there? (Plain, because a solution without a framework is preferred)

Btw: Yes, I've seen this question but it doesn't address my need.

Schertz answered 30/8, 2009 at 15:5 Comment(0)
S
65

With the help of bucabay and the advice to create my own function i created this one which works for me. What do you guys think, is there a better solution somewhere?

if(typeof escapeHtmlEntities == 'undefined') {
        escapeHtmlEntities = function (text) {
            return text.replace(/[\u00A0-\u2666<>\&]/g, function(c) {
                return '&' + 
                (escapeHtmlEntities.entityTable[c.charCodeAt(0)] || '#'+c.charCodeAt(0)) + ';';
            });
        };

        // all HTML4 entities as defined here: http://www.w3.org/TR/html4/sgml/entities.html
        // added: amp, lt, gt, quot and apos
        escapeHtmlEntities.entityTable = {
            34 : 'quot', 
            38 : 'amp', 
            39 : 'apos', 
            60 : 'lt', 
            62 : 'gt', 
            160 : 'nbsp', 
            161 : 'iexcl', 
            162 : 'cent', 
            163 : 'pound', 
            164 : 'curren', 
            165 : 'yen', 
            166 : 'brvbar', 
            167 : 'sect', 
            168 : 'uml', 
            169 : 'copy', 
            170 : 'ordf', 
            171 : 'laquo', 
            172 : 'not', 
            173 : 'shy', 
            174 : 'reg', 
            175 : 'macr', 
            176 : 'deg', 
            177 : 'plusmn', 
            178 : 'sup2', 
            179 : 'sup3', 
            180 : 'acute', 
            181 : 'micro', 
            182 : 'para', 
            183 : 'middot', 
            184 : 'cedil', 
            185 : 'sup1', 
            186 : 'ordm', 
            187 : 'raquo', 
            188 : 'frac14', 
            189 : 'frac12', 
            190 : 'frac34', 
            191 : 'iquest', 
            192 : 'Agrave', 
            193 : 'Aacute', 
            194 : 'Acirc', 
            195 : 'Atilde', 
            196 : 'Auml', 
            197 : 'Aring', 
            198 : 'AElig', 
            199 : 'Ccedil', 
            200 : 'Egrave', 
            201 : 'Eacute', 
            202 : 'Ecirc', 
            203 : 'Euml', 
            204 : 'Igrave', 
            205 : 'Iacute', 
            206 : 'Icirc', 
            207 : 'Iuml', 
            208 : 'ETH', 
            209 : 'Ntilde', 
            210 : 'Ograve', 
            211 : 'Oacute', 
            212 : 'Ocirc', 
            213 : 'Otilde', 
            214 : 'Ouml', 
            215 : 'times', 
            216 : 'Oslash', 
            217 : 'Ugrave', 
            218 : 'Uacute', 
            219 : 'Ucirc', 
            220 : 'Uuml', 
            221 : 'Yacute', 
            222 : 'THORN', 
            223 : 'szlig', 
            224 : 'agrave', 
            225 : 'aacute', 
            226 : 'acirc', 
            227 : 'atilde', 
            228 : 'auml', 
            229 : 'aring', 
            230 : 'aelig', 
            231 : 'ccedil', 
            232 : 'egrave', 
            233 : 'eacute', 
            234 : 'ecirc', 
            235 : 'euml', 
            236 : 'igrave', 
            237 : 'iacute', 
            238 : 'icirc', 
            239 : 'iuml', 
            240 : 'eth', 
            241 : 'ntilde', 
            242 : 'ograve', 
            243 : 'oacute', 
            244 : 'ocirc', 
            245 : 'otilde', 
            246 : 'ouml', 
            247 : 'divide', 
            248 : 'oslash', 
            249 : 'ugrave', 
            250 : 'uacute', 
            251 : 'ucirc', 
            252 : 'uuml', 
            253 : 'yacute', 
            254 : 'thorn', 
            255 : 'yuml', 
            402 : 'fnof', 
            913 : 'Alpha', 
            914 : 'Beta', 
            915 : 'Gamma', 
            916 : 'Delta', 
            917 : 'Epsilon', 
            918 : 'Zeta', 
            919 : 'Eta', 
            920 : 'Theta', 
            921 : 'Iota', 
            922 : 'Kappa', 
            923 : 'Lambda', 
            924 : 'Mu', 
            925 : 'Nu', 
            926 : 'Xi', 
            927 : 'Omicron', 
            928 : 'Pi', 
            929 : 'Rho', 
            931 : 'Sigma', 
            932 : 'Tau', 
            933 : 'Upsilon', 
            934 : 'Phi', 
            935 : 'Chi', 
            936 : 'Psi', 
            937 : 'Omega', 
            945 : 'alpha', 
            946 : 'beta', 
            947 : 'gamma', 
            948 : 'delta', 
            949 : 'epsilon', 
            950 : 'zeta', 
            951 : 'eta', 
            952 : 'theta', 
            953 : 'iota', 
            954 : 'kappa', 
            955 : 'lambda', 
            956 : 'mu', 
            957 : 'nu', 
            958 : 'xi', 
            959 : 'omicron', 
            960 : 'pi', 
            961 : 'rho', 
            962 : 'sigmaf', 
            963 : 'sigma', 
            964 : 'tau', 
            965 : 'upsilon', 
            966 : 'phi', 
            967 : 'chi', 
            968 : 'psi', 
            969 : 'omega', 
            977 : 'thetasym', 
            978 : 'upsih', 
            982 : 'piv', 
            8226 : 'bull', 
            8230 : 'hellip', 
            8242 : 'prime', 
            8243 : 'Prime', 
            8254 : 'oline', 
            8260 : 'frasl', 
            8472 : 'weierp', 
            8465 : 'image', 
            8476 : 'real', 
            8482 : 'trade', 
            8501 : 'alefsym', 
            8592 : 'larr', 
            8593 : 'uarr', 
            8594 : 'rarr', 
            8595 : 'darr', 
            8596 : 'harr', 
            8629 : 'crarr', 
            8656 : 'lArr', 
            8657 : 'uArr', 
            8658 : 'rArr', 
            8659 : 'dArr', 
            8660 : 'hArr', 
            8704 : 'forall', 
            8706 : 'part', 
            8707 : 'exist', 
            8709 : 'empty', 
            8711 : 'nabla', 
            8712 : 'isin', 
            8713 : 'notin', 
            8715 : 'ni', 
            8719 : 'prod', 
            8721 : 'sum', 
            8722 : 'minus', 
            8727 : 'lowast', 
            8730 : 'radic', 
            8733 : 'prop', 
            8734 : 'infin', 
            8736 : 'ang', 
            8743 : 'and', 
            8744 : 'or', 
            8745 : 'cap', 
            8746 : 'cup', 
            8747 : 'int', 
            8756 : 'there4', 
            8764 : 'sim', 
            8773 : 'cong', 
            8776 : 'asymp', 
            8800 : 'ne', 
            8801 : 'equiv', 
            8804 : 'le', 
            8805 : 'ge', 
            8834 : 'sub', 
            8835 : 'sup', 
            8836 : 'nsub', 
            8838 : 'sube', 
            8839 : 'supe', 
            8853 : 'oplus', 
            8855 : 'otimes', 
            8869 : 'perp', 
            8901 : 'sdot', 
            8968 : 'lceil', 
            8969 : 'rceil', 
            8970 : 'lfloor', 
            8971 : 'rfloor', 
            9001 : 'lang', 
            9002 : 'rang', 
            9674 : 'loz', 
            9824 : 'spades', 
            9827 : 'clubs', 
            9829 : 'hearts', 
            9830 : 'diams', 
            338 : 'OElig', 
            339 : 'oelig', 
            352 : 'Scaron', 
            353 : 'scaron', 
            376 : 'Yuml', 
            710 : 'circ', 
            732 : 'tilde', 
            8194 : 'ensp', 
            8195 : 'emsp', 
            8201 : 'thinsp', 
            8204 : 'zwnj', 
            8205 : 'zwj', 
            8206 : 'lrm', 
            8207 : 'rlm', 
            8211 : 'ndash', 
            8212 : 'mdash', 
            8216 : 'lsquo', 
            8217 : 'rsquo', 
            8218 : 'sbquo', 
            8220 : 'ldquo', 
            8221 : 'rdquo', 
            8222 : 'bdquo', 
            8224 : 'dagger', 
            8225 : 'Dagger', 
            8240 : 'permil', 
            8249 : 'lsaquo', 
            8250 : 'rsaquo', 
            8364 : 'euro'
        };
    }

usage example:

var text = "Übergroße Äpfel mit Würmern";
alert(escapeHtmlEntities (text));

result:

&Uuml;bergro&szlig;e &Auml;pfel mit W&uuml;rmern
Schertz answered 30/8, 2009 at 19:31 Comment(13)
looks good. I'd go for: escapeHtmlEntities.entityTable[c.charCodeAt(0)] || '#'+c.charCodeAt(0) so you can catch those charCode's not in entityTable.Swagsman
This is a great solution, good balance of capturing all extended Unicode characters but still providing named entities for the most common ones. You should probably add amp, gt, and lt to the entityTable. One small caveat: some older browsers may not support all of the named entities you have in that dictionary.Yellowthroat
@Schertz How about making this into a library? :)Thrombokinase
If anyones interested I made a nodejs module out of this, heres the code.Thrombokinase
Love this answer. I simplified it to to omit the entity table, not match <>\& and you get a great function for 7-bit serialization of html where all you care about is not losing data.Paraplegia
Another caveat: the code, as written, will not handle Unicode characters U+10000 and greater properly. To handle those, it would be necessary to add code to combine each UTF-16 surrogate pair into a single value.Bairn
Your code is better/more descriptive than this gist (which is similar): gist.github.com/jonathantneal/6093551. I converted to a singleton and added to my library: github.com/centurianii/jsutilsBibliology
Why stop at U+2666? Seems arbitrary. For a solution that correctly encodes all except safe & printable ASCII symbols in the input (including astral symbols!), implements all named character references (not just those in HTML4), and works in both Node.js and the browser, see my answer.Loxodromic
@Bairn For what it’s worth, the he library handles those just fine.Loxodromic
@Mathias Bynens create an answer and show a better solution, kid!Schertz
@Schertz What do you mean exactly? My comment linked to my answer :)Loxodromic
@Mathias Bynens i added a link to your lib in my answer.Schertz
Its not converting " into &quot;Kinsman
L
63

All the other solutions suggested here, as well as most other JavaScript libraries that do HTML entity encoding/decoding, make several mistakes:

For a robust solution that avoids all these issues, use a library I wrote called he for this. From its README:

he (for “HTML entities”) is a robust HTML entity encoder/decoder written in JavaScript. It supports all standardized named character references as per HTML, handles ambiguous ampersands and other edge cases just like a browser would, has an extensive test suite, and — contrary to many other JavaScript solutions — he handles astral Unicode symbols just fine. An online demo is available.

Loxodromic answered 23/5, 2014 at 14:0 Comment(1)
item (1),(3), (4) and (5) talk about decoding, not encoding and miss the point of the quesiton. the he library is great anyways, just not really needed for this question. you can look at my solution for a short standalone javascript implementation (encode only).Exhume
S
21

Using escape() should work with the character code range 0x00 to 0xFF (UTF-8 range).

If you go beyond 0xFF (255), such as 0x100 (256) then escape() will not work:

escape("\u0100"); // %u0100

and:

text = "\u0100"; // Ā
html = escape(text).replace(/%(..)/g,"&#x$1;"); // &#xu0;100

So, if you want to cover all Unicode charachacters as defined on http://www.w3.org/TR/html4/sgml/entities.html , then you could use something like:

var html = text.replace(/[\u00A0-\u00FF]/g, function(c) {
   return '&#'+c.charCodeAt(0)+';';
});

Note here the range is between: \u00A0-\u00FF.

Thats the first character code range defined in http://www.w3.org/TR/html4/sgml/entities.html which is the same as what escape() covers.

You'll need to add the other ranges you want to cover as well, or all of them.

Example: UTF-8 range with general punctuations (\u00A0-\u00FF and \u2022-\u2135)

var html = text.replace(/[\u00A0-\u00FF\u2022-\u2135]/g, function(c) {
   return '&#'+c.charCodeAt(0)+';';
});

Edit:

BTW: \u00A0-\u2666 should convert every Unicode character code not within ASCII range to HTML entities blindly:

var html = text.replace(/[\u00A0-\u2666]/g, function(c) {
   return '&#'+c.charCodeAt(0)+';';
});
Swagsman answered 30/8, 2009 at 18:10 Comment(4)
Very good point, bucabay... I was handling the simplest case of UTF8 with a quick hack, but this is definitely a more robust solution. Great use of a passed function for handling RegEx replacement, I forgot about being able to do that. Upvoted. Needs a quick fix, however, to add ampersand, less-than, and greater-than to the character range so it can completely replace my code.Yellowthroat
this is way easier than those htmlencode lookup services. try it as a bookmarklet? alert(prompt('Enter characters to htmlEncode', '').replace(/[\u00A0-\u2666]/g, function(c) { return '&#'+c.charCodeAt(0)+';'; }));Fritter
This approach doesn’t take into account the character references overrides in the HTML Standard. For example, htmlEncode('\x80') should not return &#x80; or &#128;. In fact, it shouldn’t return an HTML entity at all; there is no valid way to represent that character in HTML. See my answer for more information, and for a better solution.Loxodromic
That last edit saved the day. Thank you. I was taking some HTML dumped back from the server and trying to open it in a popup window. This came in handy.Segregationist
E
8

The he library is the only 100% reliable solution that I know of!

He is written by Mathias Bynens - one of the world's most renowned JavaScript gurus - and has the following features :


Example use

he.encode('foo © bar ≠ baz 𝌆 qux'); 
// Output : 'foo &#xA9; bar &#x2260; baz &#x1D306; qux'

he.decode('foo &copy; bar &ne; baz &#x1D306; qux');
// Output : 'foo © bar ≠ baz 𝌆 qux'
Eschatology answered 24/2, 2017 at 11:20 Comment(1)
question is a native solution without libraries.Glassman
L
7

You can use:

function encodeHTML(str){
 var aStr = str.split(''),
     i = aStr.length,
     aRet = [];

   while (i--) {
    var iC = aStr[i].charCodeAt();
    if (iC < 65 || iC > 127 || (iC>90 && iC<97)) {
      aRet.push('&#'+iC+';');
    } else {
      aRet.push(aStr[i]);
    }
  }
 return aRet.reverse().join('');
}

This function HTMLEncodes everything that is not a-z/A-Z.

[Edit] A rather old answer. Let's add a simpler String extension to encode all extended characters:

String.prototype.encodeHTML = function () {
  return this.replace(/[\u0080-\u024F]/g, 
          function (v) {return '&#'+v.charCodeAt()+';';}
         );
}
// usage
log('Übergroße Äpfel mit Würmern'.encodeHTML());
//=> '&#220;bergro&#223;e &#196;pfel mit W&#252;rmern'
Lettuce answered 30/8, 2009 at 18:10 Comment(5)
this should be a lot faster then using text.replace(). I love how while(--i) is used instead of for() loop. I'm assuming the theory is that for large text/loops the fast condition test offsets the Array.reverse().join('') outside the loop. Otherwise you would have just used string concatenation?Swagsman
Thanks. Decrementing loops are faster than incrementing indeed, it's an optimization step I read about long time ago and I use it most of the time (it's also less code). I'm not sure if the 'reverse'-part weighs out the speed gain. For shorter strings it may be still faster to use aRet[i] = [value] in stead of aRet.push (as is very well explained by "olliej" in #614626).Lettuce
--i will equal 0 when reaching the first character. Your condition should be while (--i >= 0) or you'll lose the first character of the input string.Deandreadeane
@AliGangji, yes, should've been i--, adjusted the answerLettuce
This sample seems flawed for unicode characters; see https://mcmap.net/q/37658/-how-to-convert-unicode-characters-to-html-numeric-entities-using-plain-javascript for a better encodeHTML() optionStereometry
Y
4

Having a lookup table with a bazillion replace() calls is slow and not maintainable.

Fortunately, the build-in escape() function also encodes most of the same characters, and puts them in a consistent format (%XX, where XX is the hex value of the character).

So, you can let escape() method do most of the work for you and just change its answer to be HTML entities instead of URL-escaped characters:

htmlescaped = escape(mystring).replace(/%(..)/g,"&#x$1;");

This uses the hex format for escaping values rather than the named entities, but for storing and displaying the values, it works just as well as named entities.

Of course, escape also escapes characters you don't need to escape in HTML (spaces, for instance), but you can unescape them with a few replace calls.

Edit: I like bucabay's answer better than my own... handles a larger range of characters, and requires no hacking afterward to get spaces, slashes, etc. unescaped.

Yellowthroat answered 30/8, 2009 at 15:23 Comment(2)
This approach doesn’t take into account the character references overrides in the HTML Standard. For example, htmlEncode('\x80') should not return &#x80; or &#128;. In fact, it shouldn’t return an HTML entity at all; there is no valid way to represent that character in HTML. See my answer for more information, and for a better solution.Loxodromic
actually this doesn't work for characters outside the \u00FF rangeExhume
H
3

I fixed my problem by using encodeURIComponent() instead of escape().

This might be the fix for you if the problem happens when sending your string in a URL.

Try this with the phrase ("hi & % ‘")

escape() returns

"hi%20%26%20%25%20%u2018"

Notice the %u2018 isn't very url friendly and can break the rest of the query string.

encodeURI() returns

"hi%20&%20%25%20%E2%80%98"

Notice the ampersand is still there.

encodeURIComponent() returns

"hi%20%26%20%25%20%E2%80%98"

Finally, all of our characters are properly encoded.

Hendrick answered 10/4, 2015 at 20:30 Comment(0)
E
3

Demo on JSFiddle

here's a tiny stand alone method that:

  • attempts to consolidate the answers on this page, without using a library
  • works in older browsers
  • supports surrogate pairs (like emojis)
  • applies character overrides (what's that? not sure exactly)

i don't know too much about unicode, but it seems to be working well.

// escape a string for display in html
// see also: 
// polyfill for String.prototype.codePointAt
//   https://raw.githubusercontent.com/mathiasbynens/String.prototype.codePointAt/master/codepointat.js
// how to convert characters to html entities
//     https://mcmap.net/q/37583/-how-to-convert-characters-to-html-entities-using-plain-javascript
// html overrides from 
//   https://html.spec.whatwg.org/multipage/syntax.html#table-charref-overrides / https://mcmap.net/q/37583/-how-to-convert-characters-to-html-entities-using-plain-javascript/23831239#comment36668052_1354098

var _escape_overrides = { 0x00:'\uFFFD',0x80:'\u20AC',0x82:'\u201A',0x83:'\u0192',0x84:'\u201E',0x85:'\u2026',0x86:'\u2020',0x87:'\u2021',0x88:'\u02C6',0x89:'\u2030',0x8A:'\u0160',0x8B:'\u2039',0x8C:'\u0152',0x8E:'\u017D',0x91:'\u2018',0x92:'\u2019',0x93:'\u201C',0x94:'\u201D',0x95:'\u2022',0x96:'\u2013',0x97:'\u2014',0x98:'\u02DC',0x99:'\u2122',0x9A:'\u0161',0x9B:'\u203A',0x9C:'\u0153',0x9E:'\u017E',0x9F:'\u0178' }; 

function escapeHtml(str){
    return str.replace(/([\u0000-\uD799]|[\uD800-\uDBFF][\uDC00-\uFFFF])/g, function(c) {
        var c1 = c.charCodeAt(0);
        // ascii character, use override or escape
        if( c1 <= 0xFF ) return (c1=_escape_overrides[c1])?c1:escape(c).replace(/%(..)/g,"&#x$1;");
        // utf8/16 character
        else if( c.length == 1 ) return "&#" + c1 + ";"; 
        // surrogate pair
        else if( c.length == 2 && c1 >= 0xD800 && c1 <= 0xDBFF ) return "&#" + ((c1-0xD800)*0x400 + c.charCodeAt(1) - 0xDC00 + 0x10000) + ";"
        // no clue .. 
        else return "";
    });
}
Exhume answered 19/5, 2015 at 0:47 Comment(1)
This seems to fail when using str = &#9888;&#65039; Use the encodeHTML (https://mcmap.net/q/37583/-how-to-convert-characters-to-html-entities-using-plain-javascript) from KooiInc below insteadStereometry
F
1

Just reposting @bucababy's answer as a "bookmarklet", as it's sometimes easier than using those lookup pages:

alert(prompt('Enter characters to htmlEncode', '').replace(/[\u00A0-\u2666]/g, function(c) {
   return '&#'+c.charCodeAt(0)+';';
}));
Fritter answered 14/8, 2013 at 16:22 Comment(4)
This doesn’t work for astral symbols (try 𝌆, for example) and it has some other issues too. Instead of this bookmarklet, consider using my online HTML entity encoder/decoder.Loxodromic
@MathiasBynens you have obviously given this more thought and are correct, but it's still not a bookmarklet ;)Fritter
p.s. linking to astral symbols because I had no idea what @MathiasBynens was talking aboutFritter
IMHO adding http://mothereff.in/html-entities#%s as a custom search engine to my browser is much easier, but if you insist: javascript:void (function(){location='http://mothereff.in/html-entities#'+encodeURIComponent(pro‌​mpt('Enter text to HTML-encode:',''))}())Loxodromic
S
1

I recommend to use the JS library entities. Using the library is quite simple. See the examples from docs:

const entities = require("entities");
//encoding
entities.escape("&#38;"); // "&#x26;#38;"
entities.encodeXML("&#38;"); // "&amp;#38;"
entities.encodeHTML("&#38;"); // "&amp;&num;38&semi;"
//decoding
entities.decodeXML("asdf &amp; &#xFF; &#xFC; &apos;"); // "asdf & ÿ ü '"
entities.decodeHTML("asdf &amp; &yuml; &uuml; &apos;"); // "asdf & ÿ ü '"
Sunnisunnite answered 29/8, 2019 at 13:46 Comment(0)
W
0

Best solution is posted at phpjs.org implementation of PHP function htmlentities

The format is htmlentities(string, quote_style, charset, double_encode) Full documentation on the PHP function which is identical can be read here

Weighin answered 15/11, 2011 at 7:44 Comment(0)
J
-1

I adapted one of the answers from the referenced question, but added the ability to define an explicit mapping for character names.

var char_names = {
    160:'nbsp',
    161:'iexcl',
    220:'Uuml',
    223:'szlig',
    196:'Auml',
    252:'uuml',
    };

function HTMLEncode(str){
     var aStr = str.split(''),
         i = aStr.length,
         aRet = [];

     while (--i >= 0) {
      var iC = aStr[i].charCodeAt();
       if (iC < 32 || (iC > 32 && iC < 65) || iC > 127 || (iC>90 && iC<97)) {
        if(char_names[iC]!=undefined) {
         aRet.push('&'+char_names[iC]+';');
        }
        else {
         aRet.push('&#'+iC+';');
        }
       } else {
        aRet.push(aStr[i]);
       }
    }
    return aRet.reverse().join('');
   }

var text = "Übergroße Äpfel mit Würmer";

alert(HTMLEncode(text));
Jolinejoliotcurie answered 30/8, 2009 at 15:43 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.