Here is code to transliterate unicode chars to their closest ascii version where possible. Remove/fix accents, macrons, typesetters colons, dashes, curly quotes, apostrophes, dashes, invisible spaces, and other bad chars.
This is useful if you need to feed data into another system that does not support unicode. Code is fast by using stringbuilder and simple loop (tested 8,000 char string processed 10,000x = 1.1sec).
Address:123 East Tāmaki – Tāmaki“ ” GötheФ€ O’Briens ‘hello’ he said!
outputs ->
Address:123 East Tamaki - Tamaki" " Gothe O'Briens 'hello' he said!
/// <summary>
/// Transliterate all unicode chars to their closest ascii version
/// Remove/fix accents, maori macrons, typesetters colons, dashes, curly quotes, apostrophes, dashes, invisible spaces, and other bad chars
/// 1. remove accents but keep the letters
/// 2. fix punctuation to the closest ascii punctuation
/// 3. remove any remaining non ascii chars
/// 4. also remove any invisible control chars
/// Option: remove line breaks or keep them
/// </summary>
/// <example>"CHASSIS NO.:LC0CE4CB3N0345426 East Tāmaki – East Tāmaki“ ” GötheФ€ O’Briens ‘hello’ he said!" outputs "CHASSIS NO.:LC0CE4CB3N0345426 East Tamaki - East Tamaki" " Gothe O'Briens 'hello' he said!"</example>
public static string CleanUnicodeTransliterateToAscii(string text, bool removeLineBreaks) {
if (text == null) return null;
// decomposes accented letters into the letter and the diacritic, fixes wacky punctuation to closest common punctuation
text = text.Normalize(NormalizationForm.FormKD);
// loop all chars after converting all punctuation to the closest (fix curly quotes etc)
var stringBuilder = new StringBuilder();
foreach (var c in text) {
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(c);
if (c == '\r' || c == '\n') {
if (removeLineBreaks) {
// skip
} else {
stringBuilder.Append(c);
}
} else if (unicodeCategory == UnicodeCategory.Control) {
// control char - skip
} else if (unicodeCategory == UnicodeCategory.NonSpacingMark) {
// diacritic mark/accent - skip
} else if (c == '‘' || c == '’') {
// single curly quote or apostrophe add apostrophe
stringBuilder.Append("'");
} else if (unicodeCategory == UnicodeCategory.InitialQuotePunctuation || unicodeCategory == UnicodeCategory.FinalQuotePunctuation) {
// any other quote add a normal straight quote
stringBuilder.Append("\"");
} else if (unicodeCategory == UnicodeCategory.DashPunctuation) {
stringBuilder.Append("-");
} else if (unicodeCategory == UnicodeCategory.SpaceSeparator) {
// add a normal space
stringBuilder.Append(" ");
} else if (c > 255) {
// skip any remaining non ascii chars
} else {
stringBuilder.Append(c);
}
}
text = stringBuilder.ToString();
return text;
}