Brains Coders

Brains Engineering Blog

Encoding issue on accented (and other) characters in Latin alphabet

If you have uploaded data to SQL Server using the wrong collation, or imported a text file with the wrong encoding, you could end with data full of strange characters.

The problem is being caused by UTF-8/16 bytes being interpreted as Windows-1252 (or ISO 8859-1) bytes


Example: SQL Script to update three different table containing geographic names.

--ADMIN 0

Update [Adm0GeoBoundaries] Set [Adm0NameLocalized] =[Adm0Name];


Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ä', 'a'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ä', N'ä');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'á', 'a'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'á', N'á');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'â', 'a'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'â', N'â');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ã', 'a'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ã', N'ã');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'Ã¥', 'a'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'Ã¥', N'å');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'æ', 'a'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'æ', N'æ');


Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'é', 'e'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'é', N'é');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'è', 'e'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'è', N'è');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ê', 'e'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'é', N'ê');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ë', 'e'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ë', N'ë');


Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ì', 'i'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ì', N'ì');

Update [adm0GeoBoundaries] Set [adm0Name] = replace([adm0Name], 'í', 'i'),[adm0NameLocalized] = replace([adm0NameLocalized], 'í', N'í');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'î', 'i'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'î', N'î');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ï', 'i'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ï', N'ï');


Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ò', 'o'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ò', N'ò');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ó', 'o'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ó', N'ó');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ô', 'o'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ô', N'ô');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'õ', 'o'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'õ', N'õ');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ö', 'o'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ö', N'ö');


Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ù', 'u'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ù', N'ù');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ú', 'u'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ú', N'ú');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'û', 'u'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'û', N'û');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ü', 'u'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ü', N'ü');


Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ç', N'ç'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ç', N'ç');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], 'ñ', N'ñ'),[Adm0NameLocalized] = replace([Adm0NameLocalized], 'ñ', N'ñ');

Update [Adm0GeoBoundaries] Set [Adm0Name] = replace([Adm0Name], '°', N'°'),[Adm0NameLocalized] = replace([Adm0NameLocalized], '°', N'°');


--ADMIN1

Update [adm1GeoBoundaries] Set [adm1NameLocalized] =[adm1Name];

BEGIN TRAN T1;

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ä', 'a'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ä', N'ä');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'á', 'a'),[adm1NameLocalized] = replace([adm1NameLocalized], 'á', N'á');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'â', 'a'),[adm1NameLocalized] = replace([adm1NameLocalized], 'â', N'â');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ã', 'a'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ã', N'ã');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'Ã¥', 'a'),[adm1NameLocalized] = replace([adm1NameLocalized], 'Ã¥', N'å');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'æ', 'a'),[adm1NameLocalized] = replace([adm1NameLocalized], 'æ', N'æ');


Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'é', 'e'),[adm1NameLocalized] = replace([adm1NameLocalized], 'é', N'é');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'è', 'e'),[adm1NameLocalized] = replace([adm1NameLocalized], 'è', N'è');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ê', 'e'),[adm1NameLocalized] = replace([adm1NameLocalized], 'é', N'ê');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ë', 'e'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ë', N'ë');


Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ì', 'i'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ì', N'ì');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'í', 'i'),[adm1NameLocalized] = replace([adm1NameLocalized], 'í', N'í');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'î', 'i'),[adm1NameLocalized] = replace([adm1NameLocalized], 'î', N'î');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ï', 'i'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ï', N'ï');

COMMIT TRAN T1;

BEGIN TRAN T2;

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ò', 'o'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ò', N'ò');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ó', 'o'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ó', N'ó');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ô', 'o'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ô', N'ô');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'õ', 'o'),[adm1NameLocalized] = replace([adm1NameLocalized], 'õ', N'õ');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ö', 'o'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ö', N'ö');


Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ù', 'u'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ù', N'ù');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ú', 'u'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ú', N'ú');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'û', 'u'),[adm1NameLocalized] = replace([adm1NameLocalized], 'û', N'û');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ü', 'u'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ü', N'ü');


Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ç', N'ç'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ç', N'ç');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], 'ñ', N'ñ'),[adm1NameLocalized] = replace([adm1NameLocalized], 'ñ', N'ñ');

Update [adm1GeoBoundaries] Set [adm1Name] = replace([adm1Name], '°', N'°'),[adm1NameLocalized] = replace([adm1NameLocalized], '°', N'°');

COMMIT TRAN T2;


--ADMIN2

Update [adm2GeoBoundaries] Set [adm2NameLocalized] =[adm2Name];


BEGIN TRAN T3;

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ä', 'a'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ä', N'ä');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'á', 'a'),[adm2NameLocalized] = replace([adm2NameLocalized], 'á', N'á');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'â', 'a'),[adm2NameLocalized] = replace([adm2NameLocalized], 'â', N'â');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ã', 'a'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ã', N'ã');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'Ã¥', 'a'),[adm2NameLocalized] = replace([adm2NameLocalized], 'Ã¥', N'å');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'æ', 'a'),[adm2NameLocalized] = replace([adm2NameLocalized], 'æ', N'æ');


Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'é', 'e'),[adm2NameLocalized] = replace([adm2NameLocalized], 'é', N'é');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'è', 'e'),[adm2NameLocalized] = replace([adm2NameLocalized], 'è', N'è');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ê', 'e'),[adm2NameLocalized] = replace([adm2NameLocalized], 'é', N'ê');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ë', 'e'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ë', N'ë');


Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ì', 'i'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ì', N'ì');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'í', 'i'),[adm2NameLocalized] = replace([adm2NameLocalized], 'í', N'í');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'î', 'i'),[adm2NameLocalized] = replace([adm2NameLocalized], 'î', N'î');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ï', 'i'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ï', N'ï');

COMMIT TRAN T3;

BEGIN TRAN T4;

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ò', 'o'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ò', N'ò');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ó', 'o'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ó', N'ó');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ô', 'o'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ô', N'ô');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'õ', 'o'),[adm2NameLocalized] = replace([adm2NameLocalized], 'õ', N'õ');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ö', 'o'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ö', N'ö');


Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ù', 'u'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ù', N'ù');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ú', 'u'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ú', N'ú');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'û', 'u'),[adm2NameLocalized] = replace([adm2NameLocalized], 'û', N'û');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ü', 'u'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ü', N'ü');


Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ç', N'ç'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ç', N'ç');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], 'ñ', N'ñ'),[adm2NameLocalized] = replace([adm2NameLocalized], 'ñ', N'ñ');

Update [adm2GeoBoundaries] Set [adm2Name] = replace([adm2Name], '°', N'°'),[adm2NameLocalized] = replace([adm2NameLocalized], '°', N'°');

COMMIT TRAN T4;


The script has been built on the following table:






Table for Debugging Common UTF-8 Character Encoding Problems.
Code Point CharactersUTF-8 Bytes  Code Point CharactersUTF-8 Bytes
UnicodeWindows
1252
ExpectedActual UnicodeWindows
1252
ExpectedActual
U+20AC 0x80 € %E2 %82 %AC   U+00C0 0xC0 ÀÀ %C3 %80
0x81   U+00C1 0xC1 ÁÃ %C3 %81
U+201A 0x82 ‚ %E2 %80 %9A   U+00C2 0xC2 ÂÂ %C3 %82
U+0192 0x83 ƒÆ’ %C6 %92   U+00C3 0xC3 Ãà %C3 %83
U+201E 0x84 „ %E2 %80 %9E   U+00C4 0xC4 ÄÄ %C3 %84
U+2026 0x85 … %E2 %80 %A6   U+00C5 0xC5 ÅÃ… %C3 %85
U+2020 0x86 †%E2 %80 %A0   U+00C6 0xC6 ÆÆ %C3 %86
U+2021 0x87 ‡ %E2 %80 %A1   U+00C7 0xC7 ÇÇ %C3 %87
U+02C6 0x88 ˆË† %CB %86   U+00C8 0xC8 ÈÈ %C3 %88
U+2030 0x89 ‰ %E2 %80 %B0   U+00C9 0xC9 ÉÉ %C3 %89
U+0160 0x8A ŠÅ %C5 %A0   U+00CA 0xCA ÊÊ %C3 %8A
U+2039 0x8B ‹ %E2 %80 %B9   U+00CB 0xCB ËË %C3 %8B
U+0152 0x8C ŒÅ’ %C5 %92   U+00CC 0xCC ÌÃŒ %C3 %8C
0x8D   U+00CD 0xCD ÍÃ %C3 %8D
U+017D 0x8E ŽÅ½ %C5 %BD   U+00CE 0xCE ÎÃŽ %C3 %8E
0x8F   U+00CF 0xCF ÏÃ %C3 %8F
0x90   U+00D0 0xD0 ÐÃ %C3 %90
U+2018 0x91 ‘ %E2 %80 %98   U+00D1 0xD1 ÑÑ %C3 %91
U+2019 0x92 ’ %E2 %80 %99   U+00D2 0xD2 ÒÃ’ %C3 %92
U+201C 0x93 “ %E2 %80 %9C   U+00D3 0xD3 ÓÓ %C3 %93
U+201D 0x94 †%E2 %80 %9D   U+00D4 0xD4 ÔÔ %C3 %94
U+2022 0x95 • %E2 %80 %A2   U+00D5 0xD5 ÕÕ %C3 %95
U+2013 0x96 – %E2 %80 %93   U+00D6 0xD6 ÖÖ %C3 %96
U+2014 0x97 — %E2 %80 %94   U+00D7 0xD7 ×× %C3 %97
U+02DC 0x98 ˜Ëœ %CB %9C   U+00D8 0xD8 ØØ %C3 %98
U+2122 0x99 â„¢ %E2 %84 %A2   U+00D9 0xD9 ÙÙ %C3 %99
U+0161 0x9A šÅ¡ %C5 %A1   U+00DA 0xDA ÚÚ %C3 %9A
U+203A 0x9B › %E2 %80 %BA   U+00DB 0xDB ÛÛ %C3 %9B
U+0153 0x9C œÅ“ %C5 %93   U+00DC 0xDC ÜÃœ %C3 %9C
0x9D   U+00DD 0xDD ÝÃ %C3 %9D
U+017E 0x9E žÅ¾ %C5 %BE   U+00DE 0xDE ÞÞ %C3 %9E
U+0178 0x9F ŸÅ¸ %C5 %B8   U+00DF 0xDF ßß %C3 %9F
U+00A0 0xA0  %C2 %A0   U+00E0 0xE0 àà %C3 %A0
U+00A1 0xA1 ¡Â¡ %C2 %A1   U+00E1 0xE1 áá %C3 %A1
U+00A2 0xA2 ¢Â¢ %C2 %A2   U+00E2 0xE2 ââ %C3 %A2
U+00A3 0xA3 £Â£ %C2 %A3   U+00E3 0xE3 ãã %C3 %A3
U+00A4 0xA4 ¤Â¤ %C2 %A4   U+00E4 0xE4 ää %C3 %A4
U+00A5 0xA5 ¥Â¥ %C2 %A5   U+00E5 0xE5 åÃ¥ %C3 %A5
U+00A6 0xA6 ¦Â¦ %C2 %A6   U+00E6 0xE6 ææ %C3 %A6
U+00A7 0xA7 §Â§ %C2 %A7   U+00E7 0xE7 çç %C3 %A7
U+00A8 0xA8 ¨Â¨ %C2 %A8   U+00E8 0xE8 èè %C3 %A8
U+00A9 0xA9 ©Â© %C2 %A9   U+00E9 0xE9 éé %C3 %A9
U+00AA 0xAA ªÂª %C2 %AA   U+00EA 0xEA êê %C3 %AA
U+00AB 0xAB «Â« %C2 %AB   U+00EB 0xEB ëë %C3 %AB
U+00AC 0xAC ¬Â¬ %C2 %AC   U+00EC 0xEC ìì %C3 %AC
U+00AD 0xAD ­Â­ %C2 %AD   U+00ED 0xED íÃ-­ %C3 %AD
U+00AE 0xAE ®Â® %C2 %AE   U+00EE 0xEE îî %C3 %AE
U+00AF 0xAF ¯Â¯ %C2 %AF   U+00EF 0xEF ïï %C3 %AF
U+00B0 0xB0 °Â° %C2 %B0   U+00F0 0xF0 ðð %C3 %B0
U+00B1 0xB1 ±Â± %C2 %B1   U+00F1 0xF1 ññ %C3 %B1
U+00B2 0xB2 ²Â² %C2 %B2   U+00F2 0xF2 òò %C3 %B2
U+00B3 0xB3 ³Â³ %C2 %B3   U+00F3 0xF3 óó %C3 %B3
U+00B4 0xB4 ´Â´ %C2 %B4   U+00F4 0xF4 ôô %C3 %B4
U+00B5 0xB5 µÂµ %C2 %B5   U+00F5 0xF5 õõ %C3 %B5
U+00B6 0xB6 ¶ %C2 %B6   U+00F6 0xF6 öö %C3 %B6
U+00B7 0xB7 ·Â· %C2 %B7   U+00F7 0xF7 ÷÷ %C3 %B7
U+00B8 0xB8 ¸Â¸ %C2 %B8   U+00F8 0xF8 øø %C3 %B8
U+00B9 0xB9 ¹Â¹ %C2 %B9   U+00F9 0xF9 ùù %C3 %B9
U+00BA 0xBA ºÂº %C2 %BA   U+00FA 0xFA úú %C3 %BA
U+00BB 0xBB »Â» %C2 %BB   U+00FB 0xFB ûû %C3 %BB
U+00BC 0xBC ¼Â¼ %C2 %BC   U+00FC 0xFC üü %C3 %BC
U+00BD 0xBD ½Â½ %C2 %BD   U+00FD 0xFD ýý %C3 %BD
U+00BE 0xBE ¾Â¾ %C2 %BE   U+00FE 0xFE þþ %C3 %BE
U+00BF 0xBF ¿Â¿ %C2 %BF   U+00FF 0xFF ÿÿ %C3 %BF
Loading