Windows 1252 to utf 8 javascript

I have some strings in dutch language. I know how to encode them using PHP $str = iconv( "Windows-1252", "UTF-8", $str ); What would be the equivalent in Javascript?

I did this using brute force, probably not the most elegant, but it works:

function bruteForceWindows1252toUTF16(s) {

    for (var i = 0; i < globalWin1252toUTF16table.length; i++) {
        if (s.includes(globalWin1252toUTF16table[i]['win1252'])) {
            s = s.replaceAll(globalWin1252toUTF16table[i]['win1252'], globalWin1252toUTF16table[i]['utf16']);
        }
    }
    return s;
}


const globalWin1252toUTF16table = [
        { win1252:'xe2x82xac', utf16:'u20AC' },
        { win1252:'xe2x80x9a', utf16:'u201A' },
        { win1252:'xc6x92', utf16:'u0192' },
        { win1252:'xe2x80x9e', utf16:'u201E' },
        { win1252:'xe2x80xa6', utf16:'u2026' },
        { win1252:'xe2x80xa0', utf16:'u2020' },
        { win1252:'xe2x80xa1', utf16:'u2021' },
        { win1252:'xcbx86', utf16:'u02C6' },
        { win1252:'xe2x80xb0', utf16:'u2030' },
        { win1252:'xc5xa0', utf16:'u0160' },
        { win1252:'xe2x80xb9', utf16:'u2039' },
        { win1252:'xc5x92', utf16:'u0152' },
        { win1252:'xc5xbd', utf16:'u017D' },
        { win1252:'xe2x80x98', utf16:'u2018' },
        { win1252:'xe2x80x99', utf16:'u2019' },
        { win1252:'xe2x80x9c', utf16:'u201C' },
        { win1252:'xe2x80x9d', utf16:'u201D' },
        { win1252:'xe2x80xa2', utf16:'u2022' },
        { win1252:'xe2x80x93', utf16:'u2013' },
        { win1252:'xe2x80x94', utf16:'u2014' },
        { win1252:'xcbx9c', utf16:'u02DC' },
        { win1252:'xe2x84xa2', utf16:'u2122' },
        { win1252:'xc5xa1', utf16:'u0161' },
        { win1252:'xe2x80xba', utf16:'u203A' },
        { win1252:'xc5x93', utf16:'u0153' },
        { win1252:'xc5xbe', utf16:'u017E' },
        { win1252:'xc5xb8', utf16:'u0178' },
        { win1252:'xc2xa0', utf16:'u00A0' },
        { win1252:'xc2xa1', utf16:'u00A1' },
        { win1252:'xc2xa2', utf16:'u00A2' },
        { win1252:'xc2xa3', utf16:'u00A3' },
        { win1252:'xc2xa4', utf16:'u00A4' },
        { win1252:'xc2xa5', utf16:'u00A5' },
        { win1252:'xc2xa6', utf16:'u00A6' },
        { win1252:'xc2xa7', utf16:'u00A7' },
        { win1252:'xc2xa8', utf16:'u00A8' },
        { win1252:'xc2xa9', utf16:'u00A9' },
        { win1252:'xc2xaa', utf16:'u00AA' },
        { win1252:'xc2xab', utf16:'u00AB' },
        { win1252:'xc2xac', utf16:'u00AC' },
        { win1252:'xc2xad', utf16:'u00AD' },
        { win1252:'xc2xae', utf16:'u00AE' },
        { win1252:'xc2xaf', utf16:'u00AF' },
        { win1252:'xc2xb0', utf16:'u00B0' },
        { win1252:'xc2xb1', utf16:'u00B1' },
        { win1252:'xc2xb2', utf16:'u00B2' },
        { win1252:'xc2xb3', utf16:'u00B3' },
        { win1252:'xc2xb4', utf16:'u00B4' },
        { win1252:'xc2xb5', utf16:'u00B5' },
        { win1252:'xc2xb6', utf16:'u00B6' },
        { win1252:'xc2xb7', utf16:'u00B7' },
        { win1252:'xc2xb8', utf16:'u00B8' },
        { win1252:'xc2xb9', utf16:'u00B9' },
        { win1252:'xc2xba', utf16:'u00BA' },
        { win1252:'xc2xbb', utf16:'u00BB' },
        { win1252:'xc2xbc', utf16:'u00BC' },
        { win1252:'xc2xbd', utf16:'u00BD' },
        { win1252:'xc2xbe', utf16:'u00BE' },
        { win1252:'xc2xbf', utf16:'u00BF' },
        { win1252:'xc3x80', utf16:'u00C0' },
        { win1252:'xc3x81', utf16:'u00C1' },
        { win1252:'xc3x82', utf16:'u00C2' },
        { win1252:'xc3x83', utf16:'u00C3' },
        { win1252:'xc3x84', utf16:'u00C4' },
        { win1252:'xc3x85', utf16:'u00C5' },
        { win1252:'xc3x86', utf16:'u00C6' },
        { win1252:'xc3x87', utf16:'u00C7' },
        { win1252:'xc3x88', utf16:'u00C8' },
        { win1252:'xc3x89', utf16:'u00C9' },
        { win1252:'xc3x8a', utf16:'u00CA' },
        { win1252:'xc3x8b', utf16:'u00CB' },
        { win1252:'xc3x8c', utf16:'u00CC' },
        { win1252:'xc3x8d', utf16:'u00CD' },
        { win1252:'xc3x8e', utf16:'u00CE' },
        { win1252:'xc3x8f', utf16:'u00CF' },
        { win1252:'xc3x90', utf16:'u00D0' },
        { win1252:'xc3x91', utf16:'u00D1' },
        { win1252:'xc3x92', utf16:'u00D2' },
        { win1252:'xc3x93', utf16:'u00D3' },
        { win1252:'xc3x94', utf16:'u00D4' },
        { win1252:'xc3x95', utf16:'u00D5' },
        { win1252:'xc3x96', utf16:'u00D6' },
        { win1252:'xc3x97', utf16:'u00D7' },
        { win1252:'xc3x98', utf16:'u00D8' },
        { win1252:'xc3x99', utf16:'u00D9' },
        { win1252:'xc3x9a', utf16:'u00DA' },
        { win1252:'xc3x9b', utf16:'u00DB' },
        { win1252:'xc3x9c', utf16:'u00DC' },
        { win1252:'xc3x9d', utf16:'u00DD' },
        { win1252:'xc3x9e', utf16:'u00DE' },
        { win1252:'xc3x9f', utf16:'u00DF' },
        { win1252:'xc3xa0', utf16:'u00E0' },
        { win1252:'xc3xa1', utf16:'u00E1' },
        { win1252:'xc3xa2', utf16:'u00E2' },
        { win1252:'xc3xa3', utf16:'u00E3' },
        { win1252:'xc3xa4', utf16:'u00E4' },
        { win1252:'xc3xa5', utf16:'u00E5' },
        { win1252:'xc3xa6', utf16:'u00E6' },
        { win1252:'xc3xa7', utf16:'u00E7' },
        { win1252:'xc3xa8', utf16:'u00E8' },
        { win1252:'xc3xa9', utf16:'u00E9' },
        { win1252:'xc3xaa', utf16:'u00EA' },
        { win1252:'xc3xab', utf16:'u00EB' },
        { win1252:'xc3xac', utf16:'u00EC' },
        { win1252:'xc3xad', utf16:'u00ED' },
        { win1252:'xc3xae', utf16:'u00EE' },
        { win1252:'xc3xaf', utf16:'u00EF' },
        { win1252:'xc3xb0', utf16:'u00F0' },
        { win1252:'xc3xb1', utf16:'u00F1' },
        { win1252:'xc3xb2', utf16:'u00F2' },
        { win1252:'xc3xb3', utf16:'u00F3' },
        { win1252:'xc3xb4', utf16:'u00F4' },
        { win1252:'xc3xb5', utf16:'u00F5' },
        { win1252:'xc3xb6', utf16:'u00F6' },
        { win1252:'xc3xb7', utf16:'u00F7' },
        { win1252:'xc3xb8', utf16:'u00F8' },
        { win1252:'xc3xb9', utf16:'u00F9' },
        { win1252:'xc3xba', utf16:'u00FA' },
        { win1252:'xc3xbb', utf16:'u00FB' },
        { win1252:'xc3xbc', utf16:'u00FC' },
        { win1252:'xc3xbd', utf16:'u00FD' },
        { win1252:'xc3xbe', utf16:'u00FE' },
        { win1252:'xc3xbf', utf16:'u00FF' }
        ];

How do I convert the below string:

var string = "Bouchard+P%E8re+et+Fils"

using javascript into UTF-8, so that %E8 would become %C3%A8?

Reason is this character seems to be tripping up decodeURIComponent

You can test it out by dropping the string into http://meyerweb.com/eric/tools/dencoder/ and seeing the console error that says Uncaught URIError: URI malformed

I’m looking specifically for something that can decode an entire html document, that claims to be windows-1252 encoded which is where I assume this %E8 character is coming from, into UTF-8.

Thanks!

asked Aug 24, 2015 at 5:43

dot-punto-dot's user avatar

First create a map of Windows-1252. You can find references to the encoding using your search engine of choice.

For the sake of this example, I’m going to include on the character in your sample data.

Then find all the percentage signs followed by two hexadecimal characters, convert them to numbers, and convert them using the map (to get raw data), then convert them again using encodeURIComponent (to get the encoded data).

var string = "Bouchard+P%E8re+et+Fils"

var w2512chars = [];
w2512chars[232] = "è"

var percent_encoded = /(%[a-fA-F0-9]{2})/g;

function filter(match, group) {
  var number = parseInt(group.substr(1), 16);
  var character = w2512chars[number];
  return encodeURIComponent(character);
}

string = string.replace(percent_encoded, filter);
alert(string);

answered Aug 24, 2015 at 5:59

Quentin's user avatar

QuentinQuentin

892k122 gold badges1194 silver badges1314 bronze badges

1

text-encoding

This is a polyfill for the Encoding Living
Standard API for the Web, allowing
encoding and decoding of textual data to and from Typed Array buffers
for binary data in JavaScript.

By default it adheres to the spec and does not support encoding to
legacy encodings, only decoding. It is also implemented to match the
specification’s algorithms, rather than for performance. The intended
use is within Web pages, so it has no dependency on server frameworks
or particular module schemes.

Basic examples and tests are included.

Install

There are a few ways you can get and use the text-encoding library.

HTML Page Usage

Clone the repo and include the files directly:

  <!-- Required for non-UTF encodings -->
  <script src="encoding-indexes.js"></script>
  <script src="encoding.js"></script>

This is the only use case the developer cares about. If you want those
fancy module and/or package manager things that are popular these days
you should probably use a different library.

Package Managers

The package is published to npm and bower as text-encoding.
Use through these is not really supported, since they aren’t used by
the developer of the library. Using require() in interesting ways
probably breaks. Patches welcome, as long as they don’t break the
basic use of the files via <script>.

API Overview

Basic Usage

  var uint8array = new TextEncoder().encode(string);
  var string = new TextDecoder(encoding).decode(uint8array);

Streaming Decode

  var string = "", decoder = new TextDecoder(encoding), buffer;
  while (buffer = next_chunk()) {
    string += decoder.decode(buffer, {stream:true});
  }
  string += decoder.decode(); // finish the stream

Encodings

All encodings from the Encoding specification are supported:

utf-8 ibm866 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-5 iso-8859-6
iso-8859-7 iso-8859-8 iso-8859-8-i iso-8859-10 iso-8859-13 iso-8859-14
iso-8859-15 iso-8859-16 koi8-r koi8-u macintosh windows-874
windows-1250 windows-1251 windows-1252 windows-1253 windows-1254
windows-1255 windows-1256 windows-1257 windows-1258 x-mac-cyrillic
gb18030 hz-gb-2312 big5 euc-jp iso-2022-jp shift_jis euc-kr
replacement utf-16be utf-16le x-user-defined

(Some encodings may be supported under other names, e.g. ascii,
iso-8859-1, etc. See Encoding for
additional labels for each encoding.)

Encodings other than utf-8, utf-16le and utf-16be require
an additional encoding-indexes.js file to be included. It is rather
large (596kB uncompressed, 188kB gzipped); portions may be deleted if
support for some encodings is not required.

Non-Standard Behavior

As required by the specification, only encoding to utf-8 is
supported. If you want to try it out, you can force a non-standard
behavior by passing the NONSTANDARD_allowLegacyEncoding option to
TextEncoder and a label. For example:

var uint8array = new TextEncoder(
  'windows-1252', { NONSTANDARD_allowLegacyEncoding: true }).encode(text);

But note that the above won’t work if you’re using the polyfill in a
browser that natively supports the TextEncoder API natively, since the
polyfill won’t be used!

You can force the polyfill to be used by using this before the polyfill:

<script>
window.TextEncoder = window.TextDecoder = null;
</script>

To support the legacy encodings (which may be stateful), the
TextEncoder encode() method accepts an optional dictionary and
stream option, e.g. encoder.encode(string, {stream: true}); This
is not needed for standard encoding since the input is always in
complete code points.

Если ISO-8859-1 достаточно близок, есть специальный ярлык для преобразования ISO-8859-1-байтов в кодовых единицах в символы Unicode из-за простого сопоставления байт = кодовая точка:

var chars= decodeURIComponent(escape(bytes));

Для любой другой кодировки встроенного функционала нет; вам нужно будет включить свои собственные таблицы поиска. Например:

var encodings= {
    // Windows code page 1252 Western European
    //
    cp1252: 'x00x01x02x03x04x05x06x07x08tnx0bx0crx0ex0fx10x11x12x13x14x15x16x17x18x19x1ax1bx1cx1dx1ex1f !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~x7fu20acufffdu201au0192u201eu2026u2020u2021u02c6u2030u0160u2039u0152ufffdu017dufffdufffdu2018u2019u201cu201du2022u2013u2014u02dcu2122u0161u203au0153ufffdu017eu0178xa0xa1xa2xa3xa4xa5xa6xa7xa8xa9xaaxabxacxadxaexafxb0xb1xb2xb3xb4xb5xb6xb7xb8xb9xbaxbbxbcxbdxbexbfxc0xc1xc2xc3xc4xc5xc6xc7xc8xc9xcaxcbxccxcdxcexcfxd0xd1xd2xd3xd4xd5xd6xd7xd8xd9xdaxdbxdcxddxdexdfxe0xe1xe2xe3xe4xe5xe6xe7xe8xe9xeaxebxecxedxeexefxf0xf1xf2xf3xf4xf5xf6xf7xf8xf9xfaxfbxfcxfdxfexff',

    // Windows code page 1251 Cyrillic
    //
    cp1251: 'x00x01x02x03x04x05x06x07x08tnx0bx0crx0ex0fx10x11x12x13x14x15x16x17x18x19x1ax1bx1cx1dx1ex1f !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~x7fu0402u0403u201au0453u201eu2026u2020u2021u20acu2030u0409u2039u040au040cu040bu040fu0452u2018u2019u201cu201du2022u2013u2014ufffdu2122u0459u203au045au045cu045bu045fxa0u040eu045eu0408xa4u0490xa6xa7u0401xa9u0404xabxacxadxaeu0407xb0xb1u0406u0456u0491xb5xb6xb7u0451u2116u0454xbbu0458u0405u0455u0457u0410u0411u0412u0413u0414u0415u0416u0417u0418u0419u041au041bu041cu041du041eu041fu0420u0421u0422u0423u0424u0425u0426u0427u0428u0429u042au042bu042cu042du042eu042fu0430u0431u0432u0433u0434u0435u0436u0437u0438u0439u043au043bu043cu043du043eu043fu0440u0441u0442u0443u0444u0445u0446u0447u0448u0449u044au044bu044cu044du044eu044f'
};

function decodeBytes(bytes, encoding) {
    var enc= encodings[encoding];
    var n= bytes.length;
    var chars= new Array(n);
    for (var i= 0; i<n; i++)
        chars[i]= enc.charAt(bytes.charCodeAt(i));
    return chars.join('');
}

alert(decodeBytes('xc7xe4xf0xe0xe2xf1xf2xe2xf3xe9 xecxe8xf0', 'cp1251'));
// 'u0417u0434u0440u0430u0432u0441u0442u0432u0443u0439 u043cu0438u0440'
// Здравствуй мир

ETA:

Итак, я получаю необработанную страницу html сразу после ее загрузки (в моем аддоне firefox), запускаю функцию javascript, которая анализирует эту страницу с помощью регулярного выражения.

Да, не делай этого. Вы не можете анализировать HTML с помощью регулярного выражения.

Почему бы не позволить Firefox позаботиться о парсинге страницы для данной кодировки?

Расчетное время прибытия (2):

'koi8-r': 'x00x01x02x03x04x05x06x07x08tnx0bx0crx0ex0fx10x11x12x13x14x15x16x17x18x19x1ax1bx1cx1dx1ex1f !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~x7fu2500u2502u250cu2510u2514u2518u251cu2524u252cu2534u253cu2580u2584u2588u258cu2590u2591u2592u2593u2320u25a0u2219u221au2248u2264u2265xa0u2321xb0xb2xb7xf7u2550u2551u2552u0451u2553u2554u2555u2556u2557u2558u2559u255au255bu255cu255du255eu255fu2560u2561u0401u2562u2563u2564u2565u2566u2567u2568u2569u256au256bu256cxa9u044eu0430u0431u0446u0434u0435u0444u0433u0445u0438u0439u043au043bu043cu043du043eu043fu044fu0440u0441u0442u0443u0436u0432u044cu044bu0437u0448u044du0449u0447u044au042eu0410u0411u0426u0414u0415u0424u0413u0425u0418u0419u041au041bu041cu041du041eu041fu042fu0420u0421u0422u0423u0416u0412u042cu042bu0417u0428u042du0429u0427u042a'

(Вы можете получить сопоставления для однобайтовых кодировок из Python, сказав что-то вроде :)

>>> ''.join(map(chr, range(256))).decode('koi8-r', 'replace')

Я не знаю, как вы собираетесь читать входной поток, но обычно вам не нужно выполнять такого рода распутывание кодировки вручную.

Понравилась статья? Поделить с друзьями:
  • Windows 1252 is not a supported encoding name
  • Windows 1251 таблица кодов символов windows win 1251
  • Windows 1251 сколько бит на символ
  • Windows 1251 сколько байт на символ
  • Windows 1251 кодировка что это такое