Javascript convert windows 1251 to utf 8

I need to convert a string from Windows-1251 to UTF-8.

I tried to do this with iconv, but all I get is something like this:

пїЅпїЅпїЅпїЅпїЅ пїЅпїЅпїЅпїЅпїЅпїЅ пїЅпїЅпїЅпїЅпїЅпїЅпїЅпїЅ

var iconv = new Iconv('windows-1251', 'utf-8')
title = iconv.convert(title).toString('utf-8')

Pang

9,344146 gold badges85 silver badges121 bronze badges

asked Jan 1, 2012 at 13:52

Here is working solution to your problem. You have to use Buffer and convert your string to binary first.

const Iconv = require('iconv').Iconv;

request({ 
    uri: website_url,
    method: 'GET',
    encoding: 'binary'
}, function (error, response, body) {

        const body = new Buffer(body, 'binary');
        conv = Iconv('windows-1251', 'utf8');
        body = conv.convert(body).toString();

});

Ahmet Şimşek

1,3111 gold badge14 silver badges23 bronze badges

answered Jan 29, 2012 at 0:20

Alex KolarskiAlex Kolarski

3,1851 gold badge25 silver badges35 bronze badges

If you’re reading from file, you could use something like that:

const iconv = require('iconv-lite');
const fs = require("fs");

fs.readFile("filename.xml", null, (err, data) => { 
    if(err) { 
        console.log(err)
        return
    }

    const encodedData = iconv.encode(iconv.decode(data, 'win1251'), 'utf8')
    fs.writeFile("result_filename.xml", encodedData, () => { })
})

answered Jul 14, 2021 at 18:27

I use Node version 16 and code bellow works fine. You don’t need to use Buffer node will write warnings. You need to install iconv package before.

        fs = require('fs')
        fs.readFile('printed_document.txt', function (err,data) {
            if (err) {
                return console.log(err);
            }
            console.log(require('iconv').Iconv('windows-1251', 'utf-8').convert(data).toString())
        })

answered Oct 13, 2022 at 13:44

Orlov ConstOrlov Const

3123 silver badges10 bronze badges

Источник

The TextDecoder interface represents a decoder for a specific text encoding, such as UTF-8, ISO-8859-2, KOI8-R, GBK, etc. A decoder takes a stream of bytes as input and emits a stream of code points.

Note: This feature is available in Web Workers

Constructor

TextDecoder(): Returns a newly constructed TextDecoder that will generate a code point stream with the decoding method specified in parameters.

Instance properties

The TextDecoder interface doesn’t inherit any properties.

TextDecoder.encoding Read only: A string containing the name of the decoder, that is a string describing the method the TextDecoder will use.
TextDecoder.fatal Read only: A Boolean indicating whether the error mode is fatal.
TextDecoder.ignoreBOM Read only: A Boolean indicating whether the byte order mark is ignored.

Instance methods

The TextDecoder interface doesn’t inherit any methods.

TextDecoder.decode(): Returns a string containing the text decoded with the method of the specific TextDecoder object.

Examples

Representing text with typed arrays

This example shows how to decode a Chinese/Japanese character
, as represented by five different typed arrays: Uint8Array, Int8Array, Uint16Array, Int16Array, and Int32Array.

let utf8decoder = new TextDecoder(); // default 'utf-8' or 'utf8'

let u8arr = new Uint8Array([240, 160, 174, 183]);
let i8arr = new Int8Array([-16, -96, -82, -73]);
let u16arr = new Uint16Array([41200, 47022]);
let i16arr = new Int16Array([-24336, -18514]);
let i32arr = new Int32Array([-1213292304]);

console.log(utf8decoder.decode(u8arr));
console.log(utf8decoder.decode(i8arr));
console.log(utf8decoder.decode(u16arr));
console.log(utf8decoder.decode(i16arr));
console.log(utf8decoder.decode(i32arr));

Handling non-UTF8 text

In this example, we decode the Russian text «Привет, мир!», which means «Hello, world.» In our TextDecoder() constructor, we specify the Windows-1251 character encoding, which is appropriate for Cyrillic script.

const win1251decoder = new TextDecoder("windows-1251");
const bytes = new Uint8Array([
  207, 240, 232, 226, 229, 242, 44, 32, 236, 232, 240, 33,
]);
console.log(win1251decoder.decode(bytes)); // Привет, мир!

Specifications

Specification
Encoding Standard # interface-textdecoder

Browser compatibility

BCD tables only load in the browser

iconv-lite: Pure JS character encoding conversion

No need for native code compilation. Quick to install, works on Windows, Web, and in sandboxed environments.
Used in popular projects like Express.js (body_parser),
Grunt, Nodemailer, Yeoman and others.
Faster than node-iconv (see below for performance comparison).
Intuitive encode/decode API, including Streaming support.
In-browser usage via browserify or webpack (~180kb gzip compressed with Buffer shim included).
Typescript type definition file included.
React Native is supported (need to install stream module to enable Streaming API).
License: MIT.

npm

Usage

Basic API

var iconv = require('iconv-lite');

// Convert from an encoded buffer to a js string.
str = iconv.decode(Buffer.from([0x68, 0x65, 0x6c, 0x6c, 0x6f]), 'win1251');

// Convert from a js string to an encoded buffer.
buf = iconv.encode("Sample input string", 'win1251');

// Check if encoding is supported
iconv.encodingExists("us-ascii")

Streaming API

// Decode stream (from binary data stream to js strings)
http.createServer(function(req, res) {
    var converterStream = iconv.decodeStream('win1251');
    req.pipe(converterStream);

    converterStream.on('data', function(str) {
        console.log(str); // Do something with decoded strings, chunk-by-chunk.
    });
});

// Convert encoding streaming example
fs.createReadStream('file-in-win1251.txt')
    .pipe(iconv.decodeStream('win1251'))
    .pipe(iconv.encodeStream('ucs2'))
    .pipe(fs.createWriteStream('file-in-ucs2.txt'));

// Sugar: all encode/decode streams have .collect(cb) method to accumulate data.
http.createServer(function(req, res) {
    req.pipe(iconv.decodeStream('win1251')).collect(function(err, body) {
        assert(typeof body == 'string');
        console.log(body); // full request body string
    });
});

Supported encodings

All node.js native encodings: utf8, ucs2 / utf16-le, ascii, binary, base64, hex.
Additional unicode encodings: utf16, utf16-be, utf-7, utf-7-imap, utf32, utf32-le, and utf32-be.
All widespread singlebyte encodings: Windows 125x family, ISO-8859 family,
IBM/DOS codepages, Macintosh family, KOI8 family, all others supported by iconv library.
Aliases like ‘latin1’, ‘us-ascii’ also supported.
All widespread multibyte encodings: CP932, CP936, CP949, CP950, GB2312, GBK, GB18030, Big5, Shift_JIS, EUC-JP.

See all supported encodings on wiki.

Most singlebyte encodings are generated automatically from node-iconv. Thank you Ben Noordhuis and libiconv authors!

Multibyte encodings are generated from Unicode.org mappings and WHATWG Encoding Standard mappings. Thank you, respective authors!

Encoding/decoding speed

Comparison with node-iconv module (1000x256kb, on MacBook Pro, Core i5/2.6 GHz, Node v0.12.0).
Note: your results may vary, so please always check on your hardware.

operation             iconv@2.1.4   iconv-lite@0.4.7
----------------------------------------------------------
encode('win1251')     ~96 Mb/s      ~320 Mb/s
decode('win1251')     ~95 Mb/s      ~246 Mb/s

BOM handling

Decoding: BOM is stripped by default, unless overridden by passing stripBOM: false in options
(f.ex. iconv.decode(buf, enc, {stripBOM: false})).
A callback might also be given as a stripBOM parameter — it’ll be called if BOM character was actually found.
If you want to detect UTF-8 BOM when decoding other encodings, use node-autodetect-decoder-stream module.
Encoding: No BOM added, unless overridden by addBOM: true option.

UTF-16 Encodings

This library supports UTF-16LE, UTF-16BE and UTF-16 encodings. First two are straightforward, but UTF-16 is trying to be
smart about endianness in the following ways:

Decoding: uses BOM and ‘spaces heuristic’ to determine input endianness. Default is UTF-16LE, but can be
overridden with defaultEncoding: 'utf-16be' option. Strips BOM unless stripBOM: false.
Encoding: uses UTF-16LE and writes BOM by default. Use addBOM: false to override.

UTF-32 Encodings

This library supports UTF-32LE, UTF-32BE and UTF-32 encodings. Like the UTF-16 encoding above, UTF-32 defaults to UTF-32LE, but uses BOM and ‘spaces heuristics’ to determine input endianness.

The default of UTF-32LE can be overridden with the defaultEncoding: 'utf-32be' option. Strips BOM unless stripBOM: false.
Encoding: uses UTF-32LE and writes BOM by default. Use addBOM: false to override. (defaultEncoding: 'utf-32be' can also be used here to change encoding.)

Other notes

When decoding, be sure to supply a Buffer to decode() method, otherwise bad things usually happen.
Untranslatable characters are set to � or ?. No transliteration is currently supported.
Node versions 0.10.31 and 0.11.13 are buggy, don’t use them (see #65, #77).

Testing

$ git clone git@github.com:ashtuchkin/iconv-lite.git
$ cd iconv-lite
$ npm install
$ npm test
    
$ # To view performance:
$ node test/performance.js

$ # To view test coverage:
$ npm run coverage
$ open coverage/lcov-report/index.html

Источник

Ответ, отмеченный, как принятый — полный привет логике, и полное досвидание производительности.

Проблема решается неочевидно. Помимо кодировки исходника (исходные данные на источнике), многое зависит от того, каким инструментом забираете источник: node-fetch, request, axios, unirest,… В случае, если данные читаются из файла, там данное решение тоже пройдет, но… там отдельная история.

Суть проблемы в том, что Привет может прилететь и из заголовков (headers) ответа (response) и даже из содержимого ответа (в случае XML — обязательно). Я двое суток смотрел на буквы э на местах всех кириллических знаков, пока не расковырял исходники всех этих фетчей и реквестов, которые думать не думают о других кодировках, кроме utf8 и других форматах данных, кроме json, и то — JSON обязательно должен быть utf8, даже Unicode ему нельзя быть. Как в песенке про папу, который может быть кем угодно, но мамой не может быть. Хуже всего, если все-таки — думают, но полагают, что все решено.

Далее, важно в какой консоли вы смотрите ответы Ноды: Windows (XP, Vista, 7|8, 10 — ждут сюрпризы), xterm? У Вас LINUX! О! Как хорошо, что Вы не знаете, что такое KOI-8, а Ваши учителя даже про KOI-7. Относительно ровно предсказать вывод без танцев с большим шаманским бубном можно в консолях RHEL^7(CentOs^7, Fedora^17), Ubuntu^12, MacOs^X. С другими не знаком, либо неоднозначно.

Еще вопрос — удалённо если смотрите на терминал, то какой протокол, какой терминальный клиент? Допустим, что с терминалом и кодировками на терминале хорошо.

Ожидаемое решение можно получить только ручками через нативный http или node-fetch, и только тогда, когда входные данные обрабатываются как буфер.

Вот рабочий макет для песочницы. Просто поиграйтесь с вариантами (ответы функции cnw8), которых на просторах интернетов вагон. Почти все они — неправильные, работают только два: один здесь, а другой у Майкла Джексона.

const url = 'http://www.cbr.ru/scripts/XML_daily.asp';
// если хвост отдает еще и заголовок 'Content-Type: application/xml; charset=windows-1251'
// тут совсем труба с инструментами, проявляющими инициативу по обработке данных
// т.е. axios, unirest, fetch, node-fetch в режиме text, все это НЕ БУДЕТ РАБОТАТЬ

const fs = require('fs-extra'); // промифицированный вывод на диск без заморочек
// просто мне лень еще и промис расписывать, да и пример загромождать некчему

// fetch нативный, без заморочек, промисы -- родные, нодовские, т.е. -- V8
// body -- buffer, который нужно обработать ручками
// еще лучше -- взять нативный http
const fetch = require('node-fetch'); 

const iconv = require('iconv-lite'); // тот самый конвертер

/**
 * вывод в файл без заморочек, что получили, то и сохранили
 */
function saveResult(buf, fileName = 'data.xml') {
  fs.outputFile(fileName, buf);
  return buf;
}
/**
 * просто просмотр полученного, отдаем, что получили, чтобы не рвать цепочку
 */
function showResult(res) {
  console.log(res);
  console.log('n============================================n');
  return res;
}
/**
 * собственно перекодировка Win --> Utf8
 */
function cnw8(buf) {
  return iconv.decode(Buffer.from(buf, 'binary'), 'cp1251').toString();
}

fetch(url)
  .then(res => showResult(res))
  .then(res => res.buffer())
  .then(res => saveResult(res))
  .then(res => showResult(res))
  .then(res => cnw8(res))
  .then(res => showResult(res))
  .catch(err => console.error('n--------------------nБля!nn', err));

Источник

13 / 13 / 2

Регистрация: 22.04.2010

Сообщений: 637

15.09.2013, 20:29. Показов 30069. Ответов 5

Вместо русского «Яндекс» пишутся такие кракозябры «Ð¯Ð½Ð´ÐµÐºÑ»
как поменять кодировку, чтобы увидеть исходное значение???
ЗЫ вроде как эти кракозябры в Win1252 или еще чем-то.
Заранее спасибо.

__________________
Помощь в написании контрольных, курсовых и дипломных работ, диссертаций здесь

0 / 0 / 0

Регистрация: 28.02.2010

Сообщений: 23

15.09.2013, 20:43

Насколько я понял проблема с кодировкой в файле программы. Как вариант открыть через редактор нотепад++, скопировать весь код в буфер обмена, выставить нужную кодировку в разделе программы нотепад++ и вставить текст с буфера обмена.

13 / 13 / 2

Регистрация: 22.04.2010

Сообщений: 637

15.09.2013, 22:17

[ТС]

Сообщение от hao

Нет. Проблема в получаемых данных путем запроса, не отображаются русские символы через dump/alert

0 / 0 / 0

Регистрация: 28.02.2010

Сообщений: 23

15.09.2013, 22:21

Мда

831 / 639 / 100

Регистрация: 20.08.2013

Сообщений: 2,524

15.09.2013, 23:39

Сообщение от koc94ok

посмотри плз.

Посмотрел. А почему именно мне вопрос?

Сообщение от koc94ok

Вместо русского «Яндекс» пишутся такие кракозябры «Ð¯Ð½Ð´ÐµÐºÑ»

Не вместо, тут неправильное число символов.

Сообщение от koc94ok

как поменять кодировку, чтобы увидеть исходное значение???

Код JavaScript
decodeURIComponent(«Ð¯Ð½Ð´ÐµÐº».split(«»).map(func tion(ch) { return «%»+ch.charCodeAt(0).toString(16); }).join(«»)) // «Яндек»

PS: Почему в теге с кодом нельзя нормально использовать юникодные символы??

Добавлено через 2 минуты
Более нормальных способов перекодирования в js не знаю.

Qwertiy

831 / 639 / 100

Регистрация: 20.08.2013

Сообщений: 2,524

20.09.2013, 01:10

Случайно наткнулся на такой код:

Javascript

/**
*
*  UTF-8 data encode / decode
*  [url]http://www.webtoolkit.info/[/url]
*
**/
 
var Utf8 = {
 
    // public method for url encoding
    encode : function (string) {
        string = string.replace(/rn/g,"n");
        var utftext = "";
 
        for (var n = 0; n < string.length; n++) {
 
            var c = string.charCodeAt(n);
 
            if (c < 128) {
                utftext += String.fromCharCode(c);
            }
            else if((c > 127) && (c < 2048)) {
                utftext += String.fromCharCode((c >> 6) | 192);
                utftext += String.fromCharCode((c & 63) | 128);
            }
            else {
                utftext += String.fromCharCode((c >> 12) | 224);
                utftext += String.fromCharCode(((c >> 6) & 63) | 128);
                utftext += String.fromCharCode((c & 63) | 128);
            }
 
        }
 
        return utftext;
    },
 
    // public method for url decoding
    decode : function (utftext) {
        var string = "";
        var i = 0;
        var c = c1 = c2 = 0;
 
        while ( i < utftext.length ) {
 
            c = utftext.charCodeAt(i);
 
            if (c < 128) {
                string += String.fromCharCode(c);
                i++;
            }
            else if((c > 191) && (c < 224)) {
                c2 = utftext.charCodeAt(i+1);
                string += String.fromCharCode(((c & 31) << 6) | (c2 & 63));
                i += 2;
            }
            else {
                c2 = utftext.charCodeAt(i+1);
                c3 = utftext.charCodeAt(i+2);
                string += String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
                i += 3;
            }
 
        }
 
        return string;
    }
 
}

Добавлено через 53 секунды

Сообщение от koc94ok

ЗЫ вроде как эти кракозябры в Win1252 или еще чем-то.

Нет, это просто байты с символах записаны…

Источник

Constructor

Instance properties

Instance methods

Examples

Representing text with typed arrays

Handling non-UTF8 text

Specifications

Browser compatibility

See also

iconv-lite: Pure JS character encoding conversion

Usage

Basic API

Streaming API

Supported encodings

Encoding/decoding speed

BOM handling

UTF-16 Encodings

UTF-32 Encodings

Other notes

Testing

Вот еще несколько интересных статей: