Skip to content

Commit

Permalink
Extended text encoding to many iso-8859-x encodings (#201 to fix #205)
Browse files Browse the repository at this point in the history
  • Loading branch information
wingman-jr-addon authored Jul 8, 2024
1 parent 3d3f308 commit 0b7cbdb
Show file tree
Hide file tree
Showing 38 changed files with 120 additions and 80 deletions.
130 changes: 50 additions & 80 deletions background.js
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ function bkDetectCharsetAndSetupDecoderEncoder(details) {
WJR_DEBUG && console.debug('CHARSET: No detected charset, but content type was application/xhtml+xml so using UTF-8');
} else {
decodingCharset = undefined;
WJR_DEBUG && console.debug('CHARSET: No detected charset, moving ahead with UTF-8 until decoding error encountered!');
WJR_DEBUG && console.debug('CHARSET: No detected charset, moving ahead with UTF-8 until sniff finds an encoding or decoding error encountered!');
}

let decoder = new TextDecoderWithSniffing(decodingCharset);
Expand All @@ -750,21 +750,26 @@ function bkConcatBuffersToUint8Array(buffers) {
return result;
}

/*
* UTF-8 test functions.
* If you change one, check to make sure another doesn't need to change.
*/
function bkIsUtf8Alias(declType) {
//Passes all 6 aliases found at https://encoding.spec.whatwg.org/#names-and-labels
return (/.*utf.?8/gmi.test(declType));
}

function bkDoesSniffStringIndicateUtf8(sniffString) {
return (
/<\?xml\sversion="1\.0"\s+encoding="utf-8"\?>/gm.test(sniffString)
|| /<meta[^>]+[^<]*utf.?8/igm.test(sniffString));
function bkSniffExtractEncoding(sniffString) {
try {
const xmlParts = /<\?xml\sversion="1\.0"\s+encoding="([^"]+)"\?>/gm.exec(sniffString);
if(xmlParts) {
return xmlParts[1];
}
const metaParts = /<meta[^>]+charset="?([^"]+)"/igm.exec(sniffString);
if(metaParts) {
return metaParts[1];
}
} catch (ex) {
console.error('CHARSET: Sniff extraction exception: '+ex);
}
return null;
}
/* End UTF-8 test function */

function TextDecoderWithSniffing(declType)
{
Expand All @@ -783,7 +788,7 @@ function TextDecoderWithSniffing(declType)
if(self.sniffCount == 0 && buffer.byteLength >= 3) {
let bom = new Uint8Array(buffer, 0, 3);
if(bom[0] == 0xEF && bom[1] == 0xBB && bom[2] == 0xBF) {
WJR_DEBUG && console.debug('CHARSET: Sniff found utf-8 BOM');
WJR_DEBUG && console.log('CHARSET: Sniff found utf-8 BOM');
self.currentType = 'utf-8';
}
}
Expand All @@ -797,20 +802,25 @@ function TextDecoderWithSniffing(declType)
self.sniffBufferList = null;
let tmpDecoder = new TextDecoder('iso-8859-1');
let sniffString = tmpDecoder.decode(fullSniffBuffer);
if(sniffString.length > 512) {
sniffString = sniffString.substring(0, 512);
}
WJR_DEBUG && console.debug('CHARSET: Sniff string constructed: '+sniffString);
if(bkDoesSniffStringIndicateUtf8(sniffString)) {
WJR_DEBUG && console.debug('CHARSET: Sniff found decoding of utf-8 by examining header');
self.currentType = 'utf-8';
let extractedEncoding = bkSniffExtractEncoding(sniffString);
if(extractedEncoding) {
WJR_DEBUG && console.log('CHARSET: Sniff found decoding of '+extractedEncoding+' by examining header, changing decoder');
self.currentType = extractedEncoding.toLowerCase();
self.decoder = new TextDecoder(self.currentType);
} else {
WJR_DEBUG && console.debug('CHARSET: Sniff string did not indicate UTF-8');
WJR_DEBUG && console.log('CHARSET: Sniff string did not indicate encoding');
}
}
}
}
WJR_DEBUG && console.debug('CHARSET: Sniffing decoding of utf-8');
WJR_DEBUG && console.debug('CHARSET: Sniff received a chunk, current decoding type '+self.currentType);
return self.decoder.decode(buffer, options);
} catch {
WJR_DEBUG && console.warn('CHARSET: Falling back from '+self.currentType+' to iso-8859-1');
} catch (ex) {
WJR_DEBUG && console.warn('CHARSET: Falling back from '+self.currentType+' to iso-8859-1 (Exception: '+ex+')');
self.decoder = new TextDecoder('iso-8859-1');
self.currentType = 'iso-8859-1';
return self.decoder.decode(buffer, options);
Expand All @@ -825,76 +835,36 @@ function TextDecoderWithSniffing(declType)
function TextEncoderWithSniffing(decoder) {
let self = this;
self.utf8Encoder = new TextEncoder();
self.iso_8859_1_Encoder = new TextEncoderISO_8859_1();
self.linkedDecoder = decoder;

self.encode = function(str) {
WJR_DEBUG && console.debug('CHARSET: Encoding with decoder current type '+self.linkedDecoder.currentType);
if(self.linkedDecoder.currentType === undefined) {
WJR_DEBUG && console.debug('CHARSET: Effective encoding iso-8859-1');
return self.iso_8859_1_Encoder.encode(str);
} else if(bkIsUtf8Alias(self.linkedDecoder.currentType)) {
WJR_DEBUG && console.debug('CHARSET: Effective encoding utf-8');

if(bkIsUtf8Alias(self.linkedDecoder.currentType)) {
WJR_DEBUG && console.debug('CHARSET: Encoding utf-8');
return self.utf8Encoder.encode(str);
} else {
WJR_DEBUG && console.debug('CHARSET: Effective encoding iso-8859-1');
return self.iso_8859_1_Encoder.encode(str);
}
}
}

//https://www.i18nqa.com/debug/table-iso8859-1-vs-windows-1252.html
BK_Windows1252_special_chars =
{
0x20AC : 0x80,
0x201A : 0x82,
0x192 : 0x83,
0x201E : 0x84,
0x2026 : 0x85,
0x2020 : 0x86,
0x2021 : 0x87,
0x2C6 : 0x88,
0x2030 : 0x89,
0x160 : 0x8A,
0x2039 : 0x8B,
0x152 : 0x8C,
0x17D : 0x8E,
0x2018 : 0x91,
0x2019 : 0x92,
0x201C : 0x93,
0x201D : 0x94,
0x2022 : 0x95,
0x2013 : 0x96,
0x2014 : 0x97,
0x2DC : 0x98,
0x2122 : 0x99,
0x161 : 0x9A,
0x203A : 0x9B,
0x153 : 0x9C,
0x17E : 0x9E,
0x178 : 0x9F
}

function TextEncoderISO_8859_1()
{
this.encode = function(str) {
var result = new Uint8Array(str.length);
for(let i=0; i<str.length; i++) {
let charCode = str.charCodeAt(i);

//So this is subtle. ISO-8859-1 is actually interpreted as Windows-1252 on decoding by browsers.
//When the TextDecoder got instantiated with 'iso-8859-1', it actually used Windows-1252.
//This means that for the characters in the (decimal) range 128-159, the Unicode point is not
//in the normal 0-255 range and we need to detect these characters specially to back-convert into
//Windows-1252 raw encoding masquerading as ISO-8859-1.
charCode = BK_Windows1252_special_chars[charCode] || charCode;

if(charCode > 255) {
WJR_DEBUG && console.log(`CHARSET: Warning - likely mistranslation of character ${str[i]}`);
charCode = 255;
console.log('CHARSET: Test '+TEXT_ENCODINGS[self.linkedDecoder.currentType]);
let effectiveEncoding = TEXT_ENCODINGS[self.linkedDecoder.currentType] ?? TEXT_ENCODINGS['iso-8859-1'];
WJR_DEBUG && console.debug('CHARSET: Effective encoding ' + effectiveEncoding.name);
let outputRaw = [];
let untranslatableCount = 0;
for(const codePoint of str) {
let initialCodePoint = codePoint.codePointAt(0);
let bytes = effectiveEncoding.codePointsToBytes[initialCodePoint];
if(bytes !== undefined) {
for(let i=0; i<bytes.length; i++) {
outputRaw.push(bytes[i]);
}
} else {
if(untranslatableCount == 0) {
console.warn('CHARSET: untranslatable code point '+initialCodePoint+' found while charset='+self.linkedDecoder.currentType);
}
untranslatableCount++;
}
result[i] = charCode;
}
let result = new Uint8Array(outputRaw);
WJR_DEBUG && console.log('CHARSET: re-encoded '+result.length+' bytes ('+untranslatableCount+' untranslated code points) with effective encoding '+ effectiveEncoding.name);
return result;
}
}
Expand Down
46 changes: 46 additions & 0 deletions encoders.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/* Works in conjuction with auto-generated encoders_data.js
The optimal data structures in that objects are not able to
be generated due to limitations of JSON int key maps, so
first thing is to repack it quick */
console.log('CHARSET: Repacking encoder data from '+TEXT_ENCODINGS_RAW);
for(let ei=0; ei<TEXT_ENCODINGS_RAW.length; ei++) {
let encoding = TEXT_ENCODINGS_RAW[ei];
if(encoding.dotnet_name == null) {
console.warn('CHARSET: Encoding had no correspondence: '+encoding.name);
continue
}
if(encoding.codePoints == null || encoding.bytesForCodePoints == null) {
console.error('CHARSET: Encoding failed to provide a map '+encoding.name);
continue;
}
if(encoding.codePoints.length != encoding.bytesForCodePoints.length) {
console.error('CHARSET: Encoding map codepoint/byte count mismatch: '+encoding.name);
continue;
}
encoding.codePointsToBytes = {};
for(let i=0; i<encoding.codePoints.length; i++) {
encoding.codePointsToBytes[encoding.codePoints[i]] = encoding.bytesForCodePoints[i];
}
//Clean up large data arrays present in map
encoding.codePoints = null;
encoding.bytesForCodePoints = null;
console.log('CHARSET: Repacked encoding '+encoding.name + ' with aliases '+encoding.aliases);
}

/* Now map using the alias names. Note here that the encoding
name should NOT be used, because there are instances where
this will create a problem. Browsers map what is called
"iso-8859-1" from the served page to "windows-1252" for
historical reasons. */
let TEXT_ENCODINGS = {};
let TEXT_ENCODINGS_COUNT = 0;
for(let ei=0; ei<TEXT_ENCODINGS_RAW.length; ei++) {
let encoding = TEXT_ENCODINGS_RAW[ei];
for(let ai=0; ai<encoding.aliases.length; ai++) {
let alias = encoding.aliases[ai];
TEXT_ENCODINGS[alias] = encoding;
TEXT_ENCODINGS_COUNT++;
}
}

console.log('CHARSET: Repacked '+TEXT_ENCODINGS_COUNT+' encodings.');
1 change: 1 addition & 0 deletions encoders_data.js

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
"mp4.js",
"ebml.js",
"status.js",
"encoders_data.js",
"encoders.js",
"background_gif.js",
"background.js",
"background_video.js",
Expand Down
21 changes: 21 additions & 0 deletions tests/launch_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import http.server
import socketserver

class CustomHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
def end_headers(self):
# Override the 'Content-Type' header
self.send_header('Content-Type', 'text/plain')
http.server.SimpleHTTPRequestHandler.end_headers(self)

def send_header(self, keyword, value):
# Only send headers that are not 'Content-Type'
if keyword.lower() != 'content-type':
super().send_header(keyword, value)


if __name__ == "__main__":
PORT = 8000
handler = CustomHTTPRequestHandler
httpd = socketserver.TCPServer(("", PORT), handler)
print(f"Serving at port {PORT}")
httpd.serve_forever()
Binary file added tests/verify_encodings/big5.html
Binary file not shown.
Binary file added tests/verify_encodings/euc-jp.html
Binary file not shown.
Binary file added tests/verify_encodings/euc-kr.html
Binary file not shown.
Binary file added tests/verify_encodings/gbk.html
Binary file not shown.
Binary file added tests/verify_encodings/ibm866.html
Binary file not shown.
Binary file added tests/verify_encodings/index.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-2022-jp.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-13.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-15.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-2.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-3.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-4.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-5.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-6.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-7.html
Binary file not shown.
Binary file added tests/verify_encodings/iso-8859-8.html
Binary file not shown.
Binary file added tests/verify_encodings/koi8-r.html
Binary file not shown.
Binary file added tests/verify_encodings/koi8-u.html
Binary file not shown.
Binary file added tests/verify_encodings/macintosh.html
Binary file not shown.
Binary file added tests/verify_encodings/shift-jis.html
Binary file not shown.
Binary file added tests/verify_encodings/utf-16be.html
Binary file not shown.
Binary file added tests/verify_encodings/utf-16le.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1250.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1251.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1252.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1253.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1254.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1255.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1256.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1257.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-1258.html
Binary file not shown.
Binary file added tests/verify_encodings/windows-874.html
Binary file not shown.
Binary file added tests/verify_encodings/x-mac-cyrillic.html
Binary file not shown.

0 comments on commit 0b7cbdb

Please sign in to comment.