Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
354 changes: 344 additions & 10 deletions src/mhtml2html.js
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,302 @@ function convertAssetToDataURI(asset) {
}
}

const utf_8Decodeer = new TextDecoder('utf-8');
const ibm866Decoder = new TextDecoder('ibm866');
const iso_8859_2Decoder = new TextDecoder('iso-8859-2');
const iso_8859_3Decoder = new TextDecoder('iso-8859-3');
const iso_8859_4Decoder = new TextDecoder('iso-8859-4');
const iso_8859_5Decoder = new TextDecoder('iso-8859-5');
const iso_8859_6Decoder = new TextDecoder('iso-8859-6');
const iso_8859_7Decoder = new TextDecoder('iso-8859-7');
const iso_8859_8Decoder = new TextDecoder('iso-8859-8');
const iso_8859_8_iDecoder = new TextDecoder('iso-8859-8-i');
const iso_8859_10Decoder = new TextDecoder('iso-8859-10');
const iso_8859_13Decoder = new TextDecoder('iso-8859-13');
const iso_8859_14Decoder = new TextDecoder('iso-8859-14');
const iso_8859_15Decoder = new TextDecoder('iso-8859-15');
const koi8_rDecoder = new TextDecoder('koi8-r');
const koi8_uDecoder = new TextDecoder('koi8-u');
const macintoshDecoder = new TextDecoder('macintosh');
const windows_874Decoder = new TextDecoder('windows-874');
const windows_1250Decoder = new TextDecoder('windows-1250');
const windows_1251Decoder = new TextDecoder('windows-1251');
const windows_1252Decoder = new TextDecoder('windows-1252');
const windows_1253Decoder = new TextDecoder('windows-1253');
const windows_1254Decoder = new TextDecoder('windows-1254');
const windows_1255Decoder = new TextDecoder('windows-1255');
const windows_1256Decoder = new TextDecoder('windows-1256');
const windows_1257Decoder = new TextDecoder('windows-1257');
const windows_1258Decoder = new TextDecoder('windows-1258');
const x_mac_cyrillicDecoder = new TextDecoder('x-mac-cyrillic');
const gbkDecoder = new TextDecoder('gbk');
const gb18030Decoder = new TextDecoder('gb18030');
const big5Decoder = new TextDecoder('big5');
const euc_jpDecoder = new TextDecoder('euc-jp');
const iso_2022_jpDecoder = new TextDecoder('iso-2022-jp');
const shift_jisDecoder = new TextDecoder('shift_jis');
const euc_krDecoder = new TextDecoder('euc-kr');

const decodeMap = {
'UTF-8': utf_8Decodeer,
'IBM866': ibm866Decoder,
'866': ibm866Decoder,
'CP866': ibm866Decoder,
'CSIBM866': ibm866Decoder,
'ISO-8859-2': iso_8859_2Decoder,
'CSISOLATIN2': iso_8859_2Decoder,
'ISO-IR-101': iso_8859_2Decoder,
'ISO8859-2': iso_8859_2Decoder,
'ISO88592': iso_8859_2Decoder,
'ISO_8859-2': iso_8859_2Decoder,
'ISO_8859-2:1987': iso_8859_2Decoder,
'L2': iso_8859_2Decoder,
'LATIN2': iso_8859_2Decoder,
'ISO-8859-3': iso_8859_3Decoder,
'CSISOLATIN3': iso_8859_3Decoder,
'ISO-IR-109': iso_8859_3Decoder,
'ISO8859-3': iso_8859_3Decoder,
'ISO88593': iso_8859_3Decoder,
'ISO_8859-3': iso_8859_3Decoder,
'ISO_8859-3:1988': iso_8859_3Decoder,
'L3': iso_8859_3Decoder,
'LATIN3': iso_8859_3Decoder,
'ISO-8859-4': iso_8859_4Decoder,
'CSISOLATIN4': iso_8859_4Decoder,
'ISO-IR-110': iso_8859_4Decoder,
'ISO8859-4': iso_8859_4Decoder,
'ISO88594': iso_8859_4Decoder,
'ISO_8859-4': iso_8859_4Decoder,
'ISO_8859-4:1988': iso_8859_4Decoder,
'L4': iso_8859_4Decoder,
'LATIN4': iso_8859_4Decoder,
'ISO-8859-5': iso_8859_5Decoder,
'CSISOLATINCYRILLIC': iso_8859_5Decoder,
'CYRILLIC': iso_8859_5Decoder,
'ISO-IR-144': iso_8859_5Decoder,
'ISO8859-5': iso_8859_5Decoder,
'ISO88595': iso_8859_5Decoder,
'ISO_8859-5': iso_8859_5Decoder,
'ISO_8859-5:1988': iso_8859_5Decoder,
'ISO-8859-6': iso_8859_6Decoder,
'ARABIC': iso_8859_6Decoder,
'ASMO-708': iso_8859_6Decoder,
'CSISO88596E': iso_8859_6Decoder,
'CSISO88596I': iso_8859_6Decoder,
'CSISOLATINARABIC': iso_8859_6Decoder,
'ECMA-114': iso_8859_6Decoder,
'ISO-8859-6-E': iso_8859_6Decoder,
'ISO-8859-6-I': iso_8859_6Decoder,
'ISO-IR-127': iso_8859_6Decoder,
'ISO8859-6': iso_8859_6Decoder,
'ISO88596': iso_8859_6Decoder,
'ISO_8859-6': iso_8859_6Decoder,
'ISO_8859-6:1987': iso_8859_6Decoder,
'ISO-8859-7': iso_8859_7Decoder,
'CSISOLATINGREEK': iso_8859_7Decoder,
'ECMA-118': iso_8859_7Decoder,
'ELOT_928': iso_8859_7Decoder,
'GREEK': iso_8859_7Decoder,
'GREEK8': iso_8859_7Decoder,
'ISO-IR-126': iso_8859_7Decoder,
'ISO8859-7': iso_8859_7Decoder,
'ISO88597': iso_8859_7Decoder,
'ISO_8859-7': iso_8859_7Decoder,
'ISO_8859-7:1987': iso_8859_7Decoder,
'SUN_EU_GREEK': iso_8859_7Decoder,
'ISO-8859-8': iso_8859_8Decoder,
'CSISO88598E': iso_8859_8Decoder,
'CSISOLATINHEBREW': iso_8859_8Decoder,
'HEBREW': iso_8859_8Decoder,
'ISO-8859-8-E': iso_8859_8Decoder,
'ISO-IR-138': iso_8859_8Decoder,
'ISO8859-8': iso_8859_8Decoder,
'ISO88598': iso_8859_8Decoder,
'ISO_8859-8': iso_8859_8Decoder,
'ISO_8859-8:1988': iso_8859_8Decoder,
'VISUAL': iso_8859_8Decoder,
'ISO-8859-8-I': iso_8859_8_iDecoder,
'CSISO88598I': iso_8859_8_iDecoder,
'LOGICAL': iso_8859_8_iDecoder,
'ISO-8859-10': iso_8859_10Decoder,
'CSISOLATIN6': iso_8859_10Decoder,
'ISO-IR-157': iso_8859_10Decoder,
'ISO8859-10': iso_8859_10Decoder,
'ISO885910': iso_8859_10Decoder,
'L6': iso_8859_10Decoder,
'LATIN6': iso_8859_10Decoder,
'ISO-8859-13': iso_8859_13Decoder,
'ISO8859-13': iso_8859_13Decoder,
'ISO885913': iso_8859_13Decoder,
'ISO-8859-14': iso_8859_14Decoder,
'ISO8859-14': iso_8859_14Decoder,
'ISO885914': iso_8859_14Decoder,
'ISO-8859-15': iso_8859_15Decoder,
'CSISOLATIN9': iso_8859_15Decoder,
'ISO8859-15': iso_8859_15Decoder,
'ISO885915': iso_8859_15Decoder,
'ISO_8859-15': iso_8859_15Decoder,
'L9': iso_8859_15Decoder,
'KOI8-R': koi8_rDecoder,
'CSKOI8R': koi8_rDecoder,
'KOI': koi8_rDecoder,
'KOI8': koi8_rDecoder,
'KOI8_R': koi8_rDecoder,
'KOI8-U': koi8_uDecoder,
'KOI8-RU': koi8_uDecoder,
'MACINTOSH': macintoshDecoder,
'CSMACINTOSH': macintoshDecoder,
'MAC': macintoshDecoder,
'X-MAC-ROMAN': macintoshDecoder,
'WINDOWS-874': windows_874Decoder,
'DOS-874': windows_874Decoder,
'ISO-8859-11': windows_874Decoder,
'ISO8859-11': windows_874Decoder,
'ISO885911': windows_874Decoder,
'TIS-620': windows_874Decoder,
'WINDOWS-1250': windows_1250Decoder,
'CP1250': windows_1250Decoder,
'X-CP1250': windows_1250Decoder,
'WINDOWS-1251': windows_1251Decoder,
'CP1251': windows_1251Decoder,
'X-CP1251': windows_1251Decoder,
'WINDOWS-1252': windows_1252Decoder,
'ANSI_X3.4-1968': windows_1252Decoder,
'ASCII': windows_1252Decoder,
'CP1252': windows_1252Decoder,
'CP819': windows_1252Decoder,
'CSISOLATIN1': windows_1252Decoder,
'IBM819': windows_1252Decoder,
'ISO-8859-1': windows_1252Decoder,
'ISO-IR-100': windows_1252Decoder,
'ISO8859-1': windows_1252Decoder,
'ISO88591': windows_1252Decoder,
'ISO_8859-1': windows_1252Decoder,
'ISO_8859-1:1987': windows_1252Decoder,
'L1': windows_1252Decoder,
'LATIN1': windows_1252Decoder,
'US-ASCII': windows_1252Decoder,
'X-CP1252': windows_1252Decoder,
'WINDOWS-1253': windows_1253Decoder,
'CP1253': windows_1253Decoder,
'X-CP1253': windows_1253Decoder,
'WINDOWS-1254': windows_1254Decoder,
'CP1254': windows_1254Decoder,
'CSISOLATIN5': windows_1254Decoder,
'ISO-8859-9': windows_1254Decoder,
'ISO-IR-148': windows_1254Decoder,
'ISO8859-9': windows_1254Decoder,
'ISO88599': windows_1254Decoder,
'ISO_8859-9': windows_1254Decoder,
'ISO_8859-9:1989': windows_1254Decoder,
'L5': windows_1254Decoder,
'LATIN5': windows_1254Decoder,
'X-CP1254': windows_1254Decoder,
'WINDOWS-1255': windows_1255Decoder,
'CP1255': windows_1255Decoder,
'X-CP1255': windows_1255Decoder,
'WINDOWS-1256': windows_1256Decoder,
'CP1256': windows_1256Decoder,
'X-CP1256': windows_1256Decoder,
'WINDOWS-1257': windows_1257Decoder,
'CP1257': windows_1257Decoder,
'X-CP1257': windows_1257Decoder,
'WINDOWS-1258': windows_1258Decoder,
'CP1258': windows_1258Decoder,
'X-CP1258': windows_1258Decoder,
'X-MAC-CYRILLIC': x_mac_cyrillicDecoder,
'X-MAC-UKRAINIAN': x_mac_cyrillicDecoder,
'GBK': gbkDecoder,
'CHINESE': gbkDecoder,
'CSGB2312': gbkDecoder,
'CSISO58GB231280': gbkDecoder,
'GB2312': gbkDecoder,
'GB_2312': gbkDecoder,
'GB_2312-80': gbkDecoder,
'ISO-IR-58': gbkDecoder,
'X-GBK': gbkDecoder,
'GB18030': gb18030Decoder,
'BIG5': big5Decoder,
'BIG5-HKSCS': big5Decoder,
'CN-BIG5': big5Decoder,
'CSBIG5': big5Decoder,
'X-X-BIG5': big5Decoder,
'EUC-JP': euc_jpDecoder,
'CSEUCPKDFMTJAPANESE': euc_jpDecoder,
'X-EUC-JP': euc_jpDecoder,
'ISO-2022-JP': iso_2022_jpDecoder,
'CSISO2022JP': iso_2022_jpDecoder,
'SHIFT_JIS': shift_jisDecoder,
'CSSHIFTJIS': shift_jisDecoder,
'MS932': shift_jisDecoder,
'MS_KANJI': shift_jisDecoder,
'SHIFT-JIS': shift_jisDecoder,
'SJIS': shift_jisDecoder,
'WINDOWS-31J': shift_jisDecoder,
'X-SJIS': shift_jisDecoder,
'EUC-KR': euc_krDecoder,
'CSEUCKR': euc_krDecoder,
'CSKSC56011987': euc_krDecoder,
'ISO-IR-149': euc_krDecoder,
'KOREAN': euc_krDecoder,
'KS_C_5601-1987': euc_krDecoder,
'KS_C_5601-1989': euc_krDecoder,
'KSC5601': euc_krDecoder,
'KSC_5601': euc_krDecoder,
'WINDOWS-949': euc_krDecoder,
}

/**
* Strings like css will be saved as quoted-printable.
* This function decodes quoted-printable with support for gbk encoding.
* @license MIT
* - edited from `quoted-printable` by Mathias Bynens
* @see {@link https://github.com/mathiasbynens/quoted-printable/blob/master/src/quoted-printable.js}
*/
function decodeQuotedPrintable(input, enc = 'utf-8') {
const oldEncoding = enc.toUpperCase();

let decoder = decodeMap[oldEncoding];
if (!decoder) {
console.error("unknown encoding", enc);
decoder = utf_8Decodeer;
}

return (
input
// https://tools.ietf.org/html/rfc2045#section-6.7, rule 3:
// “Therefore, when decoding a `Quoted-Printable` body, any trailing white
// space on a line must be deleted, as it will necessarily have been added
// by intermediate transport agents.”
.replace(/[\t\x20]$/gm, '')
// Remove hard line breaks preceded by `=`. Proper `Quoted-Printable`-
// encoded data only contains CRLF line endings, but for compatibility
// reasons we support separate CR and LF too.
.replace(/=(?:\r\n?|\n|$)/g, '')
// Decode escape sequences of the form `=XX` where `XX` is any
// combination of two hexidecimal digits. For optimal compatibility,
// lowercase hexadecimal digits are supported as well. See
// https://tools.ietf.org/html/rfc2045#section-6.7, note 1.
/**
* @note The method above only supports utf-8 encoding
* @edit Add support for gbk encoding by using TextDecoder
* @condition input must contains full code points
* @todo may cause performance issue with large input
*/
.replace(/(=[a-fA-F0-9]{2}){1,}/g, function ($0, $1) {
const array = $0
.split('=')
.slice(1)
.map((hex) => parseInt(hex, 16));
const buffer = new Uint8Array(array);
const utf8Str = decoder.decode(buffer);

return utf8Str;
})
)
}


// Main module.
const mhtml2html = {

Expand All @@ -109,7 +405,7 @@ const mhtml2html = {
* @param {options.parseDOM} // The callback to parse an HTML string.
* @returns an html document without resources if htmlOnly === true; an MHTML parsed object otherwise.
*/
parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser } = {}) => {
parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser, enc = "utf-8" } = {}) => {
const MHTML_FSM = {
MHTML_HEADERS : 0,
MTHML_CONTENT : 1,
Expand Down Expand Up @@ -139,19 +435,57 @@ const mhtml2html = {
}

// Returns the next line from the index.
/**
* @edit
* @note merge quoted-printable multi-line content into one line
* - this is required for gbk encoding
*/
function getLine(encoding) {
const j = i;
if (encoding === 'quoted-printable') {
let line = mhtml[i];

while (true) {
i++;
assert(i < mhtml.length, 'Unexpected EOF');

line += mhtml[i];

// 如果结尾是 =\n =\r\n 则删除行尾并继续读取下一行

// In older versions of Mac, line breaks are represented by '\r',
// while in Windows, line breaks are represented by '\r\n'.
// Since Mac is not commonly used as a server, we can ignore '\r'.
if (mhtml[i] === '\r') line = line.slice(0, -1)

if (line.endsWith('=\n')) {
line = line.slice(0, -2);
l++;
continue
}

if (line.endsWith('\n')) {
l++;
break
}
}

i++;
var decoded = decodeQuotedPrintable(line, enc);
return decoded;
}

const j = i

// Wait until a newline character is encountered or when we exceed the str length.
while (mhtml[i] !== '\n' && assert(i++ < mhtml.length - 1, 'Unexpected EOF'));
i++; l++;
while (
mhtml[i] !== '\n' &&
assert(i++ < mhtml.length - 1, 'Unexpected EOF')
);
i++;
l++;

const line = mhtml.substring(j, i);

// Return the (decoded) line.
if (encoding === 'quoted-printable') {
return QuotedPrintable.decode(line);
}
if (encoding === 'base64') {
return line.trim();
}
Expand Down Expand Up @@ -291,13 +625,13 @@ const mhtml2html = {
* @param {options.parseDOM} // The callback to parse an HTML string.
* @returns an html document element.
*/
convert: (mhtml, { convertIframes = false, parseDOM = defaultDOMParser } = {}) => {
convert: (mhtml, { convertIframes = false, parseDOM = defaultDOMParser, enc = "utf-8" } = {}) => {
let index, media, frames; // Record-keeping.
let style, base, img; // DOM objects.
let href, src; // References.

if (typeof mhtml === "string") {
mhtml = mhtml2html.parse(mhtml);
mhtml = mhtml2html.parse(mhtml, { enc });
} else {
assert(typeof mhtml === "object", 'Expected argument of type string or object');
}
Expand Down