diff --git a/src/mhtml2html.js b/src/mhtml2html.js
index b52c14b..dc43ad4 100644
--- a/src/mhtml2html.js
+++ b/src/mhtml2html.js
@@ -97,6 +97,302 @@ function convertAssetToDataURI(asset) {
}
}
+const utf_8Decodeer = new TextDecoder('utf-8');
+const ibm866Decoder = new TextDecoder('ibm866');
+const iso_8859_2Decoder = new TextDecoder('iso-8859-2');
+const iso_8859_3Decoder = new TextDecoder('iso-8859-3');
+const iso_8859_4Decoder = new TextDecoder('iso-8859-4');
+const iso_8859_5Decoder = new TextDecoder('iso-8859-5');
+const iso_8859_6Decoder = new TextDecoder('iso-8859-6');
+const iso_8859_7Decoder = new TextDecoder('iso-8859-7');
+const iso_8859_8Decoder = new TextDecoder('iso-8859-8');
+const iso_8859_8_iDecoder = new TextDecoder('iso-8859-8-i');
+const iso_8859_10Decoder = new TextDecoder('iso-8859-10');
+const iso_8859_13Decoder = new TextDecoder('iso-8859-13');
+const iso_8859_14Decoder = new TextDecoder('iso-8859-14');
+const iso_8859_15Decoder = new TextDecoder('iso-8859-15');
+const koi8_rDecoder = new TextDecoder('koi8-r');
+const koi8_uDecoder = new TextDecoder('koi8-u');
+const macintoshDecoder = new TextDecoder('macintosh');
+const windows_874Decoder = new TextDecoder('windows-874');
+const windows_1250Decoder = new TextDecoder('windows-1250');
+const windows_1251Decoder = new TextDecoder('windows-1251');
+const windows_1252Decoder = new TextDecoder('windows-1252');
+const windows_1253Decoder = new TextDecoder('windows-1253');
+const windows_1254Decoder = new TextDecoder('windows-1254');
+const windows_1255Decoder = new TextDecoder('windows-1255');
+const windows_1256Decoder = new TextDecoder('windows-1256');
+const windows_1257Decoder = new TextDecoder('windows-1257');
+const windows_1258Decoder = new TextDecoder('windows-1258');
+const x_mac_cyrillicDecoder = new TextDecoder('x-mac-cyrillic');
+const gbkDecoder = new TextDecoder('gbk');
+const gb18030Decoder = new TextDecoder('gb18030');
+const big5Decoder = new TextDecoder('big5');
+const euc_jpDecoder = new TextDecoder('euc-jp');
+const iso_2022_jpDecoder = new TextDecoder('iso-2022-jp');
+const shift_jisDecoder = new TextDecoder('shift_jis');
+const euc_krDecoder = new TextDecoder('euc-kr');
+
+const decodeMap = {
+ 'UTF-8': utf_8Decodeer,
+ 'IBM866': ibm866Decoder,
+ '866': ibm866Decoder,
+ 'CP866': ibm866Decoder,
+ 'CSIBM866': ibm866Decoder,
+ 'ISO-8859-2': iso_8859_2Decoder,
+ 'CSISOLATIN2': iso_8859_2Decoder,
+ 'ISO-IR-101': iso_8859_2Decoder,
+ 'ISO8859-2': iso_8859_2Decoder,
+ 'ISO88592': iso_8859_2Decoder,
+ 'ISO_8859-2': iso_8859_2Decoder,
+ 'ISO_8859-2:1987': iso_8859_2Decoder,
+ 'L2': iso_8859_2Decoder,
+ 'LATIN2': iso_8859_2Decoder,
+ 'ISO-8859-3': iso_8859_3Decoder,
+ 'CSISOLATIN3': iso_8859_3Decoder,
+ 'ISO-IR-109': iso_8859_3Decoder,
+ 'ISO8859-3': iso_8859_3Decoder,
+ 'ISO88593': iso_8859_3Decoder,
+ 'ISO_8859-3': iso_8859_3Decoder,
+ 'ISO_8859-3:1988': iso_8859_3Decoder,
+ 'L3': iso_8859_3Decoder,
+ 'LATIN3': iso_8859_3Decoder,
+ 'ISO-8859-4': iso_8859_4Decoder,
+ 'CSISOLATIN4': iso_8859_4Decoder,
+ 'ISO-IR-110': iso_8859_4Decoder,
+ 'ISO8859-4': iso_8859_4Decoder,
+ 'ISO88594': iso_8859_4Decoder,
+ 'ISO_8859-4': iso_8859_4Decoder,
+ 'ISO_8859-4:1988': iso_8859_4Decoder,
+ 'L4': iso_8859_4Decoder,
+ 'LATIN4': iso_8859_4Decoder,
+ 'ISO-8859-5': iso_8859_5Decoder,
+ 'CSISOLATINCYRILLIC': iso_8859_5Decoder,
+ 'CYRILLIC': iso_8859_5Decoder,
+ 'ISO-IR-144': iso_8859_5Decoder,
+ 'ISO8859-5': iso_8859_5Decoder,
+ 'ISO88595': iso_8859_5Decoder,
+ 'ISO_8859-5': iso_8859_5Decoder,
+ 'ISO_8859-5:1988': iso_8859_5Decoder,
+ 'ISO-8859-6': iso_8859_6Decoder,
+ 'ARABIC': iso_8859_6Decoder,
+ 'ASMO-708': iso_8859_6Decoder,
+ 'CSISO88596E': iso_8859_6Decoder,
+ 'CSISO88596I': iso_8859_6Decoder,
+ 'CSISOLATINARABIC': iso_8859_6Decoder,
+ 'ECMA-114': iso_8859_6Decoder,
+ 'ISO-8859-6-E': iso_8859_6Decoder,
+ 'ISO-8859-6-I': iso_8859_6Decoder,
+ 'ISO-IR-127': iso_8859_6Decoder,
+ 'ISO8859-6': iso_8859_6Decoder,
+ 'ISO88596': iso_8859_6Decoder,
+ 'ISO_8859-6': iso_8859_6Decoder,
+ 'ISO_8859-6:1987': iso_8859_6Decoder,
+ 'ISO-8859-7': iso_8859_7Decoder,
+ 'CSISOLATINGREEK': iso_8859_7Decoder,
+ 'ECMA-118': iso_8859_7Decoder,
+ 'ELOT_928': iso_8859_7Decoder,
+ 'GREEK': iso_8859_7Decoder,
+ 'GREEK8': iso_8859_7Decoder,
+ 'ISO-IR-126': iso_8859_7Decoder,
+ 'ISO8859-7': iso_8859_7Decoder,
+ 'ISO88597': iso_8859_7Decoder,
+ 'ISO_8859-7': iso_8859_7Decoder,
+ 'ISO_8859-7:1987': iso_8859_7Decoder,
+ 'SUN_EU_GREEK': iso_8859_7Decoder,
+ 'ISO-8859-8': iso_8859_8Decoder,
+ 'CSISO88598E': iso_8859_8Decoder,
+ 'CSISOLATINHEBREW': iso_8859_8Decoder,
+ 'HEBREW': iso_8859_8Decoder,
+ 'ISO-8859-8-E': iso_8859_8Decoder,
+ 'ISO-IR-138': iso_8859_8Decoder,
+ 'ISO8859-8': iso_8859_8Decoder,
+ 'ISO88598': iso_8859_8Decoder,
+ 'ISO_8859-8': iso_8859_8Decoder,
+ 'ISO_8859-8:1988': iso_8859_8Decoder,
+ 'VISUAL': iso_8859_8Decoder,
+ 'ISO-8859-8-I': iso_8859_8_iDecoder,
+ 'CSISO88598I': iso_8859_8_iDecoder,
+ 'LOGICAL': iso_8859_8_iDecoder,
+ 'ISO-8859-10': iso_8859_10Decoder,
+ 'CSISOLATIN6': iso_8859_10Decoder,
+ 'ISO-IR-157': iso_8859_10Decoder,
+ 'ISO8859-10': iso_8859_10Decoder,
+ 'ISO885910': iso_8859_10Decoder,
+ 'L6': iso_8859_10Decoder,
+ 'LATIN6': iso_8859_10Decoder,
+ 'ISO-8859-13': iso_8859_13Decoder,
+ 'ISO8859-13': iso_8859_13Decoder,
+ 'ISO885913': iso_8859_13Decoder,
+ 'ISO-8859-14': iso_8859_14Decoder,
+ 'ISO8859-14': iso_8859_14Decoder,
+ 'ISO885914': iso_8859_14Decoder,
+ 'ISO-8859-15': iso_8859_15Decoder,
+ 'CSISOLATIN9': iso_8859_15Decoder,
+ 'ISO8859-15': iso_8859_15Decoder,
+ 'ISO885915': iso_8859_15Decoder,
+ 'ISO_8859-15': iso_8859_15Decoder,
+ 'L9': iso_8859_15Decoder,
+ 'KOI8-R': koi8_rDecoder,
+ 'CSKOI8R': koi8_rDecoder,
+ 'KOI': koi8_rDecoder,
+ 'KOI8': koi8_rDecoder,
+ 'KOI8_R': koi8_rDecoder,
+ 'KOI8-U': koi8_uDecoder,
+ 'KOI8-RU': koi8_uDecoder,
+ 'MACINTOSH': macintoshDecoder,
+ 'CSMACINTOSH': macintoshDecoder,
+ 'MAC': macintoshDecoder,
+ 'X-MAC-ROMAN': macintoshDecoder,
+ 'WINDOWS-874': windows_874Decoder,
+ 'DOS-874': windows_874Decoder,
+ 'ISO-8859-11': windows_874Decoder,
+ 'ISO8859-11': windows_874Decoder,
+ 'ISO885911': windows_874Decoder,
+ 'TIS-620': windows_874Decoder,
+ 'WINDOWS-1250': windows_1250Decoder,
+ 'CP1250': windows_1250Decoder,
+ 'X-CP1250': windows_1250Decoder,
+ 'WINDOWS-1251': windows_1251Decoder,
+ 'CP1251': windows_1251Decoder,
+ 'X-CP1251': windows_1251Decoder,
+ 'WINDOWS-1252': windows_1252Decoder,
+ 'ANSI_X3.4-1968': windows_1252Decoder,
+ 'ASCII': windows_1252Decoder,
+ 'CP1252': windows_1252Decoder,
+ 'CP819': windows_1252Decoder,
+ 'CSISOLATIN1': windows_1252Decoder,
+ 'IBM819': windows_1252Decoder,
+ 'ISO-8859-1': windows_1252Decoder,
+ 'ISO-IR-100': windows_1252Decoder,
+ 'ISO8859-1': windows_1252Decoder,
+ 'ISO88591': windows_1252Decoder,
+ 'ISO_8859-1': windows_1252Decoder,
+ 'ISO_8859-1:1987': windows_1252Decoder,
+ 'L1': windows_1252Decoder,
+ 'LATIN1': windows_1252Decoder,
+ 'US-ASCII': windows_1252Decoder,
+ 'X-CP1252': windows_1252Decoder,
+ 'WINDOWS-1253': windows_1253Decoder,
+ 'CP1253': windows_1253Decoder,
+ 'X-CP1253': windows_1253Decoder,
+ 'WINDOWS-1254': windows_1254Decoder,
+ 'CP1254': windows_1254Decoder,
+ 'CSISOLATIN5': windows_1254Decoder,
+ 'ISO-8859-9': windows_1254Decoder,
+ 'ISO-IR-148': windows_1254Decoder,
+ 'ISO8859-9': windows_1254Decoder,
+ 'ISO88599': windows_1254Decoder,
+ 'ISO_8859-9': windows_1254Decoder,
+ 'ISO_8859-9:1989': windows_1254Decoder,
+ 'L5': windows_1254Decoder,
+ 'LATIN5': windows_1254Decoder,
+ 'X-CP1254': windows_1254Decoder,
+ 'WINDOWS-1255': windows_1255Decoder,
+ 'CP1255': windows_1255Decoder,
+ 'X-CP1255': windows_1255Decoder,
+ 'WINDOWS-1256': windows_1256Decoder,
+ 'CP1256': windows_1256Decoder,
+ 'X-CP1256': windows_1256Decoder,
+ 'WINDOWS-1257': windows_1257Decoder,
+ 'CP1257': windows_1257Decoder,
+ 'X-CP1257': windows_1257Decoder,
+ 'WINDOWS-1258': windows_1258Decoder,
+ 'CP1258': windows_1258Decoder,
+ 'X-CP1258': windows_1258Decoder,
+ 'X-MAC-CYRILLIC': x_mac_cyrillicDecoder,
+ 'X-MAC-UKRAINIAN': x_mac_cyrillicDecoder,
+ 'GBK': gbkDecoder,
+ 'CHINESE': gbkDecoder,
+ 'CSGB2312': gbkDecoder,
+ 'CSISO58GB231280': gbkDecoder,
+ 'GB2312': gbkDecoder,
+ 'GB_2312': gbkDecoder,
+ 'GB_2312-80': gbkDecoder,
+ 'ISO-IR-58': gbkDecoder,
+ 'X-GBK': gbkDecoder,
+ 'GB18030': gb18030Decoder,
+ 'BIG5': big5Decoder,
+ 'BIG5-HKSCS': big5Decoder,
+ 'CN-BIG5': big5Decoder,
+ 'CSBIG5': big5Decoder,
+ 'X-X-BIG5': big5Decoder,
+ 'EUC-JP': euc_jpDecoder,
+ 'CSEUCPKDFMTJAPANESE': euc_jpDecoder,
+ 'X-EUC-JP': euc_jpDecoder,
+ 'ISO-2022-JP': iso_2022_jpDecoder,
+ 'CSISO2022JP': iso_2022_jpDecoder,
+ 'SHIFT_JIS': shift_jisDecoder,
+ 'CSSHIFTJIS': shift_jisDecoder,
+ 'MS932': shift_jisDecoder,
+ 'MS_KANJI': shift_jisDecoder,
+ 'SHIFT-JIS': shift_jisDecoder,
+ 'SJIS': shift_jisDecoder,
+ 'WINDOWS-31J': shift_jisDecoder,
+ 'X-SJIS': shift_jisDecoder,
+ 'EUC-KR': euc_krDecoder,
+ 'CSEUCKR': euc_krDecoder,
+ 'CSKSC56011987': euc_krDecoder,
+ 'ISO-IR-149': euc_krDecoder,
+ 'KOREAN': euc_krDecoder,
+ 'KS_C_5601-1987': euc_krDecoder,
+ 'KS_C_5601-1989': euc_krDecoder,
+ 'KSC5601': euc_krDecoder,
+ 'KSC_5601': euc_krDecoder,
+ 'WINDOWS-949': euc_krDecoder,
+}
+
+/**
+ * Strings like css will be saved as quoted-printable.
+ * This function decodes quoted-printable with support for gbk encoding.
+ * @license MIT
+ * - edited from `quoted-printable` by Mathias Bynens
+ * @see {@link https://github.com/mathiasbynens/quoted-printable/blob/master/src/quoted-printable.js}
+ */
+function decodeQuotedPrintable(input, enc = 'utf-8') {
+ const oldEncoding = enc.toUpperCase();
+
+ let decoder = decodeMap[oldEncoding];
+ if (!decoder) {
+ console.error("unknown encoding", enc);
+ decoder = utf_8Decodeer;
+ }
+
+ return (
+ input
+ // https://tools.ietf.org/html/rfc2045#section-6.7, rule 3:
+ // “Therefore, when decoding a `Quoted-Printable` body, any trailing white
+ // space on a line must be deleted, as it will necessarily have been added
+ // by intermediate transport agents.”
+ .replace(/[\t\x20]$/gm, '')
+ // Remove hard line breaks preceded by `=`. Proper `Quoted-Printable`-
+ // encoded data only contains CRLF line endings, but for compatibility
+ // reasons we support separate CR and LF too.
+ .replace(/=(?:\r\n?|\n|$)/g, '')
+ // Decode escape sequences of the form `=XX` where `XX` is any
+ // combination of two hexidecimal digits. For optimal compatibility,
+ // lowercase hexadecimal digits are supported as well. See
+ // https://tools.ietf.org/html/rfc2045#section-6.7, note 1.
+ /**
+ * @note The method above only supports utf-8 encoding
+ * @edit Add support for gbk encoding by using TextDecoder
+ * @condition input must contains full code points
+ * @todo may cause performance issue with large input
+ */
+ .replace(/(=[a-fA-F0-9]{2}){1,}/g, function ($0, $1) {
+ const array = $0
+ .split('=')
+ .slice(1)
+ .map((hex) => parseInt(hex, 16));
+ const buffer = new Uint8Array(array);
+ const utf8Str = decoder.decode(buffer);
+
+ return utf8Str;
+ })
+ )
+}
+
+
// Main module.
const mhtml2html = {
@@ -109,7 +405,7 @@ const mhtml2html = {
* @param {options.parseDOM} // The callback to parse an HTML string.
* @returns an html document without resources if htmlOnly === true; an MHTML parsed object otherwise.
*/
- parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser } = {}) => {
+ parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser, enc = "utf-8" } = {}) => {
const MHTML_FSM = {
MHTML_HEADERS : 0,
MTHML_CONTENT : 1,
@@ -139,19 +435,57 @@ const mhtml2html = {
}
// Returns the next line from the index.
+ /**
+ * @edit
+ * @note merge quoted-printable multi-line content into one line
+ * - this is required for gbk encoding
+ */
function getLine(encoding) {
- const j = i;
+ if (encoding === 'quoted-printable') {
+ let line = mhtml[i];
+
+ while (true) {
+ i++;
+ assert(i < mhtml.length, 'Unexpected EOF');
+
+ line += mhtml[i];
+
+ // 如果结尾是 =\n =\r\n 则删除行尾并继续读取下一行
+
+ // In older versions of Mac, line breaks are represented by '\r',
+ // while in Windows, line breaks are represented by '\r\n'.
+ // Since Mac is not commonly used as a server, we can ignore '\r'.
+ if (mhtml[i] === '\r') line = line.slice(0, -1)
+
+ if (line.endsWith('=\n')) {
+ line = line.slice(0, -2);
+ l++;
+ continue
+ }
+
+ if (line.endsWith('\n')) {
+ l++;
+ break
+ }
+ }
+
+ i++;
+ var decoded = decodeQuotedPrintable(line, enc);
+ return decoded;
+ }
+
+ const j = i
// Wait until a newline character is encountered or when we exceed the str length.
- while (mhtml[i] !== '\n' && assert(i++ < mhtml.length - 1, 'Unexpected EOF'));
- i++; l++;
+ while (
+ mhtml[i] !== '\n' &&
+ assert(i++ < mhtml.length - 1, 'Unexpected EOF')
+ );
+ i++;
+ l++;
const line = mhtml.substring(j, i);
- // Return the (decoded) line.
- if (encoding === 'quoted-printable') {
- return QuotedPrintable.decode(line);
- }
if (encoding === 'base64') {
return line.trim();
}
@@ -291,13 +625,13 @@ const mhtml2html = {
* @param {options.parseDOM} // The callback to parse an HTML string.
* @returns an html document element.
*/
- convert: (mhtml, { convertIframes = false, parseDOM = defaultDOMParser } = {}) => {
+ convert: (mhtml, { convertIframes = false, parseDOM = defaultDOMParser, enc = "utf-8" } = {}) => {
let index, media, frames; // Record-keeping.
let style, base, img; // DOM objects.
let href, src; // References.
if (typeof mhtml === "string") {
- mhtml = mhtml2html.parse(mhtml);
+ mhtml = mhtml2html.parse(mhtml, { enc });
} else {
assert(typeof mhtml === "object", 'Expected argument of type string or object');
}