diff --git a/src/mhtml2html.js b/src/mhtml2html.js
index b52c14b..259da73 100644
--- a/src/mhtml2html.js
+++ b/src/mhtml2html.js
@@ -64,7 +64,7 @@ function replaceReferences(media, base, asset) {
reference = asset.substring(i, asset.indexOf(')', i));
// Get the absolute path of the referenced asset.
- const path = absoluteURL(base, reference.replace(/(\"|\')/g,''));
+ const path = absoluteURL(base, reference.replace(/(\"|\')/g, ''));
if (media[path] != null) {
if (media[path].type === 'text/css') {
media[path].data = replaceReferences(media, base, media[path].data);
@@ -77,7 +77,7 @@ function replaceReferences(media, base, asset) {
Base64.encode(media[path].data)
)}'`;
asset = `${asset.substring(0, i)}${embeddedAsset}${asset.substring(i + reference.length)}`;
- } catch(e) {
+ } catch (e) {
console.warn(e);
}
}
@@ -87,7 +87,7 @@ function replaceReferences(media, base, asset) {
// Converts the provided asset to a data URI based on the encoding.
function convertAssetToDataURI(asset) {
- switch(asset.encoding) {
+ switch (asset.encoding) {
case 'quoted-printable':
return `data:${asset.type};utf8,${escape(QuotedPrintable.decode(asset.data))}`;
case 'base64':
@@ -97,6 +97,11 @@ function convertAssetToDataURI(asset) {
}
}
+// Function to process and clean up MHTML content specifically for IE-generated documents
+function removeMimeFormatHeaderFromIE(mhtml) {
+ return mhtml.replace(/(\r\n|\n)This is a multi-part message in MIME format./g, "");
+}
+
// Main module.
const mhtml2html = {
@@ -109,12 +114,13 @@ const mhtml2html = {
* @param {options.parseDOM} // The callback to parse an HTML string.
* @returns an html document without resources if htmlOnly === true; an MHTML parsed object otherwise.
*/
- parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser } = {}) => {
+
+ parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser } = {}) => {
const MHTML_FSM = {
- MHTML_HEADERS : 0,
- MTHML_CONTENT : 1,
- MHTML_DATA : 2,
- MHTML_END : 3
+ MHTML_HEADERS: 0,
+ MTHML_CONTENT: 1,
+ MHTML_DATA: 2,
+ MHTML_END: 3
};
let asset, headers, content, media, frames; // Record-keeping.
@@ -122,10 +128,11 @@ const mhtml2html = {
let state, key, next, index, i, l; // States.
let boundary; // Boundaries.
- headers = { };
- content = { };
- media = { };
- frames = { };
+ mhtml = removeMimeFormatHeaderFromIE(mhtml); // fix for IE-made documents
+ headers = {};
+ content = {};
+ media = {};
+ frames = {};
// Initial state and index.
state = MHTML_FSM.MHTML_HEADERS;
@@ -171,7 +178,7 @@ const mhtml2html = {
}
while (state != MHTML_FSM.MHTML_END) {
- switch(state) {
+ switch (state) {
// Fetch document headers including the boundary to use.
case MHTML_FSM.MHTML_HEADERS: {
next = getLine();
@@ -185,14 +192,14 @@ const mhtml2html = {
// Ensure the extracted boundary exists.
assert(matches != null, `Missing boundary from document headers; Line ${l}`);
- boundary = matches[1].replace(/\"/g,'');
+ boundary = matches[1].replace(/\"/g, '');
trim();
next = getLine();
// Expect the next boundary to appear.
assert(next.includes(boundary), `Expected boundary; Line ${l}`);
- content = { };
+ content = {};
state = MHTML_FSM.MTHML_CONTENT;
}
break;
@@ -208,27 +215,27 @@ const mhtml2html = {
splitHeaders(next, content);
} else {
encoding = content['Content-Transfer-Encoding'];
- type = content['Content-Type'];
- id = content['Content-ID'];
+ type = content['Content-Type'];
+ id = content['Content-ID'];
location = content['Content-Location'];
// Assume the first boundary to be the document.
if (typeof index === 'undefined') {
index = location;
- assert(typeof index !== 'undefined' && type === "text/html", `Index not found; Line ${l}`);
+ assert(typeof index !== 'undefined' && type.startsWith("text/html"), `Index not found; Line ${l}`);
}
// Ensure the extracted information exists.
assert(typeof id !== 'undefined' || typeof location !== 'undefined',
`ID or location header not provided; Line ${l}`);
assert(typeof encoding !== 'undefined', `Content-Transfer-Encoding not provided; Line ${l}`);
- assert(typeof type !== 'undefined', `Content-Type not provided; Line ${l}`);
+ assert(typeof type !== 'undefined', `Content-Type not provided; Line ${l}`);
asset = {
- encoding : encoding,
- type : type,
- data : '',
- id : id
+ encoding: encoding,
+ type: type,
+ data: '',
+ id: id
};
// Keep track of frames by ID.
@@ -242,7 +249,7 @@ const mhtml2html = {
}
trim();
- content = { };
+ content = {};
state = MHTML_FSM.MHTML_DATA;
}
break;
@@ -297,38 +304,39 @@ const mhtml2html = {
let href, src; // References.
if (typeof mhtml === "string") {
+ mhtml = removeMimeFormatHeaderFromIE(mhtml); // fix for IE-made documents
mhtml = mhtml2html.parse(mhtml);
} else {
assert(typeof mhtml === "object", 'Expected argument of type string or object');
}
frames = mhtml.frames;
- media = mhtml.media;
- index = mhtml.index;
+ media = mhtml.media;
+ index = mhtml.index;
assert(typeof frames === "object", 'MHTML error: invalid frames');
- assert(typeof media === "object", 'MHTML error: invalid media' );
- assert(typeof index === "string", 'MHTML error: invalid index' );
- assert(media[index] && media[index].type === "text/html", 'MHTML error: invalid index');
+ assert(typeof media === "object", 'MHTML error: invalid media');
+ assert(typeof index === "string", 'MHTML error: invalid index');
+ assert(media[index] && media[index].type.startsWith("text/html"), 'MHTML error: invalid index');
const dom = parseDOM(media[index].data);
const documentElem = dom.window.document;
- const nodes = [ documentElem ];
+ const nodes = [documentElem];
// Merge resources into the document.
while (nodes.length) {
const childNode = nodes.shift();
// Resolve each node.
- childNode.childNodes.forEach(function(child) {
+ childNode.childNodes.forEach(function (child) {
if (child.getAttribute) {
href = child.getAttribute('href');
- src = child.getAttribute('src');
+ src = child.getAttribute('src');
}
if (child.removeAttribute) {
child.removeAttribute('integrity');
}
- switch(child.tagName) {
+ switch (child.tagName) {
case 'HEAD':
// Link targets should be directed to the outer frame.
base = documentElem.createElement("base");
@@ -360,7 +368,7 @@ const mhtml2html = {
// Embed the image into the document.
try {
img = convertAssetToDataURI(media[src]);
- } catch(e) {
+ } catch (e) {
console.warn(e);
}
if (img !== null) {
@@ -377,7 +385,7 @@ const mhtml2html = {
if (frame && frame.type === 'text/html') {
const iframe = mhtml2html.convert({
- media: Object.assign({}, media, { [id] : frame }),
+ media: Object.assign({}, media, { [id]: frame }),
frames: frames,
index: id,
}, { convertIframes, parseDOM });