Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 43 additions & 35 deletions src/mhtml2html.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function replaceReferences(media, base, asset) {
reference = asset.substring(i, asset.indexOf(')', i));

// Get the absolute path of the referenced asset.
const path = absoluteURL(base, reference.replace(/(\"|\')/g,''));
const path = absoluteURL(base, reference.replace(/(\"|\')/g, ''));
if (media[path] != null) {
if (media[path].type === 'text/css') {
media[path].data = replaceReferences(media, base, media[path].data);
Expand All @@ -77,7 +77,7 @@ function replaceReferences(media, base, asset) {
Base64.encode(media[path].data)
)}'`;
asset = `${asset.substring(0, i)}${embeddedAsset}${asset.substring(i + reference.length)}`;
} catch(e) {
} catch (e) {
console.warn(e);
}
}
Expand All @@ -87,7 +87,7 @@ function replaceReferences(media, base, asset) {

// Converts the provided asset to a data URI based on the encoding.
function convertAssetToDataURI(asset) {
switch(asset.encoding) {
switch (asset.encoding) {
case 'quoted-printable':
return `data:${asset.type};utf8,${escape(QuotedPrintable.decode(asset.data))}`;
case 'base64':
Expand All @@ -97,6 +97,11 @@ function convertAssetToDataURI(asset) {
}
}

// Function to process and clean up MHTML content specifically for IE-generated documents
function removeMimeFormatHeaderFromIE(mhtml) {
return mhtml.replace(/(\r\n|\n)This is a multi-part message in MIME format./g, "");
}

// Main module.
const mhtml2html = {

Expand All @@ -109,23 +114,25 @@ const mhtml2html = {
* @param {options.parseDOM} // The callback to parse an HTML string.
* @returns an html document without resources if htmlOnly === true; an MHTML parsed object otherwise.
*/
parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser } = {}) => {

parse: (mhtml, { htmlOnly = false, parseDOM = defaultDOMParser } = {}) => {
const MHTML_FSM = {
MHTML_HEADERS : 0,
MTHML_CONTENT : 1,
MHTML_DATA : 2,
MHTML_END : 3
MHTML_HEADERS: 0,
MTHML_CONTENT: 1,
MHTML_DATA: 2,
MHTML_END: 3
};

let asset, headers, content, media, frames; // Record-keeping.
let location, encoding, type, id; // Content properties.
let state, key, next, index, i, l; // States.
let boundary; // Boundaries.

headers = { };
content = { };
media = { };
frames = { };
mhtml = removeMimeFormatHeaderFromIE(mhtml); // fix for IE-made documents
headers = {};
content = {};
media = {};
frames = {};

// Initial state and index.
state = MHTML_FSM.MHTML_HEADERS;
Expand Down Expand Up @@ -171,7 +178,7 @@ const mhtml2html = {
}

while (state != MHTML_FSM.MHTML_END) {
switch(state) {
switch (state) {
// Fetch document headers including the boundary to use.
case MHTML_FSM.MHTML_HEADERS: {
next = getLine();
Expand All @@ -185,14 +192,14 @@ const mhtml2html = {

// Ensure the extracted boundary exists.
assert(matches != null, `Missing boundary from document headers; Line ${l}`);
boundary = matches[1].replace(/\"/g,'');
boundary = matches[1].replace(/\"/g, '');

trim();
next = getLine();

// Expect the next boundary to appear.
assert(next.includes(boundary), `Expected boundary; Line ${l}`);
content = { };
content = {};
state = MHTML_FSM.MTHML_CONTENT;
}
break;
Expand All @@ -208,27 +215,27 @@ const mhtml2html = {
splitHeaders(next, content);
} else {
encoding = content['Content-Transfer-Encoding'];
type = content['Content-Type'];
id = content['Content-ID'];
type = content['Content-Type'];
id = content['Content-ID'];
location = content['Content-Location'];

// Assume the first boundary to be the document.
if (typeof index === 'undefined') {
index = location;
assert(typeof index !== 'undefined' && type === "text/html", `Index not found; Line ${l}`);
assert(typeof index !== 'undefined' && type.startsWith("text/html"), `Index not found; Line ${l}`);
}

// Ensure the extracted information exists.
assert(typeof id !== 'undefined' || typeof location !== 'undefined',
`ID or location header not provided; Line ${l}`);
assert(typeof encoding !== 'undefined', `Content-Transfer-Encoding not provided; Line ${l}`);
assert(typeof type !== 'undefined', `Content-Type not provided; Line ${l}`);
assert(typeof type !== 'undefined', `Content-Type not provided; Line ${l}`);

asset = {
encoding : encoding,
type : type,
data : '',
id : id
encoding: encoding,
type: type,
data: '',
id: id
};

// Keep track of frames by ID.
Expand All @@ -242,7 +249,7 @@ const mhtml2html = {
}

trim();
content = { };
content = {};
state = MHTML_FSM.MHTML_DATA;
}
break;
Expand Down Expand Up @@ -297,38 +304,39 @@ const mhtml2html = {
let href, src; // References.

if (typeof mhtml === "string") {
mhtml = removeMimeFormatHeaderFromIE(mhtml); // fix for IE-made documents
mhtml = mhtml2html.parse(mhtml);
} else {
assert(typeof mhtml === "object", 'Expected argument of type string or object');
}

frames = mhtml.frames;
media = mhtml.media;
index = mhtml.index;
media = mhtml.media;
index = mhtml.index;

assert(typeof frames === "object", 'MHTML error: invalid frames');
assert(typeof media === "object", 'MHTML error: invalid media' );
assert(typeof index === "string", 'MHTML error: invalid index' );
assert(media[index] && media[index].type === "text/html", 'MHTML error: invalid index');
assert(typeof media === "object", 'MHTML error: invalid media');
assert(typeof index === "string", 'MHTML error: invalid index');
assert(media[index] && media[index].type.startsWith("text/html"), 'MHTML error: invalid index');

const dom = parseDOM(media[index].data);
const documentElem = dom.window.document;
const nodes = [ documentElem ];
const nodes = [documentElem];

// Merge resources into the document.
while (nodes.length) {
const childNode = nodes.shift();

// Resolve each node.
childNode.childNodes.forEach(function(child) {
childNode.childNodes.forEach(function (child) {
if (child.getAttribute) {
href = child.getAttribute('href');
src = child.getAttribute('src');
src = child.getAttribute('src');
}
if (child.removeAttribute) {
child.removeAttribute('integrity');
}
switch(child.tagName) {
switch (child.tagName) {
case 'HEAD':
// Link targets should be directed to the outer frame.
base = documentElem.createElement("base");
Expand Down Expand Up @@ -360,7 +368,7 @@ const mhtml2html = {
// Embed the image into the document.
try {
img = convertAssetToDataURI(media[src]);
} catch(e) {
} catch (e) {
console.warn(e);
}
if (img !== null) {
Expand All @@ -377,7 +385,7 @@ const mhtml2html = {

if (frame && frame.type === 'text/html') {
const iframe = mhtml2html.convert({
media: Object.assign({}, media, { [id] : frame }),
media: Object.assign({}, media, { [id]: frame }),
frames: frames,
index: id,
}, { convertIframes, parseDOM });
Expand Down