diff --git a/src/utils.js b/src/utils.js index 63f3f18..b1154f2 100644 --- a/src/utils.js +++ b/src/utils.js @@ -5,26 +5,57 @@ function isWhitespaceTextNode(node) { } export const stripWrappingParagraphs = (html) => { - const parsedHtml = parse5.parseFragment(html); - parsedHtml.childNodes = parsedHtml.childNodes.flatMap((node) => { - if (node.nodeName !== 'p') { - return node; - } + const isFullHtmlDoc = (/^<(!DOCTYPE )?html>/i).test(html); + const parsedHtml = isFullHtmlDoc ? parse5.parse(html) : parse5.parseFragment(html); + + const rootNode = chooseRootNode(parsedHtml); + rootNode.childNodes = rootNode.childNodes.map(traverseNodes); + + return parse5.serialize(parsedHtml); +}; - // Ignore whitespace-only text nodes - const meaningfulChildren = node.childNodes.filter( - (child) => !isWhitespaceTextNode(child) - ); +function chooseRootNode(parsedHtml, isFullHtmlDoc) { + if (isFullHtmlDoc) { + const rootNode = parsedHtml.childNodes + .find((x) => x.nodeName === 'html') + ?.childNodes?.find((x) => x.nodeName === 'body'); - if ( - meaningfulChildren.length === 1 && - meaningfulChildren[0].nodeName.includes('-') - ) { - return meaningfulChildren[0]; + if (!rootNode) { + throw new Error('html output is missing the body tag'); } + return rootNode; + } else { + return parsedHtml; + } +} + +function traverseNodes(node) { + node = stripWrappingParagraph(node); + + // Don't traverse children of custom elements + if (node.childNodes && !node.nodeName.includes('-')) { + node.childNodes = node.childNodes.map(traverseNodes); + } + + return node; +} + +function stripWrappingParagraph(node) { + if (node.nodeName !== 'p') { return node; - }); + } + + // Ignore whitespace-only text nodes + const meaningfulChildren = node.childNodes.filter( + (child) => !isWhitespaceTextNode(child) + ); + + if (meaningfulChildren.length === 1 && + meaningfulChildren[0].nodeName.includes('-')) { + return meaningfulChildren[0]; + } + + return node; +} - return parse5.serialize(parsedHtml); -}; diff --git a/test/utils.test.js b/test/utils.test.js index 9ff32c7..e5da916 100644 --- a/test/utils.test.js +++ b/test/utils.test.js @@ -3,56 +3,96 @@ import assert from 'node:assert/strict'; import { describe, it } from 'node:test'; describe('stripWrappingParagraphs', () => { - it('removes wrapping p tags', async () => { - const input = '

'; - const expected = ''; + describe('Html fragment', () => { + it('removes wrapping p tags', async () => { + const input = '

'; + const expected = ''; - const result = stripWrappingParagraphs(input); + const result = stripWrappingParagraphs(input); - assert.equal(result, expected); - }); + assert.equal(result, expected); + }); - it('removes wrapping p tags (inner content)', async () => { - const input = '

inner content

'; - const expected = 'inner content'; + it('removes wrapping p tags (inner content)', async () => { + const input = '

inner content

'; + const expected = 'inner content'; - const result = stripWrappingParagraphs(input); + const result = stripWrappingParagraphs(input); - assert.equal(result, expected); - }); + assert.equal(result, expected); + }); - it('removes wrapping p tags (inner content, newlines)', async () => { - const input = '

inner\ncontent

'; - const expected = 'inner\ncontent'; + it('removes wrapping p tags (inner content, newlines)', async () => { + const input = '

inner\ncontent

'; + const expected = 'inner\ncontent'; - const result = stripWrappingParagraphs(input); + const result = stripWrappingParagraphs(input); - assert.equal(result, expected); - }); + assert.equal(result, expected); + }); - it('removes wrapping p tags (whitespace)', async () => { - const input = '

\n\t \n

'; - const expected = ''; + it('removes wrapping p tags (whitespace)', async () => { + const input = '

\n\t \n

'; + const expected = ''; - const result = stripWrappingParagraphs(input); + const result = stripWrappingParagraphs(input); - assert.equal(result, expected); - }); + assert.equal(result, expected); + }); + + it('does not remove wrapping p tags if it includes other content', async () => { + const input = '

Hello

'; + + const result = stripWrappingParagraphs(input); + + assert.equal(result, input); + }); + + it('removes wrapping p tags (multiple)', async () => { + const input = + '

\n

'; + const expected = '\n'; - it('does not remove wrapping p tags if it includes other content', async () => { - const input = '

Hello

'; + const result = stripWrappingParagraphs(input); - const result = stripWrappingParagraphs(input); + assert.equal(result, expected); + }); - assert.equal(result, input); + it('removes nested wrapping p tags', async () => { + const input = '

'; + const expected = '
'; + + const result = stripWrappingParagraphs(input); + + assert.equal(result, expected); + }); + + it('removes double nested wrapping p tags', async () => { + const input = '

'; + const expected = '
'; + + const result = stripWrappingParagraphs(input); + + assert.equal(result, expected); + }); }); - it('removes wrapping p tags (multiple)', async () => { - const input = '

\n

'; - const expected = '\n'; + describe('Html document', () => { + it('preserves html, head, and body tags', () => { + const input = '

Hello

'; + + const result = stripWrappingParagraphs(input); + + assert.equal(result, input); + }); + + it('strips p tags in body', () => { + const input = '

'; + const expected = ''; - const result = stripWrappingParagraphs(input); + const result = stripWrappingParagraphs(input); - assert.equal(result, expected); + assert.equal(result, expected); + }); }); });