diff --git a/src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java b/src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java index a61caa2..dca976f 100644 --- a/src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java +++ b/src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java @@ -26,6 +26,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.codelibs.nekohtml.HTMLEntities; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.DTDHandler; @@ -353,11 +354,12 @@ protected void parseHTML(final String html) throws SAXException { // Text content final int nextTag = html.indexOf('<', pos); final int endPos = nextTag >= 0 ? nextTag : length; - final String text = html.substring(pos, endPos); + final String rawText = html.substring(pos, endPos); // Always emit text content, including whitespace // This preserves spacing between elements for proper text extraction - if (text.length() > 0) { + if (rawText.length() > 0) { + final String text = resolveEntities(rawText); fContentHandler.characters(text.toCharArray(), 0, text.length()); } @@ -398,7 +400,7 @@ protected AttributesImpl parseAttributes(final String attrString) { value = ""; // No value } - attrs.addAttribute("", name, name, "CDATA", value); + attrs.addAttribute("", name, name, "CDATA", resolveEntities(value, true)); } return attrs; @@ -436,6 +438,125 @@ protected String normalizeAttributeName(final String name) { return "upper".equals(fAttributeCase) ? name.toUpperCase() : "lower".equals(fAttributeCase) ? name.toLowerCase() : name; } + // Pattern for HTML character references: &#decimal; or &#xhex; or &name; + // Semicolon is optional to handle common malformed HTML + private static final Pattern ENTITY_PATTERN = Pattern.compile("&(?:#([0-9]+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z][a-zA-Z0-9]*));?"); + + /** + * Resolves HTML character entities in text content. + * Semicolon-less named entities are decoded in text context. + * + * @param text The text containing entities + * @return The text with entities resolved to their character equivalents + */ + protected String resolveEntities(final String text) { + return resolveEntities(text, false); + } + + /** + * Resolves HTML character entities in the given text. + * Handles numeric decimal (Ö), numeric hex (Ö), and named (Ö) entities. + * In attribute context, semicolon-less named entities followed by [A-Za-z0-9=] are not decoded + * per HTML5 attribute value state rules, preventing corruption of URLs like ¬=, ©=. + * + * @param text The text containing entities + * @param inAttribute Whether this text is an attribute value + * @return The text with entities resolved to their character equivalents + */ + protected String resolveEntities(final String text, final boolean inAttribute) { + if (text == null || text.indexOf('&') < 0) { + return text; + } + + final Matcher m = ENTITY_PATTERN.matcher(text); + final StringBuilder sb = new StringBuilder(text.length()); + int lastEnd = 0; + + while (m.find()) { + sb.append(text, lastEnd, m.start()); + + if (m.group(1) != null) { + // Numeric decimal: Ö + try { + final int codePoint = Integer.parseInt(m.group(1)); + sb.append(resolveCodePoint(codePoint, m.group(0))); + } catch (final NumberFormatException e) { + sb.append(m.group(0)); + } + } else if (m.group(2) != null) { + // Numeric hex: Ö + try { + final int codePoint = Integer.parseInt(m.group(2), 16); + sb.append(resolveCodePoint(codePoint, m.group(0))); + } catch (final NumberFormatException e) { + sb.append(m.group(0)); + } + } else if (m.group(3) != null) { + // Named entity: Ö + final String matched = m.group(0); + final boolean hasSemicolon = matched.endsWith(";"); + + // HTML5 attribute value state: if no semicolon and next char is [A-Za-z0-9=], + // do not decode (prevents corruption of URLs like ¬=2, ©=, ®=) + if (inAttribute && !hasSemicolon) { + final int afterEnd = m.end(); + if (afterEnd < text.length()) { + final char nextChar = text.charAt(afterEnd); + if (Character.isLetterOrDigit(nextChar) || nextChar == '=') { + sb.append(matched); + lastEnd = m.end(); + continue; + } + } + } + + final int c = HTMLEntities.get(m.group(3)); + if (c != -1) { + sb.appendCodePoint(c); + } else { + sb.append(matched); + } + } + + lastEnd = m.end(); + } + + sb.append(text, lastEnd, text.length()); + return sb.toString(); + } + + /** + * Validates a numeric code point and returns the resolved character or replacement. + * Invalid code points (null char, surrogates, out of range, XML-illegal) are replaced with U+FFFD. + */ + private static String resolveCodePoint(final int codePoint, final String original) { + if (codePoint == 0) { + // Null character: replace with U+FFFD per HTML5 spec + return "\uFFFD"; + } + if (codePoint >= 0xD800 && codePoint <= 0xDFFF) { + // Surrogate range: invalid Unicode scalar value + return "\uFFFD"; + } + if (codePoint > 0x10FFFF) { + // Out of Unicode range + return "\uFFFD"; + } + // XML 1.0 illegal characters (except tab, newline, carriage return) + if (codePoint < 0x20 && codePoint != 0x9 && codePoint != 0xA && codePoint != 0xD) { + return "\uFFFD"; + } + if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF) { + // Unicode noncharacters + return "\uFFFD"; + } + if ((codePoint & 0xFFFE) == 0xFFFE) { + // U+xFFFE and U+xFFFF are noncharacters + return "\uFFFD"; + } + return new String(Character.toChars(codePoint)); + } + @Override public boolean getFeature(final String name) throws SAXNotRecognizedException, SAXNotSupportedException { throw new SAXNotRecognizedException("Feature not recognized: " + name); diff --git a/src/test/java/org/codelibs/nekohtml/EncodingEdgeCasesTest.java b/src/test/java/org/codelibs/nekohtml/EncodingEdgeCasesTest.java index 13d549f..be2fb92 100644 --- a/src/test/java/org/codelibs/nekohtml/EncodingEdgeCasesTest.java +++ b/src/test/java/org/codelibs/nekohtml/EncodingEdgeCasesTest.java @@ -125,11 +125,9 @@ public void testUTF16LEBOM() throws Exception { @Test public void testZeroWidthCharacters() throws Exception { // Given: HTML with zero-width characters - final String html = "" - + "Zero\u200BWidth\u200BSpace " - + "Zero\u200CWidth\u200CNon\u200CJoiner " - + "Zero\u200DWidth\u200DJoiner" - + ""; + final String html = + "" + "Zero\u200BWidth\u200BSpace " + "Zero\u200CWidth\u200CNon\u200CJoiner " + "Zero\u200DWidth\u200DJoiner" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -145,10 +143,8 @@ public void testZeroWidthCharacters() throws Exception { @Test public void testRightToLeftMarks() throws Exception { // Given: HTML with RTL and LTR marks - final String html = "" - + "Text\u200Ewith\u200ELTR\u200Emarks " - + "Text\u200Fwith\u200FRTL\u200Fmarks" - + ""; + final String html = + "" + "Text\u200Ewith\u200ELTR\u200Emarks " + "Text\u200Fwith\u200FRTL\u200Fmarks" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -163,8 +159,7 @@ public void testRightToLeftMarks() throws Exception { @Test public void testCombiningCharacters() throws Exception { // Given: HTML with combining diacritics - final String html = "" - + "e\u0301 " // é (e + combining acute) + final String html = "" + "e\u0301 " // é (e + combining acute) + "n\u0303 " // ñ (n + combining tilde) + "a\u0308 " // ä (a + combining diaeresis) + ""; @@ -183,11 +178,7 @@ public void testCombiningCharacters() throws Exception { @Test public void testEmojiAndSupplementaryCharacters() throws Exception { // Given: HTML with emoji (supplementary characters) - final String html = "" - + "😀😁😂🤣😃😄😅😆😉😊 " - + "👍👎👏🙌🎉🎊🎈 " - + "🌟⭐✨💫" - + ""; + final String html = "" + "😀😁😂🤣😃😄😅😆😉😊 " + "👍👎👏🙌🎉🎊🎈 " + "🌟⭐✨💫" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -203,8 +194,7 @@ public void testEmojiAndSupplementaryCharacters() throws Exception { @Test public void testSurrogatePairs() throws Exception { // Given: HTML with characters requiring surrogate pairs - final String html = "" - + "\uD834\uDD1E" // Musical symbol G clef (U+1D11E) + final String html = "" + "\uD834\uDD1E" // Musical symbol G clef (U+1D11E) + "\uD835\uDC00" // Mathematical bold capital A (U+1D400) + "\uD83D\uDE00" // Grinning face emoji (U+1F600) + ""; @@ -221,9 +211,7 @@ public void testSurrogatePairs() throws Exception { @Test public void testControlCharacters() throws Exception { // Given: HTML with control characters (allowed ones) - final String html = "" - + "Tab:\t Newline:\n CarriageReturn:\r" - + ""; + final String html = "" + "Tab:\t Newline:\n CarriageReturn:\r" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -237,15 +225,10 @@ public void testControlCharacters() throws Exception { @Test public void testMultilingualContent() throws Exception { // Given: HTML with multiple languages - final String html = "" - + "

日本語のテキスト

" - + "

中文文本

" - + "

한국어 텍스트

" - + "

نص عربي

" - + "

Русский текст

" - + "

Ελληνικό κείμενο

" - + "

טקסט עברי

" - + ""; + final String html = + "" + "

日本語のテキスト

" + "

中文文本

" + "

한국어 텍스트

" + + "

نص عربي

" + "

Русский текст

" + "

Ελληνικό κείμενο

" + + "

טקסט עברי

" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -275,10 +258,7 @@ public void testMultilingualContent() throws Exception { @Test public void testMalformedEntities() throws Exception { // Given: HTML with malformed entities - final String html = "" - + "&invalid; " - + "¬anentity; " - + "< " // missing semicolon + final String html = "" + "&invalid; " + "¬anentity; " + "< " // missing semicolon + "> " // missing semicolon + "&" // missing semicolon + ""; @@ -295,8 +275,7 @@ public void testMalformedEntities() throws Exception { @Test public void testNumericEntitiesOutOfRange() throws Exception { // Given: HTML with out-of-range numeric entities - final String html = "" - + "� " // way out of range + final String html = "" + "� " // way out of range + "� " // huge decimal + "� " // just beyond Unicode range + ""; @@ -325,11 +304,10 @@ public void testIncompleteEntityAtEOF() throws Exception { @Test public void testEntitiesInAttributeValues() throws Exception { // Given: HTML with entities in attribute values - final String html = "" - + "
Content
" - + "Link" - + "" - + ""; + final String html = + "" + "
Content
" + + "Link" + "" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -355,15 +333,10 @@ public void testEntitiesInAttributeValues() throws Exception { @Test public void testAllCommonHTMLEntities() throws Exception { // Given: HTML with all common entities - final String html = "" - + "  < > & " ' " - + "© ® ™ " - + "€ £ ¥ ¢ " - + "— – … " - + "« » “ ” ‘ ’ " - + "° ± × ÷ " - + "¶ § † ‡ " - + ""; + final String html = + "" + "  < > & " ' " + "© ® ™ " + "€ £ ¥ ¢ " + + "— – … " + "« » “ ” ‘ ’ " + + "° ± × ÷ " + "¶ § † ‡ " + ""; // When: Parsing final Document doc = parseHTML(html); @@ -381,8 +354,7 @@ public void testAllCommonHTMLEntities() throws Exception { @Test public void testNumericCharacterReferences() throws Exception { // Given: HTML with numeric character references - final String html = "" - + "A " // A + final String html = "" + "A " // A + "A " // A (hex) + "© " // © + "© " // © (hex) @@ -405,9 +377,7 @@ public void testNumericCharacterReferences() throws Exception { @Test public void testEntitiesWithoutSemicolon() throws Exception { // Given: HTML with entities without semicolons (legacy) - final String html = "" - + "< > & © ®" - + ""; + final String html = "" + "< > & © ®" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -424,11 +394,10 @@ public void testEntitiesWithoutSemicolon() throws Exception { @Test public void testMultipleMetaCharsetDeclarations() throws Exception { // Given: HTML with multiple conflicting charset declarations - final String html = "" - + "" - + "" - + "" - + "Content"; + final String html = + "" + "" + "" + + "" + + "Content"; // When: Parsing final Document doc = parseHTML(html); @@ -442,11 +411,9 @@ public void testMultipleMetaCharsetDeclarations() throws Exception { @Test public void testMetaCharsetVariations() throws Exception { // Given: HTML with various meta charset formats - final String html = "" - + "" - + "" - + "" - + "Test"; + final String html = + "" + "" + "" + + "" + "Test"; // When: Parsing final Document doc = parseHTML(html); @@ -460,8 +427,9 @@ public void testMetaCharsetVariations() throws Exception { @Test public void testXMLDeclarationVsMetaCharset() throws Exception { // Given: HTML with both XML declaration and meta charset - final String html = "" - + "Content"; + final String html = + "" + + "Content"; // When: Parsing final Document doc = parseHTML(html); @@ -478,14 +446,9 @@ public void testXMLDeclarationVsMetaCharset() throws Exception { @Test public void testNonBreakingSpaces() throws Exception { // Given: HTML with various types of spaces - final String html = "" - + "Regular space " - + "Non-breaking space " - + "En\u2002space " - + "Em\u2003space " - + "Thin\u2009space " - + "Hair\u200Aspace" - + ""; + final String html = + "" + "Regular space " + "Non-breaking space " + "En\u2002space " + "Em\u2003space " + "Thin\u2009space " + + "Hair\u200Aspace" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -504,12 +467,9 @@ public void testNonBreakingSpaces() throws Exception { @Test public void testSpecialPunctuationCharacters() throws Exception { // Given: HTML with special punctuation - final String html = "" - + "Quotes: \u201C\u201D \u2018\u2019 " - + "Dashes: \u2013 \u2014 " - + "Ellipsis: \u2026 " - + "Bullet: \u2022 " - + ""; + final String html = + "" + "Quotes: \u201C\u201D \u2018\u2019 " + "Dashes: \u2013 \u2014 " + "Ellipsis: \u2026 " + "Bullet: \u2022 " + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -526,9 +486,7 @@ public void testSpecialPunctuationCharacters() throws Exception { @Test public void testMathematicalSymbols() throws Exception { // Given: HTML with mathematical symbols - final String html = "" - + "∞ ≠ ≤ ≥ ± × ÷ √ ∑ ∏ ∫ ∂ ∇" - + ""; + final String html = "" + "∞ ≠ ≤ ≥ ± × ÷ √ ∑ ∏ ∫ ∂ ∇" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -544,9 +502,7 @@ public void testMathematicalSymbols() throws Exception { @Test public void testCurrencySymbols() throws Exception { // Given: HTML with various currency symbols - final String html = "" - + "$ € £ ¥ ₹ ₽ ₩ ¢ ฿" - + ""; + final String html = "" + "$ € £ ¥ ₹ ₽ ₩ ¢ ฿" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -562,11 +518,9 @@ public void testCurrencySymbols() throws Exception { @Test public void testMixedDirectionalText() throws Exception { // Given: HTML with mixed LTR and RTL text - final String html = "" - + "

Left-to-right text with עברית embedded

" - + "

טקסט מימין לשמאל with English embedded

" - + "

Mixed: Hello שלום مرحبا

" - + ""; + final String html = + "" + "

Left-to-right text with עברית embedded

" + + "

טקסט מימין לשמאל with English embedded

" + "

Mixed: Hello שלום مرحبا

" + ""; // When: Parsing final Document doc = parseHTML(html); diff --git a/src/test/java/org/codelibs/nekohtml/HTML5SemanticElementsIntegrationTest.java b/src/test/java/org/codelibs/nekohtml/HTML5SemanticElementsIntegrationTest.java index e87b42d..960cad5 100644 --- a/src/test/java/org/codelibs/nekohtml/HTML5SemanticElementsIntegrationTest.java +++ b/src/test/java/org/codelibs/nekohtml/HTML5SemanticElementsIntegrationTest.java @@ -91,7 +91,8 @@ public void testSearchElementInHeader() throws Exception { @Test public void testSearchElementInAside() throws Exception { // Given: HTML with SEARCH in ASIDE (common pattern for sidebar search) - final String html = "
Main content
"; + final String html = + "
Main content
"; // When: Parsing final Document doc = parseHTML(html); @@ -108,10 +109,10 @@ public void testSearchElementInAside() throws Exception { @Test public void testMultipleSearchElements() throws Exception { // Given: HTML with multiple SEARCH elements - final String html = "" - + "
" - + "
" - + ""; + final String html = + "" + "
" + + "
" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -124,14 +125,10 @@ public void testMultipleSearchElements() throws Exception { @Test public void testSearchWithAutocompleteAndDatalist() throws Exception { // Given: HTML with SEARCH containing autocomplete and datalist - final String html = "" - + "" - + "" - + "" - + ""; + final String html = + "" + "" + + "" + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -155,11 +152,9 @@ public void testSearchWithAutocompleteAndDatalist() throws Exception { @Test public void testSearchClosingWithAdjacentBlockElements() throws Exception { // Given: HTML with SEARCH followed by other block elements - final String html = "" - + "" - + "" - + "
Content
" - + ""; + final String html = + "" + "" + "" + + "
Content
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -239,11 +234,10 @@ public void testSlotWithFallbackContent() throws Exception { @Test public void testMultipleSlotsInTemplate() throws Exception { // Given: HTML with multiple SLOTs in TEMPLATE - final String html = ""; + final String html = + ""; // When: Parsing final Document doc = parseHTML(html); @@ -265,7 +259,8 @@ public void testMultipleSlotsInTemplate() throws Exception { @Test public void testNestedSlotElements() throws Exception { // Given: HTML with nested SLOTs (edge case) - final String html = ""; + final String html = + ""; // When: Parsing final Document doc = parseHTML(html); @@ -302,14 +297,9 @@ public void testHgroupWithMultipleHeadings() throws Exception { @Test public void testHgroupWithAllHeadingLevels() throws Exception { // Given: HTML with HGROUP containing H1-H6 - final String html = "
" - + "

Level 1

" - + "

Level 2

" - + "

Level 3

" - + "

Level 4

" - + "
Level 5
" - + "
Level 6
" - + "
"; + final String html = + "
" + "

Level 1

" + "

Level 2

" + "

Level 3

" + "

Level 4

" + + "
Level 5
" + "
Level 6
" + "
"; // When: Parsing final Document doc = parseHTML(html); @@ -362,7 +352,8 @@ public void testHgroupInHeader() throws Exception { @Test public void testHgroupInArticle() throws Exception { // Given: HTML with HGROUP in ARTICLE - final String html = "

Article Title

Author Name

Article content

"; + final String html = + "

Article Title

Author Name

Article content

"; // When: Parsing final Document doc = parseHTML(html); @@ -379,7 +370,8 @@ public void testHgroupInArticle() throws Exception { @Test public void testHgroupInSection() throws Exception { // Given: HTML with HGROUP in SECTION - final String html = "

Section Title

Section Subtitle

Content

"; + final String html = + "

Section Title

Section Subtitle

Content

"; // When: Parsing final Document doc = parseHTML(html); @@ -396,10 +388,10 @@ public void testHgroupInSection() throws Exception { @Test public void testMultipleHgroups() throws Exception { // Given: HTML with multiple HGROUPs - final String html = "" - + "

First Article

First Subtitle

Content 1

" - + "

Second Article

Second Subtitle

Content 2

" - + ""; + final String html = + "" + "

First Article

First Subtitle

Content 1

" + + "

Second Article

Second Subtitle

Content 2

" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -437,25 +429,13 @@ public void testHgroupWithBlockElement() throws Exception { @Test public void testComplexSemanticStructureWithNewElements() throws Exception { // Given: Complex HTML using SEARCH, SLOT, HGROUP together - final String html = "" - + "
" - + "

Website Title

Tagline

" - + "" - + "
" - + "
" - + "
" - + "

Article Title

Article Subtitle

" - + "

Article content here

" - + "
" - + "" - + "
" - + "" - + ""; + final String html = + "" + "
" + "

Website Title

Tagline

" + + "" + "
" + "
" + "
" + + "

Article Title

Article Subtitle

" + "

Article content here

" + "
" + + "" + + "
" + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -477,7 +457,8 @@ public void testComplexSemanticStructureWithNewElements() throws Exception { @Test public void testSearchInMain() throws Exception { // Given: SEARCH in MAIN element - final String html = "
"; + final String html = + "
"; // When: Parsing final Document doc = parseHTML(html); @@ -494,7 +475,8 @@ public void testSearchInMain() throws Exception { @Test public void testHgroupInFooter() throws Exception { // Given: HGROUP in FOOTER (less common but valid) - final String html = ""; + final String html = + ""; // When: Parsing final Document doc = parseHTML(html); @@ -511,15 +493,12 @@ public void testHgroupInFooter() throws Exception { @Test public void testSearchWithComplexFormElements() throws Exception { // Given: SEARCH with complex form containing multiple inputs - final String html = "
" - + "
" - + "Advanced Search" - + "" - + "" - + "" - + "" - + "
" - + "
"; + final String html = + "
" + "
" + "Advanced Search" + + "" + + "" + + "" + "" + + "
" + "
"; // When: Parsing final Document doc = parseHTML(html); @@ -539,7 +518,8 @@ public void testSearchWithComplexFormElements() throws Exception { @Test public void testSlotInCustomElement() throws Exception { // Given: SLOT in custom element context - final String html = ""; + final String html = + ""; // When: Parsing final Document doc = parseHTML(html); diff --git a/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java b/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java index 487d4dd..ffa41a0 100644 --- a/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java +++ b/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java @@ -439,15 +439,10 @@ public void testComplexNestedStructure() throws Exception { // Given: Complex realistic document structure final StringBuilder html = new StringBuilder("Test"); for (int i = 0; i < 100; i++) { - html.append("
") - .append("

Title ").append(i).append("

") - .append("
") - .append("

Paragraph 1

") - .append("

Paragraph 2

") - .append("") - .append("
") - .append("") - .append("
"); + html.append("
").append("

Title ").append(i).append("

").append("
") + .append("

Paragraph 1

").append("

Paragraph 2

") + .append("").append("
") + .append("").append("
"); } html.append(""); diff --git a/src/test/java/org/codelibs/nekohtml/ThreadSafetyTest.java b/src/test/java/org/codelibs/nekohtml/ThreadSafetyTest.java index b089f35..405f3ae 100644 --- a/src/test/java/org/codelibs/nekohtml/ThreadSafetyTest.java +++ b/src/test/java/org/codelibs/nekohtml/ThreadSafetyTest.java @@ -44,11 +44,8 @@ public class ThreadSafetyTest { private static final String SIMPLE_HTML = "
Content
"; - private static final String COMPLEX_HTML = "Test" - + "

Title

" - + "

Paragraph 1

Paragraph 2

" - + "
" - + ""; + private static final String COMPLEX_HTML = "Test" + "

Title

" + + "

Paragraph 1

Paragraph 2

" + "
" + ""; // ======================================================================== // Concurrent Parsing with Separate Parser Instances @@ -104,8 +101,8 @@ public void testConcurrentParsingWithSeparateParsers100Threads() throws Exceptio public Boolean call() throws Exception { try { final DOMParser parser = new DOMParser(); - final String html = "

Thread " + threadId + "

" - + "

Paragraph " + threadId + "

"; + final String html = + "

Thread " + threadId + "

" + "

Paragraph " + threadId + "

"; parser.parse(new InputSource(new StringReader(html))); final Document doc = parser.getDocument(); @@ -137,10 +134,10 @@ public void testConcurrentParsingDifferentDocumentTypes() throws Exception { final ExecutorService executor = Executors.newFixedThreadPool(10); final List> futures = new ArrayList<>(); - final String[] htmlTypes = new String[] { SIMPLE_HTML, COMPLEX_HTML, - "
Table
", - "
", - "" }; + final String[] htmlTypes = + new String[] { SIMPLE_HTML, COMPLEX_HTML, "
Table
", + "
", + "" }; // When: Threads parse different document types for (int i = 0; i < threadCount; i++) { @@ -311,9 +308,10 @@ public void testConcurrentParsingWithMalformedHTML() throws Exception { final AtomicInteger successCount = new AtomicInteger(0); final List> futures = new ArrayList<>(); - final String[] malformedHTML = new String[] { "
Unclosed div", "", - "
No TR
", "

Paragraph

Block
continues

", - "
    Wrong nesting
" }; + final String[] malformedHTML = + new String[] { "
Unclosed div", "", + "
No TR
", + "

Paragraph

Block
continues

", "
    Wrong nesting
" }; // When: Threads parse malformed HTML for (int i = 0; i < threadCount; i++) { diff --git a/src/test/java/org/codelibs/nekohtml/parsers/AttributeEdgeCasesTest.java b/src/test/java/org/codelibs/nekohtml/parsers/AttributeEdgeCasesTest.java index 5398821..9d6be7f 100644 --- a/src/test/java/org/codelibs/nekohtml/parsers/AttributeEdgeCasesTest.java +++ b/src/test/java/org/codelibs/nekohtml/parsers/AttributeEdgeCasesTest.java @@ -89,7 +89,8 @@ public void testDuplicateAttributeNames() throws Exception { @Test public void testAttributesWithColons() throws Exception { // Given: Attributes with colons (namespace-like) - final String html = "
Content
"; + final String html = + "
Content
"; // When: Parsing final Document doc = parseHTML(html); @@ -317,13 +318,9 @@ public void testAttributeValueWithBackslashes() throws Exception { @Test public void testBooleanAttributesWithoutValues() throws Exception { // Given: Boolean attributes without values - final String html = "" - + "" - + "" - + "" - + "" - + "" - + ""; + final String html = + "" + "" + "" + "" + + "" + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -337,12 +334,10 @@ public void testBooleanAttributesWithoutValues() throws Exception { @Test public void testBooleanAttributesWithValues() throws Exception { // Given: Boolean attributes with values (valid in HTML) - final String html = "" - + "" - + "" - + "" - + "" - + ""; + final String html = + "" + "" + "" + + "" + "" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -356,11 +351,9 @@ public void testBooleanAttributesWithValues() throws Exception { @Test public void testBooleanAttributesWithInvalidValues() throws Exception { // Given: Boolean attributes with invalid values - final String html = "" - + "" - + "" - + "" - + ""; + final String html = + "" + "" + "" + + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -374,18 +367,11 @@ public void testBooleanAttributesWithInvalidValues() throws Exception { @Test public void testAllBooleanHTMLAttributes() throws Exception { // Given: All common boolean HTML attributes - final String html = "" - + "" - + "" - + "" - + "
    List
" - + "
Details
" - + "" - + "" - + "" - + "" - + "Group" - + ""; + final String html = + "" + "" + "" + + "" + "
    List
" + + "
Details
" + "" + "" + + "" + "" + "Group" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -434,11 +420,8 @@ public void testMultipleSpacesBetweenAttributes() throws Exception { @Test public void testAttributesWithNewlinesBetweenThem() throws Exception { // Given: Attributes on multiple lines - final String html = "Content
"; + final String html = + "Content
"; // When: Parsing final Document doc = parseHTML(html); @@ -474,9 +457,10 @@ public void testManyAttributesOnSingleElement() throws Exception { @Test public void testDataAttributes() throws Exception { // Given: Various data attributes - final String html = "" - + "
Content
" - + ""; + final String html = + "" + + "
Content
" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -493,9 +477,10 @@ public void testDataAttributes() throws Exception { @Test public void testAriaAttributes() throws Exception { // Given: ARIA attributes - final String html = "" - + "
Button
" - + ""; + final String html = + "" + + "
Button
" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -512,9 +497,10 @@ public void testAriaAttributes() throws Exception { @Test public void testEventHandlerAttributes() throws Exception { // Given: Event handler attributes - final String html = "" - + "" - + ""; + final String html = + "" + + "" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -530,9 +516,10 @@ public void testEventHandlerAttributes() throws Exception { @Test public void testStyleAttribute() throws Exception { // Given: Style attribute with complex CSS - final String html = "" - + "
Content
" - + ""; + final String html = + "" + + "
Content
" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -565,11 +552,10 @@ public void testClassAttributeWithMultipleValues() throws Exception { @Test public void testAttributeWithURL() throws Exception { // Given: Attributes with URLs - final String html = "" - + "Link" - + "\"Photo\"" - + "" - + ""; + final String html = + "" + "Link" + + "\"Photo\"" + "" + + ""; // When: Parsing final Document doc = parseHTML(html); diff --git a/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java b/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java index 4d8de62..117008d 100644 --- a/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java +++ b/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java @@ -123,8 +123,9 @@ public void testImpliedColgroup() throws Exception { @Test public void testTableWithMissingCloseTags() throws Exception { // Given: Table with missing close tags (common in legacy HTML) - final String html = "" + "
Row 1, Cell 1Row 1, Cell 2" + "
Row 2, Cell 1Row 2, Cell 2" - + "
"; + final String html = + "" + "
Row 1, Cell 1Row 1, Cell 2" + "
Row 2, Cell 1Row 2, Cell 2" + + "
"; // When: Parsing final Document doc = parseHTML(html); @@ -138,8 +139,9 @@ public void testTableWithMissingCloseTags() throws Exception { @Test public void testNestedTables() throws Exception { // Given: Nested tables - final String html = "" + "
" + "
Inner cell
" + "
" - + ""; + final String html = + "" + "
" + "
Inner cell
" + "
" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -151,7 +153,8 @@ public void testNestedTables() throws Exception { @Test public void testTableWithCaptionAfterRows() throws Exception { // Given: Table with CAPTION after TR (invalid but common) - final String html = "" + "" + "" + "
Cell
Table Caption
"; + final String html = + "" + "" + "" + "
Cell
Table Caption
"; // When: Parsing final Document doc = parseHTML(html); @@ -168,8 +171,9 @@ public void testTableWithCaptionAfterRows() throws Exception { @Test public void testFormWithOrphanedInputs() throws Exception { // Given: Form with inputs outside form tag - final String html = "" + "
" + "" + "
" - + "" + ""; + final String html = + "" + "
" + "" + "
" + + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -182,8 +186,9 @@ public void testFormWithOrphanedInputs() throws Exception { @Test public void testNestedForms() throws Exception { // Given: Nested forms (invalid HTML but may appear) - final String html = "" + "
" + "" + "" + "
" - + "" + ""; + final String html = + "" + "
" + "" + "" + "
" + "" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -196,8 +201,8 @@ public void testNestedForms() throws Exception { @Test public void testFormWithSelectWithoutClosingOption() throws Exception { // Given: SELECT with unclosed OPTION tags - final String html = "" + "" - + ""; + final String html = + "" + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -227,8 +232,9 @@ public void testUnclosedListItems() throws Exception { @Test public void testNestedListsWithUnclosedItems() throws Exception { // Given: Nested lists with unclosed LI - final String html = "" + "" + ""; + final String html = + "" + "" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -241,8 +247,9 @@ public void testNestedListsWithUnclosedItems() throws Exception { @Test public void testDefinitionListQuirks() throws Exception { // Given: DL with unclosed DT/DD - final String html = "" + "
" + "
Term 1" + "
Definition 1" + "
Term 2" + "
Definition 2" + "
" - + ""; + final String html = + "" + "
" + "
Term 1" + "
Definition 1" + "
Term 2" + "
Definition 2" + "
" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -333,8 +340,8 @@ public void testBlockInsideInline() throws Exception { @Test public void testScriptInBody() throws Exception { // Given: Script in body (valid but tested for quirks) - final String html = "" + "

Before script

" + "" + "

After script

" - + ""; + final String html = + "" + "

Before script

" + "" + "

After script

" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -431,8 +438,9 @@ public void testHTML5Doctype() throws Exception { @Test public void testHTML4StrictDoctype() throws Exception { // Given: HTML 4.01 Strict DOCTYPE - final String html = "" + "

HTML 4.01

"; + final String html = + "" + + "

HTML 4.01

"; // When: Parsing final Document doc = parseHTML(html); @@ -444,8 +452,9 @@ public void testHTML4StrictDoctype() throws Exception { @Test public void testXHTMLDoctype() throws Exception { // Given: XHTML DOCTYPE - final String html = "" + "

XHTML

"; + final String html = + "" + + "

XHTML

"; // When: Parsing final Document doc = parseHTML(html); @@ -486,7 +495,8 @@ public void testMetaCharsetHTML5() throws Exception { @Test public void testMetaContentType() throws Exception { // Given: Legacy content-type meta - final String html = "Test"; + final String html = + "Test"; // When: Parsing final Document doc = parseHTML(html); @@ -516,7 +526,8 @@ public void testVoidElementsWithClosingTags() throws Exception { @Test public void testVoidElementsWithSlash() throws Exception { // Given: Void elements with trailing slash (XHTML style) - final String html = "" + "
" + "
" + "" + "" + ""; + final String html = + "" + "
" + "
" + "" + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -533,12 +544,13 @@ public void testVoidElementsWithSlash() throws Exception { @Test public void testTypicalWebPageStructure() throws Exception { // Given: Typical web page structure - final String html = "" + "" + "" + "" - + "" + "Test Page" - + "" + "" + "" + "" + "
" - + "" + "
" + "
" + "
" - + "

Article Title

" + "

Article content.

" + "
" + "
" + "
" - + "

© 2024

" + "
" + "" + ""; + final String html = + "" + "" + "" + "" + + "" + "Test Page" + + "" + "" + "" + "" + + "
" + "" + "
" + "
" + "
" + + "

Article Title

" + "

Article content.

" + "
" + "
" + "
" + + "

© 2024

" + "
" + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -553,11 +565,11 @@ public void testTypicalWebPageStructure() throws Exception { @Test public void testTypicalFormStructure() throws Exception { // Given: Typical form structure - final String html = "" + "
" + "
" - + "User Information" + "" - + "" + "" - + "" + "
" + "" - + "
" + ""; + final String html = + "" + "
" + "
" + "User Information" + + "" + "" + + "" + "" + "
" + + "" + "
" + ""; // When: Parsing final Document doc = parseHTML(html); diff --git a/src/test/java/org/codelibs/nekohtml/parsers/ComplexTableStructuresTest.java b/src/test/java/org/codelibs/nekohtml/parsers/ComplexTableStructuresTest.java index fb5ca19..dcfbb71 100644 --- a/src/test/java/org/codelibs/nekohtml/parsers/ComplexTableStructuresTest.java +++ b/src/test/java/org/codelibs/nekohtml/parsers/ComplexTableStructuresTest.java @@ -54,11 +54,9 @@ private Document parseHTML(final String html) throws Exception { @Test public void testTableWithAllSections() throws Exception { // Given: Table with THEAD, TBODY, TFOOT - final String html = "" - + "" - + "" - + "" - + "
Header
Body
Footer
"; + final String html = + "" + "" + "" + + "" + "
Header
Body
Footer
"; // When: Parsing final Document doc = parseHTML(html); @@ -74,11 +72,9 @@ public void testTableWithAllSections() throws Exception { @Test public void testTableWithMultipleTbody() throws Exception { // Given: Table with multiple TBODY elements (valid HTML5) - final String html = "" - + "" - + "" - + "" - + "
Group 1 Row 1
Group 2 Row 1
Group 3 Row 1
"; + final String html = + "" + "" + "" + + "" + "
Group 1 Row 1
Group 2 Row 1
Group 3 Row 1
"; // When: Parsing final Document doc = parseHTML(html); @@ -92,10 +88,8 @@ public void testTableWithMultipleTbody() throws Exception { @Test public void testTableWithCaption() throws Exception { // Given: Table with CAPTION - final String html = "" - + "" - + "" - + "
Table Caption
Cell
"; + final String html = + "" + "" + "" + "
Table Caption
Cell
"; // When: Parsing final Document doc = parseHTML(html); @@ -109,14 +103,10 @@ public void testTableWithCaption() throws Exception { @Test public void testTableWithColgroup() throws Exception { // Given: Table with COLGROUP and COL - final String html = "" - + "" - + "" - + "" - + "" - + "" - + "" - + "
Cell 1Cell 2Cell 3
"; + final String html = + "" + "" + "" + "" + + "" + "" + + "" + "
Cell 1Cell 2Cell 3
"; // When: Parsing final Document doc = parseHTML(html); @@ -130,10 +120,9 @@ public void testTableWithColgroup() throws Exception { @Test public void testTableWithColgroupSpan() throws Exception { // Given: COLGROUP with span attribute - final String html = "" - + "" - + "" - + "
Cell 1Cell 2Cell 3
"; + final String html = + "" + "" + + "" + "
Cell 1Cell 2Cell 3
"; // When: Parsing final Document doc = parseHTML(html); @@ -148,19 +137,11 @@ public void testTableWithColgroupSpan() throws Exception { @Test public void testComplexTableWithAllElements() throws Exception { // Given: Complex table with all possible elements - final String html = "" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + "
Complete Table
Header 1Header 2
Data 1-1Data 1-2
Data 2-1Data 2-2
Footer 1Footer 2
"; + final String html = + "" + "" + "" + "" + + "" + "" + "" + + "" + "" + "" + + "" + "" + "
Complete Table
Header 1Header 2
Data 1-1Data 1-2
Data 2-1Data 2-2
Footer 1Footer 2
"; // When: Parsing final Document doc = parseHTML(html); @@ -182,10 +163,7 @@ public void testComplexTableWithAllElements() throws Exception { @Test public void testTableWithTrDirectlyInTable() throws Exception { // Given: TR directly in TABLE (missing TBODY) - final String html = "" - + "" - + "" - + "
Cell 1
Cell 2
"; + final String html = "" + "" + "" + "
Cell 1
Cell 2
"; // When: Parsing final Document doc = parseHTML(html); @@ -200,10 +178,7 @@ public void testTableWithTrDirectlyInTable() throws Exception { @Test public void testTableWithTdDirectlyInTable() throws Exception { // Given: TD directly in TABLE (missing TBODY and TR) - final String html = "" - + "" - + "" - + "
Cell 1Cell 2
"; + final String html = "" + "" + "" + "
Cell 1Cell 2
"; // When: Parsing final Document doc = parseHTML(html); @@ -217,10 +192,9 @@ public void testTableWithTdDirectlyInTable() throws Exception { @Test public void testTableWithTheadAfterTbody() throws Exception { // Given: THEAD after TBODY (incorrect order) - final String html = "" - + "" - + "" - + "
Body
Header
"; + final String html = + "" + "" + "" + + "
Body
Header
"; // When: Parsing final Document doc = parseHTML(html); @@ -234,13 +208,9 @@ public void testTableWithTheadAfterTbody() throws Exception { @Test public void testTableWithMixedElements() throws Exception { // Given: Table with elements in mixed/wrong order - final String html = "" - + "" - + "" - + "" - + "" - + "" - + "
Row 1
Header
Row 2
Footer
Row 3
"; + final String html = + "" + "" + "" + "" + + "" + "" + "
Row 1
Header
Row 2
Footer
Row 3
"; // When: Parsing final Document doc = parseHTML(html); @@ -253,11 +223,9 @@ public void testTableWithMixedElements() throws Exception { @Test public void testTableWithUnclosedRows() throws Exception { // Given: Table with unclosed TR tags - final String html = "" - + "" - + "" - + "" - + "
Cell 1-1Cell 1-2
Cell 2-1Cell 2-2
Cell 3-1Cell 3-2
"; + final String html = + "" + "" + "" + + "" + "
Cell 1-1Cell 1-2
Cell 2-1Cell 2-2
Cell 3-1Cell 3-2
"; // When: Parsing final Document doc = parseHTML(html); @@ -271,9 +239,7 @@ public void testTableWithUnclosedRows() throws Exception { @Test public void testTableWithUnclosedCells() throws Exception { // Given: Table with unclosed TD tags - final String html = "" - + "" - + "
Cell 1Cell 2Cell 3
"; + final String html = "" + "" + "
Cell 1Cell 2Cell 3
"; // When: Parsing final Document doc = parseHTML(html); @@ -290,10 +256,9 @@ public void testTableWithUnclosedCells() throws Exception { @Test public void testTableWithColspan() throws Exception { // Given: Table with COLSPAN - final String html = "" - + "" - + "" - + "
Spans 2 columnsCell 3
Cell 1Cell 2Cell 3
"; + final String html = + "" + "" + + "" + "
Spans 2 columnsCell 3
Cell 1Cell 2Cell 3
"; // When: Parsing final Document doc = parseHTML(html); @@ -310,10 +275,9 @@ public void testTableWithColspan() throws Exception { @Test public void testTableWithRowspan() throws Exception { // Given: Table with ROWSPAN - final String html = "" - + "" - + "" - + "
Spans 2 rowsCell 1-2
Cell 2-2
"; + final String html = + "" + "" + "" + + "
Spans 2 rowsCell 1-2
Cell 2-2
"; // When: Parsing final Document doc = parseHTML(html); @@ -330,11 +294,10 @@ public void testTableWithRowspan() throws Exception { @Test public void testTableWithComplexSpans() throws Exception { // Given: Table with complex COLSPAN and ROWSPAN - final String html = "" - + "" - + "" - + "" - + "
Spans 2x2Cell 1-3
Cell 2-3
Cell 3-1Cell 3-2Cell 3-3
"; + final String html = + "" + "" + + "" + "" + + "
Spans 2x2Cell 1-3
Cell 2-3
Cell 3-1Cell 3-2Cell 3-3
"; // When: Parsing final Document doc = parseHTML(html); @@ -352,10 +315,9 @@ public void testTableWithComplexSpans() throws Exception { @Test public void testTableWithZeroSpan() throws Exception { // Given: Table with colspan/rowspan=0 (special value) - final String html = "" - + "" - + "" - + "
Spans to end
Cell 1Cell 2Cell 3
"; + final String html = + "" + "" + + "" + "
Spans to end
Cell 1Cell 2Cell 3
"; // When: Parsing final Document doc = parseHTML(html); @@ -369,9 +331,7 @@ public void testTableWithZeroSpan() throws Exception { @Test public void testTableWithVeryLargeSpan() throws Exception { // Given: Table with very large span value - final String html = "" - + "" - + "
Large span
"; + final String html = "" + "" + "
Large span
"; // When: Parsing final Document doc = parseHTML(html); @@ -389,10 +349,9 @@ public void testTableWithVeryLargeSpan() throws Exception { @Test public void testNestedTables2Levels() throws Exception { // Given: 2 levels of nested tables - final String html = "" - + "" - + "" - + "
Outer cell 1
Inner cell
"; + final String html = + "" + "" + "" + + "
Outer cell 1
Inner cell
"; // When: Parsing final Document doc = parseHTML(html); @@ -405,15 +364,10 @@ public void testNestedTables2Levels() throws Exception { @Test public void testNestedTables5Levels() throws Exception { // Given: 5 levels of nested tables - final String html = "
" - + "
" - + "
" - + "
" - + "
Deep
" - + "
" - + "
" - + "
" - + "
"; + final String html = + "
" + "
" + "
" + "
" + + "
Deep
" + "
" + "
" + "
" + + "
"; // When: Parsing final Document doc = parseHTML(html); @@ -426,11 +380,10 @@ public void testNestedTables5Levels() throws Exception { @Test public void testNestedTableInTheadTbodyTfoot() throws Exception { // Given: Nested tables in different sections - final String html = "" - + "" - + "" - + "" - + "
Header table
Body table
Footer table
"; + final String html = + "" + "" + + "" + + "" + "
Header table
Body table
Footer table
"; // When: Parsing final Document doc = parseHTML(html); @@ -460,10 +413,9 @@ public void testEmptyTable() throws Exception { @Test public void testTableWithEmptyCells() throws Exception { // Given: Table with empty cells - final String html = "" - + "" - + "" - + "
Content
"; + final String html = + "" + "" + "" + + "
Content
"; // When: Parsing final Document doc = parseHTML(html); @@ -476,10 +428,7 @@ public void testTableWithEmptyCells() throws Exception { @Test public void testTableWithOnlyTheadEmpty() throws Exception { // Given: Table with empty THEAD - final String html = "" - + "" - + "" - + "
Body
"; + final String html = "" + "" + "" + "
Body
"; // When: Parsing final Document doc = parseHTML(html); @@ -496,12 +445,9 @@ public void testTableWithOnlyTheadEmpty() throws Exception { @Test public void testTableWithThInTbody() throws Exception { // Given: TH elements in TBODY (valid for row headers) - final String html = "" - + "" - + "" - + "" - + "" - + "
Row 1 HeaderData 1
Row 2 HeaderData 2
"; + final String html = + "" + "" + "" + + "" + "" + "
Row 1 HeaderData 1
Row 2 HeaderData 2
"; // When: Parsing final Document doc = parseHTML(html); @@ -515,9 +461,8 @@ public void testTableWithThInTbody() throws Exception { @Test public void testTableWithOnlyTh() throws Exception { // Given: Table with only TH elements - final String html = "" - + "" - + "
Header 1Header 2Header 3
"; + final String html = + "" + "" + "
Header 1Header 2Header 3
"; // When: Parsing final Document doc = parseHTML(html); @@ -531,10 +476,9 @@ public void testTableWithOnlyTh() throws Exception { @Test public void testTableWithThAttributes() throws Exception { // Given: TH with scope, headers, and colspan attributes - final String html = "" - + "" - + "" - + "
Header
Data 1Data 2
"; + final String html = + "" + "" + + "" + "
Header
Data 1Data 2
"; // When: Parsing final Document doc = parseHTML(html); @@ -554,17 +498,11 @@ public void testTableWithThAttributes() throws Exception { @Test public void testDataTableWithSorting() throws Exception { // Given: Data table with sorting attributes - final String html = "" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + "
NameAgeDate
John302025-01-01
Jane252025-01-02
"; + final String html = + "" + "" + "" + "" + + "" + "" + "" + + "" + "" + + "" + "
NameAgeDate
John302025-01-01
Jane252025-01-02
"; // When: Parsing final Document doc = parseHTML(html); @@ -581,11 +519,10 @@ public void testDataTableWithSorting() throws Exception { @Test public void testTableWithFormElements() throws Exception { // Given: Table containing form elements - final String html = "" - + "" - + "" - + "" - + "
"; + final String html = + "" + "" + + "" + + "" + "
"; // When: Parsing final Document doc = parseHTML(html); @@ -600,12 +537,10 @@ public void testTableWithFormElements() throws Exception { @Test public void testTableWithComplexContent() throws Exception { // Given: Table with various content types - final String html = "" - + "" - + "" - + "" - + "" - + "
\"Image\"
Link
  • List item

Nested content

"; + final String html = + "" + "" + + "" + "" + + "" + "
\"Image\"
Link
  • List item

Nested content

"; // When: Parsing final Document doc = parseHTML(html); @@ -621,11 +556,9 @@ public void testTableWithComplexContent() throws Exception { @Test public void testTableWithMultipleCaption() throws Exception { // Given: Table with multiple CAPTION elements (invalid but should handle) - final String html = "" - + "" - + "" - + "" - + "
Caption 1Caption 2
Cell
"; + final String html = + "" + "" + "" + "" + + "
Caption 1Caption 2
Cell
"; // When: Parsing final Document doc = parseHTML(html); @@ -638,13 +571,10 @@ public void testTableWithMultipleCaption() throws Exception { @Test public void testTableWithIrregularRows() throws Exception { // Given: Table with rows of different cell counts - final String html = "" - + "" - + "" - + "" - + "" - + "" - + "
Cell 1
Cell 1Cell 2
Cell 1Cell 2Cell 3
Cell 1Cell 2
Cell 1
"; + final String html = + "" + "" + "" + + "" + "" + + "" + "
Cell 1
Cell 1Cell 2
Cell 1Cell 2Cell 3
Cell 1Cell 2
Cell 1
"; // When: Parsing final Document doc = parseHTML(html); diff --git a/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java b/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java index 9c8c453..204a806 100644 --- a/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java +++ b/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java @@ -260,8 +260,7 @@ public void testHandlerMismatchedEndTag() throws Exception { handler.startElement("", "div", "DIV", new org.xml.sax.helpers.AttributesImpl()); // End with wrong tag - should not throw in lenient mode - assertDoesNotThrow(() -> handler.endElement("", "span", "SPAN"), - "Lenient mode should handle mismatched end tag"); + assertDoesNotThrow(() -> handler.endElement("", "span", "SPAN"), "Lenient mode should handle mismatched end tag"); } @Test @@ -275,8 +274,7 @@ public void testHandlerEndTagEmptyStack() throws Exception { handler.endDocument(); // Clear stack // End element with empty stack - should not throw - assertDoesNotThrow(() -> handler.endElement("", "div", "DIV"), - "Lenient mode should handle end tag with empty stack"); + assertDoesNotThrow(() -> handler.endElement("", "div", "DIV"), "Lenient mode should handle end tag with empty stack"); } @Test @@ -287,8 +285,7 @@ public void testHandlerCharactersBeforeStartDocument() throws Exception { final SAXToDOMHandler handler = new SAXToDOMHandler(builder); // Characters before startDocument - should not throw - assertDoesNotThrow(() -> handler.characters("test".toCharArray(), 0, 4), - "Should handle characters before startDocument"); + assertDoesNotThrow(() -> handler.characters("test".toCharArray(), 0, 4), "Should handle characters before startDocument"); } @Test @@ -301,8 +298,7 @@ public void testHandlerCommentInDocument() throws Exception { handler.startDocument(); handler.startElement("", "html", "HTML", new org.xml.sax.helpers.AttributesImpl()); - assertDoesNotThrow(() -> handler.comment("This is a comment".toCharArray(), 0, 17), - "Should handle comment in document"); + assertDoesNotThrow(() -> handler.comment("This is a comment".toCharArray(), 0, 17), "Should handle comment in document"); } @Test @@ -345,9 +341,8 @@ public void testSkipDepthInLenientMode() throws Exception { handler.endElement("", "p", "P"); handler.endElement("", "div", "DIV"); - // Should complete without throwing - assertDoesNotThrow(() -> { - }, "Skip depth should handle nested skipped elements"); + // Should complete without throwing - verify handler accepted nested elements in skip mode + assertDoesNotThrow(() -> handler.endDocument(), "Skip depth should handle nested skipped elements"); } // ========================================================================= @@ -358,8 +353,9 @@ public void testSkipDepthInLenientMode() throws Exception { public void testDOMParserStrictModeWellFormed() throws Exception { System.setProperty(PROPERTY_DOM_STRICT, "true"); - final String html = "" + "Well Formed" + "" - + "
" + "

Hello World

" + "
" + "" + ""; + final String html = + "" + "Well Formed" + "" + "
" + + "

Hello World

" + "
" + "" + ""; final DOMParser parser = new DOMParser(); parser.parse(new InputSource(new StringReader(html))); @@ -451,8 +447,7 @@ public void close() throws SecurityException { } public boolean hasWarningContaining(String substring) { - return records.stream() - .filter(r -> r.getLevel() == Level.WARNING) + return records.stream().filter(r -> r.getLevel() == Level.WARNING) .anyMatch(r -> r.getMessage() != null && r.getMessage().contains(substring)); } diff --git a/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java b/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java index 71125db..bb2b500 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java @@ -58,22 +58,11 @@ private Document parseHTML(final String html) throws Exception { @Test public void testAllFormattingElementsBasic() throws Exception { // Given: HTML with all formatting elements - final String html = "" - + "Link" - + "Bold" - + "Big" - + "Code" - + "Emphasis" - + "Font" - + "Italic" - + "NoBreak" - + "Strike" - + "Small" - + "Strike" - + "Strong" - + "Teletype" - + "Underline" - + ""; + final String html = + "" + "Link" + "Bold" + "Big" + "Code" + "Emphasis" + + "Font" + "Italic" + "NoBreak" + "Strike" + "Small" + + "Strike" + "Strong" + "Teletype" + "Underline" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -170,15 +159,9 @@ public void testDeeplyNestedFormattingElements() throws Exception { @Test public void testFormattingElementsWithComplexBlockStructure() throws Exception { // Given: Formatting elements with complex block structure - final String html = "" - + "Start" - + "
Div 1" - + "

Para 1

" - + "
Quote
" - + "
" - + "
Section
" - + "End
" - + ""; + final String html = + "" + "Start" + "
Div 1" + "

Para 1

" + "
Quote
" + "
" + + "
Section
" + "End
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -199,10 +182,9 @@ public void testFormattingElementsWithComplexBlockStructure() throws Exception { @Test public void testFormattingElementsWithIdenticalAttributes() throws Exception { // Given: Multiple formatting elements with same attributes - final String html = "" - + "Link 1
Block
continues
" - + "Link 2

Para

continues
" - + ""; + final String html = + "" + "Link 1
Block
continues
" + + "Link 2

Para

continues
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -216,9 +198,7 @@ public void testFormattingElementsWithIdenticalAttributes() throws Exception { @Test public void testFontElementWithAttributes() throws Exception { // Given: FONT element with attributes crossing block - final String html = "" - + "Red text
Block
continues
" - + ""; + final String html = "" + "Red text
Block
continues
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -271,10 +251,10 @@ public void testFormattingElementInsideTableCell() throws Exception { @Test public void testComplexTableWithFormattingElements() throws Exception { // Given: Complex table with formatting elements - final String html = "" - + "" - + "" - + "
BoldItalic
Strong

Para

Em
Div
"; + final String html = + "" + "" + + "" + + "
BoldItalic
Strong

Para

Em
Div
"; // When: Parsing final Document doc = parseHTML(html); @@ -309,10 +289,9 @@ public void testFormattingElementAcrossList() throws Exception { @Test public void testFormattingElementInListItems() throws Exception { // Given: Formatting elements in list items - final String html = ""; + final String html = + ""; // When: Parsing final Document doc = parseHTML(html); @@ -345,11 +324,9 @@ public void testNestedListsWithFormattingElements() throws Exception { @Test public void testFormattingElementsAcrossSemanticElements() throws Exception { // Given: Formatting elements crossing semantic boundaries - final String html = "Bold " - + "
Article
" - + "
Section
" - + " " - + "end
"; + final String html = + "Bold " + "
Article
" + "
Section
" + " " + + "end
"; // When: Parsing final Document doc = parseHTML(html); @@ -365,13 +342,10 @@ public void testFormattingElementsAcrossSemanticElements() throws Exception { @Test public void testComplexSemanticStructureWithFormatting() throws Exception { // Given: Complex semantic structure with formatting - final String html = "" - + "
" - + "
Header

Title

continues
" - + "
Section

Para

continues
" - + "
Footer
Div
continues
" - + "
" - + ""; + final String html = + "" + "
" + "
Header

Title

continues
" + + "
Section

Para

continues
" + + "
Footer
Div
continues
" + "
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -455,9 +429,7 @@ public void testMisnestingWithBlockElements() throws Exception { @Test public void testComplexMisnestingPattern() throws Exception { // Given: Very complex misnesting - final String html = "" - + "Text
Block
" - + ""; + final String html = "" + "Text
Block
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -537,11 +509,9 @@ public void testFormattingElementsWithNOBR() throws Exception { @Test public void testAllFormattingElementsTogether() throws Exception { // Given: All formatting elements used together - final String html = "" - + "" - + "Text
Block
continues" - + "
" - + ""; + final String html = + "" + "" + "Text
Block
continues" + + "
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -554,13 +524,9 @@ public void testAllFormattingElementsTogether() throws Exception { @Test public void testFormattingElementsAcrossNestedBlocks() throws Exception { // Given: Formatting elements across deeply nested blocks - final String html = "Start" - + "
Level 1" - + "
Level 2" - + "
Level 3" - + "

Para

" - + "
" - + "End
"; + final String html = + "Start" + "
Level 1" + "
Level 2" + "
Level 3" + "

Para

" + "
" + + "End
"; // When: Parsing final Document doc = parseHTML(html); @@ -578,12 +544,8 @@ public void testFormattingElementsAcrossNestedBlocks() throws Exception { @Test public void testAAAOuterLoopWithManyFormattingElements() throws Exception { // Given: More than 8 nested formatting elements to test outer loop limit - final String html = "" - + "" - + "Text" - + "" // Close B early to trigger AAA - + "" - + ""; + final String html = "" + "" + "Text" + "" // Close B early to trigger AAA + + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -618,12 +580,8 @@ public void testAAAWithExtremeNesting() throws Exception { @Test public void testAAAWithAlternatingFormattingElements() throws Exception { // Given: Alternating formatting elements beyond loop limit - final String html = "" - + "" - + "Deep text" - + "" // Early close - + "" - + ""; + final String html = "" + "" + "Deep text" + "" // Early close + + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -639,14 +597,9 @@ public void testAAAWithAlternatingFormattingElements() throws Exception { @Test public void testFormattingMarkerInTable() throws Exception { // Given: Formatting elements crossing table cell boundaries - final String html = "" - + "Before table" - + "" - + "" - + "" - + "
Cell 1 Italic in cellCell 2
" - + "After table
" - + ""; + final String html = + "" + "Before table" + "" + "" + "" + + "
Cell 1 Italic in cellCell 2
" + "After table
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -660,12 +613,9 @@ public void testFormattingMarkerInTable() throws Exception { @Test public void testFormattingMarkerInCaption() throws Exception { // Given: Formatting elements in table caption - final String html = "" - + "" - + "" - + "" - + "
Bold caption
Block in caption
continues
Cell
" - + ""; + final String html = + "" + "" + "" + + "" + "
Bold caption
Block in caption
continues
Cell
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -678,12 +628,9 @@ public void testFormattingMarkerInCaption() throws Exception { @Test public void testFormattingMarkerInTH() throws Exception { // Given: Formatting elements in table header - final String html = "" - + "" - + "" - + "" - + "
Header

Para in header

continues
Cell
" - + ""; + final String html = + "" + "" + "" + "" + + "
Header

Para in header

continues
Cell
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -696,15 +643,9 @@ public void testFormattingMarkerInTH() throws Exception { @Test public void testFormattingAcrossMultipleTableCells() throws Exception { // Given: Formatting spanning multiple cells (invalid but should be handled) - final String html = "" - + "" - + "" - + "" - + "" - + "" - + "" - + "
Start boldMiddle cellThird cell
" - + ""; + final String html = + "" + "" + "" + "" + "" + "" + + "" + "
Start boldMiddle cellThird cell
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -721,12 +662,9 @@ public void testFormattingAcrossMultipleTableCells() throws Exception { @Test public void testFormattingInSelectOption() throws Exception { // Given: Formatting in select option (should be stripped) - final String html = "" - + "" - + ""; + final String html = + "" + "" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -740,11 +678,8 @@ public void testFormattingInSelectOption() throws Exception { @Test public void testFormattingSpanningSelect() throws Exception { // Given: Formatting spanning across select (invalid) - final String html = "" - + "Before select" - + "" - + "After select" - + ""; + final String html = + "" + "Before select" + "" + "After select" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -761,9 +696,7 @@ public void testFormattingSpanningSelect() throws Exception { @Test public void testFormattingInButton() throws Exception { // Given: Formatting inside button - final String html = "" - + "" - + ""; + final String html = "" + "" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -776,11 +709,7 @@ public void testFormattingInButton() throws Exception { @Test public void testFormattingSpanningButton() throws Exception { // Given: Formatting spanning across button - final String html = "" - + "Before button" - + "" - + "After button" - + ""; + final String html = "" + "Before button" + "" + "After button" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -797,9 +726,7 @@ public void testFormattingSpanningButton() throws Exception { @Test public void testFormattingInObject() throws Exception { // Given: Formatting inside object element - final String html = "" - + "Fallback
Block
content
" - + ""; + final String html = "" + "Fallback
Block
content
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -812,9 +739,7 @@ public void testFormattingInObject() throws Exception { @Test public void testFormattingInMarquee() throws Exception { // Given: Formatting in marquee (deprecated but may appear) - final String html = "" - + "Scrolling
Block
text
" - + ""; + final String html = "" + "Scrolling
Block
text
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -831,11 +756,9 @@ public void testFormattingInMarquee() throws Exception { @Test public void testFormattingWithFormElement() throws Exception { // Given: Formatting crossing form boundary - final String html = "" - + "Before form" - + "
Form content
" - + "After form
" - + ""; + final String html = + "" + "Before form" + "
Form content
" + "After form
" + + ""; // When: Parsing final Document doc = parseHTML(html); @@ -848,12 +771,9 @@ public void testFormattingWithFormElement() throws Exception { @Test public void testFormattingWithFieldset() throws Exception { // Given: Formatting in fieldset with legend - final String html = "" - + "
" - + "Bold legend
Block
" - + "" - + "
" - + ""; + final String html = + "" + "
" + "Bold legend
Block
" + "" + + "
" + ""; // When: Parsing final Document doc = parseHTML(html); @@ -899,9 +819,7 @@ public void testAAAInnerLoopWithManyActiveElements() throws Exception { @Test public void testAAAWithMultipleFurthestBlockCandidates() throws Exception { // Given: Multiple potential furthest blocks - final String html = "" - + "Bold

Para 1

Div

Para 2

continues
" - + ""; + final String html = "" + "Bold

Para 1

Div

Para 2

continues
" + ""; // When: Parsing final Document doc = parseHTML(html); diff --git a/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java b/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java index e17eb20..b7ab543 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java @@ -214,13 +214,11 @@ public void testSimpleErrorFormatFeature() throws Exception { final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); // Default should be false - assertFalse(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), - "Simple error format should be disabled by default"); + assertFalse(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), "Simple error format should be disabled by default"); // Enable simple error format config.setFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT, true); - assertTrue(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), - "Should be able to enable simple error format"); + assertTrue(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), "Should be able to enable simple error format"); } @Test @@ -292,13 +290,11 @@ public void testNamesElemsProperty() throws Exception { final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); // Default should be "upper" - assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS), - "Element names should default to upper case"); + assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS), "Element names should default to upper case"); // Change to lower config.setProperty(HTMLSAXConfiguration.NAMES_ELEMS, "lower"); - assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS), - "Should be able to set element names to lower case"); + assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS), "Should be able to set element names to lower case"); } @Test @@ -306,13 +302,11 @@ public void testNamesAttrsProperty() throws Exception { final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); // Default should be "lower" - assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS), - "Attribute names should default to lower case"); + assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS), "Attribute names should default to lower case"); // Change to upper config.setProperty(HTMLSAXConfiguration.NAMES_ATTRS, "upper"); - assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS), - "Should be able to set attribute names to upper case"); + assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS), "Should be able to set attribute names to upper case"); } @Test @@ -328,16 +322,14 @@ public void testLexicalHandlerProperty() throws Exception { "Lexical handler should be retrievable via property"); // Also get via getter - assertSame(lexHandler, config.getLexicalHandler(), - "Lexical handler should be retrievable via getter"); + assertSame(lexHandler, config.getLexicalHandler(), "Lexical handler should be retrievable via getter"); } @Test public void testUnrecognizedFeature() { final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); - assertThrows(org.xml.sax.SAXNotRecognizedException.class, - () -> config.getFeature("http://example.com/unknown-feature"), + assertThrows(org.xml.sax.SAXNotRecognizedException.class, () -> config.getFeature("http://example.com/unknown-feature"), "Should throw for unrecognized feature"); } @@ -345,8 +337,7 @@ public void testUnrecognizedFeature() { public void testUnrecognizedProperty() { final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); - assertThrows(org.xml.sax.SAXNotRecognizedException.class, - () -> config.getProperty("http://example.com/unknown-property"), + assertThrows(org.xml.sax.SAXNotRecognizedException.class, () -> config.getProperty("http://example.com/unknown-property"), "Should throw for unrecognized property"); } diff --git a/src/test/java/org/codelibs/nekohtml/sax/HTMLTagBalancerFilterEnhancementsTest.java b/src/test/java/org/codelibs/nekohtml/sax/HTMLTagBalancerFilterEnhancementsTest.java index 98b6e64..36449ec 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/HTMLTagBalancerFilterEnhancementsTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/HTMLTagBalancerFilterEnhancementsTest.java @@ -46,8 +46,7 @@ public void setUp() { filter.setContentHandler(new DefaultHandler() { @Override - public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes atts) - throws SAXException { + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes atts) throws SAXException { startElements.add(qName); } diff --git a/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java b/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java index c8d5e92..b3a269a 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java @@ -88,8 +88,9 @@ public void testScriptWithTypeAttribute() throws Exception { @Test public void testMultipleScriptElements() throws Exception { - final String html = "" + "" + "" - + "" + ""; + final String html = + "" + "" + "" + "" + + ""; final DOMParser parser = new DOMParser(); parser.parse(new InputSource(new StringReader(html))); @@ -139,8 +140,7 @@ public void testStyleWithSelectors() throws Exception { assertEquals(1, styles.getLength(), "Should have one style element"); final String styleContent = styles.item(0).getTextContent(); - assertTrue(styleContent.contains("color: red") || styleContent.contains("color:"), - "Style content should contain CSS rules"); + assertTrue(styleContent.contains("color: red") || styleContent.contains("color:"), "Style content should contain CSS rules"); } @Test diff --git a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java index 217dc6e..ad28529 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java @@ -477,9 +477,13 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - // Entities may be passed through or decoded depending on implementation - assertTrue(result.toString().contains("&") || result.toString().contains("&"), - "Should handle ampersand entity"); + // Entity decoding should produce actual characters + final String decoded = result.toString(); + assertTrue(decoded.contains("&"), "Should decode & to &"); + assertTrue(decoded.contains("<"), "Should decode < to <"); + assertTrue(decoded.contains(">"), "Should decode > to >"); + assertTrue(decoded.contains("\""), "Should decode " to \""); + assertTrue(decoded.contains("\u00A0"), "Should decode   to non-breaking space"); } /** @@ -502,7 +506,11 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - assertNotNull(result.toString(), "Should parse decimal numeric entities"); + // < = <, > = >, & = & + final String decoded = result.toString(); + assertTrue(decoded.contains("<"), "Should decode < to <"); + assertTrue(decoded.contains(">"), "Should decode > to >"); + assertTrue(decoded.contains("&"), "Should decode & to &"); } /** @@ -525,7 +533,11 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - assertNotNull(result.toString(), "Should parse hexadecimal numeric entities"); + // < = <, > = >, & = & + final String decoded = result.toString(); + assertTrue(decoded.contains("<"), "Should decode < to <"); + assertTrue(decoded.contains(">"), "Should decode > to >"); + assertTrue(decoded.contains("&"), "Should decode & to &"); } /** @@ -550,7 +562,8 @@ public void startElement(String uri, String localName, String qName, org.xml.sax scanner.parse(input); - assertTrue(attrValues.toString().contains("href"), "Should parse attribute with entity"); + final String attrs = attrValues.toString(); + assertTrue(attrs.contains("href=test?a=1&b=2"), "Should decode & in attribute to &"); } /** @@ -574,8 +587,7 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - assertTrue(result.toString().contains("&") || result.toString().contains("A"), - "Should handle incomplete entity"); + assertTrue(result.toString().contains("A & B"), "Should preserve incomplete entity as literal text"); } /** @@ -598,7 +610,91 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - assertNotNull(result.toString(), "Should handle unknown entity"); + assertTrue(result.toString().contains("&unknown;"), "Should preserve unknown entity as literal text"); + } + + /** + * Test that semicolon-less named entities in URL attributes are NOT decoded + * (HTML5 attribute value state rule: ¬=, ©=, ®= must be preserved) + */ + @Test + public void testSemicolonlessEntitiesInUrlAttributes() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder attrValues = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes atts) { + for (int i = 0; i < atts.getLength(); i++) { + attrValues.append(atts.getQName(i)).append("=").append(atts.getValue(i)).append("|"); + } + } + }); + + final String html = "Link"; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + final String attrs = attrValues.toString(); + assertTrue(attrs.contains("href=/x?a=1¬=2©=3®=4"), + "Semicolon-less named entities in attributes should be preserved as-is, got: " + attrs); + } + + /** + * Test that invalid numeric references produce U+FFFD replacement character + */ + @Test + public void testInvalidNumericReferences() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + // � (null), � (surrogate),  (control char) + final String html = "� � "; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + final String decoded = result.toString(); + // All invalid code points should be replaced with U+FFFD + assertEquals( + 3, + decoded.chars().filter(c -> c == 0xFFFD).count(), + "Invalid numeric references should be replaced with U+FFFD, got: " + + decoded.codePoints().mapToObj(cp -> String.format("U+%04X", cp)).reduce("", (a, b) -> a + " " + b)); + } + + /** + * Test that semicolon-less named entities ARE decoded in text context + */ + @Test + public void testSemicolonlessEntitiesInTextContent() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final String html = "& < >"; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + final String decoded = result.toString(); + assertTrue(decoded.contains("&"), "Should decode & (without semicolon) in text"); + assertTrue(decoded.contains("<"), "Should decode < (without semicolon) in text"); + assertTrue(decoded.contains(">"), "Should decode > (without semicolon) in text"); } // ========================================================================= @@ -622,15 +718,13 @@ public void characters(char[] ch, int start, int length) { // ISO-8859-1 encoded content with accented characters final String content = "café résumé"; - final ByteArrayInputStream stream = new ByteArrayInputStream( - ("" + content + "").getBytes("ISO-8859-1")); + final ByteArrayInputStream stream = new ByteArrayInputStream(("" + content + "").getBytes("ISO-8859-1")); final InputSource input = new InputSource(stream); input.setEncoding("ISO-8859-1"); scanner.parse(input); - assertTrue(result.toString().contains("caf") || result.toString().contains("é"), - "ISO-8859-1 encoded content should be parsed"); + assertTrue(result.toString().contains("caf") || result.toString().contains("é"), "ISO-8859-1 encoded content should be parsed"); } /** @@ -655,8 +749,7 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - assertTrue(result.toString().contains("Unicode") || result.toString().contains("\u4E2D"), - "UTF-16 encoded content should be parsed"); + assertTrue(result.toString().contains("Unicode") || result.toString().contains("\u4E2D"), "UTF-16 encoded content should be parsed"); } /** @@ -681,8 +774,7 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - assertTrue(result.toString().contains("UTF-8") || result.toString().contains("ä"), - "Default UTF-8 encoding should work"); + assertTrue(result.toString().contains("UTF-8") || result.toString().contains("ä"), "Default UTF-8 encoding should work"); } /** @@ -709,8 +801,7 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - assertTrue(result.toString().contains("From character stream"), - "Character stream should take precedence over byte stream"); + assertTrue(result.toString().contains("From character stream"), "Character stream should take precedence over byte stream"); } // ========================================================================= @@ -745,8 +836,7 @@ public void characters(char[] ch, int start, int length) { scanner.parse(input); - assertTrue(result.toString().contains("File URL content"), - "Should parse content from file:// URL"); + assertTrue(result.toString().contains("File URL content"), "Should parse content from file:// URL"); } /** @@ -760,8 +850,7 @@ public void testNoValidInputSource() { // InputSource with nothing set final InputSource input = new InputSource(); - assertThrows(SAXException.class, () -> scanner.parse(input), - "Should throw when no valid input source is available"); + assertThrows(SAXException.class, () -> scanner.parse(input), "Should throw when no valid input source is available"); } /** @@ -788,8 +877,7 @@ public void characters(char[] ch, int start, int length) { scanner.parse(tempFile.getAbsolutePath()); - assertTrue(result.toString().contains("SystemId parse"), - "Should parse using String systemId parameter"); + assertTrue(result.toString().contains("SystemId parse"), "Should parse using String systemId parameter"); } // ========================================================================= @@ -900,8 +988,7 @@ public void comment(char[] ch, int start, int length) { public void testGetUnrecognizedFeature() { final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); - assertThrows(org.xml.sax.SAXNotRecognizedException.class, - () -> scanner.getFeature("http://example.com/unknown-feature"), + assertThrows(org.xml.sax.SAXNotRecognizedException.class, () -> scanner.getFeature("http://example.com/unknown-feature"), "Should throw SAXNotRecognizedException for unknown feature"); } @@ -912,8 +999,7 @@ public void testGetUnrecognizedFeature() { public void testGetUnrecognizedProperty() { final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); - assertThrows(org.xml.sax.SAXNotRecognizedException.class, - () -> scanner.getProperty("http://example.com/unknown-property"), + assertThrows(org.xml.sax.SAXNotRecognizedException.class, () -> scanner.getProperty("http://example.com/unknown-property"), "Should throw SAXNotRecognizedException for unknown property"); } @@ -929,7 +1015,6 @@ public void testParsingWithoutContentHandler() throws Exception { final InputSource input = new InputSource(new StringReader(html)); // Should return early without error - assertDoesNotThrow(() -> scanner.parse(input), - "Parsing without content handler should not throw"); + assertDoesNotThrow(() -> scanner.parse(input), "Parsing without content handler should not throw"); } } diff --git a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerTest.java b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerTest.java index 8a377fa..2653056 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerTest.java @@ -407,8 +407,9 @@ public void characters(final char[] ch, final int start, final int length) { // When: Parsing the HTML scanner.parse(input); - // Then: Should capture special characters (as-is, no entity decoding) - assertTrue(textContent.stream().anyMatch(text -> text.contains("<>&"))); + // Then: Should capture decoded special characters (join chunks since characters() may be called multiple times) + final String allText = String.join("", textContent); + assertTrue(allText.contains("<>&"), "Should decode <>& to <>&"); } @Test @@ -499,8 +500,8 @@ public void testParseSystemIdNotSupported() { final SAXException exception = assertThrows(SAXException.class, () -> { scanner.parse(input); }); - assertTrue(exception.getMessage().contains("Cannot open SystemId"), - "Expected message about unable to open SystemId, got: " + exception.getMessage()); + assertTrue(exception.getMessage().contains("Cannot open SystemId"), "Expected message about unable to open SystemId, got: " + + exception.getMessage()); } @Test @@ -525,8 +526,8 @@ public void testParseStringSystemId() throws Exception { final SAXException exception = assertThrows(SAXException.class, () -> { scanner.parse("http://example.com/nonexistent.html"); }); - assertTrue(exception.getMessage().contains("Cannot open SystemId"), - "Expected message about unable to open SystemId, got: " + exception.getMessage()); + assertTrue(exception.getMessage().contains("Cannot open SystemId"), "Expected message about unable to open SystemId, got: " + + exception.getMessage()); } @Test