diff --git a/src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java b/src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java
index a61caa2..dca976f 100644
--- a/src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java
+++ b/src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java
@@ -26,6 +26,7 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.codelibs.nekohtml.HTMLEntities;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
@@ -353,11 +354,12 @@ protected void parseHTML(final String html) throws SAXException {
// Text content
final int nextTag = html.indexOf('<', pos);
final int endPos = nextTag >= 0 ? nextTag : length;
- final String text = html.substring(pos, endPos);
+ final String rawText = html.substring(pos, endPos);
// Always emit text content, including whitespace
// This preserves spacing between elements for proper text extraction
- if (text.length() > 0) {
+ if (rawText.length() > 0) {
+ final String text = resolveEntities(rawText);
fContentHandler.characters(text.toCharArray(), 0, text.length());
}
@@ -398,7 +400,7 @@ protected AttributesImpl parseAttributes(final String attrString) {
value = ""; // No value
}
- attrs.addAttribute("", name, name, "CDATA", value);
+ attrs.addAttribute("", name, name, "CDATA", resolveEntities(value, true));
}
return attrs;
@@ -436,6 +438,125 @@ protected String normalizeAttributeName(final String name) {
return "upper".equals(fAttributeCase) ? name.toUpperCase() : "lower".equals(fAttributeCase) ? name.toLowerCase() : name;
}
+ // Pattern for HTML character references: decimal; or hex; or &name;
+ // Semicolon is optional to handle common malformed HTML
+ private static final Pattern ENTITY_PATTERN = Pattern.compile("&(?:#([0-9]+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z][a-zA-Z0-9]*));?");
+
+ /**
+ * Resolves HTML character entities in text content.
+ * Semicolon-less named entities are decoded in text context.
+ *
+ * @param text The text containing entities
+ * @return The text with entities resolved to their character equivalents
+ */
+ protected String resolveEntities(final String text) {
+ return resolveEntities(text, false);
+ }
+
+ /**
+ * Resolves HTML character entities in the given text.
+ * Handles numeric decimal (Ö), numeric hex (Ö), and named (Ö) entities.
+ * In attribute context, semicolon-less named entities followed by [A-Za-z0-9=] are not decoded
+ * per HTML5 attribute value state rules, preventing corruption of URLs like ¬=, ©=.
+ *
+ * @param text The text containing entities
+ * @param inAttribute Whether this text is an attribute value
+ * @return The text with entities resolved to their character equivalents
+ */
+ protected String resolveEntities(final String text, final boolean inAttribute) {
+ if (text == null || text.indexOf('&') < 0) {
+ return text;
+ }
+
+ final Matcher m = ENTITY_PATTERN.matcher(text);
+ final StringBuilder sb = new StringBuilder(text.length());
+ int lastEnd = 0;
+
+ while (m.find()) {
+ sb.append(text, lastEnd, m.start());
+
+ if (m.group(1) != null) {
+ // Numeric decimal: Ö
+ try {
+ final int codePoint = Integer.parseInt(m.group(1));
+ sb.append(resolveCodePoint(codePoint, m.group(0)));
+ } catch (final NumberFormatException e) {
+ sb.append(m.group(0));
+ }
+ } else if (m.group(2) != null) {
+ // Numeric hex: Ö
+ try {
+ final int codePoint = Integer.parseInt(m.group(2), 16);
+ sb.append(resolveCodePoint(codePoint, m.group(0)));
+ } catch (final NumberFormatException e) {
+ sb.append(m.group(0));
+ }
+ } else if (m.group(3) != null) {
+ // Named entity: Ö
+ final String matched = m.group(0);
+ final boolean hasSemicolon = matched.endsWith(";");
+
+ // HTML5 attribute value state: if no semicolon and next char is [A-Za-z0-9=],
+ // do not decode (prevents corruption of URLs like ¬=2, ©=, ®=)
+ if (inAttribute && !hasSemicolon) {
+ final int afterEnd = m.end();
+ if (afterEnd < text.length()) {
+ final char nextChar = text.charAt(afterEnd);
+ if (Character.isLetterOrDigit(nextChar) || nextChar == '=') {
+ sb.append(matched);
+ lastEnd = m.end();
+ continue;
+ }
+ }
+ }
+
+ final int c = HTMLEntities.get(m.group(3));
+ if (c != -1) {
+ sb.appendCodePoint(c);
+ } else {
+ sb.append(matched);
+ }
+ }
+
+ lastEnd = m.end();
+ }
+
+ sb.append(text, lastEnd, text.length());
+ return sb.toString();
+ }
+
+ /**
+ * Validates a numeric code point and returns the resolved character or replacement.
+ * Invalid code points (null char, surrogates, out of range, XML-illegal) are replaced with U+FFFD.
+ */
+ private static String resolveCodePoint(final int codePoint, final String original) {
+ if (codePoint == 0) {
+ // Null character: replace with U+FFFD per HTML5 spec
+ return "\uFFFD";
+ }
+ if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
+ // Surrogate range: invalid Unicode scalar value
+ return "\uFFFD";
+ }
+ if (codePoint > 0x10FFFF) {
+ // Out of Unicode range
+ return "\uFFFD";
+ }
+ // XML 1.0 illegal characters (except tab, newline, carriage return)
+ if (codePoint < 0x20 && codePoint != 0x9 && codePoint != 0xA && codePoint != 0xD) {
+ return "\uFFFD";
+ }
+ if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF) {
+ // Unicode noncharacters
+ return "\uFFFD";
+ }
+ if ((codePoint & 0xFFFE) == 0xFFFE) {
+ // U+xFFFE and U+xFFFF are noncharacters
+ return "\uFFFD";
+ }
+ return new String(Character.toChars(codePoint));
+ }
+
@Override
public boolean getFeature(final String name) throws SAXNotRecognizedException, SAXNotSupportedException {
throw new SAXNotRecognizedException("Feature not recognized: " + name);
diff --git a/src/test/java/org/codelibs/nekohtml/EncodingEdgeCasesTest.java b/src/test/java/org/codelibs/nekohtml/EncodingEdgeCasesTest.java
index 13d549f..be2fb92 100644
--- a/src/test/java/org/codelibs/nekohtml/EncodingEdgeCasesTest.java
+++ b/src/test/java/org/codelibs/nekohtml/EncodingEdgeCasesTest.java
@@ -125,11 +125,9 @@ public void testUTF16LEBOM() throws Exception {
@Test
public void testZeroWidthCharacters() throws Exception {
// Given: HTML with zero-width characters
- final String html = "
"
- + "Zero\u200BWidth\u200BSpace "
- + "Zero\u200CWidth\u200CNon\u200CJoiner "
- + "Zero\u200DWidth\u200DJoiner"
- + "";
+ final String html =
+ "" + "Zero\u200BWidth\u200BSpace " + "Zero\u200CWidth\u200CNon\u200CJoiner " + "Zero\u200DWidth\u200DJoiner"
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -145,10 +143,8 @@ public void testZeroWidthCharacters() throws Exception {
@Test
public void testRightToLeftMarks() throws Exception {
// Given: HTML with RTL and LTR marks
- final String html = ""
- + "Text\u200Ewith\u200ELTR\u200Emarks "
- + "Text\u200Fwith\u200FRTL\u200Fmarks"
- + "";
+ final String html =
+ "" + "Text\u200Ewith\u200ELTR\u200Emarks " + "Text\u200Fwith\u200FRTL\u200Fmarks" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -163,8 +159,7 @@ public void testRightToLeftMarks() throws Exception {
@Test
public void testCombiningCharacters() throws Exception {
// Given: HTML with combining diacritics
- final String html = ""
- + "e\u0301 " // é (e + combining acute)
+ final String html = "" + "e\u0301 " // é (e + combining acute)
+ "n\u0303 " // ñ (n + combining tilde)
+ "a\u0308 " // ä (a + combining diaeresis)
+ "";
@@ -183,11 +178,7 @@ public void testCombiningCharacters() throws Exception {
@Test
public void testEmojiAndSupplementaryCharacters() throws Exception {
// Given: HTML with emoji (supplementary characters)
- final String html = ""
- + "😀😁😂🤣😃😄😅😆😉😊 "
- + "👍👎👏🙌🎉🎊🎈 "
- + "🌟⭐✨💫"
- + "";
+ final String html = "" + "😀😁😂🤣😃😄😅😆😉😊 " + "👍👎👏🙌🎉🎊🎈 " + "🌟⭐✨💫" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -203,8 +194,7 @@ public void testEmojiAndSupplementaryCharacters() throws Exception {
@Test
public void testSurrogatePairs() throws Exception {
// Given: HTML with characters requiring surrogate pairs
- final String html = ""
- + "\uD834\uDD1E" // Musical symbol G clef (U+1D11E)
+ final String html = "" + "\uD834\uDD1E" // Musical symbol G clef (U+1D11E)
+ "\uD835\uDC00" // Mathematical bold capital A (U+1D400)
+ "\uD83D\uDE00" // Grinning face emoji (U+1F600)
+ "";
@@ -221,9 +211,7 @@ public void testSurrogatePairs() throws Exception {
@Test
public void testControlCharacters() throws Exception {
// Given: HTML with control characters (allowed ones)
- final String html = ""
- + "Tab:\t Newline:\n CarriageReturn:\r"
- + "";
+ final String html = "" + "Tab:\t Newline:\n CarriageReturn:\r" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -237,15 +225,10 @@ public void testControlCharacters() throws Exception {
@Test
public void testMultilingualContent() throws Exception {
// Given: HTML with multiple languages
- final String html = ""
- + "日本語のテキスト
"
- + "中文文本
"
- + "한국어 텍스트
"
- + "نص عربي
"
- + "Русский текст
"
- + "Ελληνικό κείμενο
"
- + "טקסט עברי
"
- + "";
+ final String html =
+ "" + "日本語のテキスト
" + "中文文本
" + "한국어 텍스트
"
+ + "نص عربي
" + "Русский текст
" + "Ελληνικό κείμενο
"
+ + "טקסט עברי
" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -275,10 +258,7 @@ public void testMultilingualContent() throws Exception {
@Test
public void testMalformedEntities() throws Exception {
// Given: HTML with malformed entities
- final String html = ""
- + "&invalid; "
- + "¬anentity; "
- + "< " // missing semicolon
+ final String html = "" + "&invalid; " + "¬anentity; " + "< " // missing semicolon
+ "> " // missing semicolon
+ "&" // missing semicolon
+ "";
@@ -295,8 +275,7 @@ public void testMalformedEntities() throws Exception {
@Test
public void testNumericEntitiesOutOfRange() throws Exception {
// Given: HTML with out-of-range numeric entities
- final String html = ""
- + " " // way out of range
+ final String html = "" + " " // way out of range
+ " " // huge decimal
+ " " // just beyond Unicode range
+ "";
@@ -325,11 +304,10 @@ public void testIncompleteEntityAtEOF() throws Exception {
@Test
public void testEntitiesInAttributeValues() throws Exception {
// Given: HTML with entities in attribute values
- final String html = ""
- + "Content
"
- + "Link "
- + " "
- + "";
+ final String html =
+ "" + "Content
"
+ + "Link " + " "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -355,15 +333,10 @@ public void testEntitiesInAttributeValues() throws Exception {
@Test
public void testAllCommonHTMLEntities() throws Exception {
// Given: HTML with all common entities
- final String html = ""
- + " < > & " ' "
- + "© ® ™ "
- + "€ £ ¥ ¢ "
- + "— – … "
- + "« » “ ” ‘ ’ "
- + "° ± × ÷ "
- + "¶ § † ‡ "
- + "";
+ final String html =
+ "" + " < > & " ' " + "© ® ™ " + "€ £ ¥ ¢ "
+ + "— – … " + "« » “ ” ‘ ’ "
+ + "° ± × ÷ " + "¶ § † ‡ " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -381,8 +354,7 @@ public void testAllCommonHTMLEntities() throws Exception {
@Test
public void testNumericCharacterReferences() throws Exception {
// Given: HTML with numeric character references
- final String html = ""
- + "A " // A
+ final String html = "" + "A " // A
+ "A " // A (hex)
+ "© " // ©
+ "© " // © (hex)
@@ -405,9 +377,7 @@ public void testNumericCharacterReferences() throws Exception {
@Test
public void testEntitiesWithoutSemicolon() throws Exception {
// Given: HTML with entities without semicolons (legacy)
- final String html = ""
- + "< > & © ®"
- + "";
+ final String html = "" + "< > & © ®" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -424,11 +394,10 @@ public void testEntitiesWithoutSemicolon() throws Exception {
@Test
public void testMultipleMetaCharsetDeclarations() throws Exception {
// Given: HTML with multiple conflicting charset declarations
- final String html = ""
- + " "
- + " "
- + " "
- + "Content";
+ final String html =
+ "" + " " + " "
+ + " "
+ + "Content";
// When: Parsing
final Document doc = parseHTML(html);
@@ -442,11 +411,9 @@ public void testMultipleMetaCharsetDeclarations() throws Exception {
@Test
public void testMetaCharsetVariations() throws Exception {
// Given: HTML with various meta charset formats
- final String html = ""
- + " "
- + " "
- + " "
- + "Test";
+ final String html =
+ "" + " " + " "
+ + " " + "Test";
// When: Parsing
final Document doc = parseHTML(html);
@@ -460,8 +427,9 @@ public void testMetaCharsetVariations() throws Exception {
@Test
public void testXMLDeclarationVsMetaCharset() throws Exception {
// Given: HTML with both XML declaration and meta charset
- final String html = ""
- + " Content";
+ final String html =
+ ""
+ + " Content";
// When: Parsing
final Document doc = parseHTML(html);
@@ -478,14 +446,9 @@ public void testXMLDeclarationVsMetaCharset() throws Exception {
@Test
public void testNonBreakingSpaces() throws Exception {
// Given: HTML with various types of spaces
- final String html = ""
- + "Regular space "
- + "Non-breaking space "
- + "En\u2002space "
- + "Em\u2003space "
- + "Thin\u2009space "
- + "Hair\u200Aspace"
- + "";
+ final String html =
+ "" + "Regular space " + "Non-breaking space " + "En\u2002space " + "Em\u2003space " + "Thin\u2009space "
+ + "Hair\u200Aspace" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -504,12 +467,9 @@ public void testNonBreakingSpaces() throws Exception {
@Test
public void testSpecialPunctuationCharacters() throws Exception {
// Given: HTML with special punctuation
- final String html = ""
- + "Quotes: \u201C\u201D \u2018\u2019 "
- + "Dashes: \u2013 \u2014 "
- + "Ellipsis: \u2026 "
- + "Bullet: \u2022 "
- + "";
+ final String html =
+ "" + "Quotes: \u201C\u201D \u2018\u2019 " + "Dashes: \u2013 \u2014 " + "Ellipsis: \u2026 " + "Bullet: \u2022 "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -526,9 +486,7 @@ public void testSpecialPunctuationCharacters() throws Exception {
@Test
public void testMathematicalSymbols() throws Exception {
// Given: HTML with mathematical symbols
- final String html = ""
- + "∞ ≠ ≤ ≥ ± × ÷ √ ∑ ∏ ∫ ∂ ∇"
- + "";
+ final String html = "" + "∞ ≠ ≤ ≥ ± × ÷ √ ∑ ∏ ∫ ∂ ∇" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -544,9 +502,7 @@ public void testMathematicalSymbols() throws Exception {
@Test
public void testCurrencySymbols() throws Exception {
// Given: HTML with various currency symbols
- final String html = ""
- + "$ € £ ¥ ₹ ₽ ₩ ¢ ฿"
- + "";
+ final String html = "" + "$ € £ ¥ ₹ ₽ ₩ ¢ ฿" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -562,11 +518,9 @@ public void testCurrencySymbols() throws Exception {
@Test
public void testMixedDirectionalText() throws Exception {
// Given: HTML with mixed LTR and RTL text
- final String html = ""
- + "Left-to-right text with עברית embedded
"
- + "טקסט מימין לשמאל with English embedded
"
- + "Mixed: Hello שלום مرحبا
"
- + "";
+ final String html =
+ "" + "Left-to-right text with עברית embedded
"
+ + "טקסט מימין לשמאל with English embedded
" + "Mixed: Hello שלום مرحبا
" + "";
// When: Parsing
final Document doc = parseHTML(html);
diff --git a/src/test/java/org/codelibs/nekohtml/HTML5SemanticElementsIntegrationTest.java b/src/test/java/org/codelibs/nekohtml/HTML5SemanticElementsIntegrationTest.java
index e87b42d..960cad5 100644
--- a/src/test/java/org/codelibs/nekohtml/HTML5SemanticElementsIntegrationTest.java
+++ b/src/test/java/org/codelibs/nekohtml/HTML5SemanticElementsIntegrationTest.java
@@ -91,7 +91,8 @@ public void testSearchElementInHeader() throws Exception {
@Test
public void testSearchElementInAside() throws Exception {
// Given: HTML with SEARCH in ASIDE (common pattern for sidebar search)
- final String html = "Main content ";
+ final String html =
+ "Main content ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -108,10 +109,10 @@ public void testSearchElementInAside() throws Exception {
@Test
public void testMultipleSearchElements() throws Exception {
// Given: HTML with multiple SEARCH elements
- final String html = ""
- + ""
- + " "
- + "";
+ final String html =
+ "" + ""
+ + " "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -124,14 +125,10 @@ public void testMultipleSearchElements() throws Exception {
@Test
public void testSearchWithAutocompleteAndDatalist() throws Exception {
// Given: HTML with SEARCH containing autocomplete and datalist
- final String html = ""
- + " "
- + ""
- + ""
- + " "
- + " "
- + " "
- + " ";
+ final String html =
+ "" + " "
+ + "" + "" + " "
+ + " " + " " + " ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -155,11 +152,9 @@ public void testSearchWithAutocompleteAndDatalist() throws Exception {
@Test
public void testSearchClosingWithAdjacentBlockElements() throws Exception {
// Given: HTML with SEARCH followed by other block elements
- final String html = ""
- + " "
- + " "
- + "Content "
- + "";
+ final String html =
+ "" + " " + " "
+ + "Content " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -239,11 +234,10 @@ public void testSlotWithFallbackContent() throws Exception {
@Test
public void testMultipleSlotsInTemplate() throws Exception {
// Given: HTML with multiple SLOTs in TEMPLATE
- final String html = ""
- + ""
- + "Default Content "
- + ""
- + " ";
+ final String html =
+ "" + ""
+ + "Default Content " + ""
+ + " ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -265,7 +259,8 @@ public void testMultipleSlotsInTemplate() throws Exception {
@Test
public void testNestedSlotElements() throws Exception {
// Given: HTML with nested SLOTs (edge case)
- final String html = " ";
+ final String html =
+ " ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -302,14 +297,9 @@ public void testHgroupWithMultipleHeadings() throws Exception {
@Test
public void testHgroupWithAllHeadingLevels() throws Exception {
// Given: HTML with HGROUP containing H1-H6
- final String html = ""
- + "Level 1 "
- + "Level 2 "
- + "Level 3 "
- + "Level 4 "
- + "Level 5 "
- + "Level 6 "
- + " ";
+ final String html =
+ "" + "Level 1 " + "Level 2 " + "Level 3 " + "Level 4 "
+ + "Level 5 " + "Level 6 " + " ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -362,7 +352,8 @@ public void testHgroupInHeader() throws Exception {
@Test
public void testHgroupInArticle() throws Exception {
// Given: HTML with HGROUP in ARTICLE
- final String html = "Article Title Author Name Article content
";
+ final String html =
+ "Article Title Author Name Article content
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -379,7 +370,8 @@ public void testHgroupInArticle() throws Exception {
@Test
public void testHgroupInSection() throws Exception {
// Given: HTML with HGROUP in SECTION
- final String html = "Section Title Section Subtitle Content
";
+ final String html =
+ "Section Title Section Subtitle Content
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -396,10 +388,10 @@ public void testHgroupInSection() throws Exception {
@Test
public void testMultipleHgroups() throws Exception {
// Given: HTML with multiple HGROUPs
- final String html = ""
- + "First Article First Subtitle Content 1
"
- + "Second Article Second Subtitle Content 2
"
- + "";
+ final String html =
+ "" + "First Article First Subtitle Content 1
"
+ + "Second Article Second Subtitle Content 2
"
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -437,25 +429,13 @@ public void testHgroupWithBlockElement() throws Exception {
@Test
public void testComplexSemanticStructureWithNewElements() throws Exception {
// Given: Complex HTML using SEARCH, SLOT, HGROUP together
- final String html = ""
- + ""
- + ""
- + ""
- + "Article Title Article Subtitle "
- + "Article content here
"
- + " "
- + ""
- + " "
- + ""
- + "Default Title "
- + "Default content
"
- + " "
- + "";
+ final String html =
+ "" + "" + "" + ""
+ + "Article Title Article Subtitle " + "Article content here
" + " "
+ + ""
+ + " " + "" + "Default Title "
+ + "Default content
" + " " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -477,7 +457,8 @@ public void testComplexSemanticStructureWithNewElements() throws Exception {
@Test
public void testSearchInMain() throws Exception {
// Given: SEARCH in MAIN element
- final String html = " ";
+ final String html =
+ " ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -494,7 +475,8 @@ public void testSearchInMain() throws Exception {
@Test
public void testHgroupInFooter() throws Exception {
// Given: HGROUP in FOOTER (less common but valid)
- final String html = "Footer Section Additional Info Footer content
";
+ final String html =
+ "Footer Section Additional Info Footer content
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -511,15 +493,12 @@ public void testHgroupInFooter() throws Exception {
@Test
public void testSearchWithComplexFormElements() throws Exception {
// Given: SEARCH with complex form containing multiple inputs
- final String html = " ";
+ final String html =
+ " ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -539,7 +518,8 @@ public void testSearchWithComplexFormElements() throws Exception {
@Test
public void testSlotInCustomElement() throws Exception {
// Given: SLOT in custom element context
- final String html = " ";
+ final String html =
+ " ";
// When: Parsing
final Document doc = parseHTML(html);
diff --git a/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java b/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java
index 487d4dd..ffa41a0 100644
--- a/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java
+++ b/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java
@@ -439,15 +439,10 @@ public void testComplexNestedStructure() throws Exception {
// Given: Complex realistic document structure
final StringBuilder html = new StringBuilder("Test ");
for (int i = 0; i < 100; i++) {
- html.append("")
- .append("Title ").append(i).append(" ")
- .append("")
- .append("Paragraph 1
")
- .append("Paragraph 2
")
- .append("")
- .append(" ")
- .append("")
- .append(" ");
+ html.append("").append("Title ").append(i).append(" ").append("")
+ .append("Paragraph 1
").append("Paragraph 2
")
+ .append("").append(" ")
+ .append("").append(" ");
}
html.append("");
diff --git a/src/test/java/org/codelibs/nekohtml/ThreadSafetyTest.java b/src/test/java/org/codelibs/nekohtml/ThreadSafetyTest.java
index b089f35..405f3ae 100644
--- a/src/test/java/org/codelibs/nekohtml/ThreadSafetyTest.java
+++ b/src/test/java/org/codelibs/nekohtml/ThreadSafetyTest.java
@@ -44,11 +44,8 @@
public class ThreadSafetyTest {
private static final String SIMPLE_HTML = "Content
";
- private static final String COMPLEX_HTML = "Test "
- + ""
- + ""
- + " "
- + "";
+ private static final String COMPLEX_HTML = "Test " + ""
+ + "" + " " + "";
// ========================================================================
// Concurrent Parsing with Separate Parser Instances
@@ -104,8 +101,8 @@ public void testConcurrentParsingWithSeparateParsers100Threads() throws Exceptio
public Boolean call() throws Exception {
try {
final DOMParser parser = new DOMParser();
- final String html = "Thread " + threadId + " "
- + "Paragraph " + threadId + "
";
+ final String html =
+ "Thread " + threadId + " " + "Paragraph " + threadId + "
";
parser.parse(new InputSource(new StringReader(html)));
final Document doc = parser.getDocument();
@@ -137,10 +134,10 @@ public void testConcurrentParsingDifferentDocumentTypes() throws Exception {
final ExecutorService executor = Executors.newFixedThreadPool(10);
final List> futures = new ArrayList<>();
- final String[] htmlTypes = new String[] { SIMPLE_HTML, COMPLEX_HTML,
- "",
- "",
- "" };
+ final String[] htmlTypes =
+ new String[] { SIMPLE_HTML, COMPLEX_HTML, "",
+ "",
+ "" };
// When: Threads parse different document types
for (int i = 0; i < threadCount; i++) {
@@ -311,9 +308,10 @@ public void testConcurrentParsingWithMalformedHTML() throws Exception {
final AtomicInteger successCount = new AtomicInteger(0);
final List> futures = new ArrayList<>();
- final String[] malformedHTML = new String[] { "Unclosed div", "
",
- "
", "
Paragraph
Block
continues",
- "
" };
+ final String[] malformedHTML =
+ new String[] { "
Unclosed div", "
",
+ "
",
+ "
Paragraph
Block
continues", "
" };
// When: Threads parse malformed HTML
for (int i = 0; i < threadCount; i++) {
diff --git a/src/test/java/org/codelibs/nekohtml/parsers/AttributeEdgeCasesTest.java b/src/test/java/org/codelibs/nekohtml/parsers/AttributeEdgeCasesTest.java
index 5398821..9d6be7f 100644
--- a/src/test/java/org/codelibs/nekohtml/parsers/AttributeEdgeCasesTest.java
+++ b/src/test/java/org/codelibs/nekohtml/parsers/AttributeEdgeCasesTest.java
@@ -89,7 +89,8 @@ public void testDuplicateAttributeNames() throws Exception {
@Test
public void testAttributesWithColons() throws Exception {
// Given: Attributes with colons (namespace-like)
- final String html = "
Content
";
+ final String html =
+ "
Content
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -317,13 +318,9 @@ public void testAttributeValueWithBackslashes() throws Exception {
@Test
public void testBooleanAttributesWithoutValues() throws Exception {
// Given: Boolean attributes without values
- final String html = ""
- + "
"
- + "
"
- + "
"
- + "
Button "
- + ""
- + "";
+ final String html =
+ "" + "
" + "
" + "
"
+ + "
Button " + "" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -337,12 +334,10 @@ public void testBooleanAttributesWithoutValues() throws Exception {
@Test
public void testBooleanAttributesWithValues() throws Exception {
// Given: Boolean attributes with values (valid in HTML)
- final String html = ""
- + "
"
- + "
"
- + "
"
- + "
Option "
- + "";
+ final String html =
+ "" + "
" + "
"
+ + "
" + "
Option "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -356,11 +351,9 @@ public void testBooleanAttributesWithValues() throws Exception {
@Test
public void testBooleanAttributesWithInvalidValues() throws Exception {
// Given: Boolean attributes with invalid values
- final String html = ""
- + "
"
- + "
"
- + "
"
- + "";
+ final String html =
+ "" + "
" + "
"
+ + "
" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -374,18 +367,11 @@ public void testBooleanAttributesWithInvalidValues() throws Exception {
@Test
public void testAllBooleanHTMLAttributes() throws Exception {
// Given: All common boolean HTML attributes
- final String html = ""
- + "
"
- + "
Button "
- + "
Video "
- + "
List "
- + "
Details "
- + ""
- + "
"
- + "
"
- + "Option "
- + "Group "
- + "";
+ final String html =
+ "" + " " + "Button "
+ + "Video " + "List "
+ + "Details " + "" + ""
+ + "" + "Option " + "Group " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -434,11 +420,8 @@ public void testMultipleSpacesBetweenAttributes() throws Exception {
@Test
public void testAttributesWithNewlinesBetweenThem() throws Exception {
// Given: Attributes on multiple lines
- final String html = "Content
";
+ final String html =
+ "Content
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -474,9 +457,10 @@ public void testManyAttributesOnSingleElement() throws Exception {
@Test
public void testDataAttributes() throws Exception {
// Given: Various data attributes
- final String html = ""
- + "Content
"
- + "";
+ final String html =
+ ""
+ + "Content
"
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -493,9 +477,10 @@ public void testDataAttributes() throws Exception {
@Test
public void testAriaAttributes() throws Exception {
// Given: ARIA attributes
- final String html = ""
- + "Button
"
- + "";
+ final String html =
+ ""
+ + "Button
"
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -512,9 +497,10 @@ public void testAriaAttributes() throws Exception {
@Test
public void testEventHandlerAttributes() throws Exception {
// Given: Event handler attributes
- final String html = ""
- + "Button "
- + "";
+ final String html =
+ ""
+ + "Button "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -530,9 +516,10 @@ public void testEventHandlerAttributes() throws Exception {
@Test
public void testStyleAttribute() throws Exception {
// Given: Style attribute with complex CSS
- final String html = ""
- + "Content
"
- + "";
+ final String html =
+ ""
+ + "Content
"
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -565,11 +552,10 @@ public void testClassAttributeWithMultipleValues() throws Exception {
@Test
public void testAttributeWithURL() throws Exception {
// Given: Attributes with URLs
- final String html = ""
- + "Link "
- + " "
- + " "
- + "";
+ final String html =
+ "" + "Link "
+ + " " + " "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
diff --git a/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java b/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java
index 4d8de62..117008d 100644
--- a/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java
+++ b/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java
@@ -123,8 +123,9 @@ public void testImpliedColgroup() throws Exception {
@Test
public void testTableWithMissingCloseTags() throws Exception {
// Given: Table with missing close tags (common in legacy HTML)
- final String html = "" + "Row 1, Cell 1 Row 1, Cell 2" + " Row 2, Cell 1 Row 2, Cell 2"
- + "
";
+ final String html =
+ "" + "Row 1, Cell 1 Row 1, Cell 2" + " Row 2, Cell 1 Row 2, Cell 2"
+ + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -138,8 +139,9 @@ public void testTableWithMissingCloseTags() throws Exception {
@Test
public void testNestedTables() throws Exception {
// Given: Nested tables
- final String html = "" + ""
- + "";
+ final String html =
+ "" + ""
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -151,7 +153,8 @@ public void testNestedTables() throws Exception {
@Test
public void testTableWithCaptionAfterRows() throws Exception {
// Given: Table with CAPTION after TR (invalid but common)
- final String html = "" + "Cell " + "Table Caption " + "
";
+ final String html =
+ "" + "Cell " + "Table Caption " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -168,8 +171,9 @@ public void testTableWithCaptionAfterRows() throws Exception {
@Test
public void testFormWithOrphanedInputs() throws Exception {
// Given: Form with inputs outside form tag
- final String html = "" + ""
- + " " + "";
+ final String html =
+ "" + ""
+ + " " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -182,8 +186,9 @@ public void testFormWithOrphanedInputs() throws Exception {
@Test
public void testNestedForms() throws Exception {
// Given: Nested forms (invalid HTML but may appear)
- final String html = "" + ""
- + "" + "";
+ final String html =
+ "" + "" + ""
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -196,8 +201,8 @@ public void testNestedForms() throws Exception {
@Test
public void testFormWithSelectWithoutClosingOption() throws Exception {
// Given: SELECT with unclosed OPTION tags
- final String html = "" + "" + "Option 1" + " Option 2" + " Option 3" + " "
- + "";
+ final String html =
+ "" + "" + "Option 1" + " Option 2" + " Option 3" + " " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -227,8 +232,9 @@ public void testUnclosedListItems() throws Exception {
@Test
public void testNestedListsWithUnclosedItems() throws Exception {
// Given: Nested lists with unclosed LI
- final String html = "" + "" + "Item 1" + "" + "Nested 1" + " Nested 2" + " " + " Item 2"
- + " " + "";
+ final String html =
+ "" + "" + "Item 1" + "" + "Nested 1" + " Nested 2" + " " + " Item 2" + " "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -241,8 +247,9 @@ public void testNestedListsWithUnclosedItems() throws Exception {
@Test
public void testDefinitionListQuirks() throws Exception {
// Given: DL with unclosed DT/DD
- final String html = "" + "" + "Term 1" + " Definition 1" + " Term 2" + " Definition 2" + " "
- + "";
+ final String html =
+ "" + "" + "Term 1" + " Definition 1" + " Term 2" + " Definition 2" + " "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -333,8 +340,8 @@ public void testBlockInsideInline() throws Exception {
@Test
public void testScriptInBody() throws Exception {
// Given: Script in body (valid but tested for quirks)
- final String html = "" + "Before script
" + "" + "After script
"
- + "";
+ final String html =
+ "" + "Before script
" + "" + "After script
" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -431,8 +438,9 @@ public void testHTML5Doctype() throws Exception {
@Test
public void testHTML4StrictDoctype() throws Exception {
// Given: HTML 4.01 Strict DOCTYPE
- final String html = "" + "HTML 4.01
";
+ final String html =
+ ""
+ + "HTML 4.01
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -444,8 +452,9 @@ public void testHTML4StrictDoctype() throws Exception {
@Test
public void testXHTMLDoctype() throws Exception {
// Given: XHTML DOCTYPE
- final String html = "" + "XHTML
";
+ final String html =
+ ""
+ + "XHTML
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -486,7 +495,8 @@ public void testMetaCharsetHTML5() throws Exception {
@Test
public void testMetaContentType() throws Exception {
// Given: Legacy content-type meta
- final String html = " Test";
+ final String html =
+ " Test";
// When: Parsing
final Document doc = parseHTML(html);
@@ -516,7 +526,8 @@ public void testVoidElementsWithClosingTags() throws Exception {
@Test
public void testVoidElementsWithSlash() throws Exception {
// Given: Void elements with trailing slash (XHTML style)
- final String html = "" + " " + " " + " " + " " + "";
+ final String html =
+ "" + " " + " " + " " + " " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -533,12 +544,13 @@ public void testVoidElementsWithSlash() throws Exception {
@Test
public void testTypicalWebPageStructure() throws Exception {
// Given: Typical web page structure
- final String html = "" + "" + "" + " "
- + " " + "Test Page "
- + " " + "" + "" + "" + "" + "" + ""
- + "Article Title " + "Article content.
" + " " + " " + "" + "" + "";
+ final String html =
+ "" + "" + "" + " "
+ + " " + "Test Page "
+ + " " + "" + "" + ""
+ + "" + "" + ""
+ + "Article Title " + "Article content.
" + " " + " " + "" + "" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -553,11 +565,11 @@ public void testTypicalWebPageStructure() throws Exception {
@Test
public void testTypicalFormStructure() throws Exception {
// Given: Typical form structure
- final String html = "" + "" + "";
+ final String html =
+ "" + "" + "";
// When: Parsing
final Document doc = parseHTML(html);
diff --git a/src/test/java/org/codelibs/nekohtml/parsers/ComplexTableStructuresTest.java b/src/test/java/org/codelibs/nekohtml/parsers/ComplexTableStructuresTest.java
index fb5ca19..dcfbb71 100644
--- a/src/test/java/org/codelibs/nekohtml/parsers/ComplexTableStructuresTest.java
+++ b/src/test/java/org/codelibs/nekohtml/parsers/ComplexTableStructuresTest.java
@@ -54,11 +54,9 @@ private Document parseHTML(final String html) throws Exception {
@Test
public void testTableWithAllSections() throws Exception {
// Given: Table with THEAD, TBODY, TFOOT
- final String html = ""
- + "Header "
- + "Body "
- + "Footer "
- + "
";
+ final String html =
+ "" + "Header " + "Body "
+ + "Footer " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -74,11 +72,9 @@ public void testTableWithAllSections() throws Exception {
@Test
public void testTableWithMultipleTbody() throws Exception {
// Given: Table with multiple TBODY elements (valid HTML5)
- final String html = ""
- + "Group 1 Row 1 "
- + "Group 2 Row 1 "
- + "Group 3 Row 1 "
- + "
";
+ final String html =
+ "" + "Group 1 Row 1 " + "Group 2 Row 1 "
+ + "Group 3 Row 1 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -92,10 +88,8 @@ public void testTableWithMultipleTbody() throws Exception {
@Test
public void testTableWithCaption() throws Exception {
// Given: Table with CAPTION
- final String html = ""
- + "Table Caption "
- + "Cell "
- + "
";
+ final String html =
+ "" + "Table Caption " + "Cell " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -109,14 +103,10 @@ public void testTableWithCaption() throws Exception {
@Test
public void testTableWithColgroup() throws Exception {
// Given: Table with COLGROUP and COL
- final String html = ""
- + ""
- + " "
- + " "
- + " "
- + " "
- + "Cell 1 Cell 2 Cell 3 "
- + "
";
+ final String html =
+ "" + "" + " " + " "
+ + " " + " "
+ + "Cell 1 Cell 2 Cell 3 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -130,10 +120,9 @@ public void testTableWithColgroup() throws Exception {
@Test
public void testTableWithColgroupSpan() throws Exception {
// Given: COLGROUP with span attribute
- final String html = ""
- + " "
- + "Cell 1 Cell 2 Cell 3 "
- + "
";
+ final String html =
+ "" + " "
+ + "Cell 1 Cell 2 Cell 3 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -148,19 +137,11 @@ public void testTableWithColgroupSpan() throws Exception {
@Test
public void testComplexTableWithAllElements() throws Exception {
// Given: Complex table with all possible elements
- final String html = ""
- + "Complete Table "
- + ""
- + " "
- + " "
- + " "
- + "Header 1 Header 2 "
- + ""
- + "Data 1-1 Data 1-2 "
- + "Data 2-1 Data 2-2 "
- + " "
- + "Footer 1 Footer 2 "
- + "
";
+ final String html =
+ "" + "Complete Table " + "" + " "
+ + " " + " " + "Header 1 Header 2 "
+ + "" + "Data 1-1 Data 1-2 " + "Data 2-1 Data 2-2 "
+ + " " + "Footer 1 Footer 2 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -182,10 +163,7 @@ public void testComplexTableWithAllElements() throws Exception {
@Test
public void testTableWithTrDirectlyInTable() throws Exception {
// Given: TR directly in TABLE (missing TBODY)
- final String html = ""
- + "Cell 1 "
- + "Cell 2 "
- + "
";
+ final String html = "" + "Cell 1 " + "Cell 2 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -200,10 +178,7 @@ public void testTableWithTrDirectlyInTable() throws Exception {
@Test
public void testTableWithTdDirectlyInTable() throws Exception {
// Given: TD directly in TABLE (missing TBODY and TR)
- final String html = ""
- + "Cell 1 "
- + "Cell 2 "
- + "
";
+ final String html = "" + "Cell 1 " + "Cell 2 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -217,10 +192,9 @@ public void testTableWithTdDirectlyInTable() throws Exception {
@Test
public void testTableWithTheadAfterTbody() throws Exception {
// Given: THEAD after TBODY (incorrect order)
- final String html = ""
- + "Body "
- + "Header "
- + "
";
+ final String html =
+ "" + "Body " + "Header "
+ + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -234,13 +208,9 @@ public void testTableWithTheadAfterTbody() throws Exception {
@Test
public void testTableWithMixedElements() throws Exception {
// Given: Table with elements in mixed/wrong order
- final String html = ""
- + "Row 1 "
- + "Header "
- + "Row 2 "
- + "Footer "
- + "Row 3 "
- + "
";
+ final String html =
+ "" + "Row 1 " + "Header " + "Row 2 "
+ + "Footer " + "Row 3 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -253,11 +223,9 @@ public void testTableWithMixedElements() throws Exception {
@Test
public void testTableWithUnclosedRows() throws Exception {
// Given: Table with unclosed TR tags
- final String html = ""
- + "Cell 1-1 Cell 1-2 "
- + "Cell 2-1 Cell 2-2 "
- + "Cell 3-1 Cell 3-2 "
- + "
";
+ final String html =
+ "" + "Cell 1-1 Cell 1-2 " + "Cell 2-1 Cell 2-2 "
+ + "Cell 3-1 Cell 3-2 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -271,9 +239,7 @@ public void testTableWithUnclosedRows() throws Exception {
@Test
public void testTableWithUnclosedCells() throws Exception {
// Given: Table with unclosed TD tags
- final String html = ""
- + "Cell 1 Cell 2 Cell 3 "
- + "
";
+ final String html = "" + "Cell 1 Cell 2 Cell 3 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -290,10 +256,9 @@ public void testTableWithUnclosedCells() throws Exception {
@Test
public void testTableWithColspan() throws Exception {
// Given: Table with COLSPAN
- final String html = ""
- + "Spans 2 columns Cell 3 "
- + "Cell 1 Cell 2 Cell 3 "
- + "
";
+ final String html =
+ "" + "Spans 2 columns Cell 3 "
+ + "Cell 1 Cell 2 Cell 3 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -310,10 +275,9 @@ public void testTableWithColspan() throws Exception {
@Test
public void testTableWithRowspan() throws Exception {
// Given: Table with ROWSPAN
- final String html = ""
- + "Spans 2 rows Cell 1-2 "
- + "Cell 2-2 "
- + "
";
+ final String html =
+ "" + "Spans 2 rows Cell 1-2 " + "Cell 2-2 "
+ + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -330,11 +294,10 @@ public void testTableWithRowspan() throws Exception {
@Test
public void testTableWithComplexSpans() throws Exception {
// Given: Table with complex COLSPAN and ROWSPAN
- final String html = ""
- + "Spans 2x2 Cell 1-3 "
- + "Cell 2-3 "
- + "Cell 3-1 Cell 3-2 Cell 3-3 "
- + "
";
+ final String html =
+ "" + "Spans 2x2 Cell 1-3 "
+ + "Cell 2-3 " + "Cell 3-1 Cell 3-2 Cell 3-3 "
+ + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -352,10 +315,9 @@ public void testTableWithComplexSpans() throws Exception {
@Test
public void testTableWithZeroSpan() throws Exception {
// Given: Table with colspan/rowspan=0 (special value)
- final String html = ""
- + "Spans to end "
- + "Cell 1 Cell 2 Cell 3 "
- + "
";
+ final String html =
+ "" + "Spans to end "
+ + "Cell 1 Cell 2 Cell 3 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -369,9 +331,7 @@ public void testTableWithZeroSpan() throws Exception {
@Test
public void testTableWithVeryLargeSpan() throws Exception {
// Given: Table with very large span value
- final String html = "";
+ final String html = "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -389,10 +349,9 @@ public void testTableWithVeryLargeSpan() throws Exception {
@Test
public void testNestedTables2Levels() throws Exception {
// Given: 2 levels of nested tables
- final String html = ""
- + "Outer cell 1 "
- + " "
- + "
";
+ final String html =
+ "" + "Outer cell 1 " + " "
+ + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -405,15 +364,10 @@ public void testNestedTables2Levels() throws Exception {
@Test
public void testNestedTables5Levels() throws Exception {
// Given: 5 levels of nested tables
- final String html = "";
+ final String html =
+ "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -426,11 +380,10 @@ public void testNestedTables5Levels() throws Exception {
@Test
public void testNestedTableInTheadTbodyTfoot() throws Exception {
// Given: Nested tables in different sections
- final String html = ""
- + " "
- + " "
- + " "
- + "
";
+ final String html =
+ "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -460,10 +413,9 @@ public void testEmptyTable() throws Exception {
@Test
public void testTableWithEmptyCells() throws Exception {
// Given: Table with empty cells
- final String html = ""
- + " "
- + "Content "
- + "
";
+ final String html =
+ "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -476,10 +428,7 @@ public void testTableWithEmptyCells() throws Exception {
@Test
public void testTableWithOnlyTheadEmpty() throws Exception {
// Given: Table with empty THEAD
- final String html = ""
- + " "
- + "Body "
- + "
";
+ final String html = "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -496,12 +445,9 @@ public void testTableWithOnlyTheadEmpty() throws Exception {
@Test
public void testTableWithThInTbody() throws Exception {
// Given: TH elements in TBODY (valid for row headers)
- final String html = ""
- + ""
- + "Row 1 Header Data 1 "
- + "Row 2 Header Data 2 "
- + " "
- + "
";
+ final String html =
+ "" + "" + "Row 1 Header Data 1 "
+ + "Row 2 Header Data 2 " + " " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -515,9 +461,8 @@ public void testTableWithThInTbody() throws Exception {
@Test
public void testTableWithOnlyTh() throws Exception {
// Given: Table with only TH elements
- final String html = ""
- + "Header 1 Header 2 Header 3 "
- + "
";
+ final String html =
+ "" + "Header 1 Header 2 Header 3 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -531,10 +476,9 @@ public void testTableWithOnlyTh() throws Exception {
@Test
public void testTableWithThAttributes() throws Exception {
// Given: TH with scope, headers, and colspan attributes
- final String html = ""
- + "Header "
- + "Data 1 Data 2 "
- + "
";
+ final String html =
+ "" + "Header "
+ + "Data 1 Data 2 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -554,17 +498,11 @@ public void testTableWithThAttributes() throws Exception {
@Test
public void testDataTableWithSorting() throws Exception {
// Given: Data table with sorting attributes
- final String html = ""
- + ""
- + "Name "
- + "Age "
- + "Date "
- + " "
- + ""
- + "John 30 2025-01-01 "
- + "Jane 25 2025-01-02 "
- + " "
- + "
";
+ final String html =
+ "" + "" + "Name " + "Age "
+ + "Date " + " " + ""
+ + "John 30 2025-01-01 " + "Jane 25 2025-01-02 "
+ + " " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -581,11 +519,10 @@ public void testDataTableWithSorting() throws Exception {
@Test
public void testTableWithFormElements() throws Exception {
// Given: Table containing form elements
- final String html = "";
+ final String html =
+ "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -600,12 +537,10 @@ public void testTableWithFormElements() throws Exception {
@Test
public void testTableWithComplexContent() throws Exception {
// Given: Table with various content types
- final String html = ""
- + " "
- + "Link "
- + " "
- + " "
- + "
";
+ final String html =
+ "" + " "
+ + "Link " + " "
+ + " " + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -621,11 +556,9 @@ public void testTableWithComplexContent() throws Exception {
@Test
public void testTableWithMultipleCaption() throws Exception {
// Given: Table with multiple CAPTION elements (invalid but should handle)
- final String html = ""
- + "Caption 1 "
- + "Caption 2 "
- + "Cell "
- + "
";
+ final String html =
+ "" + "Caption 1 " + "Caption 2 " + "Cell "
+ + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -638,13 +571,10 @@ public void testTableWithMultipleCaption() throws Exception {
@Test
public void testTableWithIrregularRows() throws Exception {
// Given: Table with rows of different cell counts
- final String html = ""
- + "Cell 1 "
- + "Cell 1 Cell 2 "
- + "Cell 1 Cell 2 Cell 3 "
- + "Cell 1 Cell 2 "
- + "Cell 1 "
- + "
";
+ final String html =
+ "" + "Cell 1 " + "Cell 1 Cell 2 "
+ + "Cell 1 Cell 2 Cell 3 " + "Cell 1 Cell 2 "
+ + "Cell 1 " + "
";
// When: Parsing
final Document doc = parseHTML(html);
diff --git a/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java b/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java
index 9c8c453..204a806 100644
--- a/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java
+++ b/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java
@@ -260,8 +260,7 @@ public void testHandlerMismatchedEndTag() throws Exception {
handler.startElement("", "div", "DIV", new org.xml.sax.helpers.AttributesImpl());
// End with wrong tag - should not throw in lenient mode
- assertDoesNotThrow(() -> handler.endElement("", "span", "SPAN"),
- "Lenient mode should handle mismatched end tag");
+ assertDoesNotThrow(() -> handler.endElement("", "span", "SPAN"), "Lenient mode should handle mismatched end tag");
}
@Test
@@ -275,8 +274,7 @@ public void testHandlerEndTagEmptyStack() throws Exception {
handler.endDocument(); // Clear stack
// End element with empty stack - should not throw
- assertDoesNotThrow(() -> handler.endElement("", "div", "DIV"),
- "Lenient mode should handle end tag with empty stack");
+ assertDoesNotThrow(() -> handler.endElement("", "div", "DIV"), "Lenient mode should handle end tag with empty stack");
}
@Test
@@ -287,8 +285,7 @@ public void testHandlerCharactersBeforeStartDocument() throws Exception {
final SAXToDOMHandler handler = new SAXToDOMHandler(builder);
// Characters before startDocument - should not throw
- assertDoesNotThrow(() -> handler.characters("test".toCharArray(), 0, 4),
- "Should handle characters before startDocument");
+ assertDoesNotThrow(() -> handler.characters("test".toCharArray(), 0, 4), "Should handle characters before startDocument");
}
@Test
@@ -301,8 +298,7 @@ public void testHandlerCommentInDocument() throws Exception {
handler.startDocument();
handler.startElement("", "html", "HTML", new org.xml.sax.helpers.AttributesImpl());
- assertDoesNotThrow(() -> handler.comment("This is a comment".toCharArray(), 0, 17),
- "Should handle comment in document");
+ assertDoesNotThrow(() -> handler.comment("This is a comment".toCharArray(), 0, 17), "Should handle comment in document");
}
@Test
@@ -345,9 +341,8 @@ public void testSkipDepthInLenientMode() throws Exception {
handler.endElement("", "p", "P");
handler.endElement("", "div", "DIV");
- // Should complete without throwing
- assertDoesNotThrow(() -> {
- }, "Skip depth should handle nested skipped elements");
+ // Should complete without throwing - verify handler accepted nested elements in skip mode
+ assertDoesNotThrow(() -> handler.endDocument(), "Skip depth should handle nested skipped elements");
}
// =========================================================================
@@ -358,8 +353,9 @@ public void testSkipDepthInLenientMode() throws Exception {
public void testDOMParserStrictModeWellFormed() throws Exception {
System.setProperty(PROPERTY_DOM_STRICT, "true");
- final String html = "" + "Well Formed " + ""
- + "" + "" + "";
+ final String html =
+ "" + "Well Formed " + "" + "" + "" + "";
final DOMParser parser = new DOMParser();
parser.parse(new InputSource(new StringReader(html)));
@@ -451,8 +447,7 @@ public void close() throws SecurityException {
}
public boolean hasWarningContaining(String substring) {
- return records.stream()
- .filter(r -> r.getLevel() == Level.WARNING)
+ return records.stream().filter(r -> r.getLevel() == Level.WARNING)
.anyMatch(r -> r.getMessage() != null && r.getMessage().contains(substring));
}
diff --git a/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java b/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java
index 71125db..bb2b500 100644
--- a/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java
+++ b/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java
@@ -58,22 +58,11 @@ private Document parseHTML(final String html) throws Exception {
@Test
public void testAllFormattingElementsBasic() throws Exception {
// Given: HTML with all formatting elements
- final String html = ""
- + "Link "
- + "Bold "
- + "Big "
- + "Code"
- + "Emphasis "
- + "Font "
- + "Italic "
- + "NoBreak "
- + "Strike "
- + "Small "
- + "Strike "
- + "Strong "
- + "Teletype "
- + "Underline "
- + "";
+ final String html =
+ "" + "Link " + "Bold " + "Big " + "Code" + "Emphasis "
+ + "Font " + "Italic " + "NoBreak " + "Strike " + "Small "
+ + "Strike " + "Strong " + "Teletype " + "Underline "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -170,15 +159,9 @@ public void testDeeplyNestedFormattingElements() throws Exception {
@Test
public void testFormattingElementsWithComplexBlockStructure() throws Exception {
// Given: Formatting elements with complex block structure
- final String html = ""
- + "Start"
- + "Div 1"
- + "
Para 1
"
- + "
Quote "
- + "
"
- + ""
- + "End "
- + "";
+ final String html =
+ "" + "Start" + "Div 1" + "
Para 1
" + "
Quote " + "
"
+ + "" + "End " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -199,10 +182,9 @@ public void testFormattingElementsWithComplexBlockStructure() throws Exception {
@Test
public void testFormattingElementsWithIdenticalAttributes() throws Exception {
// Given: Multiple formatting elements with same attributes
- final String html = ""
- + "Link 1 Block
continues "
- + "Link 2 Para
continues "
- + "";
+ final String html =
+ "" + "Link 1 Block
continues "
+ + "Link 2 Para
continues " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -216,9 +198,7 @@ public void testFormattingElementsWithIdenticalAttributes() throws Exception {
@Test
public void testFontElementWithAttributes() throws Exception {
// Given: FONT element with attributes crossing block
- final String html = ""
- + "Red text Block
continues "
- + "";
+ final String html = "" + "Red text Block
continues " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -271,10 +251,10 @@ public void testFormattingElementInsideTableCell() throws Exception {
@Test
public void testComplexTableWithFormattingElements() throws Exception {
// Given: Complex table with formatting elements
- final String html = ""
- + "Bold Italic "
- + "Strong Para
Em Div
"
- + "
";
+ final String html =
+ "" + "Bold Italic "
+ + "Strong Para
Em Div
"
+ + "
";
// When: Parsing
final Document doc = parseHTML(html);
@@ -309,10 +289,9 @@ public void testFormattingElementAcrossList() throws Exception {
@Test
public void testFormattingElementInListItems() throws Exception {
// Given: Formatting elements in list items
- final String html = ""
- + "Bold Block
continues "
- + "Italic Para
continues "
- + " ";
+ final String html =
+ "" + "Bold Block
continues " + "Italic Para
continues "
+ + " ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -345,11 +324,9 @@ public void testNestedListsWithFormattingElements() throws Exception {
@Test
public void testFormattingElementsAcrossSemanticElements() throws Exception {
// Given: Formatting elements crossing semantic boundaries
- final String html = "Bold "
- + "Article "
- + " "
- + "Nav "
- + "end ";
+ final String html =
+ "Bold " + "Article " + " " + "Nav "
+ + "end ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -365,13 +342,10 @@ public void testFormattingElementsAcrossSemanticElements() throws Exception {
@Test
public void testComplexSemanticStructureWithFormatting() throws Exception {
// Given: Complex semantic structure with formatting
- final String html = ""
- + ""
- + ""
- + ""
- + ""
- + " "
- + "";
+ final String html =
+ "" + "" + ""
+ + ""
+ + "" + " " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -455,9 +429,7 @@ public void testMisnestingWithBlockElements() throws Exception {
@Test
public void testComplexMisnestingPattern() throws Exception {
// Given: Very complex misnesting
- final String html = ""
- + "TextBlock
"
- + "";
+ final String html = "" + "TextBlock
" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -537,11 +509,9 @@ public void testFormattingElementsWithNOBR() throws Exception {
@Test
public void testAllFormattingElementsTogether() throws Exception {
// Given: All formatting elements used together
- final String html = ""
- + ""
- + "Text Block
continues"
- + " "
- + "";
+ final String html =
+ "" + "" + "Text Block
continues"
+ + " " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -554,13 +524,9 @@ public void testAllFormattingElementsTogether() throws Exception {
@Test
public void testFormattingElementsAcrossNestedBlocks() throws Exception {
// Given: Formatting elements across deeply nested blocks
- final String html = "Start"
- + "Level 1"
- + "
Level 2"
- + "
Level 3"
- + "
Para
"
- + "
"
- + "End ";
+ final String html =
+ "Start" + ""
+ + "End ";
// When: Parsing
final Document doc = parseHTML(html);
@@ -578,12 +544,8 @@ public void testFormattingElementsAcrossNestedBlocks() throws Exception {
@Test
public void testAAAOuterLoopWithManyFormattingElements() throws Exception {
// Given: More than 8 nested formatting elements to test outer loop limit
- final String html = ""
- + ""
- + "Text"
- + " " // Close B early to trigger AAA
- + ""
- + "";
+ final String html = "" + "" + "Text" + " " // Close B early to trigger AAA
+ + "" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -618,12 +580,8 @@ public void testAAAWithExtremeNesting() throws Exception {
@Test
public void testAAAWithAlternatingFormattingElements() throws Exception {
// Given: Alternating formatting elements beyond loop limit
- final String html = ""
- + ""
- + "Deep text"
- + " " // Early close
- + " "
- + "";
+ final String html = "" + "" + "Deep text" + " " // Early close
+ + " " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -639,14 +597,9 @@ public void testAAAWithAlternatingFormattingElements() throws Exception {
@Test
public void testFormattingMarkerInTable() throws Exception {
// Given: Formatting elements crossing table cell boundaries
- final String html = ""
- + "Before table"
- + ""
- + "Cell 1 Italic in cell "
- + "Cell 2 "
- + "
"
- + "After table "
- + "";
+ final String html =
+ "" + "Before table" + "" + "Cell 1 Italic in cell " + "Cell 2 "
+ + "
" + "After table " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -660,12 +613,9 @@ public void testFormattingMarkerInTable() throws Exception {
@Test
public void testFormattingMarkerInCaption() throws Exception {
// Given: Formatting elements in table caption
- final String html = ""
- + ""
- + "Bold caption Block in caption
continues "
- + "Cell "
- + "
"
- + "";
+ final String html =
+ "" + "" + "Bold caption Block in caption
continues "
+ + "Cell " + "
" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -678,12 +628,9 @@ public void testFormattingMarkerInCaption() throws Exception {
@Test
public void testFormattingMarkerInTH() throws Exception {
// Given: Formatting elements in table header
- final String html = ""
- + ""
- + "Header Para in header
continues "
- + "Cell "
- + "
"
- + "";
+ final String html =
+ "" + "" + "Header Para in header
continues " + "Cell "
+ + "
" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -696,15 +643,9 @@ public void testFormattingMarkerInTH() throws Exception {
@Test
public void testFormattingAcrossMultipleTableCells() throws Exception {
// Given: Formatting spanning multiple cells (invalid but should be handled)
- final String html = ""
- + ""
- + ""
- + "Start bold "
- + "Middle cell "
- + "Third cell "
- + " "
- + "
"
- + "";
+ final String html =
+ "" + "" + "" + "Start bold " + "Middle cell " + "Third cell "
+ + " " + "
" + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -721,12 +662,9 @@ public void testFormattingAcrossMultipleTableCells() throws Exception {
@Test
public void testFormattingInSelectOption() throws Exception {
// Given: Formatting in select option (should be stripped)
- final String html = ""
- + ""
- + "Bold option "
- + "Italic option "
- + " "
- + "";
+ final String html =
+ "" + "" + "Bold option " + "Italic option " + " "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -740,11 +678,8 @@ public void testFormattingInSelectOption() throws Exception {
@Test
public void testFormattingSpanningSelect() throws Exception {
// Given: Formatting spanning across select (invalid)
- final String html = ""
- + "Before select"
- + "Option 1 "
- + "After select "
- + "";
+ final String html =
+ "" + "Before select" + "Option 1 " + "After select " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -761,9 +696,7 @@ public void testFormattingSpanningSelect() throws Exception {
@Test
public void testFormattingInButton() throws Exception {
// Given: Formatting inside button
- final String html = ""
- + "Bold button Block in button
text "
- + "";
+ final String html = "" + "Bold button Block in button
text " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -776,11 +709,7 @@ public void testFormattingInButton() throws Exception {
@Test
public void testFormattingSpanningButton() throws Exception {
// Given: Formatting spanning across button
- final String html = ""
- + "Before button"
- + "Click me "
- + "After button "
- + "";
+ final String html = "" + "Before button" + "Click me " + "After button " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -797,9 +726,7 @@ public void testFormattingSpanningButton() throws Exception {
@Test
public void testFormattingInObject() throws Exception {
// Given: Formatting inside object element
- final String html = ""
- + "Fallback Block
content "
- + "";
+ final String html = "" + "Fallback Block
content " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -812,9 +739,7 @@ public void testFormattingInObject() throws Exception {
@Test
public void testFormattingInMarquee() throws Exception {
// Given: Formatting in marquee (deprecated but may appear)
- final String html = ""
- + "Scrolling Block
text "
- + "";
+ final String html = "" + "Scrolling Block
text " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -831,11 +756,9 @@ public void testFormattingInMarquee() throws Exception {
@Test
public void testFormattingWithFormElement() throws Exception {
// Given: Formatting crossing form boundary
- final String html = ""
- + "Before form"
- + ""
- + "After form "
- + "";
+ final String html =
+ "" + "Before form" + "" + "After form "
+ + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -848,12 +771,9 @@ public void testFormattingWithFormElement() throws Exception {
@Test
public void testFormattingWithFieldset() throws Exception {
// Given: Formatting in fieldset with legend
- final String html = ""
- + ""
- + "Bold legend Block
"
- + " "
- + " "
- + "";
+ final String html =
+ "" + "" + "Bold legend Block
" + " "
+ + " " + "";
// When: Parsing
final Document doc = parseHTML(html);
@@ -899,9 +819,7 @@ public void testAAAInnerLoopWithManyActiveElements() throws Exception {
@Test
public void testAAAWithMultipleFurthestBlockCandidates() throws Exception {
// Given: Multiple potential furthest blocks
- final String html = ""
- + "Bold Para 1
Div
Para 2
continues"
- + "";
+ final String html = "" + "Bold Para 1
Div
Para 2
continues" + "";
// When: Parsing
final Document doc = parseHTML(html);
diff --git a/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java b/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java
index e17eb20..b7ab543 100644
--- a/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java
+++ b/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java
@@ -214,13 +214,11 @@ public void testSimpleErrorFormatFeature() throws Exception {
final HTMLSAXConfiguration config = new HTMLSAXConfiguration();
// Default should be false
- assertFalse(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT),
- "Simple error format should be disabled by default");
+ assertFalse(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), "Simple error format should be disabled by default");
// Enable simple error format
config.setFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT, true);
- assertTrue(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT),
- "Should be able to enable simple error format");
+ assertTrue(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), "Should be able to enable simple error format");
}
@Test
@@ -292,13 +290,11 @@ public void testNamesElemsProperty() throws Exception {
final HTMLSAXConfiguration config = new HTMLSAXConfiguration();
// Default should be "upper"
- assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS),
- "Element names should default to upper case");
+ assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS), "Element names should default to upper case");
// Change to lower
config.setProperty(HTMLSAXConfiguration.NAMES_ELEMS, "lower");
- assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS),
- "Should be able to set element names to lower case");
+ assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS), "Should be able to set element names to lower case");
}
@Test
@@ -306,13 +302,11 @@ public void testNamesAttrsProperty() throws Exception {
final HTMLSAXConfiguration config = new HTMLSAXConfiguration();
// Default should be "lower"
- assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS),
- "Attribute names should default to lower case");
+ assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS), "Attribute names should default to lower case");
// Change to upper
config.setProperty(HTMLSAXConfiguration.NAMES_ATTRS, "upper");
- assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS),
- "Should be able to set attribute names to upper case");
+ assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS), "Should be able to set attribute names to upper case");
}
@Test
@@ -328,16 +322,14 @@ public void testLexicalHandlerProperty() throws Exception {
"Lexical handler should be retrievable via property");
// Also get via getter
- assertSame(lexHandler, config.getLexicalHandler(),
- "Lexical handler should be retrievable via getter");
+ assertSame(lexHandler, config.getLexicalHandler(), "Lexical handler should be retrievable via getter");
}
@Test
public void testUnrecognizedFeature() {
final HTMLSAXConfiguration config = new HTMLSAXConfiguration();
- assertThrows(org.xml.sax.SAXNotRecognizedException.class,
- () -> config.getFeature("http://example.com/unknown-feature"),
+ assertThrows(org.xml.sax.SAXNotRecognizedException.class, () -> config.getFeature("http://example.com/unknown-feature"),
"Should throw for unrecognized feature");
}
@@ -345,8 +337,7 @@ public void testUnrecognizedFeature() {
public void testUnrecognizedProperty() {
final HTMLSAXConfiguration config = new HTMLSAXConfiguration();
- assertThrows(org.xml.sax.SAXNotRecognizedException.class,
- () -> config.getProperty("http://example.com/unknown-property"),
+ assertThrows(org.xml.sax.SAXNotRecognizedException.class, () -> config.getProperty("http://example.com/unknown-property"),
"Should throw for unrecognized property");
}
diff --git a/src/test/java/org/codelibs/nekohtml/sax/HTMLTagBalancerFilterEnhancementsTest.java b/src/test/java/org/codelibs/nekohtml/sax/HTMLTagBalancerFilterEnhancementsTest.java
index 98b6e64..36449ec 100644
--- a/src/test/java/org/codelibs/nekohtml/sax/HTMLTagBalancerFilterEnhancementsTest.java
+++ b/src/test/java/org/codelibs/nekohtml/sax/HTMLTagBalancerFilterEnhancementsTest.java
@@ -46,8 +46,7 @@ public void setUp() {
filter.setContentHandler(new DefaultHandler() {
@Override
- public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes atts)
- throws SAXException {
+ public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes atts) throws SAXException {
startElements.add(qName);
}
diff --git a/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java b/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java
index c8d5e92..b3a269a 100644
--- a/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java
+++ b/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java
@@ -88,8 +88,9 @@ public void testScriptWithTypeAttribute() throws Exception {
@Test
public void testMultipleScriptElements() throws Exception {
- final String html = "" + "" + ""
- + "" + "";
+ final String html =
+ "" + "" + "" + ""
+ + "";
final DOMParser parser = new DOMParser();
parser.parse(new InputSource(new StringReader(html)));
@@ -139,8 +140,7 @@ public void testStyleWithSelectors() throws Exception {
assertEquals(1, styles.getLength(), "Should have one style element");
final String styleContent = styles.item(0).getTextContent();
- assertTrue(styleContent.contains("color: red") || styleContent.contains("color:"),
- "Style content should contain CSS rules");
+ assertTrue(styleContent.contains("color: red") || styleContent.contains("color:"), "Style content should contain CSS rules");
}
@Test
diff --git a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java
index 217dc6e..ad28529 100644
--- a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java
+++ b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java
@@ -477,9 +477,13 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- // Entities may be passed through or decoded depending on implementation
- assertTrue(result.toString().contains("&") || result.toString().contains("&"),
- "Should handle ampersand entity");
+ // Entity decoding should produce actual characters
+ final String decoded = result.toString();
+ assertTrue(decoded.contains("&"), "Should decode & to &");
+ assertTrue(decoded.contains("<"), "Should decode < to <");
+ assertTrue(decoded.contains(">"), "Should decode > to >");
+ assertTrue(decoded.contains("\""), "Should decode " to \"");
+ assertTrue(decoded.contains("\u00A0"), "Should decode to non-breaking space");
}
/**
@@ -502,7 +506,11 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- assertNotNull(result.toString(), "Should parse decimal numeric entities");
+ // < = <, > = >, & = &
+ final String decoded = result.toString();
+ assertTrue(decoded.contains("<"), "Should decode < to <");
+ assertTrue(decoded.contains(">"), "Should decode > to >");
+ assertTrue(decoded.contains("&"), "Should decode & to &");
}
/**
@@ -525,7 +533,11 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- assertNotNull(result.toString(), "Should parse hexadecimal numeric entities");
+ // < = <, > = >, & = &
+ final String decoded = result.toString();
+ assertTrue(decoded.contains("<"), "Should decode < to <");
+ assertTrue(decoded.contains(">"), "Should decode > to >");
+ assertTrue(decoded.contains("&"), "Should decode & to &");
}
/**
@@ -550,7 +562,8 @@ public void startElement(String uri, String localName, String qName, org.xml.sax
scanner.parse(input);
- assertTrue(attrValues.toString().contains("href"), "Should parse attribute with entity");
+ final String attrs = attrValues.toString();
+ assertTrue(attrs.contains("href=test?a=1&b=2"), "Should decode & in attribute to &");
}
/**
@@ -574,8 +587,7 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- assertTrue(result.toString().contains("&") || result.toString().contains("A"),
- "Should handle incomplete entity");
+ assertTrue(result.toString().contains("A & B"), "Should preserve incomplete entity as literal text");
}
/**
@@ -598,7 +610,91 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- assertNotNull(result.toString(), "Should handle unknown entity");
+ assertTrue(result.toString().contains("&unknown;"), "Should preserve unknown entity as literal text");
+ }
+
+ /**
+ * Test that semicolon-less named entities in URL attributes are NOT decoded
+ * (HTML5 attribute value state rule: ¬=, ©=, ®= must be preserved)
+ */
+ @Test
+ public void testSemicolonlessEntitiesInUrlAttributes() throws Exception {
+ final SimpleHTMLScanner scanner = new SimpleHTMLScanner();
+ final StringBuilder attrValues = new StringBuilder();
+
+ scanner.setContentHandler(new DefaultHandler() {
+ @Override
+ public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes atts) {
+ for (int i = 0; i < atts.getLength(); i++) {
+ attrValues.append(atts.getQName(i)).append("=").append(atts.getValue(i)).append("|");
+ }
+ }
+ });
+
+ final String html = "Link ";
+ final InputSource input = new InputSource(new StringReader(html));
+
+ scanner.parse(input);
+
+ final String attrs = attrValues.toString();
+ assertTrue(attrs.contains("href=/x?a=1¬=2©=3®=4"),
+ "Semicolon-less named entities in attributes should be preserved as-is, got: " + attrs);
+ }
+
+ /**
+ * Test that invalid numeric references produce U+FFFD replacement character
+ */
+ @Test
+ public void testInvalidNumericReferences() throws Exception {
+ final SimpleHTMLScanner scanner = new SimpleHTMLScanner();
+ final StringBuilder result = new StringBuilder();
+
+ scanner.setContentHandler(new DefaultHandler() {
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ result.append(new String(ch, start, length));
+ }
+ });
+
+ // (null), (surrogate), (control char)
+ final String html = " ";
+ final InputSource input = new InputSource(new StringReader(html));
+
+ scanner.parse(input);
+
+ final String decoded = result.toString();
+ // All invalid code points should be replaced with U+FFFD
+ assertEquals(
+ 3,
+ decoded.chars().filter(c -> c == 0xFFFD).count(),
+ "Invalid numeric references should be replaced with U+FFFD, got: "
+ + decoded.codePoints().mapToObj(cp -> String.format("U+%04X", cp)).reduce("", (a, b) -> a + " " + b));
+ }
+
+ /**
+ * Test that semicolon-less named entities ARE decoded in text context
+ */
+ @Test
+ public void testSemicolonlessEntitiesInTextContent() throws Exception {
+ final SimpleHTMLScanner scanner = new SimpleHTMLScanner();
+ final StringBuilder result = new StringBuilder();
+
+ scanner.setContentHandler(new DefaultHandler() {
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ result.append(new String(ch, start, length));
+ }
+ });
+
+ final String html = "& < >";
+ final InputSource input = new InputSource(new StringReader(html));
+
+ scanner.parse(input);
+
+ final String decoded = result.toString();
+ assertTrue(decoded.contains("&"), "Should decode & (without semicolon) in text");
+ assertTrue(decoded.contains("<"), "Should decode < (without semicolon) in text");
+ assertTrue(decoded.contains(">"), "Should decode > (without semicolon) in text");
}
// =========================================================================
@@ -622,15 +718,13 @@ public void characters(char[] ch, int start, int length) {
// ISO-8859-1 encoded content with accented characters
final String content = "café résumé";
- final ByteArrayInputStream stream = new ByteArrayInputStream(
- ("" + content + "").getBytes("ISO-8859-1"));
+ final ByteArrayInputStream stream = new ByteArrayInputStream(("" + content + "").getBytes("ISO-8859-1"));
final InputSource input = new InputSource(stream);
input.setEncoding("ISO-8859-1");
scanner.parse(input);
- assertTrue(result.toString().contains("caf") || result.toString().contains("é"),
- "ISO-8859-1 encoded content should be parsed");
+ assertTrue(result.toString().contains("caf") || result.toString().contains("é"), "ISO-8859-1 encoded content should be parsed");
}
/**
@@ -655,8 +749,7 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- assertTrue(result.toString().contains("Unicode") || result.toString().contains("\u4E2D"),
- "UTF-16 encoded content should be parsed");
+ assertTrue(result.toString().contains("Unicode") || result.toString().contains("\u4E2D"), "UTF-16 encoded content should be parsed");
}
/**
@@ -681,8 +774,7 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- assertTrue(result.toString().contains("UTF-8") || result.toString().contains("ä"),
- "Default UTF-8 encoding should work");
+ assertTrue(result.toString().contains("UTF-8") || result.toString().contains("ä"), "Default UTF-8 encoding should work");
}
/**
@@ -709,8 +801,7 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- assertTrue(result.toString().contains("From character stream"),
- "Character stream should take precedence over byte stream");
+ assertTrue(result.toString().contains("From character stream"), "Character stream should take precedence over byte stream");
}
// =========================================================================
@@ -745,8 +836,7 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(input);
- assertTrue(result.toString().contains("File URL content"),
- "Should parse content from file:// URL");
+ assertTrue(result.toString().contains("File URL content"), "Should parse content from file:// URL");
}
/**
@@ -760,8 +850,7 @@ public void testNoValidInputSource() {
// InputSource with nothing set
final InputSource input = new InputSource();
- assertThrows(SAXException.class, () -> scanner.parse(input),
- "Should throw when no valid input source is available");
+ assertThrows(SAXException.class, () -> scanner.parse(input), "Should throw when no valid input source is available");
}
/**
@@ -788,8 +877,7 @@ public void characters(char[] ch, int start, int length) {
scanner.parse(tempFile.getAbsolutePath());
- assertTrue(result.toString().contains("SystemId parse"),
- "Should parse using String systemId parameter");
+ assertTrue(result.toString().contains("SystemId parse"), "Should parse using String systemId parameter");
}
// =========================================================================
@@ -900,8 +988,7 @@ public void comment(char[] ch, int start, int length) {
public void testGetUnrecognizedFeature() {
final SimpleHTMLScanner scanner = new SimpleHTMLScanner();
- assertThrows(org.xml.sax.SAXNotRecognizedException.class,
- () -> scanner.getFeature("http://example.com/unknown-feature"),
+ assertThrows(org.xml.sax.SAXNotRecognizedException.class, () -> scanner.getFeature("http://example.com/unknown-feature"),
"Should throw SAXNotRecognizedException for unknown feature");
}
@@ -912,8 +999,7 @@ public void testGetUnrecognizedFeature() {
public void testGetUnrecognizedProperty() {
final SimpleHTMLScanner scanner = new SimpleHTMLScanner();
- assertThrows(org.xml.sax.SAXNotRecognizedException.class,
- () -> scanner.getProperty("http://example.com/unknown-property"),
+ assertThrows(org.xml.sax.SAXNotRecognizedException.class, () -> scanner.getProperty("http://example.com/unknown-property"),
"Should throw SAXNotRecognizedException for unknown property");
}
@@ -929,7 +1015,6 @@ public void testParsingWithoutContentHandler() throws Exception {
final InputSource input = new InputSource(new StringReader(html));
// Should return early without error
- assertDoesNotThrow(() -> scanner.parse(input),
- "Parsing without content handler should not throw");
+ assertDoesNotThrow(() -> scanner.parse(input), "Parsing without content handler should not throw");
}
}
diff --git a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerTest.java b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerTest.java
index 8a377fa..2653056 100644
--- a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerTest.java
+++ b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerTest.java
@@ -407,8 +407,9 @@ public void characters(final char[] ch, final int start, final int length) {
// When: Parsing the HTML
scanner.parse(input);
- // Then: Should capture special characters (as-is, no entity decoding)
- assertTrue(textContent.stream().anyMatch(text -> text.contains("<>&")));
+ // Then: Should capture decoded special characters (join chunks since characters() may be called multiple times)
+ final String allText = String.join("", textContent);
+ assertTrue(allText.contains("<>&"), "Should decode <>& to <>&");
}
@Test
@@ -499,8 +500,8 @@ public void testParseSystemIdNotSupported() {
final SAXException exception = assertThrows(SAXException.class, () -> {
scanner.parse(input);
});
- assertTrue(exception.getMessage().contains("Cannot open SystemId"),
- "Expected message about unable to open SystemId, got: " + exception.getMessage());
+ assertTrue(exception.getMessage().contains("Cannot open SystemId"), "Expected message about unable to open SystemId, got: "
+ + exception.getMessage());
}
@Test
@@ -525,8 +526,8 @@ public void testParseStringSystemId() throws Exception {
final SAXException exception = assertThrows(SAXException.class, () -> {
scanner.parse("http://example.com/nonexistent.html");
});
- assertTrue(exception.getMessage().contains("Cannot open SystemId"),
- "Expected message about unable to open SystemId, got: " + exception.getMessage());
+ assertTrue(exception.getMessage().contains("Cannot open SystemId"), "Expected message about unable to open SystemId, got: "
+ + exception.getMessage());
}
@Test