From 1257fa5dbfe8b6c92ac51f1793ee601bb3be0181 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 10 Jan 2026 00:21:22 +0000 Subject: [PATCH] Add comprehensive test coverage improvements - Add RawTextElementsTest.java for script/style/textarea/title element handling - Add StrictModeTest.java for nekohtml.dom.strict system property testing - Add BrowserQuirksIntegrationTest.java for browser quirks and implied elements - Extend SimpleHTMLScannerEnhancementsTest with entity and encoding tests - Extend AdoptionAgencyAlgorithmExtendedTest with AAA loop limit, formatting marker, and table context tests - Extend HTMLSAXConfigurationTest with feature combination and property tests - Extend PerformanceStressTest with concurrent parsing, stress patterns, and real-world simulation tests --- .../nekohtml/PerformanceStressTest.java | 296 +++++++++ .../parsers/BrowserQuirksIntegrationTest.java | 573 ++++++++++++++++++ .../nekohtml/parsers/StrictModeTest.java | 464 ++++++++++++++ .../AdoptionAgencyAlgorithmExtendedTest.java | 416 +++++++++++++ .../sax/HTMLSAXConfigurationTest.java | 429 +++++++++++++ .../nekohtml/sax/RawTextElementsTest.java | 451 ++++++++++++++ .../SimpleHTMLScannerEnhancementsTest.java | 480 +++++++++++++++ 7 files changed, 3109 insertions(+) create mode 100644 src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java create mode 100644 src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java create mode 100644 src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java diff --git a/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java b/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java index f929288..487d4dd 100644 --- a/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java +++ b/src/test/java/org/codelibs/nekohtml/PerformanceStressTest.java @@ -580,4 +580,300 @@ public void testMixedContentStressTest() throws Exception { assertTrue(doc.getElementsByTagName("UL").getLength() > 0, "Should have lists"); assertTrue(doc.getElementsByTagName("TABLE").getLength() > 0, "Should have tables"); } + + // ======================================================================== + // Additional Stress Tests + // ======================================================================== + + @Test + @Timeout(15) + public void testVeryDeeplyNestedFormatting1000Levels() throws Exception { + // Given: 1000 levels of nested formatting elements + final StringBuilder html = new StringBuilder(""); + for (int i = 0; i < 1000; i++) { + html.append(""); + } + html.append("Deep text"); + for (int i = 0; i < 1000; i++) { + html.append(""); + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle extreme nesting without stack overflow + assertNotNull(doc, "Document should be parsed without stack overflow"); + } + + @Test + @Timeout(10) + public void testParseEmptyDocument() throws Exception { + // Given: Empty document + final String html = ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should create document + assertNotNull(doc, "Empty document should be parsed"); + } + + @Test + @Timeout(10) + public void testParseWhitespaceOnlyDocument() throws Exception { + // Given: Whitespace-only document + final String html = " \n\t\n "; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should create document + assertNotNull(doc, "Whitespace document should be parsed"); + } + + @Test + @Timeout(15) + public void testManyNestedFormsWithInputs() throws Exception { + // Given: Many forms with inputs (stress test for form handling) + final StringBuilder html = new StringBuilder(""); + for (int i = 0; i < 500; i++) { + html.append("
"); + html.append(""); + html.append(""); + html.append(""); + html.append("
"); + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle many forms + assertNotNull(doc, "Document should be parsed"); + assertEquals(500, doc.getElementsByTagName("FORM").getLength(), "Should have 500 forms"); + assertEquals(1500, doc.getElementsByTagName("INPUT").getLength(), "Should have 1500 inputs"); + } + + @Test + @Timeout(10) + public void testHugeAttributeCount() throws Exception { + // Given: Element with 5000 attributes + final StringBuilder html = new StringBuilder("
Content
"); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle huge attribute count + assertNotNull(doc, "Document should be parsed"); + final Element div = (Element) doc.getElementsByTagName("DIV").item(0); + assertNotNull(div, "DIV should exist"); + } + + @Test + @Timeout(20) + public void testConcurrentParsing() throws Exception { + // Given: Multiple parse operations + final int threadCount = 10; + final java.util.concurrent.CountDownLatch latch = new java.util.concurrent.CountDownLatch(threadCount); + final java.util.concurrent.atomic.AtomicInteger successCount = new java.util.concurrent.atomic.AtomicInteger(0); + final java.util.concurrent.atomic.AtomicInteger errorCount = new java.util.concurrent.atomic.AtomicInteger(0); + + for (int t = 0; t < threadCount; t++) { + final int threadId = t; + new Thread(() -> { + try { + final DOMParser threadParser = new DOMParser(); + final StringBuilder html = new StringBuilder(""); + for (int i = 0; i < 100; i++) { + html.append("
Thread ").append(threadId).append(" Element ").append(i).append("
"); + } + html.append(""); + + threadParser.parse(new InputSource(new StringReader(html.toString()))); + final Document doc = threadParser.getDocument(); + + if (doc != null && doc.getElementsByTagName("DIV").getLength() == 100) { + successCount.incrementAndGet(); + } else { + errorCount.incrementAndGet(); + } + } catch (final Exception e) { + errorCount.incrementAndGet(); + } finally { + latch.countDown(); + } + }).start(); + } + + latch.await(); + + // Then: All threads should succeed + assertEquals(threadCount, successCount.get(), "All threads should succeed"); + assertEquals(0, errorCount.get(), "No errors should occur"); + } + + @Test + @Timeout(15) + public void testMisnestingStressTest() throws Exception { + // Given: Extremely misnested document + final StringBuilder html = new StringBuilder(""); + for (int i = 0; i < 100; i++) { + html.append("
"); + } + html.append("Content"); + // Close in completely wrong order + for (int i = 0; i < 100; i++) { + html.append("
"); + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle extreme misnesting + assertNotNull(doc, "Document should be parsed despite misnesting"); + } + + @Test + @Timeout(15) + public void testAlternatingBlockInline() throws Exception { + // Given: Rapidly alternating block and inline elements + final StringBuilder html = new StringBuilder(""); + for (int i = 0; i < 1000; i++) { + if (i % 2 == 0) { + html.append("
Block ").append(i).append("
"); + } else { + html.append("Inline ").append(i).append(""); + } + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle alternating pattern + assertNotNull(doc, "Document should be parsed"); + assertEquals(500, doc.getElementsByTagName("DIV").getLength(), "Should have 500 DIVs"); + assertEquals(500, doc.getElementsByTagName("SPAN").getLength(), "Should have 500 SPANs"); + } + + @Test + @Timeout(15) + public void testVeryLongElementName() throws Exception { + // Given: Element with very long custom name (1000 chars) + final StringBuilder longName = new StringBuilder(); + for (int i = 0; i < 1000; i++) { + longName.append("x"); + } + final String html = "<" + longName + ">Content"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle long element name + assertNotNull(doc, "Document should be parsed"); + } + + @Test + @Timeout(15) + public void testManyVoidElements() throws Exception { + // Given: Many void elements in sequence + final StringBuilder html = new StringBuilder(""); + for (int i = 0; i < 5000; i++) { + html.append("

"); + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle many void elements + assertNotNull(doc, "Document should be parsed"); + assertEquals(5000, doc.getElementsByTagName("BR").getLength(), "Should have 5000 BRs"); + assertEquals(5000, doc.getElementsByTagName("HR").getLength(), "Should have 5000 HRs"); + } + + @Test + @Timeout(20) + public void testComplexRealWorldSimulation() throws Exception { + // Given: Simulated real-world complex page + final StringBuilder html = new StringBuilder(); + html.append(""); + html.append(""); + html.append("Complex Page"); + html.append(""); + html.append(""); + html.append(""); + + // Header with nav + html.append("
"); + + // Main content with multiple sections + html.append("
"); + for (int section = 0; section < 50; section++) { + html.append("
"); + html.append("

Section ").append(section).append("

"); + for (int para = 0; para < 5; para++) { + html.append("

Lorem ipsum dolor sit amet, consectetur adipiscing elit.

"); + } + html.append(""); + for (int row = 0; row < 5; row++) { + html.append(""); + } + html.append("
Col1Col2
Data ").append(row).append("Value ").append(row).append("
"); + html.append("
"); + } + html.append("
"); + + // Sidebar + html.append(""); + + // Footer + html.append(""); + html.append(""); + + // When: Parsing + final long startTime = System.currentTimeMillis(); + final Document doc = parseHTML(html.toString()); + final long elapsedTime = System.currentTimeMillis() - startTime; + + // Then: Should handle complex real-world structure + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("HEADER").getLength(), "Should have HEADER"); + assertEquals(1, doc.getElementsByTagName("MAIN").getLength(), "Should have MAIN"); + assertEquals(1, doc.getElementsByTagName("FOOTER").getLength(), "Should have FOOTER"); + assertEquals(50, doc.getElementsByTagName("SECTION").getLength(), "Should have 50 SECTIONs"); + assertEquals(50, doc.getElementsByTagName("TABLE").getLength(), "Should have 50 TABLEs"); + + System.out.println("Parsed complex real-world simulation in " + elapsedTime + "ms"); + } + + @Test + @Timeout(10) + public void testRepeatedParsingWithSameParser() throws Exception { + // Given: Same parser instance used multiple times + final DOMParser reusableParser = new DOMParser(); + + for (int iteration = 0; iteration < 100; iteration++) { + final String html = "
Iteration " + iteration + "
"; + reusableParser.parse(new InputSource(new StringReader(html))); + final Document doc = reusableParser.getDocument(); + + assertNotNull(doc, "Document should be parsed on iteration " + iteration); + assertEquals(1, doc.getElementsByTagName("DIV").getLength(), "Should have 1 DIV on iteration " + iteration); + } + } } diff --git a/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java b/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java new file mode 100644 index 0000000..4d8de62 --- /dev/null +++ b/src/test/java/org/codelibs/nekohtml/parsers/BrowserQuirksIntegrationTest.java @@ -0,0 +1,573 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.nekohtml.parsers; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.StringReader; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +/** + * Integration tests for browser quirks mode compatibility and implied element handling. + * Tests common HTML patterns that browsers handle gracefully but are technically invalid. + * + * @author CodeLibs Project + */ +public class BrowserQuirksIntegrationTest { + + private DOMParser parser; + + @BeforeEach + public void setUp() throws Exception { + parser = new DOMParser(); + } + + private Document parseHTML(final String html) throws Exception { + parser.parse(new InputSource(new StringReader(html))); + return parser.getDocument(); + } + + // ========================================================================= + // Implied Element Tests + // ========================================================================= + + @Test + public void testImpliedHtmlElement() throws Exception { + // Given: Document without explicit HTML tag + final String html = "Test

Content

"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: HTML element should exist + assertNotNull(doc.getDocumentElement(), "Document should have root element"); + } + + @Test + public void testImpliedHeadElement() throws Exception { + // Given: Document without explicit HEAD tag but with HEAD content + final String html = "Test

Content

"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse successfully + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("TITLE").getLength(), "TITLE should exist"); + } + + @Test + public void testImpliedBodyElement() throws Exception { + // Given: Document without explicit BODY tag + final String html = "Test

Content

"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse successfully + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("P").getLength(), "P should exist"); + } + + @Test + public void testImpliedTbodyInTable() throws Exception { + // Given: Table without explicit TBODY + final String html = "
Cell 1Cell 2
"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Table should parse successfully + assertEquals(1, doc.getElementsByTagName("TABLE").getLength(), "TABLE should exist"); + assertEquals(1, doc.getElementsByTagName("TR").getLength(), "TR should exist"); + assertEquals(2, doc.getElementsByTagName("TD").getLength(), "TD elements should exist"); + } + + @Test + public void testImpliedColgroup() throws Exception { + // Given: Table with COL but no COLGROUP + final String html = "
Cell
"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse successfully + assertEquals(1, doc.getElementsByTagName("TABLE").getLength(), "TABLE should exist"); + assertEquals(1, doc.getElementsByTagName("COL").getLength(), "COL should exist"); + } + + // ========================================================================= + // Table Quirks Tests + // ========================================================================= + + @Test + public void testTableWithMissingCloseTags() throws Exception { + // Given: Table with missing close tags (common in legacy HTML) + final String html = "" + "
Row 1, Cell 1Row 1, Cell 2" + "
Row 2, Cell 1Row 2, Cell 2" + + "
"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Table should be properly structured + assertEquals(1, doc.getElementsByTagName("TABLE").getLength(), "TABLE should exist"); + assertEquals(2, doc.getElementsByTagName("TR").getLength(), "Should have 2 rows"); + assertEquals(4, doc.getElementsByTagName("TD").getLength(), "Should have 4 cells"); + } + + @Test + public void testNestedTables() throws Exception { + // Given: Nested tables + final String html = "" + "
" + "
Inner cell
" + "
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Both tables should exist + assertEquals(2, doc.getElementsByTagName("TABLE").getLength(), "Should have 2 tables"); + } + + @Test + public void testTableWithCaptionAfterRows() throws Exception { + // Given: Table with CAPTION after TR (invalid but common) + final String html = "" + "" + "" + "
Cell
Table Caption
"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse without error + assertEquals(1, doc.getElementsByTagName("TABLE").getLength(), "TABLE should exist"); + assertEquals(1, doc.getElementsByTagName("CAPTION").getLength(), "CAPTION should exist"); + } + + // ========================================================================= + // Form Element Quirks + // ========================================================================= + + @Test + public void testFormWithOrphanedInputs() throws Exception { + // Given: Form with inputs outside form tag + final String html = "" + "
" + "" + "
" + + "" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Both inputs should exist + assertEquals(2, doc.getElementsByTagName("INPUT").getLength(), "Should have 2 inputs"); + assertEquals(1, doc.getElementsByTagName("FORM").getLength(), "FORM should exist"); + } + + @Test + public void testNestedForms() throws Exception { + // Given: Nested forms (invalid HTML but may appear) + final String html = "" + "
" + "" + "" + "
" + + "" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse without throwing + assertNotNull(doc, "Document should be parsed"); + assertTrue(doc.getElementsByTagName("INPUT").getLength() >= 1, "Input should exist"); + } + + @Test + public void testFormWithSelectWithoutClosingOption() throws Exception { + // Given: SELECT with unclosed OPTION tags + final String html = "" + "" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: All options should exist + assertEquals(1, doc.getElementsByTagName("SELECT").getLength(), "SELECT should exist"); + assertEquals(3, doc.getElementsByTagName("OPTION").getLength(), "Should have 3 options"); + } + + // ========================================================================= + // List Element Quirks + // ========================================================================= + + @Test + public void testUnclosedListItems() throws Exception { + // Given: List with unclosed LI tags + final String html = "" + "" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: All list items should exist + assertEquals(1, doc.getElementsByTagName("UL").getLength(), "UL should exist"); + assertEquals(3, doc.getElementsByTagName("LI").getLength(), "Should have 3 list items"); + } + + @Test + public void testNestedListsWithUnclosedItems() throws Exception { + // Given: Nested lists with unclosed LI + final String html = "" + "" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse correctly + assertEquals(2, doc.getElementsByTagName("UL").getLength(), "Should have 2 ULs"); + assertEquals(4, doc.getElementsByTagName("LI").getLength(), "Should have 4 LIs"); + } + + @Test + public void testDefinitionListQuirks() throws Exception { + // Given: DL with unclosed DT/DD + final String html = "" + "
" + "
Term 1" + "
Definition 1" + "
Term 2" + "
Definition 2" + "
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: All elements should exist + assertEquals(1, doc.getElementsByTagName("DL").getLength(), "DL should exist"); + assertEquals(2, doc.getElementsByTagName("DT").getLength(), "Should have 2 DTs"); + assertEquals(2, doc.getElementsByTagName("DD").getLength(), "Should have 2 DDs"); + } + + // ========================================================================= + // Paragraph and Block Element Quirks + // ========================================================================= + + @Test + public void testParagraphAutoClose() throws Exception { + // Given: Paragraphs auto-closing when block element starts + final String html = "" + "

Para 1" + "

Para 2" + "

Para 3" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: All paragraphs should exist + assertEquals(3, doc.getElementsByTagName("P").getLength(), "Should have 3 paragraphs"); + } + + @Test + public void testDivInsideParagraph() throws Exception { + // Given: DIV inside P (should auto-close P) + final String html = "" + "

Before" + "

Block content
" + "After" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should have both elements + assertTrue(doc.getElementsByTagName("P").getLength() >= 1, "P should exist"); + assertEquals(1, doc.getElementsByTagName("DIV").getLength(), "DIV should exist"); + } + + @Test + public void testHeadingsAutoClose() throws Exception { + // Given: Unclosed headings + final String html = "" + "

Heading 1" + "

Heading 2" + "

Heading 3" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: All headings should exist + assertEquals(1, doc.getElementsByTagName("H1").getLength(), "H1 should exist"); + assertEquals(1, doc.getElementsByTagName("H2").getLength(), "H2 should exist"); + assertEquals(1, doc.getElementsByTagName("H3").getLength(), "H3 should exist"); + } + + // ========================================================================= + // Inline Element in Block Context + // ========================================================================= + + @Test + public void testInlineElementsInBody() throws Exception { + // Given: Inline elements directly in body + final String html = "Text bold more text italic"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Inline elements should be preserved + assertEquals(1, doc.getElementsByTagName("B").getLength(), "B should exist"); + assertEquals(1, doc.getElementsByTagName("I").getLength(), "I should exist"); + } + + @Test + public void testBlockInsideInline() throws Exception { + // Given: Block element inside inline (invalid) + final String html = "
Block in span
"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse (may restructure) + assertTrue(doc.getElementsByTagName("SPAN").getLength() >= 1, "SPAN should exist"); + assertEquals(1, doc.getElementsByTagName("DIV").getLength(), "DIV should exist"); + } + + // ========================================================================= + // Script and Style in Wrong Places + // ========================================================================= + + @Test + public void testScriptInBody() throws Exception { + // Given: Script in body (valid but tested for quirks) + final String html = "" + "

Before script

" + "" + "

After script

" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: All elements should exist + assertEquals(2, doc.getElementsByTagName("P").getLength(), "Should have 2 paragraphs"); + assertEquals(1, doc.getElementsByTagName("SCRIPT").getLength(), "SCRIPT should exist"); + } + + @Test + public void testStyleInBody() throws Exception { + // Given: Style in body (quirks mode) + final String html = "" + "" + "

Styled paragraph

" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse + assertEquals(1, doc.getElementsByTagName("STYLE").getLength(), "STYLE should exist"); + assertEquals(1, doc.getElementsByTagName("P").getLength(), "P should exist"); + } + + // ========================================================================= + // Attribute Quirks + // ========================================================================= + + @Test + public void testBooleanAttributes() throws Exception { + // Given: Boolean attributes without values + final String html = "" + "" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Attributes should exist + final NodeList inputs = doc.getElementsByTagName("INPUT"); + assertEquals(1, inputs.getLength(), "INPUT should exist"); + + final Element input = (Element) inputs.item(0); + assertTrue(input.hasAttribute("checked") || input.hasAttribute("CHECKED"), "checked attribute should exist"); + } + + @Test + public void testUnquotedAttributeValues() throws Exception { + // Given: Unquoted attribute values + final String html = "" + "
" + "Content" + "
" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse attributes + final NodeList divs = doc.getElementsByTagName("DIV"); + assertEquals(1, divs.getLength(), "DIV should exist"); + + final Element div = (Element) divs.item(0); + assertEquals("myid", div.getAttribute("id"), "id should be parsed"); + assertEquals("myclass", div.getAttribute("class"), "class should be parsed"); + } + + @Test + public void testMixedQuoteStyles() throws Exception { + // Given: Mixed single and double quotes + final String html = "" + "Link" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse both + final NodeList links = doc.getElementsByTagName("A"); + assertEquals(1, links.getLength(), "A should exist"); + + final Element a = (Element) links.item(0); + assertEquals("http://example.com", a.getAttribute("href"), "href should be parsed"); + assertEquals("Example Site", a.getAttribute("title"), "title should be parsed"); + } + + // ========================================================================= + // DOCTYPE Variations + // ========================================================================= + + @Test + public void testHTML5Doctype() throws Exception { + // Given: HTML5 DOCTYPE + final String html = "

HTML5

"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("P").getLength(), "P should exist"); + } + + @Test + public void testHTML4StrictDoctype() throws Exception { + // Given: HTML 4.01 Strict DOCTYPE + final String html = "" + "

HTML 4.01

"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse + assertNotNull(doc, "Document should be parsed"); + } + + @Test + public void testXHTMLDoctype() throws Exception { + // Given: XHTML DOCTYPE + final String html = "" + "

XHTML

"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse + assertNotNull(doc, "Document should be parsed"); + } + + @Test + public void testMissingDoctype() throws Exception { + // Given: No DOCTYPE (quirks mode trigger in browsers) + final String html = "

No DOCTYPE

"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse in quirks mode + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("P").getLength(), "P should exist"); + } + + // ========================================================================= + // Character Encoding Edge Cases + // ========================================================================= + + @Test + public void testMetaCharsetHTML5() throws Exception { + // Given: HTML5 charset meta + final String html = "Test"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse + assertEquals(1, doc.getElementsByTagName("META").getLength(), "META should exist"); + } + + @Test + public void testMetaContentType() throws Exception { + // Given: Legacy content-type meta + final String html = "Test"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse + assertEquals(1, doc.getElementsByTagName("META").getLength(), "META should exist"); + } + + // ========================================================================= + // Void Element Handling + // ========================================================================= + + @Test + public void testVoidElementsWithClosingTags() throws Exception { + // Given: Void elements with explicit closing tags (allowed in HTML) + final String html = "" + "

" + "
" + "" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Void elements should exist + assertTrue(doc.getElementsByTagName("BR").getLength() >= 1, "BR should exist"); + assertTrue(doc.getElementsByTagName("HR").getLength() >= 1, "HR should exist"); + assertTrue(doc.getElementsByTagName("IMG").getLength() >= 1, "IMG should exist"); + } + + @Test + public void testVoidElementsWithSlash() throws Exception { + // Given: Void elements with trailing slash (XHTML style) + final String html = "" + "
" + "
" + "" + "" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse correctly + assertTrue(doc.getElementsByTagName("BR").getLength() >= 1, "BR should exist"); + assertTrue(doc.getElementsByTagName("IMG").getLength() >= 1, "IMG should exist"); + } + + // ========================================================================= + // Real-World HTML Patterns + // ========================================================================= + + @Test + public void testTypicalWebPageStructure() throws Exception { + // Given: Typical web page structure + final String html = "" + "" + "" + "" + + "" + "Test Page" + + "" + "" + "" + "" + "
" + + "" + "
" + "
" + "
" + + "

Article Title

" + "

Article content.

" + "
" + "
" + "
" + + "

© 2024

" + "
" + "" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: All major sections should exist + assertEquals(1, doc.getElementsByTagName("HEADER").getLength(), "HEADER should exist"); + assertEquals(1, doc.getElementsByTagName("MAIN").getLength(), "MAIN should exist"); + assertEquals(1, doc.getElementsByTagName("FOOTER").getLength(), "FOOTER should exist"); + assertEquals(1, doc.getElementsByTagName("ARTICLE").getLength(), "ARTICLE should exist"); + } + + @Test + public void testTypicalFormStructure() throws Exception { + // Given: Typical form structure + final String html = "" + "
" + "
" + + "User Information" + "" + + "" + "" + + "" + "
" + "" + + "
" + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Form elements should exist + assertEquals(1, doc.getElementsByTagName("FORM").getLength(), "FORM should exist"); + assertEquals(1, doc.getElementsByTagName("FIELDSET").getLength(), "FIELDSET should exist"); + assertEquals(1, doc.getElementsByTagName("LEGEND").getLength(), "LEGEND should exist"); + assertEquals(2, doc.getElementsByTagName("LABEL").getLength(), "Should have 2 LABELs"); + assertEquals(2, doc.getElementsByTagName("INPUT").getLength(), "Should have 2 INPUTs"); + } + +} // class BrowserQuirksIntegrationTest diff --git a/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java b/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java new file mode 100644 index 0000000..9c8c453 --- /dev/null +++ b/src/test/java/org/codelibs/nekohtml/parsers/StrictModeTest.java @@ -0,0 +1,464 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.nekohtml.parsers; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.StringReader; +import java.util.logging.Handler; +import java.util.logging.Level; +import java.util.logging.LogRecord; +import java.util.logging.Logger; + +import javax.xml.parsers.DocumentBuilderFactory; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.w3c.dom.Document; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Tests for the nekohtml.dom.strict system property behavior. + * Tests the three modes: NOT_SET (default), FALSE (explicit lenient), TRUE (strict). + * + * @author CodeLibs Project + */ +public class StrictModeTest { + + private static final String PROPERTY_DOM_STRICT = "nekohtml.dom.strict"; + + private String originalPropertyValue; + private TestLogHandler logHandler; + private Logger saxToDomLogger; + + @BeforeEach + public void setUp() { + // Save original property value + originalPropertyValue = System.getProperty(PROPERTY_DOM_STRICT); + + // Set up log capture for SAXToDOMHandler + saxToDomLogger = Logger.getLogger("org.codelibs.nekohtml.parsers.SAXToDOMHandler"); + logHandler = new TestLogHandler(); + saxToDomLogger.addHandler(logHandler); + saxToDomLogger.setLevel(Level.ALL); + } + + @AfterEach + public void tearDown() { + // Restore original property value + if (originalPropertyValue != null) { + System.setProperty(PROPERTY_DOM_STRICT, originalPropertyValue); + } else { + System.clearProperty(PROPERTY_DOM_STRICT); + } + + // Remove log handler + if (saxToDomLogger != null && logHandler != null) { + saxToDomLogger.removeHandler(logHandler); + } + } + + // ========================================================================= + // Default Mode Tests (Property NOT_SET) + // ========================================================================= + + @Test + public void testDefaultModeParseNormalHtml() throws Exception { + System.clearProperty(PROPERTY_DOM_STRICT); + + final String html = "Test

Hello

"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be parsed successfully in default mode"); + assertNotNull(doc.getDocumentElement(), "Root element should exist"); + } + + @Test + public void testDefaultModeParsesMalformedHtml() throws Exception { + System.clearProperty(PROPERTY_DOM_STRICT); + + // Malformed HTML with unclosed tags + final String html = "

Unclosed paragraph

Nested div"; + + final DOMParser parser = new DOMParser(); + + // Should not throw in default mode + assertDoesNotThrow(() -> parser.parse(new InputSource(new StringReader(html)))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be parsed in default mode"); + } + + @Test + public void testDefaultModeMismatchedTags() throws Exception { + System.clearProperty(PROPERTY_DOM_STRICT); + + // Tags closed in wrong order + final String html = "text"; + + final DOMParser parser = new DOMParser(); + assertDoesNotThrow(() -> parser.parse(new InputSource(new StringReader(html)))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be parsed in default mode with mismatched tags"); + } + + // ========================================================================= + // Lenient Mode Tests (Property set to FALSE) + // ========================================================================= + + @Test + public void testLenientModeParseNormalHtml() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "false"); + + final String html = "Test

Hello

"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be parsed successfully in lenient mode"); + } + + @Test + public void testLenientModeLogsWarnings() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "false"); + + // HTML that might trigger warnings (mismatched tags handled by tag balancer) + final String html = "

"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be parsed in lenient mode"); + } + + @Test + public void testLenientModeParsesEmptyHtml() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "false"); + + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be created even for empty HTML in lenient mode"); + } + + @Test + public void testLenientModeExplicitFalse() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "FALSE"); + + final String html = "

Test

"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be parsed with uppercase FALSE"); + } + + // ========================================================================= + // Strict Mode Tests (Property set to TRUE) + // ========================================================================= + + @Test + public void testStrictModeParseNormalHtml() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "true"); + + final String html = "Test

Hello

"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be parsed successfully in strict mode for valid HTML"); + } + + @Test + public void testStrictModeExplicitTrue() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "TRUE"); + + final String html = "

Test

"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Document should be parsed with uppercase TRUE"); + } + + @Test + public void testStrictModeWithWellFormedHtml() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "true"); + + // Properly nested HTML + final String html = "Title

Paragraph

"; + + final DOMParser parser = new DOMParser(); + assertDoesNotThrow(() -> parser.parse(new InputSource(new StringReader(html)))); + } + + // ========================================================================= + // Direct SAXToDOMHandler Tests + // ========================================================================= + + @Test + public void testHandlerWithoutStartDocument() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "false"); + + // Create handler directly to test edge cases + final javax.xml.parsers.DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + final SAXToDOMHandler handler = new SAXToDOMHandler(builder); + + // Call startElement without startDocument + assertDoesNotThrow(() -> handler.startElement("", "div", "DIV", new org.xml.sax.helpers.AttributesImpl()), + "Lenient mode should not throw when startElement called before startDocument"); + } + + @Test + public void testHandlerWithoutStartDocumentStrict() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "true"); + + final javax.xml.parsers.DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + final SAXToDOMHandler handler = new SAXToDOMHandler(builder); + + // Call startElement without startDocument - should throw in strict mode + assertThrows(SAXException.class, () -> handler.startElement("", "div", "DIV", new org.xml.sax.helpers.AttributesImpl()), + "Strict mode should throw when startElement called before startDocument"); + } + + @Test + public void testHandlerMismatchedEndTag() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "false"); + + final javax.xml.parsers.DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + final SAXToDOMHandler handler = new SAXToDOMHandler(builder); + + handler.startDocument(); + handler.startElement("", "div", "DIV", new org.xml.sax.helpers.AttributesImpl()); + + // End with wrong tag - should not throw in lenient mode + assertDoesNotThrow(() -> handler.endElement("", "span", "SPAN"), + "Lenient mode should handle mismatched end tag"); + } + + @Test + public void testHandlerEndTagEmptyStack() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "false"); + + final javax.xml.parsers.DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + final SAXToDOMHandler handler = new SAXToDOMHandler(builder); + + handler.startDocument(); + handler.endDocument(); // Clear stack + + // End element with empty stack - should not throw + assertDoesNotThrow(() -> handler.endElement("", "div", "DIV"), + "Lenient mode should handle end tag with empty stack"); + } + + @Test + public void testHandlerCharactersBeforeStartDocument() throws Exception { + System.clearProperty(PROPERTY_DOM_STRICT); + + final javax.xml.parsers.DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + final SAXToDOMHandler handler = new SAXToDOMHandler(builder); + + // Characters before startDocument - should not throw + assertDoesNotThrow(() -> handler.characters("test".toCharArray(), 0, 4), + "Should handle characters before startDocument"); + } + + @Test + public void testHandlerCommentInDocument() throws Exception { + System.clearProperty(PROPERTY_DOM_STRICT); + + final javax.xml.parsers.DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + final SAXToDOMHandler handler = new SAXToDOMHandler(builder); + + handler.startDocument(); + handler.startElement("", "html", "HTML", new org.xml.sax.helpers.AttributesImpl()); + + assertDoesNotThrow(() -> handler.comment("This is a comment".toCharArray(), 0, 17), + "Should handle comment in document"); + } + + @Test + public void testHandlerNestedElements() throws Exception { + System.clearProperty(PROPERTY_DOM_STRICT); + + final javax.xml.parsers.DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + final SAXToDOMHandler handler = new SAXToDOMHandler(builder); + + handler.startDocument(); + handler.startElement("", "html", "HTML", new org.xml.sax.helpers.AttributesImpl()); + handler.startElement("", "body", "BODY", new org.xml.sax.helpers.AttributesImpl()); + handler.startElement("", "div", "DIV", new org.xml.sax.helpers.AttributesImpl()); + handler.characters("Hello World".toCharArray(), 0, 11); + handler.endElement("", "div", "DIV"); + handler.endElement("", "body", "BODY"); + handler.endElement("", "html", "HTML"); + handler.endDocument(); + + final Document doc = handler.getDocument(); + assertNotNull(doc, "Document should be built"); + assertEquals("HTML", doc.getDocumentElement().getNodeName(), "Root should be HTML"); + } + + // ========================================================================= + // Skip Depth Tests + // ========================================================================= + + @Test + public void testSkipDepthInLenientMode() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "false"); + + final javax.xml.parsers.DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); + final SAXToDOMHandler handler = new SAXToDOMHandler(builder); + + // Don't call startDocument - this triggers skip mode + handler.startElement("", "div", "DIV", new org.xml.sax.helpers.AttributesImpl()); + handler.startElement("", "p", "P", new org.xml.sax.helpers.AttributesImpl()); + handler.characters("Skipped content".toCharArray(), 0, 15); + handler.endElement("", "p", "P"); + handler.endElement("", "div", "DIV"); + + // Should complete without throwing + assertDoesNotThrow(() -> { + }, "Skip depth should handle nested skipped elements"); + } + + // ========================================================================= + // Integration Tests with DOMParser + // ========================================================================= + + @Test + public void testDOMParserStrictModeWellFormed() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "true"); + + final String html = "" + "Well Formed" + "" + + "
" + "

Hello World

" + "
" + "" + ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Well-formed document should parse in strict mode"); + assertEquals("HTML", doc.getDocumentElement().getNodeName()); + } + + @Test + public void testDOMParserLenientModeWithErrors() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "false"); + + // Intentionally bad HTML + final String html = "
"; + + final DOMParser parser = new DOMParser(); + + // Should not throw in lenient mode + assertDoesNotThrow(() -> parser.parse(new InputSource(new StringReader(html)))); + } + + // ========================================================================= + // Edge Cases + // ========================================================================= + + @Test + public void testEmptyPropertyValue() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, ""); + + final String html = "

Test

"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Empty property value should be treated as FALSE/lenient"); + } + + @Test + public void testInvalidPropertyValue() throws Exception { + System.setProperty(PROPERTY_DOM_STRICT, "maybe"); + + final String html = "

Test

"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + + final Document doc = parser.getDocument(); + assertNotNull(doc, "Invalid property value should be treated as FALSE/lenient"); + } + + @Test + public void testPropertyChangeBetweenParses() throws Exception { + // First parse in lenient mode + System.setProperty(PROPERTY_DOM_STRICT, "false"); + final DOMParser parser1 = new DOMParser(); + parser1.parse(new InputSource(new StringReader(""))); + assertNotNull(parser1.getDocument()); + + // Switch to strict mode + System.setProperty(PROPERTY_DOM_STRICT, "true"); + final DOMParser parser2 = new DOMParser(); + parser2.parse(new InputSource(new StringReader(""))); + assertNotNull(parser2.getDocument()); + } + + // ========================================================================= + // Helper Classes + // ========================================================================= + + /** + * Test handler to capture log messages. + */ + private static class TestLogHandler extends Handler { + private final java.util.List records = new java.util.ArrayList<>(); + + @Override + public void publish(LogRecord record) { + records.add(record); + } + + @Override + public void flush() { + } + + @Override + public void close() throws SecurityException { + } + + public boolean hasWarningContaining(String substring) { + return records.stream() + .filter(r -> r.getLevel() == Level.WARNING) + .anyMatch(r -> r.getMessage() != null && r.getMessage().contains(substring)); + } + + public boolean hasAnyMessage() { + return !records.isEmpty(); + } + } + +} // class StrictModeTest diff --git a/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java b/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java index 61d8a62..71125db 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/AdoptionAgencyAlgorithmExtendedTest.java @@ -570,4 +570,420 @@ public void testFormattingElementsAcrossNestedBlocks() throws Exception { assertTrue(doc.getElementsByTagName("B").getLength() >= 1, "Should have B elements"); assertEquals(3, doc.getElementsByTagName("DIV").getLength(), "Should have 3 DIVs"); } + + // ======================================================================== + // AAA Outer Loop Limit Tests (HTML5 spec: max 8 iterations) + // ======================================================================== + + @Test + public void testAAAOuterLoopWithManyFormattingElements() throws Exception { + // Given: More than 8 nested formatting elements to test outer loop limit + final String html = "" + + "" + + "Text" + + "" // Close B early to trigger AAA + + "" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle without infinite loop + assertNotNull(doc, "Document should be parsed without infinite loop"); + } + + @Test + public void testAAAWithExtremeNesting() throws Exception { + // Given: Very deeply nested formatting elements + final StringBuilder html = new StringBuilder(""); + final int depth = 20; // Well beyond the 8 iteration limit + + for (int i = 0; i < depth; i++) { + html.append(""); + } + html.append("Text"); + // Close only some, leaving mismatched structure + for (int i = 0; i < depth / 2; i++) { + html.append(""); + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle extreme nesting gracefully + assertNotNull(doc, "Document should be parsed with extreme nesting"); + } + + @Test + public void testAAAWithAlternatingFormattingElements() throws Exception { + // Given: Alternating formatting elements beyond loop limit + final String html = "" + + "" + + "Deep text" + + "" // Early close + + "" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle alternating pattern + assertNotNull(doc, "Document should be parsed with alternating patterns"); + } + + // ======================================================================== + // Formatting Marker Tests (Table/Caption/TD/TH context boundaries) + // ======================================================================== + + @Test + public void testFormattingMarkerInTable() throws Exception { + // Given: Formatting elements crossing table cell boundaries + final String html = "" + + "Before table" + + "" + + "" + + "" + + "
Cell 1 Italic in cellCell 2
" + + "After table
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Table should act as a formatting marker boundary + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("TABLE").getLength(), "Should have TABLE"); + assertEquals(2, doc.getElementsByTagName("TD").getLength(), "Should have 2 TDs"); + } + + @Test + public void testFormattingMarkerInCaption() throws Exception { + // Given: Formatting elements in table caption + final String html = "" + + "" + + "" + + "" + + "
Bold caption
Block in caption
continues
Cell
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle caption as marker boundary + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("CAPTION").getLength(), "Should have CAPTION"); + } + + @Test + public void testFormattingMarkerInTH() throws Exception { + // Given: Formatting elements in table header + final String html = "" + + "" + + "" + + "" + + "
Header

Para in header

continues
Cell
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle TH as marker boundary + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("TH").getLength(), "Should have TH"); + } + + @Test + public void testFormattingAcrossMultipleTableCells() throws Exception { + // Given: Formatting spanning multiple cells (invalid but should be handled) + final String html = "" + + "" + + "" + + "" + + "" + + "" + + "" + + "
Start boldMiddle cellThird cell
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle cross-cell formatting + assertNotNull(doc, "Document should be parsed"); + assertEquals(3, doc.getElementsByTagName("TD").getLength(), "Should have 3 TDs"); + } + + // ======================================================================== + // AAA with Select Elements + // ======================================================================== + + @Test + public void testFormattingInSelectOption() throws Exception { + // Given: Formatting in select option (should be stripped) + final String html = "" + + "" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should parse select with options + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("SELECT").getLength(), "Should have SELECT"); + assertEquals(2, doc.getElementsByTagName("OPTION").getLength(), "Should have 2 OPTIONs"); + } + + @Test + public void testFormattingSpanningSelect() throws Exception { + // Given: Formatting spanning across select (invalid) + final String html = "" + + "Before select" + + "" + + "After select" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle select boundary + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("SELECT").getLength(), "Should have SELECT"); + } + + // ======================================================================== + // AAA with Button Elements + // ======================================================================== + + @Test + public void testFormattingInButton() throws Exception { + // Given: Formatting inside button + final String html = "" + + "" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle button content + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("BUTTON").getLength(), "Should have BUTTON"); + } + + @Test + public void testFormattingSpanningButton() throws Exception { + // Given: Formatting spanning across button + final String html = "" + + "Before button" + + "" + + "After button" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle button boundary + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("BUTTON").getLength(), "Should have BUTTON"); + } + + // ======================================================================== + // AAA with Applet/Object/Marquee (obsolete marker elements) + // ======================================================================== + + @Test + public void testFormattingInObject() throws Exception { + // Given: Formatting inside object element + final String html = "" + + "Fallback
Block
content
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle object content + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("OBJECT").getLength(), "Should have OBJECT"); + } + + @Test + public void testFormattingInMarquee() throws Exception { + // Given: Formatting in marquee (deprecated but may appear) + final String html = "" + + "Scrolling
Block
text
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle marquee content + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("MARQUEE").getLength(), "Should have MARQUEE"); + } + + // ======================================================================== + // AAA with Special Scope Elements + // ======================================================================== + + @Test + public void testFormattingWithFormElement() throws Exception { + // Given: Formatting crossing form boundary + final String html = "" + + "Before form" + + "
Form content
" + + "After form
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle form boundary + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("FORM").getLength(), "Should have FORM"); + } + + @Test + public void testFormattingWithFieldset() throws Exception { + // Given: Formatting in fieldset with legend + final String html = "" + + "
" + + "Bold legend
Block
" + + "" + + "
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle fieldset content + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("FIELDSET").getLength(), "Should have FIELDSET"); + assertEquals(1, doc.getElementsByTagName("LEGEND").getLength(), "Should have LEGEND"); + } + + // ======================================================================== + // AAA Inner Loop Limit Tests + // ======================================================================== + + @Test + public void testAAAInnerLoopWithManyActiveElements() throws Exception { + // Given: Many active formatting elements in list + final StringBuilder html = new StringBuilder(""); + // Create many formatting elements + for (int i = 0; i < 10; i++) { + html.append("b").append(i).append(" "); + } + // Add a block to trigger reconstruction + html.append("
Block
"); + // Close in reverse + for (int i = 0; i < 10; i++) { + html.append("
"); + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle many active elements + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("DIV").getLength(), "Should have DIV"); + } + + // ======================================================================== + // AAA with Furthest Block Variations + // ======================================================================== + + @Test + public void testAAAWithMultipleFurthestBlockCandidates() throws Exception { + // Given: Multiple potential furthest blocks + final String html = "" + + "Bold

Para 1

Div

Para 2

continues
" + + ""; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle multiple blocks + assertNotNull(doc, "Document should be parsed"); + assertEquals(2, doc.getElementsByTagName("P").getLength(), "Should have 2 P elements"); + assertEquals(1, doc.getElementsByTagName("DIV").getLength(), "Should have DIV"); + } + + @Test + public void testAAAWithNoFurthestBlock() throws Exception { + // Given: Formatting element closed when no block is present + final String html = "Text"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should close normally (no AAA needed) + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("B").getLength(), "Should have B"); + assertEquals(1, doc.getElementsByTagName("I").getLength(), "Should have I"); + assertEquals(1, doc.getElementsByTagName("U").getLength(), "Should have U"); + } + + @Test + public void testAAAFurthestBlockIsImmediateChild() throws Exception { + // Given: Furthest block is immediate child of formatting element + final String html = "
Direct block child
"; + + // When: Parsing + final Document doc = parseHTML(html); + + // Then: Should handle direct block child + assertNotNull(doc, "Document should be parsed"); + assertEquals(1, doc.getElementsByTagName("DIV").getLength(), "Should have DIV"); + } + + // ======================================================================== + // Stress Tests for AAA + // ======================================================================== + + @Test + public void testAAAWithManyInterleavedElements() throws Exception { + // Given: Many interleaved formatting and block elements + final StringBuilder html = new StringBuilder(""); + for (int i = 0; i < 50; i++) { + if (i % 2 == 0) { + html.append("b").append(i); + } else { + html.append("
d").append(i).append("
"); + } + } + for (int i = 0; i < 25; i++) { + html.append("
"); + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle interleaved pattern + assertNotNull(doc, "Document should be parsed with interleaved elements"); + } + + @Test + public void testAAAWithVeryLongFormattingElementList() throws Exception { + // Given: Very long active formatting element list + final StringBuilder html = new StringBuilder(""); + for (int i = 0; i < 100; i++) { + html.append(""); + } + html.append("
Block
"); + for (int i = 0; i < 100; i++) { + html.append("
"); + } + html.append(""); + + // When: Parsing + final Document doc = parseHTML(html.toString()); + + // Then: Should handle long list without stack overflow + assertNotNull(doc, "Document should be parsed without stack overflow"); + } } diff --git a/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java b/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java index 1affdb7..e17eb20 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/HTMLSAXConfigurationTest.java @@ -15,9 +15,12 @@ */ package org.codelibs.nekohtml.sax; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.StringReader; @@ -178,4 +181,430 @@ public void comment(final char[] ch, final int start, final int length) throws S } } + // ========================================================================= + // Feature Combination Tests + // ========================================================================= + + @Test + public void testAugmentationsFeature() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Default should be false + assertFalse(config.getFeature(HTMLSAXConfiguration.AUGMENTATIONS), "Augmentations should be disabled by default"); + + // Enable augmentations + config.setFeature(HTMLSAXConfiguration.AUGMENTATIONS, true); + assertTrue(config.getFeature(HTMLSAXConfiguration.AUGMENTATIONS), "Should be able to enable augmentations"); + } + + @Test + public void testReportErrorsFeature() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Default should be false + assertFalse(config.getFeature(HTMLSAXConfiguration.REPORT_ERRORS), "Report errors should be disabled by default"); + + // Enable error reporting + config.setFeature(HTMLSAXConfiguration.REPORT_ERRORS, true); + assertTrue(config.getFeature(HTMLSAXConfiguration.REPORT_ERRORS), "Should be able to enable error reporting"); + } + + @Test + public void testSimpleErrorFormatFeature() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Default should be false + assertFalse(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), + "Simple error format should be disabled by default"); + + // Enable simple error format + config.setFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT, true); + assertTrue(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), + "Should be able to enable simple error format"); + } + + @Test + public void testHTML5ModeFeature() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Default should be false + assertFalse(config.getFeature(HTMLSAXConfiguration.HTML5_MODE), "HTML5 mode should be disabled by default"); + + // Enable HTML5 mode + config.setFeature(HTMLSAXConfiguration.HTML5_MODE, true); + assertTrue(config.getFeature(HTMLSAXConfiguration.HTML5_MODE), "Should be able to enable HTML5 mode"); + } + + @Test + public void testAllFeaturesEnabledTogether() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Enable all features + config.setFeature(HTMLSAXConfiguration.NAMESPACES, true); + config.setFeature(HTMLSAXConfiguration.AUGMENTATIONS, true); + config.setFeature(HTMLSAXConfiguration.REPORT_ERRORS, true); + config.setFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT, true); + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, true); + config.setFeature(HTMLSAXConfiguration.HTML5_MODE, true); + + // Verify all are enabled + assertTrue(config.getFeature(HTMLSAXConfiguration.NAMESPACES), "Namespaces should be enabled"); + assertTrue(config.getFeature(HTMLSAXConfiguration.AUGMENTATIONS), "Augmentations should be enabled"); + assertTrue(config.getFeature(HTMLSAXConfiguration.REPORT_ERRORS), "Report errors should be enabled"); + assertTrue(config.getFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT), "Simple error format should be enabled"); + assertTrue(config.getFeature(HTMLSAXConfiguration.BALANCE_TAGS), "Balance tags should be enabled"); + assertTrue(config.getFeature(HTMLSAXConfiguration.HTML5_MODE), "HTML5 mode should be enabled"); + + // Verify parsing still works + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + config.parse(new InputSource(new StringReader("

Test

"))); + + assertTrue(handler.events.size() > 0, "Should parse with all features enabled"); + } + + @Test + public void testAllFeaturesDisabled() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Disable all features + config.setFeature(HTMLSAXConfiguration.NAMESPACES, false); + config.setFeature(HTMLSAXConfiguration.AUGMENTATIONS, false); + config.setFeature(HTMLSAXConfiguration.REPORT_ERRORS, false); + config.setFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT, false); + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, false); + config.setFeature(HTMLSAXConfiguration.HTML5_MODE, false); + + // Verify parsing still works + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + config.parse(new InputSource(new StringReader("

Test

"))); + + assertTrue(handler.events.size() > 0, "Should parse with all features disabled"); + } + + // ========================================================================= + // Property Tests + // ========================================================================= + + @Test + public void testNamesElemsProperty() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Default should be "upper" + assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS), + "Element names should default to upper case"); + + // Change to lower + config.setProperty(HTMLSAXConfiguration.NAMES_ELEMS, "lower"); + assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ELEMS), + "Should be able to set element names to lower case"); + } + + @Test + public void testNamesAttrsProperty() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Default should be "lower" + assertEquals("lower", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS), + "Attribute names should default to lower case"); + + // Change to upper + config.setProperty(HTMLSAXConfiguration.NAMES_ATTRS, "upper"); + assertEquals("upper", config.getProperty(HTMLSAXConfiguration.NAMES_ATTRS), + "Should be able to set attribute names to upper case"); + } + + @Test + public void testLexicalHandlerProperty() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + final TestLexicalHandler lexHandler = new TestLexicalHandler(); + + // Set via property + config.setProperty("http://xml.org/sax/properties/lexical-handler", lexHandler); + + // Get via property + assertSame(lexHandler, config.getProperty("http://xml.org/sax/properties/lexical-handler"), + "Lexical handler should be retrievable via property"); + + // Also get via getter + assertSame(lexHandler, config.getLexicalHandler(), + "Lexical handler should be retrievable via getter"); + } + + @Test + public void testUnrecognizedFeature() { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + assertThrows(org.xml.sax.SAXNotRecognizedException.class, + () -> config.getFeature("http://example.com/unknown-feature"), + "Should throw for unrecognized feature"); + } + + @Test + public void testUnrecognizedProperty() { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + assertThrows(org.xml.sax.SAXNotRecognizedException.class, + () -> config.getProperty("http://example.com/unknown-property"), + "Should throw for unrecognized property"); + } + + // ========================================================================= + // Pipeline Configuration Tests + // ========================================================================= + + @Test + public void testTagBalancingAffectsPipeline() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Start with tag balancing enabled (default) + assertTrue(config.getFeature(HTMLSAXConfiguration.BALANCE_TAGS)); + final int initialSize = config.fPipeline.size(); + + // Disable tag balancing + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, false); + final int disabledSize = config.fPipeline.size(); + + // Pipeline should be smaller without tag balancer + assertTrue(disabledSize <= initialSize, "Pipeline should be smaller without tag balancer"); + + // Re-enable tag balancing + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, true); + final int reenableSize = config.fPipeline.size(); + + assertEquals(initialSize, reenableSize, "Pipeline should be restored when re-enabling"); + } + + @Test + public void testParsingWithTagBalancingDisabled() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, false); + + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + + // Parse with imbalanced tags + config.parse(new InputSource(new StringReader("

Unclosed"))); + + // Should still parse (but without auto-closing) + assertTrue(handler.events.size() > 0, "Should parse without tag balancing"); + } + + @Test + public void testParsingWithTagBalancingEnabled() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, true); + + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + + // Parse with imbalanced tags + config.parse(new InputSource(new StringReader("

Auto-close test

"))); + + // Should parse with auto-closing + assertTrue(handler.events.size() > 0, "Should parse with tag balancing"); + // Tag balancer should auto-close

+ assertTrue(handler.events.contains("endElement:P") || handler.events.contains("endElement:DIV"), + "Tag balancer should close elements"); + } + + // ========================================================================= + // Handler Management Tests + // ========================================================================= + + @Test + public void testDTDHandlerGetterSetter() { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + final org.xml.sax.DTDHandler handler = new org.xml.sax.DTDHandler() { + @Override + public void notationDecl(String name, String publicId, String systemId) { + } + + @Override + public void unparsedEntityDecl(String name, String publicId, String systemId, String notationName) { + } + }; + + config.setDTDHandler(handler); + assertSame(handler, config.getDTDHandler(), "DTD handler should be retrievable"); + } + + @Test + public void testEntityResolverGetterSetter() { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + final org.xml.sax.EntityResolver resolver = (publicId, systemId) -> null; + + config.setEntityResolver(resolver); + assertSame(resolver, config.getEntityResolver(), "Entity resolver should be retrievable"); + } + + @Test + public void testErrorHandlerGetterSetter() { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + final org.xml.sax.ErrorHandler handler = new org.xml.sax.ErrorHandler() { + @Override + public void warning(org.xml.sax.SAXParseException exception) { + } + + @Override + public void error(org.xml.sax.SAXParseException exception) { + } + + @Override + public void fatalError(org.xml.sax.SAXParseException exception) { + } + }; + + config.setErrorHandler(handler); + assertSame(handler, config.getErrorHandler(), "Error handler should be retrievable"); + } + + @Test + public void testLocaleGetterSetter() { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + config.setLocale(java.util.Locale.JAPANESE); + assertEquals(java.util.Locale.JAPANESE, config.getLocale(), "Locale should be retrievable"); + + config.setLocale(java.util.Locale.FRENCH); + assertEquals(java.util.Locale.FRENCH, config.getLocale(), "Locale should be changeable"); + } + + // ========================================================================= + // Parse Method Variations + // ========================================================================= + + @Test + public void testParseWithSystemId() throws Exception { + // Create temp file + final java.io.File tempFile = java.io.File.createTempFile("test", ".html"); + tempFile.deleteOnExit(); + + try (java.io.FileWriter writer = new java.io.FileWriter(tempFile)) { + writer.write("

From file

"); + } + + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + + config.parse(tempFile.getAbsolutePath()); + + assertTrue(handler.events.contains("startElement:P"), "Should parse from file path"); + } + + @Test + public void testParseWithNullContentHandler() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + // Don't set content handler + + // Should not throw - uses DefaultHandler + assertDoesNotThrow(() -> config.parse(new InputSource(new StringReader("")))); + } + + // ========================================================================= + // Complex Feature Interactions + // ========================================================================= + + @Test + public void testNamespacesWithTagBalancing() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + config.setFeature(HTMLSAXConfiguration.NAMESPACES, true); + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, true); + + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + + config.parse(new InputSource(new StringReader("

Test

"))); + + assertTrue(handler.events.size() > 0, "Should parse with namespaces and tag balancing"); + } + + @Test + public void testReportErrorsWithSimpleFormat() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + config.setFeature(HTMLSAXConfiguration.REPORT_ERRORS, true); + config.setFeature(HTMLSAXConfiguration.SIMPLE_ERROR_FORMAT, true); + + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + + // Parse malformed HTML + config.parse(new InputSource(new StringReader("

"))); + + // Should parse without throwing + assertTrue(handler.events.size() > 0, "Should parse with error reporting enabled"); + } + + @Test + public void testHTML5ModeWithTagBalancing() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + config.setFeature(HTMLSAXConfiguration.HTML5_MODE, true); + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, true); + + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + + // Parse HTML5 document + config.parse(new InputSource(new StringReader( + "
Content
"))); + + assertTrue(handler.events.contains("startElement:ARTICLE"), "Should parse HTML5 elements"); + assertTrue(handler.events.contains("startElement:SECTION"), "Should parse HTML5 elements"); + } + + // ========================================================================= + // Edge Cases + // ========================================================================= + + @Test + public void testEmptyPipeline() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // The pipeline should never be empty due to scanner + assertFalse(config.fPipeline.isEmpty(), "Pipeline should not be empty"); + } + + @Test + public void testMultipleFeatureChanges() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + // Toggle features multiple times + for (int i = 0; i < 5; i++) { + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, true); + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, false); + } + config.setFeature(HTMLSAXConfiguration.BALANCE_TAGS, true); + + // Should still work correctly + final TestHandler handler = new TestHandler(); + config.setContentHandler(handler); + config.parse(new InputSource(new StringReader(""))); + + assertTrue(handler.events.size() > 0, "Should parse after multiple feature changes"); + } + + @Test + public void testContentHandlerReassignment() throws Exception { + final HTMLSAXConfiguration config = new HTMLSAXConfiguration(); + + final TestHandler handler1 = new TestHandler(); + final TestHandler handler2 = new TestHandler(); + + // Set first handler + config.setContentHandler(handler1); + config.parse(new InputSource(new StringReader("

First

"))); + + // Set second handler + config.setContentHandler(handler2); + config.parse(new InputSource(new StringReader("
Second
"))); + + // First handler should have

+ assertTrue(handler1.events.contains("startElement:P"), "First handler should receive first parse"); + + // Second handler should have

+ assertTrue(handler2.events.contains("startElement:DIV"), "Second handler should receive second parse"); + } + } // class HTMLSAXConfigurationTest diff --git a/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java b/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java new file mode 100644 index 0000000..c8d5e92 --- /dev/null +++ b/src/test/java/org/codelibs/nekohtml/sax/RawTextElementsTest.java @@ -0,0 +1,451 @@ +/* + * Copyright 2012-2025 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.nekohtml.sax; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import org.codelibs.nekohtml.parsers.DOMParser; +import org.junit.jupiter.api.Test; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Tests for raw text element handling (script, style, textarea, title, etc.). + * These elements contain raw text that should not be parsed as HTML. + * + * @author CodeLibs Project + */ +public class RawTextElementsTest { + + // ========================================================================= + // Script Element Tests + // ========================================================================= + + @Test + public void testScriptWithHtmlLikeTags() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList scripts = doc.getElementsByTagName("SCRIPT"); + assertEquals(1, scripts.getLength(), "Should have one script element"); + + // The script content should contain the HTML-like string + final String scriptContent = scripts.item(0).getTextContent(); + assertTrue(scriptContent.contains("
") || scriptContent.contains("not a tag"), + "Script content should preserve HTML-like content"); + } + + @Test + public void testScriptWithComments() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList scripts = doc.getElementsByTagName("SCRIPT"); + assertEquals(1, scripts.getLength(), "Should have one script element"); + } + + @Test + public void testScriptWithTypeAttribute() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList scripts = doc.getElementsByTagName("SCRIPT"); + assertEquals(1, scripts.getLength(), "Should have one script element"); + + final Element script = (Element) scripts.item(0); + assertEquals("text/javascript", script.getAttribute("type"), "Should preserve type attribute"); + } + + @Test + public void testMultipleScriptElements() throws Exception { + final String html = "" + "" + "" + + "" + ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList scripts = doc.getElementsByTagName("SCRIPT"); + assertEquals(3, scripts.getLength(), "Should have three script elements"); + } + + @Test + public void testEmptyScript() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList scripts = doc.getElementsByTagName("SCRIPT"); + assertEquals(1, scripts.getLength(), "Should have one empty script element"); + } + + @Test + public void testScriptInBody() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList scripts = doc.getElementsByTagName("SCRIPT"); + assertEquals(1, scripts.getLength(), "Should have one script element in body"); + } + + // ========================================================================= + // Style Element Tests + // ========================================================================= + + @Test + public void testStyleWithSelectors() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList styles = doc.getElementsByTagName("STYLE"); + assertEquals(1, styles.getLength(), "Should have one style element"); + + final String styleContent = styles.item(0).getTextContent(); + assertTrue(styleContent.contains("color: red") || styleContent.contains("color:"), + "Style content should contain CSS rules"); + } + + @Test + public void testStyleWithMediaQuery() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList styles = doc.getElementsByTagName("STYLE"); + assertEquals(1, styles.getLength(), "Should have one style element"); + } + + @Test + public void testStyleWithTypeAttribute() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList styles = doc.getElementsByTagName("STYLE"); + assertEquals(1, styles.getLength(), "Should have one style element"); + + final Element style = (Element) styles.item(0); + assertEquals("text/css", style.getAttribute("type"), "Should preserve type attribute"); + } + + @Test + public void testEmptyStyle() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList styles = doc.getElementsByTagName("STYLE"); + assertEquals(1, styles.getLength(), "Should have one empty style element"); + } + + // ========================================================================= + // Textarea Element Tests + // ========================================================================= + + @Test + public void testTextareaWithHtmlContent() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList textareas = doc.getElementsByTagName("TEXTAREA"); + assertEquals(1, textareas.getLength(), "Should have one textarea element"); + } + + @Test + public void testTextareaWithAttributes() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList textareas = doc.getElementsByTagName("TEXTAREA"); + assertEquals(1, textareas.getLength(), "Should have one textarea element"); + + final Element textarea = (Element) textareas.item(0); + assertEquals("content", textarea.getAttribute("name"), "Should preserve name attribute"); + assertEquals("10", textarea.getAttribute("rows"), "Should preserve rows attribute"); + assertEquals("50", textarea.getAttribute("cols"), "Should preserve cols attribute"); + } + + @Test + public void testMultilineTextarea() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList textareas = doc.getElementsByTagName("TEXTAREA"); + assertEquals(1, textareas.getLength(), "Should have one textarea element"); + } + + // ========================================================================= + // Title Element Tests + // ========================================================================= + + @Test + public void testTitleElement() throws Exception { + final String html = "Test Page Title"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList titles = doc.getElementsByTagName("TITLE"); + assertEquals(1, titles.getLength(), "Should have one title element"); + assertEquals("Test Page Title", titles.item(0).getTextContent(), "Should preserve title text"); + } + + @Test + public void testTitleWithSpecialChars() throws Exception { + final String html = "Test < > & \" Page"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList titles = doc.getElementsByTagName("TITLE"); + assertEquals(1, titles.getLength(), "Should have one title element"); + } + + // ========================================================================= + // XMP and Listing Element Tests (deprecated but may appear in legacy HTML) + // ========================================================================= + + @Test + public void testXmpElement() throws Exception { + final String html = "<b>This is not bold</b>"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList xmps = doc.getElementsByTagName("XMP"); + // XMP may or may not be supported, just verify parsing doesn't fail + assertNotNull(doc, "Document should be parsed"); + } + + // ========================================================================= + // Entity Handling in Raw Text Elements + // ========================================================================= + + @Test + public void testScriptWithEntityLikeContent() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList scripts = doc.getElementsByTagName("SCRIPT"); + assertEquals(1, scripts.getLength(), "Should have one script element"); + } + + @Test + public void testStyleWithEntityLikeContent() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList styles = doc.getElementsByTagName("STYLE"); + assertEquals(1, styles.getLength(), "Should have one style element"); + } + + // ========================================================================= + // SAX-Level Tests for Raw Text Content + // ========================================================================= + + @Test + public void testSAXScriptContent() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final List elements = new ArrayList<>(); + final StringBuilder content = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) { + elements.add("START:" + qName); + } + + @Override + public void endElement(String uri, String localName, String qName) { + elements.add("END:" + qName); + } + + @Override + public void characters(char[] ch, int start, int length) { + content.append(new String(ch, start, length)); + } + }); + + final String html = ""; + scanner.parse(new InputSource(new StringReader(html))); + + assertTrue(elements.contains("START:SCRIPT"), "Should have script start element"); + assertTrue(elements.contains("END:SCRIPT"), "Should have script end element"); + } + + @Test + public void testSAXStyleContent() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final List elements = new ArrayList<>(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) { + elements.add("START:" + qName); + } + + @Override + public void endElement(String uri, String localName, String qName) { + elements.add("END:" + qName); + } + }); + + final String html = ""; + scanner.parse(new InputSource(new StringReader(html))); + + assertTrue(elements.contains("START:STYLE"), "Should have style start element"); + assertTrue(elements.contains("END:STYLE"), "Should have style end element"); + } + + // ========================================================================= + // Noscript Element Tests + // ========================================================================= + + @Test + public void testNoscriptElement() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList noscripts = doc.getElementsByTagName("NOSCRIPT"); + assertEquals(1, noscripts.getLength(), "Should have one noscript element"); + } + + // ========================================================================= + // Inline Script/Style Tests + // ========================================================================= + + @Test + public void testInlineEventHandler() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList buttons = doc.getElementsByTagName("BUTTON"); + assertEquals(1, buttons.getLength(), "Should have one button element"); + + final Element button = (Element) buttons.item(0); + assertEquals("alert('clicked')", button.getAttribute("onclick"), "Should preserve onclick attribute"); + } + + @Test + public void testInlineStyle() throws Exception { + final String html = "
Styled
"; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList divs = doc.getElementsByTagName("DIV"); + assertEquals(1, divs.getLength(), "Should have one div element"); + + final Element div = (Element) divs.item(0); + assertTrue(div.getAttribute("style").contains("color"), "Should preserve style attribute"); + } + + // ========================================================================= + // Complex Script Content Tests + // ========================================================================= + + @Test + public void testScriptWithRegex() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + // This is a tricky case - script end tag detection + assertDoesNotThrow(() -> parser.parse(new InputSource(new StringReader(html)))); + } + + @Test + public void testScriptWithTemplateStrings() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + assertNotNull(doc, "Document should be parsed"); + } + + @Test + public void testScriptWithJsonData() throws Exception { + final String html = ""; + + final DOMParser parser = new DOMParser(); + parser.parse(new InputSource(new StringReader(html))); + final Document doc = parser.getDocument(); + + final NodeList scripts = doc.getElementsByTagName("SCRIPT"); + assertEquals(1, scripts.getLength(), "Should have one script element"); + + final Element script = (Element) scripts.item(0); + assertEquals("application/json", script.getAttribute("type"), "Should preserve type attribute"); + } + +} // class RawTextElementsTest diff --git a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java index 9dc19fa..217dc6e 100644 --- a/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java +++ b/src/test/java/org/codelibs/nekohtml/sax/SimpleHTMLScannerEnhancementsTest.java @@ -452,4 +452,484 @@ public void endElement(String uri, String localName, String qName) { assertTrue(elementNames.toString().contains("END:DIV"), "Should parse end tag with trailing whitespace"); } + + // ========================================================================= + // Entity Handling Tests + // ========================================================================= + + /** + * Test named HTML entities in content + */ + @Test + public void testNamedEntitiesInContent() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final String html = "& < > "  "; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + // Entities may be passed through or decoded depending on implementation + assertTrue(result.toString().contains("&") || result.toString().contains("&"), + "Should handle ampersand entity"); + } + + /** + * Test numeric entities (decimal) + */ + @Test + public void testDecimalNumericEntities() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final String html = "< > &"; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + assertNotNull(result.toString(), "Should parse decimal numeric entities"); + } + + /** + * Test numeric entities (hexadecimal) + */ + @Test + public void testHexNumericEntities() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final String html = "< > &"; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + assertNotNull(result.toString(), "Should parse hexadecimal numeric entities"); + } + + /** + * Test entities in attribute values + */ + @Test + public void testEntitiesInAttributes() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder attrValues = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes atts) { + for (int i = 0; i < atts.getLength(); i++) { + attrValues.append(atts.getQName(i)).append("=").append(atts.getValue(i)).append("|"); + } + } + }); + + final String html = "Link"; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + assertTrue(attrValues.toString().contains("href"), "Should parse attribute with entity"); + } + + /** + * Test invalid/incomplete entities + */ + @Test + public void testIncompleteEntity() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + // Incomplete entity - ampersand without semicolon + final String html = "A & B"; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + assertTrue(result.toString().contains("&") || result.toString().contains("A"), + "Should handle incomplete entity"); + } + + /** + * Test unknown entity names + */ + @Test + public void testUnknownEntity() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final String html = "&unknown;"; + final InputSource input = new InputSource(new StringReader(html)); + + scanner.parse(input); + + assertNotNull(result.toString(), "Should handle unknown entity"); + } + + // ========================================================================= + // Advanced Encoding Tests + // ========================================================================= + + /** + * Test ISO-8859-1 (Latin-1) encoding + */ + @Test + public void testISO88591Encoding() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + // ISO-8859-1 encoded content with accented characters + final String content = "café résumé"; + final ByteArrayInputStream stream = new ByteArrayInputStream( + ("" + content + "").getBytes("ISO-8859-1")); + final InputSource input = new InputSource(stream); + input.setEncoding("ISO-8859-1"); + + scanner.parse(input); + + assertTrue(result.toString().contains("caf") || result.toString().contains("é"), + "ISO-8859-1 encoded content should be parsed"); + } + + /** + * Test UTF-16 encoding + */ + @Test + public void testUTF16Encoding() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final String html = "Unicode: \u4E2D\u6587"; + final ByteArrayInputStream stream = new ByteArrayInputStream(html.getBytes("UTF-16")); + final InputSource input = new InputSource(stream); + input.setEncoding("UTF-16"); + + scanner.parse(input); + + assertTrue(result.toString().contains("Unicode") || result.toString().contains("\u4E2D"), + "UTF-16 encoded content should be parsed"); + } + + /** + * Test default encoding when not specified + */ + @Test + public void testDefaultEncodingUTF8() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final String html = "UTF-8 default: äöü"; + final ByteArrayInputStream stream = new ByteArrayInputStream(html.getBytes("UTF-8")); + final InputSource input = new InputSource(stream); + // Don't set encoding - should default to UTF-8 + + scanner.parse(input); + + assertTrue(result.toString().contains("UTF-8") || result.toString().contains("ä"), + "Default UTF-8 encoding should work"); + } + + /** + * Test parsing with character stream (Reader) + */ + @Test + public void testCharacterStreamTakesPrecedence() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final String html = "From character stream"; + + // Set both byte stream and character stream - character stream should take precedence + final InputSource input = new InputSource(); + input.setCharacterStream(new StringReader(html)); + input.setByteStream(new ByteArrayInputStream("Different content".getBytes())); + + scanner.parse(input); + + assertTrue(result.toString().contains("From character stream"), + "Character stream should take precedence over byte stream"); + } + + // ========================================================================= + // Input Source Variations + // ========================================================================= + + /** + * Test file:// URL SystemId + */ + @Test + public void testFileUrlSystemId() throws Exception { + // Create a temporary HTML file + final java.io.File tempFile = java.io.File.createTempFile("test", ".html"); + tempFile.deleteOnExit(); + + try (java.io.FileWriter writer = new java.io.FileWriter(tempFile)) { + writer.write("File URL content"); + } + + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + final InputSource input = new InputSource(); + input.setSystemId("file://" + tempFile.getAbsolutePath()); + + scanner.parse(input); + + assertTrue(result.toString().contains("File URL content"), + "Should parse content from file:// URL"); + } + + /** + * Test input source with no valid source + */ + @Test + public void testNoValidInputSource() { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + scanner.setContentHandler(new DefaultHandler()); + + // InputSource with nothing set + final InputSource input = new InputSource(); + + assertThrows(SAXException.class, () -> scanner.parse(input), + "Should throw when no valid input source is available"); + } + + /** + * Test parse(String systemId) convenience method + */ + @Test + public void testParseBySystemId() throws Exception { + final java.io.File tempFile = java.io.File.createTempFile("test", ".html"); + tempFile.deleteOnExit(); + + try (java.io.FileWriter writer = new java.io.FileWriter(tempFile)) { + writer.write("SystemId parse"); + } + + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final StringBuilder result = new StringBuilder(); + + scanner.setContentHandler(new DefaultHandler() { + @Override + public void characters(char[] ch, int start, int length) { + result.append(new String(ch, start, length)); + } + }); + + scanner.parse(tempFile.getAbsolutePath()); + + assertTrue(result.toString().contains("SystemId parse"), + "Should parse using String systemId parameter"); + } + + // ========================================================================= + // Handler Management Tests + // ========================================================================= + + /** + * Test DTDHandler getter/setter + */ + @Test + public void testDTDHandler() { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final org.xml.sax.DTDHandler handler = new org.xml.sax.DTDHandler() { + @Override + public void notationDecl(String name, String publicId, String systemId) { + } + + @Override + public void unparsedEntityDecl(String name, String publicId, String systemId, String notationName) { + } + }; + + scanner.setDTDHandler(handler); + assertSame(handler, scanner.getDTDHandler(), "DTDHandler should be retrievable"); + } + + /** + * Test EntityResolver getter/setter + */ + @Test + public void testEntityResolver() { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final org.xml.sax.EntityResolver resolver = (publicId, systemId) -> null; + + scanner.setEntityResolver(resolver); + assertSame(resolver, scanner.getEntityResolver(), "EntityResolver should be retrievable"); + } + + /** + * Test ErrorHandler getter/setter + */ + @Test + public void testErrorHandler() { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final org.xml.sax.ErrorHandler handler = new org.xml.sax.ErrorHandler() { + @Override + public void warning(org.xml.sax.SAXParseException exception) { + } + + @Override + public void error(org.xml.sax.SAXParseException exception) { + } + + @Override + public void fatalError(org.xml.sax.SAXParseException exception) { + } + }; + + scanner.setErrorHandler(handler); + assertSame(handler, scanner.getErrorHandler(), "ErrorHandler should be retrievable"); + } + + /** + * Test LexicalHandler via property + */ + @Test + public void testLexicalHandlerProperty() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + final LexicalHandler handler = new LexicalHandler() { + @Override + public void startDTD(String name, String publicId, String systemId) { + } + + @Override + public void endDTD() { + } + + @Override + public void startEntity(String name) { + } + + @Override + public void endEntity(String name) { + } + + @Override + public void startCDATA() { + } + + @Override + public void endCDATA() { + } + + @Override + public void comment(char[] ch, int start, int length) { + } + }; + + scanner.setProperty("http://xml.org/sax/properties/lexical-handler", handler); + assertSame(handler, scanner.getProperty("http://xml.org/sax/properties/lexical-handler"), + "LexicalHandler should be retrievable via property"); + } + + /** + * Test getFeature throws for unrecognized feature + */ + @Test + public void testGetUnrecognizedFeature() { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + + assertThrows(org.xml.sax.SAXNotRecognizedException.class, + () -> scanner.getFeature("http://example.com/unknown-feature"), + "Should throw SAXNotRecognizedException for unknown feature"); + } + + /** + * Test getProperty throws for unrecognized property + */ + @Test + public void testGetUnrecognizedProperty() { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + + assertThrows(org.xml.sax.SAXNotRecognizedException.class, + () -> scanner.getProperty("http://example.com/unknown-property"), + "Should throw SAXNotRecognizedException for unknown property"); + } + + /** + * Test parsing without content handler set + */ + @Test + public void testParsingWithoutContentHandler() throws Exception { + final SimpleHTMLScanner scanner = new SimpleHTMLScanner(); + // Don't set content handler + + final String html = "Content"; + final InputSource input = new InputSource(new StringReader(html)); + + // Should return early without error + assertDoesNotThrow(() -> scanner.parse(input), + "Parsing without content handler should not throw"); + } }