diff --git a/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlElementFactory.cs b/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlElementFactory.cs new file mode 100644 index 0000000..b0b162f --- /dev/null +++ b/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlElementFactory.cs @@ -0,0 +1,150 @@ +using WebExpress.WebCore.WebHtml; +using WebExpress.WebCore.WebHtml.Parser; + +namespace WebExpress.WebCore.Test.Html.Parser +{ + /// + /// Unit tests for the class. + /// + [Collection("NonParallelTests")] + public class UnitTestHtmlElementFactory + { + /// + /// Known tags are resolved to their specific subclass. + /// + [Fact] + public void KnownTag_Div_ReturnsCorrectType() + { + var element = HtmlElementFactory.Create("div"); + + Assert.IsType(element); + } + + /// + /// Known tags are resolved to their specific subclass. + /// + [Fact] + public void KnownTag_Span_ReturnsCorrectType() + { + var element = HtmlElementFactory.Create("span"); + + Assert.IsType(element); + } + + /// + /// Known tags are resolved to their specific subclass. + /// + [Fact] + public void KnownTag_Img_ReturnsCorrectType() + { + var element = HtmlElementFactory.Create("img"); + + Assert.IsType(element); + } + + /// + /// Known tags are resolved to their specific subclass. + /// + [Fact] + public void KnownTag_Input_ReturnsCorrectType() + { + var element = HtmlElementFactory.Create("input"); + + Assert.IsType(element); + } + + /// + /// Known tags are resolved to their specific subclass. + /// + [Fact] + public void KnownTag_Table_ReturnsCorrectType() + { + var element = HtmlElementFactory.Create("table"); + + Assert.IsType(element); + } + + /// + /// Factory is case-insensitive – upper-case tag names resolve correctly. + /// + [Fact] + public void CaseInsensitive_UpperCase_ReturnsCorrectType() + { + var element = HtmlElementFactory.Create("DIV"); + + Assert.IsType(element); + } + + /// + /// An unknown tag name returns a generic . + /// + [Fact] + public void UnknownTag_ReturnsGenericElement() + { + var element = HtmlElementFactory.Create("x-custom-widget"); + + Assert.IsType(element); + } + + /// + /// returns true for known tags. + /// + [Fact] + public void IsKnown_KnownTag_ReturnsTrue() + { + Assert.True(HtmlElementFactory.IsKnown("div")); + } + + /// + /// returns false for unknown tags. + /// + [Fact] + public void IsKnown_UnknownTag_ReturnsFalse() + { + Assert.False(HtmlElementFactory.IsKnown("x-unknown")); + } + + /// + /// Passing null to throws an + /// . + /// + [Fact] + public void NullTagName_Throws() + { + Assert.Throws(() => HtmlElementFactory.Create(null)); + } + + /// + /// Both 'kbd' and 'kdb' map to the existing . + /// + [Fact] + public void KbdTag_MapsToKdbElement() + { + var element = HtmlElementFactory.Create("kbd"); + + Assert.IsType(element); + } + + /// + /// The 'keygen' tag is resolved to . + /// + [Fact] + public void KnownTag_Keygen_ReturnsCorrectType() + { + var element = HtmlElementFactory.Create("keygen"); + + Assert.IsType(element); + } + + /// + /// The 'command' tag is resolved to . + /// + [Fact] + public void KnownTag_Command_ReturnsCorrectType() + { + var element = HtmlElementFactory.Create("command"); + + Assert.IsType(element); + } + } +} diff --git a/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlParser.cs b/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlParser.cs new file mode 100644 index 0000000..5738752 --- /dev/null +++ b/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlParser.cs @@ -0,0 +1,500 @@ +using System.Linq; +using WebExpress.WebCore.WebHtml; +using WebExpress.WebCore.WebHtml.Parser; + +namespace WebExpress.WebCore.Test.Html.Parser +{ + /// + /// Unit tests for the class. + /// + [Collection("NonParallelTests")] + public class UnitTestHtmlParser + { + private static readonly HtmlParser Parser = new(); + + // ------------------------------------------------------------------ + // Simple elements + // ------------------------------------------------------------------ + + /// + /// A simple element is correctly reconstructed. + /// + [Fact] + public void SimpleElement_Div_IsReconstructed() + { + var nodes = Parser.Parse("
"); + var div = nodes.OfType().Single(); + + Assert.NotNull(div); + } + + /// + /// An element with a class attribute retains its attribute. + /// + [Fact] + public void ElementWithClass_RetainsAttribute() + { + var nodes = Parser.Parse("
"); + var div = nodes.OfType().Single(); + + Assert.Equal("container", div.Class); + } + + /// + /// An element with an id attribute retains its attribute. + /// + [Fact] + public void ElementWithId_RetainsAttribute() + { + var nodes = Parser.Parse("

text

"); + var p = nodes.OfType().Single(); + + Assert.Equal("intro", p.Id); + } + + // ------------------------------------------------------------------ + // Nested structures + // ------------------------------------------------------------------ + + /// + /// A nested element hierarchy is correctly reconstructed. + /// + [Fact] + public void NestedElements_AreReconstructed() + { + var nodes = Parser.Parse("
text
"); + var div = nodes.OfType().Single(); + var span = div.Elements.OfType().Single(); + + Assert.NotNull(span); + } + + /// + /// A deeply nested structure is correctly reconstructed. + /// + [Fact] + public void DeepNesting_IsReconstructed() + { + var nodes = Parser.Parse("
  • item
"); + var ul = nodes.OfType().Single(); + var li = ul.Elements.OfType().Single(); + var span = li.Elements.OfType().Single(); + + Assert.NotNull(span); + } + + // ------------------------------------------------------------------ + // Text nodes + // ------------------------------------------------------------------ + + /// + /// A text node inside an element is preserved. + /// + [Fact] + public void TextNode_IsPreserved() + { + var nodes = Parser.Parse("

Hello World

"); + var p = nodes.OfType().Single(); + var text = p.Elements.OfType().Single(); + + Assert.Equal("Hello World", text.Value); + } + + /// + /// A bare text node at the top level is returned as a text node. + /// + [Fact] + public void BareText_ReturnsTextNode() + { + var nodes = Parser.Parse("Hello"); + var text = nodes.OfType().Single(); + + Assert.Equal("Hello", text.Value); + } + + // ------------------------------------------------------------------ + // Self-closing tags + // ------------------------------------------------------------------ + + /// + /// A self-closing <br/> tag produces a . + /// + [Fact] + public void SelfClosingBr_IsReconstructed() + { + var nodes = Parser.Parse("
"); + var br = nodes.OfType().Single(); + + Assert.NotNull(br); + } + + /// + /// A void <img> tag (no trailing slash) produces a . + /// + [Fact] + public void VoidImg_IsReconstructed() + { + var nodes = Parser.Parse("\"photo\""); + var img = nodes.OfType().Single(); + + Assert.Equal("photo.png", img.Src); + Assert.Equal("photo", img.Alt); + } + + // ------------------------------------------------------------------ + // Attributes + // ------------------------------------------------------------------ + + /// + /// Boolean attributes are applied to the element. + /// + [Fact] + public void BooleanAttribute_IsApplied() + { + var nodes = Parser.Parse(""); + var input = nodes.OfType().Single(); + + Assert.True(input.HasUserAttribute("disabled")); + } + + /// + /// data-* attributes are preserved on the element. + /// + [Fact] + public void DataAttribute_IsPreserved() + { + var nodes = Parser.Parse("
"); + var div = nodes.OfType().Single(); + + Assert.Equal("modal", div.GetUserAttribute("data-toggle")); + } + + /// + /// ARIA attributes are preserved on the element. + /// + [Fact] + public void AriaAttribute_IsPreserved() + { + var nodes = Parser.Parse(""); + var btn = nodes.OfType().Single(); + + Assert.Equal("Close", btn.GetUserAttribute("aria-label")); + } + + // ------------------------------------------------------------------ + // Comments + // ------------------------------------------------------------------ + + /// + /// An HTML comment is reconstructed as a node. + /// + [Fact] + public void Comment_IsReconstructed() + { + var nodes = Parser.Parse(""); + var comment = nodes.OfType().Single(); + + Assert.Equal("remark", comment.Text); + } + + // ------------------------------------------------------------------ + // DOCTYPE + // ------------------------------------------------------------------ + + /// + /// A DOCTYPE declaration does not produce a node in the tree (it is + /// informational only). + /// + [Fact] + public void Doctype_ProducesNoNode() + { + var nodes = Parser.Parse(""); + + Assert.DoesNotContain(nodes, n => n is HtmlText t && t.Value.Contains("DOCTYPE")); + } + + // ------------------------------------------------------------------ + // Unknown tags + // ------------------------------------------------------------------ + + /// + /// An unknown tag is mapped to a generic . + /// + [Fact] + public void UnknownTag_MapsToGenericElement() + { + var nodes = Parser.Parse(""); + var element = nodes.OfType().Single(); + + Assert.Equal("bar", element.GetUserAttribute("foo")); + } + + // ------------------------------------------------------------------ + // Malformed HTML + // ------------------------------------------------------------------ + + /// + /// An unclosed tag is handled gracefully and the element is still returned. + /// + [Fact] + public void UnclosedTag_IsHandledGracefully() + { + var nodes = Parser.Parse("

text"); + + var div = nodes.OfType().Single(); + Assert.NotNull(div); + } + + ///

+ /// Parsing an empty string does not throw. + /// + [Fact] + public void EmptyInput_ReturnsEmptyList() + { + var nodes = Parser.Parse(""); + + Assert.Empty(nodes); + } + + /// + /// Passing null to throws + /// . + /// + [Fact] + public void NullInput_Throws() + { + Assert.Throws(() => Parser.Parse(null)); + } + + // ------------------------------------------------------------------ + // Round-trip tests + // ------------------------------------------------------------------ + + /// + /// Parsing the HTML produced by the renderer reconstructs the same + /// element type. + /// + [Fact] + public void RoundTrip_SimpleDiv_PreservesType() + { + // arrange + var original = new HtmlElementTextContentDiv(); + original.Id = "main"; + original.AddClass("container"); + + // act + var html = original.ToString().Trim(); + var parsed = Parser.Parse(html); + + var restored = parsed.OfType().Single(); + + // validation + Assert.Equal(original.Id, restored.Id); + Assert.Equal(original.Class, restored.Class); + } + + /// + /// Parsing the HTML produced by the renderer for a nested structure + /// reconstructs the hierarchy. + /// + [Fact] + public void RoundTrip_NestedStructure_PreservesHierarchy() + { + // arrange + var original = new HtmlElementTextContentDiv( + new HtmlElementTextSemanticsSpan(new HtmlText("hello")) + ); + + // act + var html = original.ToString().Trim(); + var parsed = Parser.Parse(html); + + var div = parsed.OfType().Single(); + var span = div.Elements.OfType().Single(); + var text = span.Elements.OfType().Single(); + + // validation + Assert.Equal("hello", text.Value); + } + + /// + /// Rendering the parsed HTML of an <img> element produces equivalent HTML. + /// + [Fact] + public void RoundTrip_Img_ProducesEquivalentHtml() + { + // arrange + var original = new HtmlElementMultimediaImg + { + Src = "logo.png", + Alt = "Logo" + }; + + // act + var html = original.ToString().Trim(); + var parsed = Parser.Parse(html); + + var img = parsed.OfType().Single(); + var restoredHtml = img.ToString().Trim(); + + // validation + Assert.Equal(html, restoredHtml); + } + + // ------------------------------------------------------------------ + // Additional tests + // ------------------------------------------------------------------ + + /// + /// ParseSingle returns the first node. + /// + [Fact] + public void ParseSingle_ReturnsFirstNode() + { + var node = Parser.ParseSingle("
"); + + Assert.IsType(node); + } + + /// + /// ParseSingle returns null for an empty input. + /// + [Fact] + public void ParseSingle_EmptyInput_ReturnsNull() + { + var node = Parser.ParseSingle(""); + + Assert.Null(node); + } + + /// + /// An element with an inline style attribute retains its value. + /// + [Fact] + public void InlineStyleAttribute_IsPreserved() + { + var nodes = Parser.Parse("
"); + var div = nodes.OfType().Single(); + + Assert.Equal("color: red;", div.Style); + } + + /// + /// A table structure with thead, tbody, and rows is correctly reconstructed. + /// + [Fact] + public void TableStructure_IsReconstructed() + { + var nodes = Parser.Parse("
Header
Cell
"); + var table = nodes.OfType().Single(); + var thead = table.Elements.OfType().Single(); + var tbody = table.Elements.OfType().Single(); + + Assert.NotNull(thead); + Assert.NotNull(tbody); + } + + /// + /// Multiple top-level elements are all returned. + /// + [Fact] + public void MultipleRoots_AreAllReturned() + { + var nodes = Parser.Parse("

one

two

"); + + Assert.Equal(2, nodes.Count); + Assert.All(nodes, n => Assert.IsType(n)); + } + + /// + /// A mismatched end tag is handled gracefully without throwing. + /// + [Fact] + public void MismatchedEndTag_IsHandledGracefully() + { + var nodes = Parser.Parse("
text
"); + + var div = nodes.OfType().Single(); + Assert.NotNull(div); + } + + /// + /// Mixed text and element children are preserved in order. + /// + [Fact] + public void MixedContent_TextAndElements_ArePreserved() + { + var nodes = Parser.Parse("

Hello World!

"); + var p = nodes.OfType().Single(); + + Assert.Equal(3, p.Elements.Count()); + } + + /// + /// Roundtrip of a styled element preserves the style attribute. + /// + [Fact] + public void RoundTrip_StyleAttribute_IsPreserved() + { + // arrange + var original = new HtmlElementTextContentDiv(); + original.Style = "color: red;"; + + // act + var html = original.ToString().Trim(); + var parsed = Parser.Parse(html); + + var restored = parsed.OfType().Single(); + + // validation + Assert.Equal("color: red;", restored.Style); + } + + /// + /// Roundtrip of an anchor element preserves href and text content. + /// + [Fact] + public void RoundTrip_Anchor_PreservesHrefAndText() + { + // arrange + var original = new HtmlElementTextSemanticsA(new HtmlText("click me")); + original.Href = "https://example.com"; + + // act + var html = original.ToString().Trim(); + var parsed = Parser.Parse(html); + + var a = parsed.OfType().Single(); + var text = a.Elements.OfType().Single(); + + // validation + Assert.Equal("https://example.com", a.Href); + Assert.Equal("click me", text.Value); + } + + /// + /// A form with input fields is correctly reconstructed. + /// + [Fact] + public void FormWithInputs_IsReconstructed() + { + var nodes = Parser.Parse("
"); + var form = nodes.OfType().Single(); + var input = form.Elements.OfType().Single(); + + Assert.NotNull(input); + } + + /// + /// The kbd tag (standard HTML) maps to HtmlElementTextSemanticsKdb. + /// + [Fact] + public void KbdTag_MapsToKdbElement() + { + var nodes = Parser.Parse("Ctrl+C"); + var kbd = nodes.OfType().Single(); + + Assert.NotNull(kbd); + } + } +} diff --git a/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlTokenizer.cs b/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlTokenizer.cs new file mode 100644 index 0000000..3dda876 --- /dev/null +++ b/src/WebExpress.WebCore.Test/Html/Parser/UnitTestHtmlTokenizer.cs @@ -0,0 +1,321 @@ +using System.Linq; +using WebExpress.WebCore.WebHtml.Parser; + +namespace WebExpress.WebCore.Test.Html.Parser +{ + /// + /// Unit tests for the class. + /// + [Collection("NonParallelTests")] + public class UnitTestHtmlTokenizer + { + // ------------------------------------------------------------------ + // Helper + // ------------------------------------------------------------------ + + private static HtmlToken[] Tokenize(string html) + { + var tokenizer = new HtmlTokenizer(html); + return [.. tokenizer.Tokenize()]; + } + + // ------------------------------------------------------------------ + // Basic tokens + // ------------------------------------------------------------------ + + /// + /// An empty input produces only an EOF token. + /// + [Fact] + public void EmptyInput_ReturnsEof() + { + var tokens = Tokenize(""); + + Assert.Single(tokens); + Assert.Equal(HtmlTokenType.EndOfFile, tokens[0].Type); + } + + /// + /// A plain text input produces a text token. + /// + [Fact] + public void PlainText_ReturnsTextToken() + { + var tokens = Tokenize("Hello World"); + + Assert.Equal(HtmlTokenType.Text, tokens[0].Type); + Assert.Equal("Hello World", tokens[0].Value); + } + + /// + /// A simple start tag produces a start-tag token. + /// + [Fact] + public void SimpleStartTag_ReturnsStartTag() + { + var tokens = Tokenize("
"); + + Assert.Equal(HtmlTokenType.StartTag, tokens[0].Type); + Assert.Equal("div", tokens[0].TagName); + } + + /// + /// A simple end tag produces an end-tag token. + /// + [Fact] + public void SimpleEndTag_ReturnsEndTag() + { + var tokens = Tokenize("
"); + + Assert.Equal(HtmlTokenType.EndTag, tokens[0].Type); + Assert.Equal("div", tokens[0].TagName); + } + + /// + /// An explicit self-closing tag produces a self-closing token. + /// + [Fact] + public void ExplicitSelfClosingTag_ReturnsSelfClosing() + { + var tokens = Tokenize("
"); + + Assert.Equal(HtmlTokenType.SelfClosingTag, tokens[0].Type); + Assert.Equal("br", tokens[0].TagName); + } + + /// + /// A void element without a trailing slash is still emitted as self-closing. + /// + [Fact] + public void VoidElement_ReturnsSelfClosing() + { + var tokens = Tokenize(""); + + Assert.Equal(HtmlTokenType.SelfClosingTag, tokens[0].Type); + Assert.Equal("img", tokens[0].TagName); + } + + // ------------------------------------------------------------------ + // Attributes + // ------------------------------------------------------------------ + + /// + /// Quoted attribute values are correctly extracted. + /// + [Fact] + public void TagWithQuotedAttribute_ExtractsAttribute() + { + var tokens = Tokenize("
"); + + Assert.Equal(HtmlTokenType.StartTag, tokens[0].Type); + var attr = tokens[0].Attributes.Single(); + Assert.Equal("class", attr.Name); + Assert.Equal("foo", attr.Value); + } + + /// + /// Single-quoted attribute values are correctly extracted. + /// + [Fact] + public void TagWithSingleQuotedAttribute_ExtractsAttribute() + { + var tokens = Tokenize("
"); + + var attr = tokens[0].Attributes.Single(); + Assert.Equal("class", attr.Name); + Assert.Equal("bar", attr.Value); + } + + /// + /// Multiple attributes on a single tag are all extracted. + /// + [Fact] + public void TagWithMultipleAttributes_ExtractsAll() + { + var tokens = Tokenize(""); + + var attrs = tokens[0].Attributes; + Assert.Equal(3, attrs.Count); + Assert.Equal("id", attrs[0].Name); + Assert.Equal("x", attrs[0].Value); + Assert.Equal("type", attrs[1].Name); + Assert.Equal("text", attrs[1].Value); + Assert.Equal("disabled", attrs[2].Name); + Assert.True(attrs[2].IsBoolean); + } + + /// + /// Boolean (valueless) attributes are represented without a value. + /// + [Fact] + public void BooleanAttribute_IsBoolean() + { + var tokens = Tokenize(""); + + var attr = tokens[0].Attributes.Single(); + Assert.True(attr.IsBoolean); + Assert.Null(attr.Value); + } + + /// + /// Data attributes are preserved as-is. + /// + [Fact] + public void DataAttribute_IsPreserved() + { + var tokens = Tokenize("
"); + + var attr = tokens[0].Attributes.Single(); + Assert.Equal("data-toggle", attr.Name); + Assert.Equal("modal", attr.Value); + } + + /// + /// ARIA attributes are preserved as-is. + /// + [Fact] + public void AriaAttribute_IsPreserved() + { + var tokens = Tokenize(""); + + var attr = tokens[0].Attributes.Single(); + Assert.Equal("aria-label", attr.Name); + Assert.Equal("Close", attr.Value); + } + + // ------------------------------------------------------------------ + // Comments and DOCTYPE + // ------------------------------------------------------------------ + + /// + /// An HTML comment produces a comment token whose value does not include the delimiters. + /// + [Fact] + public void Comment_ReturnsCommentToken() + { + var tokens = Tokenize(""); + + Assert.Equal(HtmlTokenType.Comment, tokens[0].Type); + Assert.Equal("remark", tokens[0].Value); + } + + /// + /// A DOCTYPE declaration produces a doctype token. + /// + [Fact] + public void Doctype_ReturnsDoctypeToken() + { + var tokens = Tokenize(""); + + Assert.Equal(HtmlTokenType.Doctype, tokens[0].Type); + Assert.Equal("html", tokens[0].TagName); + } + + // ------------------------------------------------------------------ + // Compound input + // ------------------------------------------------------------------ + + /// + /// A typical HTML snippet produces the expected sequence of tokens. + /// + [Fact] + public void CompoundInput_ProducesExpectedSequence() + { + var tokens = Tokenize("

Hello

"); + + Assert.Equal(HtmlTokenType.StartTag, tokens[0].Type); + Assert.Equal("p", tokens[0].TagName); + + Assert.Equal(HtmlTokenType.Text, tokens[1].Type); + Assert.Equal("Hello", tokens[1].Value); + + Assert.Equal(HtmlTokenType.EndTag, tokens[2].Type); + Assert.Equal("p", tokens[2].TagName); + + Assert.Equal(HtmlTokenType.EndOfFile, tokens[3].Type); + } + + /// + /// Tags names are normalised to lower case. + /// + [Fact] + public void TagNameNormalisation_IsLowerCase() + { + var tokens = Tokenize("
"); + + Assert.Equal("div", tokens[0].TagName); + Assert.Equal("class", tokens[0].Attributes[0].Name); + } + + // ------------------------------------------------------------------ + // Whitespace and edge cases + // ------------------------------------------------------------------ + + /// + /// Whitespace-only text between tags is preserved as a text token. + /// + [Fact] + public void WhitespaceText_IsPreservedAsTextToken() + { + var tokens = Tokenize("
"); + + Assert.Equal(HtmlTokenType.StartTag, tokens[0].Type); + Assert.Equal(HtmlTokenType.Text, tokens[1].Type); + Assert.Equal(" ", tokens[1].Value); + Assert.Equal(HtmlTokenType.EndTag, tokens[2].Type); + } + + /// + /// An unquoted attribute value is read until whitespace or closing bracket. + /// + [Fact] + public void UnquotedAttributeValue_IsExtracted() + { + var tokens = Tokenize("
"); + + var attr = tokens[0].Attributes.Single(); + Assert.Equal("class", attr.Name); + Assert.Equal("foo", attr.Value); + } + + /// + /// An inline style attribute is preserved in its entirety. + /// + [Fact] + public void InlineStyleAttribute_IsPreserved() + { + var tokens = Tokenize("
"); + + var attr = tokens[0].Attributes.Single(); + Assert.Equal("style", attr.Name); + Assert.Equal("color: red; font-size: 14px;", attr.Value); + } + + /// + /// A stray less-than character is emitted as text. + /// + [Fact] + public void StrayLessThan_IsEmittedAsText() + { + var tokens = Tokenize("a < b"); + + Assert.Equal(HtmlTokenType.Text, tokens[0].Type); + Assert.Equal("a ", tokens[0].Value); + Assert.Equal(HtmlTokenType.Text, tokens[1].Type); + Assert.Equal("<", tokens[1].Value); + Assert.Equal(HtmlTokenType.Text, tokens[2].Type); + } + + /// + /// A keygen void element without slash is emitted as self-closing. + /// + [Fact] + public void KeygenVoidElement_ReturnsSelfClosing() + { + var tokens = Tokenize(""); + + Assert.Equal(HtmlTokenType.SelfClosingTag, tokens[0].Type); + Assert.Equal("keygen", tokens[0].TagName); + } + } +} diff --git a/src/WebExpress.WebCore/WebHtml/Parser/HtmlElementFactory.cs b/src/WebExpress.WebCore/WebHtml/Parser/HtmlElementFactory.cs new file mode 100644 index 0000000..b44c04b --- /dev/null +++ b/src/WebExpress.WebCore/WebHtml/Parser/HtmlElementFactory.cs @@ -0,0 +1,201 @@ +using System; +using System.Collections.Generic; + +namespace WebExpress.WebCore.WebHtml.Parser +{ + /// + /// Maps HTML tag names to their corresponding subclass instances. + /// + /// + /// When a tag name is not recognised, the factory returns a generic + /// instance whose ElementName is preserved so + /// that the tag name survives a round-trip through the renderer. + /// + public class HtmlElementFactory + { + private static readonly Dictionary> _registry = + new(StringComparer.OrdinalIgnoreCase) + { + // Root + ["html"] = () => new HtmlElementRootHtml(), + + // Metadata + ["head"] = () => new HtmlElementMetadataHead(), + ["base"] = () => new HtmlElementMetadataBase(), + ["link"] = () => new HtmlElementMetadataLink(), + ["meta"] = () => new HtmlElementMetadataMeta(), + ["style"] = () => new HtmlElementMetadataStyle(), + ["title"] = () => new HtmlElementMetadataTitle(), + + // Scripting + ["script"] = () => new HtmlElementScriptingScript(), + ["noscript"] = () => new HtmlElementScriptingNoscript(), + ["canvas"] = () => new HtmlElementScriptingCanvas(), + + // Sections + ["body"] = () => new HtmlElementSectionBody(), + ["address"] = () => new HtmlElementSectionAddress(), + ["article"] = () => new HtmlElementSectionArticle(), + ["aside"] = () => new HtmlElementSectionAside(), + ["footer"] = () => new HtmlElementSectionFooter(), + ["h1"] = () => new HtmlElementSectionH1(), + ["h2"] = () => new HtmlElementSectionH2(), + ["h3"] = () => new HtmlElementSectionH3(), + ["h4"] = () => new HtmlElementSectionH4(), + ["h5"] = () => new HtmlElementSectionH5(), + ["h6"] = () => new HtmlElementSectionH6(), + ["header"] = () => new HtmlElementSectionHeader(), + ["main"] = () => new HtmlElementSectionMain(), + ["nav"] = () => new HtmlElementSectionNav(), + ["section"] = () => new HtmlElementSectionSection(), + + // Text content + ["blockquote"] = () => new HtmlElementTextContentBlockquote(), + ["dd"] = () => new HtmlElementTextContentDd(), + ["div"] = () => new HtmlElementTextContentDiv(), + ["dl"] = () => new HtmlElementTextContentDl(), + ["dt"] = () => new HtmlElementTextContentDt(), + ["figcaption"] = () => new HtmlElementTextContentFigcaption(), + ["figure"] = () => new HtmlElementTextContentFigure(), + ["hr"] = () => new HtmlElementTextContentHr(), + ["li"] = () => new HtmlElementTextContentLi(), + ["ol"] = () => new HtmlElementTextContentOl(), + ["p"] = () => new HtmlElementTextContentP(), + ["pre"] = () => new HtmlElementTextContentPre(), + ["ul"] = () => new HtmlElementTextContentUl(), + + // Inline text semantics + ["a"] = () => new HtmlElementTextSemanticsA(), + ["abbr"] = () => new HtmlElementTextSemanticsAbbr(), + ["b"] = () => new HtmlElementTextSemanticsB(), + ["bdi"] = () => new HtmlElementTextSemanticsBdi(), + ["bdo"] = () => new HtmlElementTextSemanticsBdo(), + ["br"] = () => new HtmlElementTextSemanticsBr(), + ["cite"] = () => new HtmlElementTextSemanticsCite(), + ["code"] = () => new HtmlElementTextSemanticsCode(), + ["data"] = () => new HtmlElementTextSemanticsData(), + ["dfn"] = () => new HtmlElementTextSemanticsDfn(), + ["em"] = () => new HtmlElementTextSemanticsEm(), + ["i"] = () => new HtmlElementTextSemanticsI(), + // The standard HTML element is , but the existing class uses "kdb" as + // its element name. Both spellings are mapped so that the parser handles + // real-world HTML () as well as the project's own renderer output (). + ["kbd"] = () => new HtmlElementTextSemanticsKdb(), + ["kdb"] = () => new HtmlElementTextSemanticsKdb(), + // 'kbd' is the correct HTML tag name; 'kdb' mirrors the existing class typo. + ["kbd"] = () => new HtmlElementTextSemanticsKdb(), + ["mark"] = () => new HtmlElementTextSemanticsMark(), + ["q"] = () => new HtmlElementTextSemanticsQ(), + ["rp"] = () => new HtmlElementTextSemanticsRp(), + ["rt"] = () => new HtmlElementTextSemanticsRt(), + ["ruby"] = () => new HtmlElementTextSemanticsRuby(), + ["s"] = () => new HtmlElementTextSemanticsS(), + ["samp"] = () => new HtmlElementTextSemanticsSamp(), + ["small"] = () => new HtmlElementTextSemanticsSmall(), + ["span"] = () => new HtmlElementTextSemanticsSpan(), + ["strong"] = () => new HtmlElementTextSemanticsStrong(), + ["sub"] = () => new HtmlElementTextSemanticsSub(), + ["sup"] = () => new HtmlElementTextSemanticsSup(), + ["time"] = () => new HtmlElementTextSemanticsTime(), + ["u"] = () => new HtmlElementTextSemanticsU(), + ["var"] = () => new HtmlElementTextSemanticsVar(), + ["wbr"] = () => new HtmlElementTextSemanticsWbr(), + + // Edits + ["del"] = () => new HtmlElementEditDel(), + ["ins"] = () => new HtmlElementEditIns(), + + // Embedded content + ["embed"] = () => new HtmlElementEmbeddedEmbed(), + ["iframe"] = () => new HtmlElementEmbeddedIframe(), + ["object"] = () => new HtmlElementEmbeddedObject(), + ["param"] = () => new HtmlElementEmbeddedParam(), + ["picture"] = () => new HtmlElementEmbeddedPicture(), + ["source"] = () => new HtmlElementEmbeddedSource(), + + // Multimedia + ["area"] = () => new HtmlElementMultimediaArea(), + ["audio"] = () => new HtmlElementMultimediaAudio(), + ["img"] = () => new HtmlElementMultimediaImg(), + ["map"] = () => new HtmlElementMultimediaMap(), + ["math"] = () => new HtmlElementMultimediaMath(), + ["svg"] = () => new HtmlElementMultimediaSvg(), + ["track"] = () => new HtmlElementMultimediaTrack(), + ["video"] = () => new HtmlElementMultimediaVideo(), + + // Table + ["caption"] = () => new HtmlElementTableCaption(), + ["col"] = () => new HtmlElementTableCol(), + ["colgroup"] = () => new HtmlElementTableColgroup(), + ["table"] = () => new HtmlElementTableTable(), + ["tbody"] = () => new HtmlElementTableTbody(), + ["td"] = () => new HtmlElementTableTd(), + ["tfoot"] = () => new HtmlElementTableTfoot(), + ["th"] = () => new HtmlElementTableTh(), + ["thead"] = () => new HtmlElementTableThead(), + ["tr"] = () => new HtmlElementTableTr(), + + // Forms + ["button"] = () => new HtmlElementFieldButton(), + ["input"] = () => new HtmlElementFieldInput(), + ["label"] = () => new HtmlElementFieldLabel(), + ["legend"] = () => new HtmlElementFieldLegend(), + ["select"] = () => new HtmlElementFieldSelect(), + ["datalist"] = () => new HtmlElementFormDatalist(), + ["fieldset"] = () => new HtmlElementFormFieldset(), + ["form"] = () => new HtmlElementFormForm(), + ["keygen"] = () => new HtmlElementFormKeygen(), + ["meter"] = () => new HtmlElementFormMeter(), + ["optgroup"] = () => new HtmlElementFormOptgroup(), + ["option"] = () => new HtmlElementFormOption(), + ["output"] = () => new HtmlElementFormOutput(), + ["progress"] = () => new HtmlElementFormProgress(), + ["textarea"] = () => new HtmlElementFormTextarea(), + + // Interactive + ["command"] = () => new HtmlElementInteractiveCommand(), + ["details"] = () => new HtmlElementInteractiveDetails(), + ["menu"] = () => new HtmlElementInteractiveMenu(), + ["summary"] = () => new HtmlElementInteractiveSummary(), + + // Web fragments + ["slot"] = () => new HtmlElementWebFragmentsSlot(), + ["template"] = () => new HtmlElementWebFragmentsTemplate(), + }; + + /// + /// Creates an instance for the specified HTML tag name. + /// + /// The lower-case HTML tag name (e.g. "div"). + /// + /// An instance of the most specific subclass that + /// corresponds to . If the tag name is unknown, a generic + /// is returned so that the parser remains robust. + /// + /// + /// Thrown when is null. + /// + public static HtmlElement Create(string tagName) + { + if (tagName is null) + { + throw new ArgumentNullException(nameof(tagName)); + } + + if (_registry.TryGetValue(tagName, out var factory)) + { + return factory(); + } + + // Unknown tag – return a generic element so parsing remains robust. + return new HtmlElement(tagName); + } + + /// + /// Returns true if the specified tag name is registered in the factory. + /// + /// The HTML tag name to look up (case-insensitive). + public static bool IsKnown(string tagName) => + tagName is not null && _registry.ContainsKey(tagName); + } +} diff --git a/src/WebExpress.WebCore/WebHtml/Parser/HtmlParseException.cs b/src/WebExpress.WebCore/WebHtml/Parser/HtmlParseException.cs new file mode 100644 index 0000000..68de726 --- /dev/null +++ b/src/WebExpress.WebCore/WebHtml/Parser/HtmlParseException.cs @@ -0,0 +1,54 @@ +using System; + +namespace WebExpress.WebCore.WebHtml.Parser +{ + /// + /// Represents an error that occurs while tokenizing or parsing an HTML string. + /// + public class HtmlParseException : Exception + { + /// + /// Returns the zero-based character position in the input at which the + /// error was detected, or -1 if the position is unknown. + /// + public int Position { get; } + + /// + /// Initializes a new instance of the class + /// with the specified error message. + /// + /// A descriptive message that explains the reason for the failure. + public HtmlParseException(string message) + : base(message) + { + Position = -1; + } + + /// + /// Initializes a new instance of the class + /// with the specified error message and the position in the input where the + /// error occurred. + /// + /// A descriptive message that explains the reason for the failure. + /// The zero-based character position in the input where the error was detected. + public HtmlParseException(string message, int position) + : base(message) + { + Position = position; + } + + /// + /// Initializes a new instance of the class + /// with the specified error message and a reference to the inner exception + /// that caused this exception. + /// + /// A descriptive message that explains the reason for the failure. + /// The exception that is the cause of the current exception, + /// or null if no inner exception is specified. + public HtmlParseException(string message, Exception innerException) + : base(message, innerException) + { + Position = -1; + } + } +} diff --git a/src/WebExpress.WebCore/WebHtml/Parser/HtmlParser.cs b/src/WebExpress.WebCore/WebHtml/Parser/HtmlParser.cs new file mode 100644 index 0000000..a3567f5 --- /dev/null +++ b/src/WebExpress.WebCore/WebHtml/Parser/HtmlParser.cs @@ -0,0 +1,176 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace WebExpress.WebCore.WebHtml.Parser +{ + /// + /// Parses an HTML string and reconstructs the corresponding + /// object tree using the classes from + /// WebExpress.WebCore.WebHtml. + /// + /// + /// The parser consumes the token stream produced by + /// and builds a DOM-like hierarchy of objects. It is + /// deliberately tolerant: unknown tags are mapped to a generic + /// , and unclosed tags are automatically closed when + /// an end token for an ancestor is encountered (or when the token stream ends). + /// + public class HtmlParser + { + // ------------------------------------------------------------------ + // Public entry points + // ------------------------------------------------------------------ + + /// + /// Parses the supplied HTML string and returns the top-level nodes. + /// + /// The HTML string to parse. + /// + /// A read-only list of objects that form the + /// top-level content of the parsed document. + /// + /// Thrown when is null. + /// Thrown when the HTML is so malformed that recovery is impossible. + public IReadOnlyList Parse(string html) + { + if (html is null) + { + throw new ArgumentNullException(nameof(html)); + } + + var tokenizer = new HtmlTokenizer(html); + var tokens = tokenizer.Tokenize(); + return ParseTokens(tokens); + } + + /// + /// Parses the supplied HTML string and returns the single root node. + /// + /// The HTML string to parse. + /// + /// The first top-level , or null if the + /// input produces no nodes. + /// + /// Thrown when is null. + public IHtmlNode ParseSingle(string html) + { + var nodes = Parse(html); + return nodes.Count > 0 ? nodes[0] : null; + } + + // ------------------------------------------------------------------ + // Core parsing logic + // ------------------------------------------------------------------ + + private IReadOnlyList ParseTokens(IReadOnlyList tokens) + { + var index = 0; + var roots = new List(); + + ParseNodes(tokens, ref index, null, roots); + + return roots; + } + + /// + /// Recursively consumes tokens and appends the resulting nodes to + /// . Parsing stops when an end tag matching + /// is found or the stream ends. + /// + private void ParseNodes( + IReadOnlyList tokens, + ref int index, + string parentTagName, + List target) + { + while (index < tokens.Count) + { + var token = tokens[index]; + + switch (token.Type) + { + case HtmlTokenType.EndOfFile: + return; + + case HtmlTokenType.Doctype: + index++; + // DOCTYPE tokens are informational; no node is emitted. + break; + + case HtmlTokenType.Comment: + index++; + target.Add(new HtmlComment(token.Value)); + break; + + case HtmlTokenType.Text: + index++; + var textValue = token.Value; + if (!string.IsNullOrEmpty(textValue)) + { + target.Add(new HtmlText(textValue)); + } + break; + + case HtmlTokenType.SelfClosingTag: + index++; + var selfClosing = HtmlElementFactory.Create(token.TagName); + ApplyAttributes(selfClosing, token.Attributes); + target.Add(selfClosing); + break; + + case HtmlTokenType.StartTag: + index++; + var element = HtmlElementFactory.Create(token.TagName); + ApplyAttributes(element, token.Attributes); + + if (element.CloseTag) + { + var children = new List(); + ParseNodes(tokens, ref index, token.TagName, children); + element.Add(children.ToArray()); + } + + target.Add(element); + break; + + case HtmlTokenType.EndTag: + if (string.Equals(token.TagName, parentTagName, StringComparison.OrdinalIgnoreCase)) + { + index++; // consume the matching end tag + } + // If it doesn't match our parent, stop and let the caller handle it. + return; + + default: + index++; + break; + } + } + } + + // ------------------------------------------------------------------ + // Attribute mapping + // ------------------------------------------------------------------ + + private static void ApplyAttributes(HtmlElement element, IReadOnlyList attributes) + { + if (attributes is null || attributes.Count == 0) + { + return; + } + + foreach (var attr in attributes) + { + if (attr.IsBoolean) + { + element.AddUserAttribute(attr.Name); + } + else + { + element.AddUserAttribute(attr.Name, attr.Value); + } + } + } + } +} diff --git a/src/WebExpress.WebCore/WebHtml/Parser/HtmlToken.cs b/src/WebExpress.WebCore/WebHtml/Parser/HtmlToken.cs new file mode 100644 index 0000000..ed21c05 --- /dev/null +++ b/src/WebExpress.WebCore/WebHtml/Parser/HtmlToken.cs @@ -0,0 +1,85 @@ +using System.Collections.Generic; + +namespace WebExpress.WebCore.WebHtml.Parser +{ + /// + /// Represents a single token produced by the . + /// + public class HtmlToken + { + /// + /// Returns the type of this token. + /// + public HtmlTokenType Type { get; } + + /// + /// Returns the tag name for start, end, and self-closing tokens, or the + /// doctype name for doctype tokens. Returns null for text, comment, + /// and end-of-file tokens. + /// + public string TagName { get; } + + /// + /// Returns the raw text value for text and comment tokens. For start and + /// self-closing tokens this property is unused (null). + /// + public string Value { get; } + + /// + /// Returns the attributes associated with start and self-closing tokens. + /// The key is the attribute name (lower-case) and the value is the + /// attribute value, or null for boolean (valueless) attributes. + /// + public IReadOnlyList Attributes { get; } + + /// + /// Initializes a new end-of-file token. + /// + public HtmlToken() + { + Type = HtmlTokenType.EndOfFile; + Attributes = []; + } + + /// + /// Initializes a new text or comment token. + /// + /// The token type. Must be or + /// . + /// The raw text or comment content. + public HtmlToken(HtmlTokenType type, string value) + { + Type = type; + Value = value; + Attributes = []; + } + + /// + /// Initializes a new tag or doctype token. + /// + /// The token type. + /// The tag or doctype name. + /// The list of attributes. May be empty but must not be null. + public HtmlToken(HtmlTokenType type, string tagName, IReadOnlyList attributes) + { + Type = type; + TagName = tagName; + Attributes = attributes ?? []; + } + + /// + /// Returns a human-readable description of this token for debugging purposes. + /// + public override string ToString() => Type switch + { + HtmlTokenType.StartTag => $"<{TagName}>", + HtmlTokenType.EndTag => $"", + HtmlTokenType.SelfClosingTag => $"<{TagName}/>", + HtmlTokenType.Doctype => $"", + HtmlTokenType.Text => $"\"{Value}\"", + HtmlTokenType.Comment => $"", + HtmlTokenType.EndOfFile => "", + _ => base.ToString() + }; + } +} diff --git a/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenAttribute.cs b/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenAttribute.cs new file mode 100644 index 0000000..04980e5 --- /dev/null +++ b/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenAttribute.cs @@ -0,0 +1,51 @@ +namespace WebExpress.WebCore.WebHtml.Parser +{ + /// + /// Represents a single attribute within an HTML tag token. + /// + public class HtmlTokenAttribute + { + /// + /// Returns the attribute name (lower-cased). + /// + public string Name { get; } + + /// + /// Returns the attribute value, or null if the attribute is boolean + /// (has no explicit value, e.g. disabled). + /// + public string Value { get; } + + /// + /// Returns a value indicating whether this is a boolean (valueless) attribute. + /// + public bool IsBoolean => Value is null; + + /// + /// Initializes a new valueless (boolean) attribute. + /// + /// The attribute name. + public HtmlTokenAttribute(string name) + { + Name = name; + Value = null; + } + + /// + /// Initializes a new attribute with a value. + /// + /// The attribute name. + /// The attribute value. + public HtmlTokenAttribute(string name, string value) + { + Name = name; + Value = value; + } + + /// + /// Returns a human-readable description of this attribute for debugging purposes. + /// + public override string ToString() => + IsBoolean ? Name : $"{Name}=\"{Value}\""; + } +} diff --git a/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenType.cs b/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenType.cs new file mode 100644 index 0000000..32a287a --- /dev/null +++ b/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenType.cs @@ -0,0 +1,43 @@ +namespace WebExpress.WebCore.WebHtml.Parser +{ + /// + /// Specifies the type of an HTML token produced by the tokenizer. + /// + public enum HtmlTokenType + { + /// + /// Represents a DOCTYPE declaration, e.g. <!DOCTYPE html>. + /// + Doctype, + + /// + /// Represents an opening tag, e.g. <div class="foo">. + /// + StartTag, + + /// + /// Represents a closing tag, e.g. </div>. + /// + EndTag, + + /// + /// Represents a self-closing tag, e.g. <br/> or void elements like <img>. + /// + SelfClosingTag, + + /// + /// Represents a text node. + /// + Text, + + /// + /// Represents an HTML comment, e.g. <!-- remark -->. + /// + Comment, + + /// + /// Represents the end of the token stream. + /// + EndOfFile + } +} diff --git a/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenizer.cs b/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenizer.cs new file mode 100644 index 0000000..b96e085 --- /dev/null +++ b/src/WebExpress.WebCore/WebHtml/Parser/HtmlTokenizer.cs @@ -0,0 +1,350 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace WebExpress.WebCore.WebHtml.Parser +{ + /// + /// Breaks an HTML string into a flat sequence of objects. + /// + /// + /// The tokenizer is deliberately lenient: it makes a best-effort attempt to + /// produce useful tokens even when the input HTML is malformed. Recoverable + /// situations (e.g. an unclosed tag or an unquoted attribute value) are handled + /// silently; only situations that make further tokenization impossible result in a + /// . + /// + public class HtmlTokenizer + { + private readonly string _input; + private int _position; + + /// + /// Returns the set of HTML void-element tag names that are always + /// treated as self-closing even when the input does not include a + /// trailing slash. + /// + private static readonly HashSet VoidElements = + new(StringComparer.OrdinalIgnoreCase) + { + "area", "base", "br", "col", "embed", "hr", "img", "input", + "keygen", "link", "meta", "param", "source", "track", "wbr" + }; + + /// + /// Initializes a new instance of the class. + /// + /// The HTML string to tokenize. + /// Thrown when is null. + public HtmlTokenizer(string input) + { + _input = input ?? throw new ArgumentNullException(nameof(input)); + } + + /// + /// Tokenizes the entire input and returns all tokens, including a final + /// token. + /// + /// A list of objects. + public IReadOnlyList Tokenize() + { + var tokens = new List(); + _position = 0; + + while (_position < _input.Length) + { + var token = ReadNextToken(); + if (token != null) + { + tokens.Add(token); + } + } + + tokens.Add(new HtmlToken()); + return tokens; + } + + // ------------------------------------------------------------------ + // Private helpers + // ------------------------------------------------------------------ + + private char Current => _position < _input.Length ? _input[_position] : '\0'; + private char Peek(int offset = 1) => (_position + offset) < _input.Length ? _input[_position + offset] : '\0'; + + private HtmlToken ReadNextToken() + { + if (Current == '<') + { + return ReadTagOrSpecial(); + } + + return ReadText(); + } + + private HtmlToken ReadTagOrSpecial() + { + var start = _position; + _position++; // consume '<' + + if (Current == '!') + { + return ReadBangToken(start); + } + + if (Current == '/') + { + return ReadEndTag(); + } + + if (char.IsLetter(Current) || Current == '_') + { + return ReadStartTag(); + } + + // Anything else – treat the stray '<' as text. + return new HtmlToken(HtmlTokenType.Text, "<"); + } + + // Handles and + private HtmlToken ReadBangToken(int start) + { + _position++; // consume '!' + + if (Current == '-' && Peek() == '-') + { + return ReadComment(); + } + + if (_input.IndexOf("DOCTYPE", _position, StringComparison.OrdinalIgnoreCase) == _position) + { + return ReadDoctype(); + } + + // Unknown bang construct – consume until '>' and emit as text. + var builder = new StringBuilder("') + { + builder.Append(Current); + _position++; + } + if (Current == '>') + { + builder.Append('>'); + _position++; + } + return new HtmlToken(HtmlTokenType.Text, builder.ToString()); + } + + private HtmlToken ReadComment() + { + _position += 2; // consume '--' + var builder = new StringBuilder(); + + while (_position < _input.Length) + { + if (Current == '-' && Peek() == '-' && Peek(2) == '>') + { + _position += 3; // consume '-->' + break; + } + builder.Append(Current); + _position++; + } + + return new HtmlToken(HtmlTokenType.Comment, builder.ToString().Trim()); + } + + private HtmlToken ReadDoctype() + { + _position += 7; // consume 'DOCTYPE' + SkipWhitespace(); + + var nameBuilder = new StringBuilder(); + while (_position < _input.Length && Current != '>' && !char.IsWhiteSpace(Current)) + { + nameBuilder.Append(Current); + _position++; + } + + // Skip anything remaining until '>' + while (_position < _input.Length && Current != '>') + { + _position++; + } + if (Current == '>') + { + _position++; + } + + return new HtmlToken(HtmlTokenType.Doctype, nameBuilder.ToString(), []); + } + + private HtmlToken ReadEndTag() + { + _position++; // consume '/' + SkipWhitespace(); + + var tagName = ReadTagName(); + SkipWhitespace(); + + // Consume closing '>' + if (Current == '>') + { + _position++; + } + + return new HtmlToken(HtmlTokenType.EndTag, tagName.ToLowerInvariant(), []); + } + + private HtmlToken ReadStartTag() + { + var tagName = ReadTagName().ToLowerInvariant(); + var attributes = ReadAttributes(); + + var selfClose = false; + if (Current == '/') + { + selfClose = true; + _position++; // consume '/' + } + + if (Current == '>') + { + _position++; // consume '>' + } + + if (selfClose || VoidElements.Contains(tagName)) + { + return new HtmlToken(HtmlTokenType.SelfClosingTag, tagName, attributes); + } + + return new HtmlToken(HtmlTokenType.StartTag, tagName, attributes); + } + + private string ReadTagName() + { + var builder = new StringBuilder(); + while (_position < _input.Length && !char.IsWhiteSpace(Current) && + Current != '>' && Current != '/' && Current != '\0') + { + builder.Append(Current); + _position++; + } + return builder.ToString(); + } + + private IReadOnlyList ReadAttributes() + { + var list = new List(); + + while (_position < _input.Length) + { + SkipWhitespace(); + + if (Current == '>' || Current == '/' || Current == '\0') + { + break; + } + + var attr = ReadAttribute(); + if (attr != null) + { + list.Add(attr); + } + } + + return list; + } + + private HtmlTokenAttribute ReadAttribute() + { + var name = ReadAttributeName(); + if (string.IsNullOrEmpty(name)) + { + // Skip an unexpected character and continue. + if (_position < _input.Length) + { + _position++; + } + return null; + } + + SkipWhitespace(); + + if (Current != '=') + { + // Boolean attribute. + return new HtmlTokenAttribute(name.ToLowerInvariant()); + } + + _position++; // consume '=' + SkipWhitespace(); + + var value = ReadAttributeValue(); + return new HtmlTokenAttribute(name.ToLowerInvariant(), value); + } + + private string ReadAttributeName() + { + var builder = new StringBuilder(); + while (_position < _input.Length && Current != '=' && Current != '>' && + Current != '/' && !char.IsWhiteSpace(Current) && Current != '\0') + { + builder.Append(Current); + _position++; + } + return builder.ToString(); + } + + private string ReadAttributeValue() + { + if (Current == '"' || Current == '\'') + { + var quote = Current; + _position++; // consume opening quote + var builder = new StringBuilder(); + while (_position < _input.Length && Current != quote) + { + builder.Append(Current); + _position++; + } + if (Current == quote) + { + _position++; // consume closing quote + } + return builder.ToString(); + } + + // Unquoted value – read until whitespace or '>'. + var unquotedBuilder = new StringBuilder(); + while (_position < _input.Length && !char.IsWhiteSpace(Current) && + Current != '>' && Current != '\0') + { + unquotedBuilder.Append(Current); + _position++; + } + return unquotedBuilder.ToString(); + } + + private HtmlToken ReadText() + { + var builder = new StringBuilder(); + while (_position < _input.Length && Current != '<') + { + builder.Append(Current); + _position++; + } + + var text = builder.ToString(); + return text.Length > 0 ? new HtmlToken(HtmlTokenType.Text, text) : null; + } + + private void SkipWhitespace() + { + while (_position < _input.Length && char.IsWhiteSpace(Current)) + { + _position++; + } + } + } +}