From 57364ccdf7245c06471680f9b2b4a3ebfb8878f3 Mon Sep 17 00:00:00 2001 From: Brett Vickers Date: Thu, 21 Aug 2025 19:54:38 -0700 Subject: [PATCH 1/2] Add iterator versions of element queries Introduced 4 new functions to support iterator-based traversal of etree hierarchies: * ChildElementsSeq * SelectElementsSeq * FindElementsSeq * FindElementsPathSeq These functions work similarly to their non-Seq counterparts but use go 1.23's iterators to provide better performance during traversal, especially if you need to early-out. The existing single-element query functions (SelectElement, FindElement and FindElementPath) also benefit from this change, because they now early-out after finding the first matching element. --- etree.go | 72 +++++++++++++++------- etree_test.go | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++ go.mod | 2 +- path.go | 54 +++++++++------- 4 files changed, 248 insertions(+), 46 deletions(-) diff --git a/etree.go b/etree.go index 8393013..bfe1f06 100644 --- a/etree.go +++ b/etree.go @@ -12,6 +12,7 @@ import ( "encoding/xml" "errors" "io" + "iter" "os" "slices" "strings" @@ -1018,24 +1019,29 @@ func (e *Element) SelectAttrValue(key, dflt string) string { // ChildElements returns all elements that are children of this element. func (e *Element) ChildElements() []*Element { - var elements []*Element - for _, t := range e.Child { - if c, ok := t.(*Element); ok { - elements = append(elements, c) + return slices.Collect(e.ChildElementsSeq()) +} + +// ChildElementsSeq returns an iterator over all child elements of this +// element. +func (e *Element) ChildElementsSeq() iter.Seq[*Element] { + return func(yield func(*Element) bool) { + for _, t := range e.Child { + if c, ok := t.(*Element); ok { + if !yield(c) { + return + } + } } } - return elements } // SelectElement returns the first child element with the given 'tag' (i.e., // name). The function returns nil if no child element matching the tag is // found. The tag may include a namespace prefix followed by a colon. func (e *Element) SelectElement(tag string) *Element { - space, stag := spaceDecompose(tag) - for _, t := range e.Child { - if c, ok := t.(*Element); ok && spaceMatch(space, c.Space) && stag == c.Tag { - return c - } + for element := range e.SelectElementsSeq(tag) { + return element } return nil } @@ -1043,14 +1049,23 @@ func (e *Element) SelectElement(tag string) *Element { // SelectElements returns a slice of all child elements with the given 'tag' // (i.e., name). The tag may include a namespace prefix followed by a colon. func (e *Element) SelectElements(tag string) []*Element { - space, stag := spaceDecompose(tag) - var elements []*Element - for _, t := range e.Child { - if c, ok := t.(*Element); ok && spaceMatch(space, c.Space) && stag == c.Tag { - elements = append(elements, c) + return slices.Collect(e.SelectElementsSeq(tag)) +} + +// SelectElementsSeq returns an iterator over all child elements with the +// given 'tag' (i.e., name). The tag may include a namespace prefix followed +// by a colon. +func (e *Element) SelectElementsSeq(tag string) iter.Seq[*Element] { + return func(yield func(*Element) bool) { + space, stag := spaceDecompose(tag) + for _, t := range e.Child { + if c, ok := t.(*Element); ok && spaceMatch(space, c.Space) && stag == c.Tag { + if !yield(c) { + return + } + } } } - return elements } // FindElement returns the first element matched by the XPath-like 'path' @@ -1063,10 +1078,8 @@ func (e *Element) FindElement(path string) *Element { // FindElementPath returns the first element matched by the 'path' object. The // function returns nil if no element is found using the path. func (e *Element) FindElementPath(path Path) *Element { - p := newPather() - elements := p.traverse(e, path) - if len(elements) > 0 { - return elements[0] + for element := range path.traverse(e) { + return element } return nil } @@ -1075,13 +1088,26 @@ func (e *Element) FindElementPath(path Path) *Element { // string. The function returns nil if no child element is found using the // path. It panics if an invalid path string is supplied. func (e *Element) FindElements(path string) []*Element { - return e.FindElementsPath(MustCompilePath(path)) + return slices.Collect(e.FindElementsSeq(path)) +} + +// FindElementsSeq returns an iterator over elements matched by the XPath-like +// 'path' string. This function uses Go's iterator support for +// memory-efficient traversal. It panics if an invalid path string is +// supplied. +func (e *Element) FindElementsSeq(path string) iter.Seq[*Element] { + return e.FindElementsPathSeq(MustCompilePath(path)) } // FindElementsPath returns a slice of elements matched by the 'path' object. func (e *Element) FindElementsPath(path Path) []*Element { - p := newPather() - return p.traverse(e, path) + return slices.Collect(e.FindElementsPathSeq(path)) +} + +// FindElementsPathSeq returns an iterator over elements matched by the 'path' +// object. +func (e *Element) FindElementsPathSeq(path Path) iter.Seq[*Element] { + return path.traverse(e) } // NotNil returns the receiver element if it isn't nil; otherwise, it returns diff --git a/etree_test.go b/etree_test.go index 057a893..baeae12 100644 --- a/etree_test.go +++ b/etree_test.go @@ -225,6 +225,172 @@ func TestDocument(t *testing.T) { } } +func TestSelectElementsSeq(t *testing.T) { + doc := NewDocument() + store := doc.CreateElement("store") + book1 := store.CreateElement("book") + book1.CreateAttr("id", "1") + book2 := store.CreateElement("book") + book2.CreateAttr("id", "2") + other := store.CreateElement("other") + other.CreateAttr("id", "3") + book3 := store.CreateElement("book") + book3.CreateAttr("id", "4") + + expectedBooks := store.SelectElements("book") + if len(expectedBooks) != 3 { + t.Error("etree: setup error - expected 3 books") + } + + var iterBooks []*Element + for book := range store.SelectElementsSeq("book") { + iterBooks = append(iterBooks, book) + } + + if len(iterBooks) != len(expectedBooks) { + t.Errorf("etree: SelectElementsSeq returned %d elements, expected %d", len(iterBooks), len(expectedBooks)) + } + + for i, book := range iterBooks { + if book != expectedBooks[i] { + t.Errorf("etree: SelectElementsSeq element %d mismatch", i) + } + if book.SelectAttrValue("id", "") == "" { + t.Errorf("etree: SelectElementsSeq element %d missing id attribute", i) + } + } + + count := 0 + for book := range store.SelectElementsSeq("book") { + count++ + if count == 2 { + break + } + _ = book + } + if count != 2 { + t.Errorf("etree: early termination failed, got %d iterations", count) + } + + nonExistentCount := 0 + for range store.SelectElementsSeq("nonexistent") { + nonExistentCount++ + } + if nonExistentCount != 0 { + t.Errorf("etree: SelectElementsSeq found %d non-existent elements", nonExistentCount) + } +} + +func TestFindElementsSeq(t *testing.T) { + doc := NewDocument() + store := doc.CreateElement("store") + + book1 := store.CreateElement("book") + book1.CreateAttr("category", "fiction") + title1 := book1.CreateElement("title") + title1.SetText("Book One") + author1 := book1.CreateElement("author") + author1.SetText("Author A") + + book2 := store.CreateElement("book") + book2.CreateAttr("category", "nonfiction") + title2 := book2.CreateElement("title") + title2.SetText("Book Two") + author2 := book2.CreateElement("author") + author2.SetText("Author B") + + magazine := store.CreateElement("magazine") + magazine.CreateAttr("category", "tech") + magTitle := magazine.CreateElement("title") + magTitle.SetText("Tech Magazine") + + section := store.CreateElement("section") + section.CreateAttr("name", "classics") + book3 := section.CreateElement("book") + book3.CreateAttr("category", "classic") + title3 := book3.CreateElement("title") + title3.SetText("Classic Book") + + testCases := []struct { + name string + path string + }{ + {"direct children", "book"}, + {"all descendants", "//book"}, + {"all titles", "//title"}, + {"books with specific category", "book[@category='fiction']"}, + {"nested path", "section/book"}, + {"all elements", "//*"}, + {"specific attribute", "//book[@category='classic']"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + expected := store.FindElements(tc.path) + + var iterResults []*Element + for elem := range store.FindElementsSeq(tc.path) { + iterResults = append(iterResults, elem) + } + + if len(iterResults) != len(expected) { + t.Errorf("FindElementsSeq returned %d elements, expected %d for path %s", + len(iterResults), len(expected), tc.path) + } + + expectedMap := make(map[*Element]bool) + for _, e := range expected { + expectedMap[e] = true + } + + for _, e := range iterResults { + if !expectedMap[e] { + t.Errorf("FindElementsSeq returned unexpected element for path %s", tc.path) + } + } + }) + } + + t.Run("early termination", func(t *testing.T) { + count := 0 + for elem := range store.FindElementsSeq("//title") { + count++ + if count == 2 { + break + } + _ = elem + } + if count != 2 { + t.Errorf("Early termination failed, got %d iterations", count) + } + }) + + t.Run("compiled path", func(t *testing.T) { + path := MustCompilePath("//book") + expected := store.FindElementsPath(path) + + var iterResults []*Element + for elem := range store.FindElementsPathSeq(path) { + iterResults = append(iterResults, elem) + } + + if len(iterResults) != len(expected) { + t.Errorf("FindElementsPathSeq returned %d elements, expected %d", + len(iterResults), len(expected)) + } + }) + + t.Run("empty results", func(t *testing.T) { + count := 0 + for range store.FindElementsSeq("//nonexistent") { + count++ + } + if count != 0 { + t.Errorf("FindElementsSeq found %d non-existent elements", count) + } + }) +} + func TestImbalancedXML(t *testing.T) { cases := []string{ ``, diff --git a/go.mod b/go.mod index 49abab8..e807cc1 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ module github.com/beevik/etree -go 1.21.0 +go 1.23.0 diff --git a/path.go b/path.go index 8d63096..21760d3 100644 --- a/path.go +++ b/path.go @@ -5,6 +5,7 @@ package etree import ( + "iter" "strconv" "strings" ) @@ -122,6 +123,20 @@ func MustCompilePath(path string) Path { return p } +// traverse follows the path from the element e, yielding elements that match +// the path's selectors and filters using iterators. +func (p Path) traverse(e *Element) iter.Seq[*Element] { + pather := newPather() + return func(yield func(*Element) bool) { + pather.queue.add(node{e, p.segments}) + for pather.queue.len() > 0 { + if cont := pather.eval(pather.queue.remove(), yield); !cont { + return + } + } + } +} + // A segment is a portion of a path between "/" characters. // It contains one selector and zero or more [filters]. type segment struct { @@ -148,6 +163,13 @@ type filter interface { apply(p *pather) } +// A node represents an element and the remaining path segments that +// should be applied against it by the pather. +type node struct { + e *Element + segments []segment +} + // A pather is helper object that traverses an element tree using // a Path object. It collects and deduplicates all elements matching // the path query. @@ -159,13 +181,7 @@ type pather struct { scratch []*Element // used by filters } -// A node represents an element and the remaining path segments that -// should be applied against it by the pather. -type node struct { - e *Element - segments []segment -} - +// newPather creates a new pather instance. func newPather() *pather { return &pather{ results: make([]*Element, 0), @@ -175,20 +191,11 @@ func newPather() *pather { } } -// traverse follows the path from the element e, collecting -// and then returning all elements that match the path's selectors -// and filters. -func (p *pather) traverse(e *Element, path Path) []*Element { - for p.queue.add(node{e, path.segments}); p.queue.len() > 0; { - p.eval(p.queue.remove()) - } - return p.results -} - -// eval evaluates the current path node by applying the remaining -// path's selector rules against the node's element. -func (p *pather) eval(n node) { - p.candidates = p.candidates[0:0] +// eval evaluates the current path node by applying the remaining path's +// selector rules against the node's element, yielding results via iterator. +// Returns false if early termination is requested. +func (p *pather) eval(n node, yield func(*Element) bool) bool { + p.candidates = p.candidates[:0] seg, remain := n.segments[0], n.segments[1:] seg.apply(n.e, p) @@ -196,7 +203,9 @@ func (p *pather) eval(n node) { for _, c := range p.candidates { if in := p.inResults[c]; !in { p.inResults[c] = true - p.results = append(p.results, c) + if !yield(c) { + return false + } } } } else { @@ -204,6 +213,7 @@ func (p *pather) eval(n node) { p.queue.add(node{c, remain}) } } + return true } // A compiler generates a compiled path from a path string. From 9c2943f36a4c7ee0d6d9047e88572ceec8d8b30c Mon Sep 17 00:00:00 2001 From: Brett Vickers Date: Thu, 21 Aug 2025 19:59:43 -0700 Subject: [PATCH 2/2] Update github actions to use go 1.23+ Also, update actions steps to use latest versions. --- .github/workflows/go.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 73fbf55..1278536 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -23,18 +23,18 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Initialize CodeQL - uses: github/codeql-action/init@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 + uses: github/codeql-action/init@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11 with: languages: ${{ matrix.language }} - name: Autobuild - uses: github/codeql-action/autobuild@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 + uses: github/codeql-action/autobuild@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@4fa2a7953630fd2f3fb380f21be14ede0169dd4f # v3.25.12 + uses: github/codeql-action/analyze@3c3833e0f8c1c83d449a7478aa59c036a9165498 # v3.29.11 with: category: "/language:${{matrix.language}}" @@ -44,14 +44,14 @@ jobs: strategy: matrix: - go-version: [ '1.21', '1.22.x' ] + go-version: [ '1.23', '1.25.x' ] steps: - name: Checkout repository - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup Go ${{ matrix.go-version }} - uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2 + uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 with: go-version: ${{ matrix.go-version }}