From 0c671958d120cc68db67c0700fdbf68f2d7de416 Mon Sep 17 00:00:00 2001 From: Sai Asish Y Date: Wed, 17 Jun 2026 19:32:15 -0700 Subject: [PATCH 1/3] Honor byte-order mark when parsing UTF-16 and UTF-8 input Signed-off-by: Sai Asish Y --- src/raw.jl | 16 ++++++++++++++++ test/runtests.jl | 13 +++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/raw.jl b/src/raw.jl index 29d0a10..00ec743 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -67,7 +67,23 @@ struct Raw ctx::Vector{Bool} # Context for xml:space (Vector to support inheritance of context) has_xml_space::Bool # Whether data contains `xml:space` attribute at least once end +# Honor a leading byte-order mark (XML 1.0 ยง4.3.3): decode UTF-16 to UTF-8 and +# strip a UTF-8 BOM so the byte-oriented tokenizer always sees UTF-8. +function _normalize_bom(data::Vector{UInt8}) + n = length(data) + if n >= 2 && data[1] == 0xFF && data[2] == 0xFE + units = reinterpret(UInt16, data[3:end]) + return Vector{UInt8}(transcode(String, units)) + elseif n >= 2 && data[1] == 0xFE && data[2] == 0xFF + units = bswap.(reinterpret(UInt16, data[3:end])) + return Vector{UInt8}(transcode(String, units)) + elseif n >= 3 && data[1] == 0xEF && data[2] == 0xBB && data[3] == 0xBF + return data[4:end] + end + return data +end function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false]) + data = _normalize_bom(data) needle = Vector{UInt8}("xml:space") has_xml_space = findfirst(needle, data) !== nothing return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space) diff --git a/test/runtests.jl b/test/runtests.jl index 89978eb..66c7dfb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -642,5 +642,18 @@ end kw = NamedTuple(OrderedDict(Symbol(k) => Int(k) for k in 'a':'z')) xyz = XML.Element("point"; kw...) @test collect(keys(attributes(xyz))) == string.(collect('a':'z')) + + # https://github.com/JuliaComputing/XML.jl/issues/62 (UTF-16/BOM input) + text = """hello""" + units = transcode(UInt16, text) + le = vcat(UInt8[0xFF, 0xFE], reinterpret(UInt8, units)) + be = vcat(UInt8[0xFE, 0xFF], reinterpret(UInt8, bswap.(units))) + utf8_bom = vcat(UInt8[0xEF, 0xBB, 0xBF], Vector{UInt8}(text)) + for bytes in (le, be, utf8_bom) + doc = Node(XML.Raw(bytes)) + root = only(children(doc)) + @test tag(root) == "root" + @test value(only(children(root))) == "hello" + end end From 3b091a0b23eae63a1bda1d39581eade88f323023 Mon Sep 17 00:00:00 2001 From: Sai Asish Y Date: Wed, 17 Jun 2026 19:40:01 -0700 Subject: [PATCH 2/3] test: select root element when parsing declaration-prefixed BOM input Signed-off-by: Sai Asish Y --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 66c7dfb..e6b100e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -651,7 +651,7 @@ end utf8_bom = vcat(UInt8[0xEF, 0xBB, 0xBF], Vector{UInt8}(text)) for bytes in (le, be, utf8_bom) doc = Node(XML.Raw(bytes)) - root = only(children(doc)) + root = only(filter(n -> nodetype(n) == Element, children(doc))) @test tag(root) == "root" @test value(only(children(root))) == "hello" end From fbf90654681b43ed96e5f8979eaa07edd0867b34 Mon Sep 17 00:00:00 2001 From: Sai Asish Y Date: Sat, 20 Jun 2026 13:24:28 -0700 Subject: [PATCH 3/3] test: add read(file) round-trip for BOM-prefixed XML inputs Signed-off-by: Sai Asish Y --- test/runtests.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index e6b100e..5646a8d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -654,6 +654,16 @@ end root = only(filter(n -> nodetype(n) == Element, children(doc))) @test tag(root) == "root" @test value(only(children(root))) == "hello" + + # read(file) round-trip exercises the Mmap.mmap path (the real .xlsx case) + mktemp() do path, io + write(io, bytes) + close(io) + file_doc = read(path, Node) + file_root = only(filter(n -> nodetype(n) == Element, children(file_doc))) + @test tag(file_root) == "root" + @test value(only(children(file_root))) == "hello" + end end end