diff --git a/src/raw.jl b/src/raw.jl index 29d0a10..00ec743 100644 --- a/src/raw.jl +++ b/src/raw.jl @@ -67,7 +67,23 @@ struct Raw ctx::Vector{Bool} # Context for xml:space (Vector to support inheritance of context) has_xml_space::Bool # Whether data contains `xml:space` attribute at least once end +# Honor a leading byte-order mark (XML 1.0 ยง4.3.3): decode UTF-16 to UTF-8 and +# strip a UTF-8 BOM so the byte-oriented tokenizer always sees UTF-8. +function _normalize_bom(data::Vector{UInt8}) + n = length(data) + if n >= 2 && data[1] == 0xFF && data[2] == 0xFE + units = reinterpret(UInt16, data[3:end]) + return Vector{UInt8}(transcode(String, units)) + elseif n >= 2 && data[1] == 0xFE && data[2] == 0xFF + units = bswap.(reinterpret(UInt16, data[3:end])) + return Vector{UInt8}(transcode(String, units)) + elseif n >= 3 && data[1] == 0xEF && data[2] == 0xBB && data[3] == 0xBF + return data[4:end] + end + return data +end function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false]) + data = _normalize_bom(data) needle = Vector{UInt8}("xml:space") has_xml_space = findfirst(needle, data) !== nothing return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space) diff --git a/test/runtests.jl b/test/runtests.jl index 89978eb..5646a8d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -642,5 +642,28 @@ end kw = NamedTuple(OrderedDict(Symbol(k) => Int(k) for k in 'a':'z')) xyz = XML.Element("point"; kw...) @test collect(keys(attributes(xyz))) == string.(collect('a':'z')) + + # https://github.com/JuliaComputing/XML.jl/issues/62 (UTF-16/BOM input) + text = """hello""" + units = transcode(UInt16, text) + le = vcat(UInt8[0xFF, 0xFE], reinterpret(UInt8, units)) + be = vcat(UInt8[0xFE, 0xFF], reinterpret(UInt8, bswap.(units))) + utf8_bom = vcat(UInt8[0xEF, 0xBB, 0xBF], Vector{UInt8}(text)) + for bytes in (le, be, utf8_bom) + doc = Node(XML.Raw(bytes)) + root = only(filter(n -> nodetype(n) == Element, children(doc))) + @test tag(root) == "root" + @test value(only(children(root))) == "hello" + + # read(file) round-trip exercises the Mmap.mmap path (the real .xlsx case) + mktemp() do path, io + write(io, bytes) + close(io) + file_doc = read(path, Node) + file_root = only(filter(n -> nodetype(n) == Element, children(file_doc))) + @test tag(file_root) == "root" + @test value(only(children(file_root))) == "hello" + end + end end