Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/raw.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,23 @@ struct Raw
ctx::Vector{Bool} # Context for xml:space (Vector to support inheritance of context)
has_xml_space::Bool # Whether data contains `xml:space` attribute at least once
end
# Honor a leading byte-order mark (XML 1.0 §4.3.3): decode UTF-16 to UTF-8 and
# strip a UTF-8 BOM so the byte-oriented tokenizer always sees UTF-8.
function _normalize_bom(data::Vector{UInt8})
n = length(data)
if n >= 2 && data[1] == 0xFF && data[2] == 0xFE
units = reinterpret(UInt16, data[3:end])
return Vector{UInt8}(transcode(String, units))
elseif n >= 2 && data[1] == 0xFE && data[2] == 0xFF
units = bswap.(reinterpret(UInt16, data[3:end]))
return Vector{UInt8}(transcode(String, units))
elseif n >= 3 && data[1] == 0xEF && data[2] == 0xBB && data[3] == 0xBF
return data[4:end]
end
return data
end
function Raw(data::Vector{UInt8})#, ctx::Vector{Bool}=Bool[false])
data = _normalize_bom(data)
needle = Vector{UInt8}("xml:space")
has_xml_space = findfirst(needle, data) !== nothing
return Raw(RawDocument, 0, 0, 0, data, [false], has_xml_space)
Expand Down
23 changes: 23 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -642,5 +642,28 @@ end
kw = NamedTuple(OrderedDict(Symbol(k) => Int(k) for k in 'a':'z'))
xyz = XML.Element("point"; kw...)
@test collect(keys(attributes(xyz))) == string.(collect('a':'z'))

# https://github.com/JuliaComputing/XML.jl/issues/62 (UTF-16/BOM input)
text = """<?xml version="1.0"?><root>hello</root>"""
units = transcode(UInt16, text)
le = vcat(UInt8[0xFF, 0xFE], reinterpret(UInt8, units))
be = vcat(UInt8[0xFE, 0xFF], reinterpret(UInt8, bswap.(units)))
utf8_bom = vcat(UInt8[0xEF, 0xBB, 0xBF], Vector{UInt8}(text))
for bytes in (le, be, utf8_bom)
Comment thread
mathieu17g marked this conversation as resolved.
doc = Node(XML.Raw(bytes))
root = only(filter(n -> nodetype(n) == Element, children(doc)))
@test tag(root) == "root"
@test value(only(children(root))) == "hello"

# read(file) round-trip exercises the Mmap.mmap path (the real .xlsx case)
mktemp() do path, io
write(io, bytes)
close(io)
file_doc = read(path, Node)
file_root = only(filter(n -> nodetype(n) == Element, children(file_doc)))
@test tag(file_root) == "root"
@test value(only(children(file_root))) == "hello"
end
end
end

Loading