From 5fe77772d36978f67c3db3a86268310c9cfcb795 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 17 Feb 2026 11:15:11 -0500 Subject: [PATCH 1/3] handle whatwg encoding standard overrides --- compile_flags.txt | 1 + deps/rust/Cargo.lock | 1 + deps/rust/cargo.bzl | 1 + deps/rust/crates/BUILD.bazel | 12 + deps/rust/crates/defs.bzl | 2 + src/rust/encoding/BUILD.bazel | 11 + src/rust/encoding/lib.rs | 150 +++++++++++ src/workerd/api/BUILD.bazel | 13 +- src/workerd/api/encoding-legacy.c++ | 68 +++++ src/workerd/api/encoding-legacy.h | 51 ++++ src/workerd/api/encoding-shared.h | 81 ++++++ src/workerd/api/encoding.c++ | 166 +++--------- src/workerd/api/encoding.h | 107 +------- src/workerd/api/tests/encoding-test.js | 263 ++++++++++++++++++++ src/workerd/api/tests/encoding-test.wd-test | 2 +- src/workerd/io/compatibility-date.capnp | 7 + src/wpt/BUILD.bazel | 5 +- src/wpt/encoding-test.ts | 234 ++++++++++++++--- 18 files changed, 895 insertions(+), 280 deletions(-) create mode 100644 src/rust/encoding/BUILD.bazel create mode 100644 src/rust/encoding/lib.rs create mode 100644 src/workerd/api/encoding-legacy.c++ create mode 100644 src/workerd/api/encoding-legacy.h create mode 100644 src/workerd/api/encoding-shared.h diff --git a/compile_flags.txt b/compile_flags.txt index 68e865fbfa7..a9b5d8088d1 100644 --- a/compile_flags.txt +++ b/compile_flags.txt @@ -63,6 +63,7 @@ -isystembazel-bin/src/rust/jsg/_virtual_includes/lib.rs@cxx -isystembazel-bin/src/rust/jsg/_virtual_includes/v8.rs@cxx -isystembazel-bin/src/rust/jsg-test/_virtual_includes/ffi-hdrs +-isystembazel-bin/src/rust/encoding/_virtual_includes/lib.rs@cxx -isystembazel-bin/src/rust/jsg-test/_virtual_includes/lib.rs@cxx -isystembazel-bin/src/rust/gen-compile-cache/_virtual_includes/cxx-bridge -isystembazel-bin/src/rust/gen-compile-cache/_virtual_includes/gen-compile-cache@cxx diff --git a/deps/rust/Cargo.lock b/deps/rust/Cargo.lock index efd1e052306..4229c1b1e60 100644 --- a/deps/rust/Cargo.lock +++ b/deps/rust/Cargo.lock @@ -451,6 +451,7 @@ dependencies = [ "clang-ast", "clap", "codespan-reporting", + "encoding_rs", "flate2", "foldhash", "futures", diff --git a/deps/rust/cargo.bzl b/deps/rust/cargo.bzl index 904afe40f3c..28e46227e8a 100644 --- a/deps/rust/cargo.bzl +++ b/deps/rust/cargo.bzl @@ -17,6 +17,7 @@ PACKAGES = WORKERD_CXX_PACKAGES | { "clang-ast": crate.spec(version = "0"), "clap": crate.spec(version = "4", default_features = False, features = ["derive", "std", "help"]), "codespan-reporting": crate.spec(version = "0"), + "encoding_rs": crate.spec(version = "0"), "flate2": crate.spec(version = "1"), "futures": crate.spec(version = "0"), "lol_html_c_api": crate.spec(git = "https://github.com/cloudflare/lol-html", tag = "v2.7.1"), diff --git a/deps/rust/crates/BUILD.bazel b/deps/rust/crates/BUILD.bazel index 6836d2e47e7..71c72b11088 100644 --- a/deps/rust/crates/BUILD.bazel +++ b/deps/rust/crates/BUILD.bazel @@ -151,6 +151,18 @@ alias( tags = ["manual"], ) +alias( + name = "encoding_rs-0.8.35", + actual = "@crates_vendor__encoding_rs-0.8.35//:encoding_rs", + tags = ["manual"], +) + +alias( + name = "encoding_rs", + actual = "@crates_vendor__encoding_rs-0.8.35//:encoding_rs", + tags = ["manual"], +) + alias( name = "flate2-1.1.9", actual = "@crates_vendor__flate2-1.1.9//:flate2", diff --git a/deps/rust/crates/defs.bzl b/deps/rust/crates/defs.bzl index 4c647a91c27..a6cd505691c 100644 --- a/deps/rust/crates/defs.bzl +++ b/deps/rust/crates/defs.bzl @@ -305,6 +305,7 @@ _NORMAL_DEPENDENCIES = { "clang-ast": Label("@crates_vendor//:clang-ast-0.1.35"), "clap": Label("@crates_vendor//:clap-4.5.58"), "codespan-reporting": Label("@crates_vendor//:codespan-reporting-0.13.1"), + "encoding_rs": Label("@crates_vendor//:encoding_rs-0.8.35"), "flate2": Label("@crates_vendor//:flate2-1.1.9"), "foldhash": Label("@crates_vendor//:foldhash-0.2.0"), "futures": Label("@crates_vendor//:futures-0.3.31"), @@ -2957,6 +2958,7 @@ def crate_repositories(): struct(repo = "crates_vendor__clang-ast-0.1.35", is_dev_dep = False), struct(repo = "crates_vendor__clap-4.5.58", is_dev_dep = False), struct(repo = "crates_vendor__codespan-reporting-0.13.1", is_dev_dep = False), + struct(repo = "crates_vendor__encoding_rs-0.8.35", is_dev_dep = False), struct(repo = "crates_vendor__flate2-1.1.9", is_dev_dep = False), struct(repo = "crates_vendor__foldhash-0.2.0", is_dev_dep = False), struct(repo = "crates_vendor__futures-0.3.31", is_dev_dep = False), diff --git a/src/rust/encoding/BUILD.bazel b/src/rust/encoding/BUILD.bazel new file mode 100644 index 00000000000..0c332064b7b --- /dev/null +++ b/src/rust/encoding/BUILD.bazel @@ -0,0 +1,11 @@ +load("//:build/wd_rust_crate.bzl", "wd_rust_crate") + +wd_rust_crate( + name = "encoding", + cxx_bridge_src = "lib.rs", + visibility = ["//visibility:public"], + deps = [ + "//src/rust/cxx-integration", + "@crates_vendor//:encoding_rs", + ], +) diff --git a/src/rust/encoding/lib.rs b/src/rust/encoding/lib.rs new file mode 100644 index 00000000000..b6b112156ff --- /dev/null +++ b/src/rust/encoding/lib.rs @@ -0,0 +1,150 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 + +//! WHATWG Encoding Standard legacy decoders via `encoding_rs`. +//! +//! Exposes a streaming decoder to C++ via CXX bridge. All legacy encodings +//! (CJK multi-byte, single-byte windows-1252, and x-user-defined) are handled +//! by a single opaque `Decoder` type backed by `encoding_rs::Decoder`. + +#[cxx::bridge(namespace = "workerd::rust::encoding")] +mod ffi { + /// Legacy encoding types supported by the Rust decoder. + /// Shared between C++ and Rust. + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + #[repr(u16)] + enum Encoding { + Big5, + EucJp, + EucKr, + Gb18030, + Gbk, + Iso2022Jp, + ShiftJis, + Windows1252, + XUserDefined, + } + + /// Result of a decode operation. + struct DecodeResult { + /// UTF-16 output. + output: Vec, + /// True if a fatal decoding error was encountered. Only meaningful + /// when the caller requested fatal mode — in replacement mode errors + /// are silently replaced with U+FFFD and this flag is not set. + had_error: bool, + } + + extern "Rust" { + type Decoder; + + /// Create a new streaming decoder for the given encoding. + // CXX bridge requires Box for opaque types. + #[expect(clippy::unnecessary_box_returns)] + fn new_decoder(encoding: Encoding) -> Box; + + /// Decode a chunk of bytes. Set `flush` to true on the final chunk. + /// When `fatal` is true and an error is encountered, `had_error` is + /// set and the output may be incomplete. + fn decode(decoder: &mut Decoder, input: &[u8], flush: bool, fatal: bool) -> DecodeResult; + + /// Reset the decoder to its initial state. + fn reset(decoder: &mut Decoder); + } +} + +/// Opaque decoder state exposed to C++ via `Box`. +pub struct Decoder { + encoding: &'static encoding_rs::Encoding, + inner: encoding_rs::Decoder, +} + +/// Map a CXX-shared `Encoding` variant to the corresponding +/// `encoding_rs` static. +fn to_encoding(enc: ffi::Encoding) -> &'static encoding_rs::Encoding { + match enc { + ffi::Encoding::Big5 => encoding_rs::BIG5, + ffi::Encoding::EucJp => encoding_rs::EUC_JP, + ffi::Encoding::EucKr => encoding_rs::EUC_KR, + ffi::Encoding::Gb18030 => encoding_rs::GB18030, + ffi::Encoding::Gbk => encoding_rs::GBK, + ffi::Encoding::Iso2022Jp => encoding_rs::ISO_2022_JP, + ffi::Encoding::ShiftJis => encoding_rs::SHIFT_JIS, + ffi::Encoding::Windows1252 => encoding_rs::WINDOWS_1252, + ffi::Encoding::XUserDefined => encoding_rs::X_USER_DEFINED, + _ => unreachable!(), + } +} + +pub fn new_decoder(encoding: ffi::Encoding) -> Box { + let encoding = to_encoding(encoding); + Box::new(Decoder { + inner: encoding.new_decoder_without_bom_handling(), + encoding, + }) +} + +pub fn decode(state: &mut Decoder, input: &[u8], flush: bool, fatal: bool) -> ffi::DecodeResult { + let max_len = state + .inner + .max_utf16_buffer_length(input.len()) + .unwrap_or(input.len() + 4); + let mut output = vec![0u16; max_len]; + let mut total_read = 0usize; + let mut total_written = 0usize; + + if fatal { + loop { + let (result, read, written) = state.inner.decode_to_utf16_without_replacement( + &input[total_read..], + &mut output[total_written..], + flush, + ); + total_read += read; + total_written += written; + + match result { + encoding_rs::DecoderResult::InputEmpty => break, + encoding_rs::DecoderResult::OutputFull => { + output.resize(output.len() * 2, 0); + } + encoding_rs::DecoderResult::Malformed(_, _) => { + state.inner = state.encoding.new_decoder_without_bom_handling(); + output.truncate(total_written); + return ffi::DecodeResult { + output, + had_error: true, + }; + } + } + } + } else { + loop { + let (result, read, written, _had_errors) = state.inner.decode_to_utf16( + &input[total_read..], + &mut output[total_written..], + flush, + ); + total_read += read; + total_written += written; + + match result { + encoding_rs::CoderResult::InputEmpty => break, + encoding_rs::CoderResult::OutputFull => { + output.resize(output.len() * 2, 0); + } + } + } + } + + output.truncate(total_written); + ffi::DecodeResult { + output, + had_error: false, + } +} + +pub fn reset(state: &mut Decoder) { + state.inner = state.encoding.new_decoder_without_bom_handling(); +} diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel index bd3b81e18e7..e47e0426e05 100644 --- a/src/workerd/api/BUILD.bazel +++ b/src/workerd/api/BUILD.bazel @@ -430,8 +430,15 @@ wd_cc_library( wd_cc_library( name = "encoding", - srcs = ["encoding.c++"], - hdrs = ["encoding.h"], + srcs = [ + "encoding.c++", + "encoding-legacy.c++", + ], + hdrs = [ + "encoding.h", + "encoding-legacy.h", + "encoding-shared.h", + ], implementation_deps = [ "//src/workerd/io:features", "//src/workerd/util:strings", @@ -439,10 +446,12 @@ wd_cc_library( visibility = ["//visibility:public"], deps = [ ":util", + "//src/rust/encoding", "//src/workerd/io:compatibility-date_capnp", "//src/workerd/jsg", "@capnp-cpp//src/kj", "@simdutf", + "@workerd-cxx//kj-rs", ], ) diff --git a/src/workerd/api/encoding-legacy.c++ b/src/workerd/api/encoding-legacy.c++ new file mode 100644 index 00000000000..378afec91b2 --- /dev/null +++ b/src/workerd/api/encoding-legacy.c++ @@ -0,0 +1,68 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 + +#include "encoding-legacy.h" + +#include +#include + +#include + +namespace workerd::api { + +namespace { + +// Map workerd::api::Encoding to the Rust-side RustEncoding enum. +::workerd::rust::encoding::Encoding toRustEncoding(Encoding encoding) { + using RE = ::workerd::rust::encoding::Encoding; + switch (encoding) { + case Encoding::Big5: + return RE::Big5; + case Encoding::Euc_Jp: + return RE::EucJp; + case Encoding::Euc_Kr: + return RE::EucKr; + case Encoding::Gb18030: + return RE::Gb18030; + case Encoding::Gbk: + return RE::Gbk; + case Encoding::Iso2022_Jp: + return RE::Iso2022Jp; + case Encoding::Shift_Jis: + return RE::ShiftJis; + case Encoding::Windows_1252: + return RE::Windows1252; + case Encoding::X_User_Defined: + return RE::XUserDefined; + default: + KJ_UNREACHABLE; + } +} + +} // namespace + +LegacyDecoder::LegacyDecoder(Encoding encoding, DecoderFatal fatal) + : encoding(encoding), + fatal(fatal), + state(::workerd::rust::encoding::new_decoder(toRustEncoding(encoding))) {} + +void LegacyDecoder::reset() { + ::workerd::rust::encoding::reset(*state); +} + +kj::Maybe LegacyDecoder::decode( + jsg::Lock& js, kj::ArrayPtr buffer, bool flush) { + auto result = ::workerd::rust::encoding::decode( + *state, buffer.as(), flush, fatal.toBool()); + + if (fatal.toBool() && result.had_error) { + // Decoder state already reset by the Rust side on fatal error. + return kj::none; + } + + auto output = kj::from(result.output); + return js.str(output); +} + +} // namespace workerd::api diff --git a/src/workerd/api/encoding-legacy.h b/src/workerd/api/encoding-legacy.h new file mode 100644 index 00000000000..9e91e8d275e --- /dev/null +++ b/src/workerd/api/encoding-legacy.h @@ -0,0 +1,51 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 + +// WHATWG-compliant legacy decoders (CJK multi-byte, windows-1252, +// x-user-defined) implemented via the encoding_rs Rust crate through +// a CXX bridge. A single LegacyDecoder class wraps an opaque Rust-side +// decoder that handles all the encoding-specific state machines. + +#pragma once + +#include "encoding-shared.h" + +#include + +#include + +#include + +namespace workerd::api { + +// Unified legacy decoder using encoding_rs via Rust CXX bridge. +// encoding_rs implements the full WHATWG decoder algorithms for all +// legacy encodings, including streaming, error recovery, and ASCII +// byte pushback. +// +// According to WHATWG spec, any encoding except UTF-8 and UTF-16 is considered legacy. +class LegacyDecoder final: public Decoder { + public: + LegacyDecoder(Encoding encoding, DecoderFatal fatal); + ~LegacyDecoder() noexcept = default; + LegacyDecoder(LegacyDecoder&&) noexcept = default; + LegacyDecoder& operator=(LegacyDecoder&&) noexcept = default; + KJ_DISALLOW_COPY(LegacyDecoder); + + Encoding getEncoding() override { + return encoding; + } + + kj::Maybe decode( + jsg::Lock& js, kj::ArrayPtr buffer, bool flush = false) override; + + void reset() override; + + private: + Encoding encoding; + DecoderFatal fatal; + ::rust::Box<::workerd::rust::encoding::Decoder> state; +}; + +} // namespace workerd::api diff --git a/src/workerd/api/encoding-shared.h b/src/workerd/api/encoding-shared.h new file mode 100644 index 00000000000..77aa8cfd702 --- /dev/null +++ b/src/workerd/api/encoding-shared.h @@ -0,0 +1,81 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 + +// Shared types used by encoding.h and encoding-legacy.h. +// Extracted to break circular dependencies between the two headers. + +#pragma once + +#include +#include + +namespace workerd::api { + +WD_STRONG_BOOL(DecoderFatal); +WD_STRONG_BOOL(DecoderIgnoreBom); + +// The encodings listed here are defined as required by the Encoding spec. +// The first label is enum we use to identify the encoding in code, while +// the second label is the public identifier. +#define EW_ENCODINGS(V) \ + V(Utf8, "utf-8") \ + V(Ibm866, "ibm866") \ + V(Iso8859_2, "iso-8859-2") \ + V(Iso8859_3, "iso-8859-3") \ + V(Iso8859_4, "iso-8859-4") \ + V(Iso8859_5, "iso-8859-5") \ + V(Iso8859_6, "iso-8859-6") \ + V(Iso8859_7, "iso-8859-7") \ + V(Iso8859_8, "iso-8859-8") \ + V(Iso8859_8i, "iso-8859-8-i") \ + V(Iso8859_10, "iso-8859-10") \ + V(Iso8859_13, "iso-8859-13") \ + V(Iso8859_14, "iso-8859-14") \ + V(Iso8859_15, "iso-8859-15") \ + V(Iso8859_16, "iso-8859-16") \ + V(Ko18_r, "koi8-r") \ + V(Koi8_u, "koi8-u") \ + V(Macintosh, "macintosh") \ + V(Windows_874, "windows-874") \ + V(Windows_1250, "windows-1250") \ + V(Windows_1251, "windows-1251") \ + V(Windows_1252, "windows-1252") \ + V(Windows_1253, "windows-1253") \ + V(Windows_1254, "windows-1254") \ + V(Windows_1255, "windows-1255") \ + V(Windows_1256, "windows-1256") \ + V(Windows_1257, "windows-1257") \ + V(Windows_1258, "windows-1258") \ + V(X_Mac_Cyrillic, "x-mac-cyrillic") \ + V(Gbk, "gbk") \ + V(Gb18030, "gb18030") \ + V(Big5, "big5") \ + V(Euc_Jp, "euc-jp") \ + V(Iso2022_Jp, "iso-2022-jp") \ + V(Shift_Jis, "shift_jis") \ + V(Euc_Kr, "euc-kr") \ + V(Replacement, "replacement") \ + V(Utf16be, "utf-16be") \ + V(Utf16le, "utf-16le") \ + V(X_User_Defined, "x-user-defined") + +enum class Encoding { + INVALID, +#define V(name, _) name, + EW_ENCODINGS(V) +#undef V +}; + +// A Decoder provides the underlying implementation of a TextDecoder. +class Decoder { + public: + virtual ~Decoder() noexcept(true) {} + virtual Encoding getEncoding() = 0; + virtual kj::Maybe decode( + jsg::Lock& js, kj::ArrayPtr buffer, bool flush = false) = 0; + + virtual void reset() {} +}; + +} // namespace workerd::api diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index f44b1215fb3..e4590818a66 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -14,7 +14,7 @@ #include #include -#include +#include namespace workerd::api { @@ -303,6 +303,7 @@ kj::Maybe IcuDecoder::create(Encoding encoding, bool fatal, bool ign kj::Maybe IcuDecoder::decode( jsg::Lock& js, kj::ArrayPtr buffer, bool flush) { UErrorCode status = U_ZERO_ERROR; + kj::Maybe> merged; const auto maxCharSize = [this]() { return ucnv_getMaxCharSize(inner.get()); }; const auto isUnicode = [this]() { @@ -318,8 +319,6 @@ kj::Maybe IcuDecoder::decode( KJ_UNREACHABLE; }; - const auto isUsAscii = [](const auto& b) { return b <= 0x7f; }; - KJ_DEFER({ if (flush) reset(); }); @@ -329,7 +328,8 @@ kj::Maybe IcuDecoder::decode( // conversions are being handled by v8 directly rather than by the ICU converter). if (buffer.size() > 0 && ucnv_toUCountPending(inner.get(), &status) == 0) { KJ_ASSERT(U_SUCCESS(status)); - if (encoding == Encoding::Utf8 && std::all_of(buffer.begin(), buffer.end(), isUsAscii)) { + if (encoding == Encoding::Utf8 && + simdutf::validate_ascii(buffer.asChars().begin(), buffer.size())) { // This is a fast-path option for UTF-8 that can be taken when there // are no buffered inputs and the non-empty input buffer contains only // codepoints <= 0x7f. This path is safe because with ASCII range codepoints @@ -391,7 +391,7 @@ kj::Maybe IcuDecoder::decode( status = U_ZERO_ERROR; auto limit = 2 * maxCharSize() * (!flush ? buffer.size() - : std::max(buffer.size(), + : kj::max(buffer.size(), static_cast(ucnv_toUCountPending(inner.get(), &status)))); KJ_STACK_ARRAY(UChar, result, limit, 512, 4096); @@ -414,121 +414,6 @@ kj::Maybe IcuDecoder::decode( return js.str(result.slice(omitInitialBom ? 1 : 0, length)); } -// Full 256-entry windows-1252 byte-to-Unicode lookup table. -// For most entries table[i] == i (identity mapping). Bytes 0x80-0x9F -// differ from Latin-1 and map to their correct windows-1252 code points. -// Undefined bytes (0x81, 0x8D, 0x8F, 0x90, 0x9D) map to 0x0000 as a sentinel. -// See: https://encoding.spec.whatwg.org/index-windows-1252.txt -// clang-format off -static constexpr uint16_t WIN1252_TABLE[256] = { - // 0x00-0x0F - 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, - 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, - // 0x10-0x1F - 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, - 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, - // 0x20-0x2F - 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, - 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, - // 0x30-0x3F - 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, - 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, - // 0x40-0x4F - 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, - 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, - // 0x50-0x5F - 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, - 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, - // 0x60-0x6F - 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, - 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, - // 0x70-0x7F - 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, - 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, - // 0x80-0x8F — windows-1252 diverges from Latin-1 here - 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, - 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, - // 0x90-0x9F - 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, - 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, - // 0xA0-0xAF - 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, - 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, - // 0xB0-0xBF - 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, - 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, - // 0xC0-0xCF - 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, - 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, - // 0xD0-0xDF - 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, - 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, - // 0xE0-0xEF - 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, - 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, - // 0xF0-0xFF - 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, - 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, -}; -// clang-format on - -kj::Maybe AsciiDecoder::decode( - jsg::Lock& js, kj::ArrayPtr buffer, bool flush) { - // Single branchless scan: accumulate whether any byte maps to a - // different code point than its raw value. For bytes outside 0x80-0x9F - // the table is identity so the XOR is zero and contributes nothing. - uint16_t diff = 0; - for (auto byte: buffer) { - diff |= WIN1252_TABLE[byte] ^ byte; - } - - if (diff == 0) { - // Fast path: all bytes mapped to their own value (pure ASCII or - // 0xA0-0xFF range), so Latin-1 identity decoding is correct. - return js.str(buffer); - } - - // Slow path: at least one byte in 0x80-0x9F needs remapping. - // Since some windows-1252 code points are > 0xFF we use uint16_t. - auto result = kj::heapArray(buffer.size()); - for (size_t i = 0; i < buffer.size(); i++) { - result[i] = WIN1252_TABLE[buffer[i]]; - } - - return js.str(result.asPtr()); -} - -kj::Maybe XUserDefinedDecoder::decode( - jsg::Lock& js, kj::ArrayPtr buffer, bool flush) { - // x-user-defined encoding per WHATWG spec: - // https://encoding.spec.whatwg.org/#x-user-defined-decoder - // - 0x00-0x7F: code point = byte (ASCII identity) - // - 0x80-0xFF: code point = 0xF780 + (byte - 0x80) = 0xF700 + byte - - // Check if we have any high bytes that need remapping - bool hasHighBytes = - !simdutf::validate_ascii(reinterpret_cast(buffer.begin()), buffer.size()); - - if (!hasHighBytes) { - // Fast path: all ASCII bytes, identity mapping - return js.str(buffer); - } - - // Slow path: at least one byte >= 0x80, need uint16_t for PUA mapping - auto result = kj::heapArray(buffer.size()); - for (size_t i = 0; i < buffer.size(); i++) { - auto byte = buffer[i]; - if (byte < 0x80) { - result[i] = byte; - } else { - // Map 0x80-0xFF to U+F780-U+F7FF (Private Use Area) - result[i] = 0xF700 + byte; - } - } - - return js.str(result.asPtr()); -} - void IcuDecoder::reset() { bomSeen = false; return ucnv_reset(inner.get()); @@ -536,15 +421,12 @@ void IcuDecoder::reset() { Decoder& TextDecoder::getImpl() { KJ_SWITCH_ONEOF(decoder) { - KJ_CASE_ONEOF(dec, AsciiDecoder) { + KJ_CASE_ONEOF(dec, LegacyDecoder) { return dec; } KJ_CASE_ONEOF(dec, IcuDecoder) { return dec; } - KJ_CASE_ONEOF(dec, XUserDefinedDecoder) { - return dec; - } } KJ_UNREACHABLE; } @@ -566,12 +448,19 @@ jsg::Ref TextDecoder::constructor(jsg::Lock& js, errorMessage(label)); } - if (encoding == Encoding::Windows_1252) { - return js.alloc(AsciiDecoder(), options); - } - - if (encoding == Encoding::X_User_Defined) { - return js.alloc(XUserDefinedDecoder(), options); + switch (encoding) { + case Encoding::Big5: + case Encoding::Euc_Jp: + case Encoding::Euc_Kr: + case Encoding::Gb18030: + case Encoding::Gbk: + case Encoding::Iso2022_Jp: + case Encoding::Shift_Jis: + case Encoding::Windows_1252: + case Encoding::X_User_Defined: + return js.alloc(LegacyDecoder(encoding, DecoderFatal(options.fatal)), options); + default: + break; } return js.alloc( @@ -588,23 +477,28 @@ jsg::JsString TextDecoder::decode(jsg::Lock& js, jsg::Optional> maybeInput, jsg::Optional maybeOptions) { auto options = maybeOptions.orDefault(DEFAULT_OPTIONS); + // Per spec, omitting input is end-of-queue, so we must flush pending bytes. + const auto flush = maybeInput == kj::none || !options.stream; auto& input = maybeInput.orDefault(EMPTY); - return JSG_REQUIRE_NONNULL( - getImpl().decode(js, input, !options.stream), TypeError, "Failed to decode input."); + auto result = + JSG_REQUIRE_NONNULL(getImpl().decode(js, input, flush), TypeError, "Failed to decode input."); + // Per WHATWG spec, when flush is set the decoder is reset to a new instance + // so subsequent calls start with clean state. + if (flush) { + getImpl().reset(); + } + return kj::mv(result); } kj::Maybe TextDecoder::decodePtr( jsg::Lock& js, kj::ArrayPtr buffer, bool flush) { KJ_SWITCH_ONEOF(decoder) { - KJ_CASE_ONEOF(dec, AsciiDecoder) { + KJ_CASE_ONEOF(dec, LegacyDecoder) { return dec.decode(js, buffer, flush); } KJ_CASE_ONEOF(dec, IcuDecoder) { return dec.decode(js, buffer, flush); } - KJ_CASE_ONEOF(dec, XUserDefinedDecoder) { - return dec.decode(js, buffer, flush); - } } KJ_UNREACHABLE; } diff --git a/src/workerd/api/encoding.h b/src/workerd/api/encoding.h index 37720124a6f..adaae247e89 100644 --- a/src/workerd/api/encoding.h +++ b/src/workerd/api/encoding.h @@ -4,6 +4,9 @@ #pragma once +#include "encoding-legacy.h" +#include "encoding-shared.h" + #include #include @@ -11,108 +14,6 @@ namespace workerd::api { -// The encodings listed here are defined as required by the Encoding spec. -// The first label is enum we use to identify the encoding in code, while -// the second label is the public identifier. -#define EW_ENCODINGS(V) \ - V(Utf8, "utf-8") \ - V(Ibm866, "ibm866") \ - V(Iso8859_2, "iso-8859-2") \ - V(Iso8859_3, "iso-8859-3") \ - V(Iso8859_4, "iso-8859-4") \ - V(Iso8859_5, "iso-8859-5") \ - V(Iso8859_6, "iso-8859-6") \ - V(Iso8859_7, "iso-8859-7") \ - V(Iso8859_8, "iso-8859-8") \ - V(Iso8859_8i, "iso-8859-8-i") \ - V(Iso8859_10, "iso-8859-10") \ - V(Iso8859_13, "iso-8859-13") \ - V(Iso8859_14, "iso-8859-14") \ - V(Iso8859_15, "iso-8859-15") \ - V(Iso8859_16, "iso-8859-16") \ - V(Ko18_r, "koi8-r") \ - V(Koi8_u, "koi8-u") \ - V(Macintosh, "macintosh") \ - V(Windows_874, "windows-874") \ - V(Windows_1250, "windows-1250") \ - V(Windows_1251, "windows-1251") \ - V(Windows_1252, "windows-1252") \ - V(Windows_1253, "windows-1253") \ - V(Windows_1254, "windows-1254") \ - V(Windows_1255, "windows-1255") \ - V(Windows_1256, "windows-1256") \ - V(Windows_1257, "windows-1257") \ - V(Windows_1258, "windows-1258") \ - V(X_Mac_Cyrillic, "x-mac-cyrillic") \ - V(Gbk, "gbk") \ - V(Gb18030, "gb18030") \ - V(Big5, "big5") \ - V(Euc_Jp, "euc-jp") \ - V(Iso2022_Jp, "iso-2022-jp") \ - V(Shift_Jis, "shift_jis") \ - V(Euc_Kr, "euc-kr") \ - V(Replacement, "replacement") \ - V(Utf16be, "utf-16be") \ - V(Utf16le, "utf-16le") \ - V(X_User_Defined, "x-user-defined") - -enum class Encoding { - INVALID, -#define V(name, _) name, - EW_ENCODINGS(V) -#undef V -}; - -// A Decoder provides the underlying implementation of a TextDecoder. -class Decoder { - public: - virtual ~Decoder() noexcept(true) {} - virtual Encoding getEncoding() = 0; - virtual kj::Maybe decode( - jsg::Lock& js, kj::ArrayPtr buffer, bool flush = false) = 0; - - virtual void reset() {} -}; - -// Decoder implementation that provides a fast-track for windows-1252. -// When the input contains only bytes <= 0x7F or >= 0xA0, these are -// identical between Latin-1 and windows-1252 so we can use V8's -// efficient NewFromOneByte. For bytes in 0x80-0x9F, we remap them -// to the correct windows-1252 code points using NewFromTwoByte. -class AsciiDecoder final: public Decoder { - public: - AsciiDecoder() = default; - AsciiDecoder(AsciiDecoder&&) = default; - AsciiDecoder& operator=(AsciiDecoder&&) = default; - KJ_DISALLOW_COPY(AsciiDecoder); - - Encoding getEncoding() override { - return Encoding::Windows_1252; - } - - kj::Maybe decode( - jsg::Lock& js, kj::ArrayPtr buffer, bool flush = false) override; -}; - -// Decoder implementation for x-user-defined encoding. -// Per WHATWG spec (https://encoding.spec.whatwg.org/#x-user-defined-decoder): -// - Bytes 0x00-0x7F map to themselves (ASCII identity) -// - Bytes 0x80-0xFF map to U+F780 + (byte - 0x80) = U+F700 + byte -class XUserDefinedDecoder final: public Decoder { - public: - XUserDefinedDecoder() = default; - XUserDefinedDecoder(XUserDefinedDecoder&&) = default; - XUserDefinedDecoder& operator=(XUserDefinedDecoder&&) = default; - KJ_DISALLOW_COPY(XUserDefinedDecoder); - - Encoding getEncoding() override { - return Encoding::X_User_Defined; - } - - kj::Maybe decode( - jsg::Lock& js, kj::ArrayPtr buffer, bool flush = false) override; -}; - // Decoder implementation that uses ICU's built-in conversion APIs. // ICU's decoder is fairly comprehensive, covering the full range // of encodings required by the Encoding specification. @@ -157,7 +58,7 @@ class IcuDecoder final: public Decoder { // https://encoding.spec.whatwg.org/#interface-textdecoder class TextDecoder final: public jsg::Object { public: - using DecoderImpl = kj::OneOf; + using DecoderImpl = kj::OneOf; struct ConstructorOptions { bool fatal = false; diff --git a/src/workerd/api/tests/encoding-test.js b/src/workerd/api/tests/encoding-test.js index bcbff6f199d..c5940d429e6 100644 --- a/src/workerd/api/tests/encoding-test.js +++ b/src/workerd/api/tests/encoding-test.js @@ -15,6 +15,8 @@ function decodeStreaming(decoder, input) { return x; } +const u = (...args) => Uint8Array.of(...args); + // From https://developer.mozilla.org/en-US/docs/Web/API/Encoding_API/Encodings const windows1252Labels = [ 'ansi_x3.4-1968', @@ -764,6 +766,161 @@ export const gbkDecoderIsGb18030Decoder = { }, }; +const gbVersionAndRangesTest = (encoding) => { + const loose = new TextDecoder(encoding); + const checkAll = (...list) => list.forEach((x) => check(...x)); + const check = (bytes, str, invalid = false) => { + const fatal = new TextDecoder(encoding, { fatal: true }); + const u8 = Uint8Array.from(bytes); + strictEqual(loose.decode(u8), str); + if (!invalid) strictEqual(fatal.decode(u8), str); + if (invalid) throws(() => fatal.decode(u8)); + }; + + check([0x84, 0x31, 0xa4, 0x36], '\uFFFC'); + check([0x84, 0x31, 0xa4, 0x37], '\uFFFD'); + check([0x84, 0x31, 0xa4, 0x38], '\uFFFE'); + check([0x84, 0x31, 0xa4, 0x39], '\uFFFF'); + check([0x84, 0x31, 0xa5, 0x30], '\uFFFD', true); + check([0x8f, 0x39, 0xfe, 0x39], '\uFFFD', true); + check([0x90, 0x30, 0x81, 0x30], String.fromCodePoint(0x1_00_00)); + check([0x90, 0x30, 0x81, 0x31], String.fromCodePoint(0x1_00_01)); + + check([0xe3, 0x32, 0x9a, 0x35], String.fromCodePoint(0x10_ff_ff)); + check([0xe3, 0x32, 0x9a, 0x36], '\uFFFD', true); + check([0xe3, 0x32, 0x9a, 0x37], '\uFFFD', true); + + check([0xfe, 0x39, 0xfe, 0x39], '\uFFFD', true); + check([0xff, 0x39, 0xfe, 0x39], '\uFFFD9\uFFFD', true); + check([0xfe, 0x40, 0xfe, 0x39], '\uFA0C\uFFFD', true); + check([0xfe, 0x39, 0xff, 0x39], '\uFFFD9\uFFFD9', true); + check([0xfe, 0x39, 0xfe, 0x40], '\uFFFD9\uFA0C', true); + + checkAll( + [[0xa8, 0xbb], '\u0251'], + [[0xa8, 0xbc], '\u1E3F'], + [[0xa8, 0xbd], '\u0144'] + ); + check([0x81, 0x35, 0xf4, 0x36], '\u1E3E'); + check([0x81, 0x35, 0xf4, 0x37], '\uE7C7'); + check([0x81, 0x35, 0xf4, 0x38], '\u1E40'); + + checkAll( + [[0xa6, 0xd9], '\uFE10'], + [[0xa6, 0xed], '\uFE18'], + [[0xa6, 0xf3], '\uFE19'] + ); + checkAll([[0xfe, 0x59], '\u9FB4'], [[0xfe, 0xa0], '\u9FBB']); +}; + +export const gb18030VersionAndRanges = { + test() { + gbVersionAndRangesTest('gb18030'); + }, +}; + +export const gbkVersionAndRanges = { + test() { + gbVersionAndRangesTest('gbk'); + }, +}; + +// Verify that the WHATWG-required mapping corrections also produce the +// correct output when the corrected byte sequences appear inside a larger +// buffer (surrounded by ASCII), not only when they are the entire input. +export const gb18030OverridesEmbedded = { + test() { + const d = new TextDecoder('gb18030'); + + // 0x80 → U+20AC (Euro sign) surrounded by ASCII + strictEqual(d.decode(Uint8Array.of(0x41, 0x80, 0x42)), 'A\u20ACB'); + + // Two-byte mapping corrections surrounded by ASCII + strictEqual(d.decode(Uint8Array.of(0x41, 0xa8, 0xbb, 0x42)), 'A\u0251B'); + strictEqual(d.decode(Uint8Array.of(0x41, 0xa8, 0xbc, 0x42)), 'A\u1E3FB'); + strictEqual(d.decode(Uint8Array.of(0x41, 0xa8, 0xbd, 0x42)), 'A\u0144B'); + + // Vertical form corrections surrounded by ASCII + strictEqual(d.decode(Uint8Array.of(0x41, 0xa6, 0xd9, 0x42)), 'A\uFE10B'); + + // CJK extension corrections surrounded by ASCII + strictEqual(d.decode(Uint8Array.of(0x41, 0xfe, 0x59, 0x42)), 'A\u9FB4B'); + }, +}; + +export const replacementPushbackAsciiCharactersLoose = { + test() { + const vectors = { + big5: [ + [[0x80], '\uFFFD'], + [[0x81, 0x40], '\uFFFD@'], + [[0x83, 0x5c], '\uFFFD\\'], + [[0x87, 0x87, 0x40], '\uFFFD@'], + [[0x81, 0x81], '\uFFFD'], + ], + 'iso-2022-jp': [ + [[0x1b, 0x24], '\uFFFD$'], + [[0x1b, 0x24, 0x40, 0x1b, 0x24], '\uFFFD\uFFFD'], + ], + 'euc-jp': [ + [[0x80], '\uFFFD'], + [[0x8d, 0x8d], '\uFFFD\uFFFD'], + [[0x8e, 0x8e], '\uFFFD'], + ], + }; + + for (const [encoding, list] of Object.entries(vectors)) { + const d = new TextDecoder(encoding); + for (const [bytes, text] of list) { + strictEqual(d.decode(Uint8Array.from(bytes)), text); + } + } + }, +}; + +export const stickyMultibyteStateIso2022JpLoose = { + test() { + const vectors = [ + [[27], '\uFFFD'], + [[27, 0x28], '\uFFFD('], + [[0x1b, 0x28, 0x49], ''], + ]; + + const d = new TextDecoder('iso-2022-jp'); + for (const [bytes, text] of vectors) { + strictEqual(d.decode(u(0x40)), '@'); + strictEqual(d.decode(Uint8Array.from(bytes)), text); + strictEqual(d.decode(u(0x40)), '@'); + strictEqual(d.decode(u(0x2a)), '*'); + strictEqual(d.decode(u(0x42)), 'B'); + } + }, +}; + +export const fatalStreamGb18030Gbk = { + test() { + for (const encoding of ['gb18030', 'gbk']) { + { + const d = new TextDecoder(encoding, { fatal: true }); + strictEqual(d.decode(Uint8Array.of(0x80), { stream: true }), '\u20AC'); + throws(() => + d.decode(u(0x81, 0x30, 0x21, 0x21, 0x21), { stream: true }) + ); + strictEqual(d.decode(Uint8Array.of(0x80)), '\u20AC'); + } + + { + const d = new TextDecoder(encoding, { fatal: true }); + strictEqual(d.decode(Uint8Array.of(0x80), { stream: true }), '\u20AC'); + throws(() => + d.decode(u(0x81, 0x30, 0x81, 0x42, 0x42), { stream: true }) + ); + strictEqual(d.decode(Uint8Array.of(0x80)), '\u20AC'); + } + } + }, +}; + export const textDecoderStream = { test() { const stream = new TextDecoderStream('utf-16', { @@ -779,6 +936,55 @@ export const textDecoderStream = { }, }; +// Per WHATWG Big5 decoder step 1, when end-of-queue is reached with a +// pending lead byte, the decoder must return error (U+FFFD in replacement +// mode, throw in fatal mode). This tests the streaming case where a lead +// byte is buffered in one call and then flushed without a trail byte. +export const big5OrphanedLeadOnFlush = { + test() { + // 0xA4 is a valid Big5 lead byte (e.g., first byte of 中 = 0xA4 0xA4). + // Streaming it alone, then flushing, must produce U+FFFD. + { + const dec = new TextDecoder('big5'); + strictEqual(dec.decode(Uint8Array.of(0xa4), { stream: true }), ''); + strictEqual(dec.decode(), '\uFFFD'); + } + + // Fatal mode must throw on the orphaned lead. + { + const dec = new TextDecoder('big5', { fatal: true }); + strictEqual(dec.decode(Uint8Array.of(0xa4), { stream: true }), ''); + throws(() => dec.decode()); + } + + // Orphaned lead followed by an invalid trail byte on flush: the lead + // must produce U+FFFD. 0x20 (space) is not a valid Big5 trail byte + // (valid trails are 0x40-0x7E and 0xA1-0xFE). + { + const dec = new TextDecoder('big5'); + strictEqual(dec.decode(Uint8Array.of(0xa4), { stream: true }), ''); + const result = dec.decode(Uint8Array.of(0x20)); + // The orphaned lead must produce at least one U+FFFD. + ok( + result.includes('\uFFFD'), + `expected U+FFFD in output, got: ${JSON.stringify(result)}` + ); + // The space byte must not be swallowed. + ok( + result.includes(' '), + `expected space in output, got: ${JSON.stringify(result)}` + ); + } + + // Streaming a complete pair across two calls must still work. + { + const dec = new TextDecoder('big5'); + strictEqual(dec.decode(Uint8Array.of(0xa4), { stream: true }), ''); + strictEqual(dec.decode(Uint8Array.of(0xa4)), '中'); + } + }, +}; + // Test x-user-defined encoding per WHATWG spec // https://encoding.spec.whatwg.org/#x-user-defined-decoder export const xUserDefinedDecode = { @@ -840,3 +1046,60 @@ export const xUserDefinedFatal = { } }, }; + +// Verify that streaming with zero-length input works for every legacy +// encoding handled by the Rust LegacyDecoder. An empty chunk in streaming +// mode must produce an empty string and leave the decoder in a valid state +// for subsequent calls. +export const legacyStreamEmptyInput = { + test() { + const encodings = [ + 'big5', + 'euc-jp', + 'euc-kr', + 'gb18030', + 'gbk', + 'iso-2022-jp', + 'shift_jis', + 'windows-1252', + 'x-user-defined', + ]; + + const empty = new Uint8Array(0); + + for (const label of encodings) { + for (const fatal of [false, true]) { + const dec = new TextDecoder(label, { fatal }); + + // Empty stream chunk must produce empty string. + strictEqual( + dec.decode(empty, { stream: true }), + '', + `${label} (fatal=${fatal}): empty stream chunk should be ''` + ); + + // A second empty stream chunk must also be fine. + strictEqual( + dec.decode(empty, { stream: true }), + '', + `${label} (fatal=${fatal}): second empty stream chunk should be ''` + ); + + // Final flush with no pending bytes must produce empty string. + strictEqual( + dec.decode(), + '', + `${label} (fatal=${fatal}): flush after empty chunks should be ''` + ); + + // Decoder must still work normally after the empty-stream sequence. + // Feed a single ASCII byte to verify. + strictEqual( + dec.decode(Uint8Array.of(0x41)), + 'A', + `${label} (fatal=${fatal}): decode 'A' after empty stream should work` + ); + } + } + }, +}; diff --git a/src/workerd/api/tests/encoding-test.wd-test b/src/workerd/api/tests/encoding-test.wd-test index 6ec29f86389..016035b74e7 100644 --- a/src/workerd/api/tests/encoding-test.wd-test +++ b/src/workerd/api/tests/encoding-test.wd-test @@ -7,7 +7,7 @@ const unitTests :Workerd.Config = ( modules = [ (name = "worker", esModule = embed "encoding-test.js") ], - compatibilityFlags = ["nodejs_compat"] + compatibilityFlags = ["nodejs_compat", "text_decoder_cjk_decoder"] ) ), ], diff --git a/src/workerd/io/compatibility-date.capnp b/src/workerd/io/compatibility-date.capnp index 83bba3ca604..c73f9480369 100644 --- a/src/workerd/io/compatibility-date.capnp +++ b/src/workerd/io/compatibility-date.capnp @@ -1403,4 +1403,11 @@ struct CompatibilityFlags @0x8f8c1b68151b6cef { $compatEnableDate("2026-03-03"); # When enabled, unhandledrejection processing is deferred until the microtask # checkpoint completes, avoiding misfires on multi-tick promise chains. + + textDecoderCjkDecoder @163 :Bool + $compatEnableFlag("text_decoder_cjk_decoder") + $compatDisableFlag("disable_text_decoder_cjk_decoder") + $compatEnableDate("2026-03-03"); + # Enables the dedicated CJK TextDecoder implementation for overrides and + # Big5 lead-byte handling instead of the legacy ICU-only path. } diff --git a/src/wpt/BUILD.bazel b/src/wpt/BUILD.bazel index a7382c9ba7d..6914ac125a7 100644 --- a/src/wpt/BUILD.bazel +++ b/src/wpt/BUILD.bazel @@ -75,7 +75,10 @@ wpt_test( wpt_test( name = "encoding", - compat_flags = ["pedantic_wpt"], + compat_flags = [ + "pedantic_wpt", + "text_decoder_cjk_decoder", + ], config = "encoding-test.ts", wpt_directory = "@wpt//:encoding@module", ) diff --git a/src/wpt/encoding-test.ts b/src/wpt/encoding-test.ts index b73f7d43660..29186e73001 100644 --- a/src/wpt/encoding-test.ts +++ b/src/wpt/encoding-test.ts @@ -2,8 +2,24 @@ // Licensed under the Apache 2.0 license found in the LICENSE file or at: // https://opensource.org/licenses/Apache-2.0 +import path from 'node:path'; +import { getBindingPath } from 'harness/common'; import { type TestRunnerConfig } from 'harness/harness'; +function loadWptResource(relativePath: string): void { + const bindingPath = getBindingPath( + path.dirname(globalThis.state.testFileName), + relativePath + ); + const code = globalThis.state.env[bindingPath]; + if (typeof code !== 'string') { + throw new Error( + `Test file ${bindingPath} not found. Update wpt_test.bzl to handle this case.` + ); + } + globalThis.state.env.unsafe.eval(code); +} + export default { 'api-basics.any.js': {}, 'api-invalid-label.any.js': {}, @@ -68,53 +84,44 @@ export default { 'TextEncoderStream interface: attribute encoding', ], }, - 'iso-2022-jp-decoder.any.js': { - comment: 'TODO investigate this', - expectedFailures: [ - 'iso-2022-jp decoder: Error ESC', - 'iso-2022-jp decoder: Katakana ESC, SO / SI', - 'iso-2022-jp decoder: character, error ESC #2', - ], - }, + 'iso-2022-jp-decoder.any.js': {}, 'legacy-mb-japanese/euc-jp/eucjp-decoder.js': {}, 'legacy-mb-japanese/euc-jp/eucjp-encoder.js': { - comment: 'ReferenceError: jis0208 is not defined', - omittedTests: true, + before: (): void => { + loadWptResource('./jis0208_index.js'); + loadWptResource('./jis0212_index.js'); + }, }, 'legacy-mb-japanese/euc-jp/jis0208_index.js': {}, 'legacy-mb-japanese/euc-jp/jis0212_index.js': {}, 'legacy-mb-japanese/iso-2022-jp/iso2022jp-decoder.js': {}, 'legacy-mb-japanese/iso-2022-jp/iso2022jp-encoder.js': { - comment: - 'This file is meant to be included by tests and cannot run on its own', - omittedTests: true, + before: (): void => { + loadWptResource('./jis0208_index.js'); + }, }, 'legacy-mb-japanese/iso-2022-jp/jis0208_index.js': {}, 'legacy-mb-japanese/shift_jis/jis0208_index.js': {}, 'legacy-mb-japanese/shift_jis/sjis-decoder.js': {}, 'legacy-mb-japanese/shift_jis/sjis-encoder.js': { - comment: - 'This file is meant to be included by tests and cannot run on its own', - omittedTests: true, + before: (): void => { + loadWptResource('./jis0208_index.js'); + }, }, 'legacy-mb-korean/euc-kr/euckr-decoder.js': {}, 'legacy-mb-korean/euc-kr/euckr-encoder.js': { - comment: 'ReferenceError: euckr is not defined', - omittedTests: true, + before: (): void => { + loadWptResource('./euckr_index.js'); + }, }, 'legacy-mb-korean/euc-kr/euckr_index.js': {}, - 'legacy-mb-schinese/gb18030/gb18030-decoder.any.js': { - comment: 'Too many failures to list individually', - omittedTests: true, - }, - 'legacy-mb-schinese/gbk/gbk-decoder.any.js': { - comment: 'Too many failures to list individually', - omittedTests: true, - }, + 'legacy-mb-schinese/gb18030/gb18030-decoder.any.js': {}, + 'legacy-mb-schinese/gbk/gbk-decoder.any.js': {}, 'legacy-mb-tchinese/big5/big5-decoder.js': {}, 'legacy-mb-tchinese/big5/big5-encoder.js': { - comment: 'big5 is not defined', - omittedTests: true, + before: (): void => { + loadWptResource('./big5_index.js'); + }, }, 'legacy-mb-tchinese/big5/big5_index.js': {}, 'replacement-encodings.any.js': { @@ -511,18 +518,12 @@ export default { 'streams/readable-writable-properties.any.js': {}, 'streams/realms.window.js': { comment: 'ReferenceError: window is not defined', - disabledTests: true, + omittedTests: true, }, 'textdecoder-arguments.any.js': {}, 'textdecoder-byte-order-marks.any.js': {}, 'textdecoder-copy.any.js': {}, - 'textdecoder-eof.any.js': { - comment: 'TextDecoder end-of-queue handling differs from spec', - expectedFailures: [ - 'TextDecoder end-of-queue handling', - 'TextDecoder end-of-queue handling using stream: true', - ], - }, + 'textdecoder-eof.any.js': {}, 'textdecoder-fatal-single-byte.any.js': {}, 'textdecoder-fatal-streaming.any.js': {}, 'textdecoder-fatal.any.js': {}, @@ -553,6 +554,165 @@ export default { }, 'unsupported-labels.window.js': { comment: 'Too many failures to list by name', - disabledTests: true, + expectedFailures: [ + '437 is not supported by the Encoding Standard', + 'adobe-standard-encoding is not supported by the Encoding Standard', + 'armscii-8 is not supported by the Encoding Standard', + 'bocu-1 is not supported by the Encoding Standard', + 'cesu-8 is not supported by the Encoding Standard', + 'cp1025 is not supported by the Encoding Standard', + 'cp437 is not supported by the Encoding Standard', + 'cp737 is not supported by the Encoding Standard', + 'cp851 is not supported by the Encoding Standard', + 'cp858 is not supported by the Encoding Standard', + 'cp862 is not supported by the Encoding Standard', + 'cp864 is not supported by the Encoding Standard', + 'cp869 is not supported by the Encoding Standard', + 'cp875 is not supported by the Encoding Standard', + 'cp950 is not supported by the Encoding Standard', + 'csiso103t618bit is not supported by the Encoding Standard', + 'csiso111ecmacyrillic is not supported by the Encoding Standard', + 'cspc8codepage437 is not supported by the Encoding Standard', + 'csviscii is not supported by the Encoding Standard', + 'dos-720 is not supported by the Encoding Standard', + 'dos-862 is not supported by the Encoding Standard', + 'ecma-cyrillic is not supported by the Encoding Standard', + 'euc-tw is not supported by the Encoding Standard', + 'german is not supported by the Encoding Standard', + 'geostd8 is not supported by the Encoding Standard', + 'hp-roman8 is not supported by the Encoding Standard', + 'ibm-thai is not supported by the Encoding Standard', + 'ibm00858 is not supported by the Encoding Standard', + 'ibm00924 is not supported by the Encoding Standard', + 'ibm01047 is not supported by the Encoding Standard', + 'ibm01140 is not supported by the Encoding Standard', + 'ibm01141 is not supported by the Encoding Standard', + 'ibm01142 is not supported by the Encoding Standard', + 'ibm01143 is not supported by the Encoding Standard', + 'ibm01144 is not supported by the Encoding Standard', + 'ibm01145 is not supported by the Encoding Standard', + 'ibm01146 is not supported by the Encoding Standard', + 'ibm01147 is not supported by the Encoding Standard', + 'ibm01148 is not supported by the Encoding Standard', + 'ibm01149 is not supported by the Encoding Standard', + 'ibm037 is not supported by the Encoding Standard', + 'ibm1026 is not supported by the Encoding Standard', + 'ibm1047 is not supported by the Encoding Standard', + 'ibm273 is not supported by the Encoding Standard', + 'ibm277 is not supported by the Encoding Standard', + 'ibm278 is not supported by the Encoding Standard', + 'ibm280 is not supported by the Encoding Standard', + 'ibm284 is not supported by the Encoding Standard', + 'ibm285 is not supported by the Encoding Standard', + 'ibm290 is not supported by the Encoding Standard', + 'ibm297 is not supported by the Encoding Standard', + 'ibm367 is not supported by the Encoding Standard', + 'ibm420 is not supported by the Encoding Standard', + 'ibm423 is not supported by the Encoding Standard', + 'ibm424 is not supported by the Encoding Standard', + 'ibm437 is not supported by the Encoding Standard', + 'ibm500 is not supported by the Encoding Standard', + 'ibm737 is not supported by the Encoding Standard', + 'ibm775 is not supported by the Encoding Standard', + 'ibm850 is not supported by the Encoding Standard', + 'ibm852 is not supported by the Encoding Standard', + 'ibm855 is not supported by the Encoding Standard', + 'ibm857 is not supported by the Encoding Standard', + 'ibm860 is not supported by the Encoding Standard', + 'ibm861 is not supported by the Encoding Standard', + 'ibm862 is not supported by the Encoding Standard', + 'ibm863 is not supported by the Encoding Standard', + 'ibm864 is not supported by the Encoding Standard', + 'ibm864i is not supported by the Encoding Standard', + 'ibm865 is not supported by the Encoding Standard', + 'ibm868 is not supported by the Encoding Standard', + 'ibm869 is not supported by the Encoding Standard', + 'ibm870 is not supported by the Encoding Standard', + 'ibm871 is not supported by the Encoding Standard', + 'ibm880 is not supported by the Encoding Standard', + 'ibm905 is not supported by the Encoding Standard', + 'ibm918 is not supported by the Encoding Standard', + 'iso-2022-jp-1 is not supported by the Encoding Standard', + 'iso-2022-jp-2 is not supported by the Encoding Standard', + 'iso-2022-jp-3 is not supported by the Encoding Standard', + 'iso-8859-8 visual is not supported by the Encoding Standard', + 'jis_c6226-1978 is not supported by the Encoding Standard', + 'jis_x0208-1983 is not supported by the Encoding Standard', + 'jis_x0208-1990 is not supported by the Encoding Standard', + 'jis_x0212-1990 is not supported by the Encoding Standard', + 'johab is not supported by the Encoding Standard', + 'latin9 is not supported by the Encoding Standard', + 'norwegian is not supported by the Encoding Standard', + 'sami-ws2 is not supported by the Encoding Standard', + 'scsu is not supported by the Encoding Standard', + 'shift_jis_x0213-2000 is not supported by the Encoding Standard', + 'swedish is not supported by the Encoding Standard', + 'tcvn is not supported by the Encoding Standard', + 'tis-620-2533 is not supported by the Encoding Standard', + 'utf-7 is not supported by the Encoding Standard', + 'utf-32 is not supported by the Encoding Standard', + 'viscii is not supported by the Encoding Standard', + 'windows-936-2000 is not supported by the Encoding Standard', + 'windows-sami-2 is not supported by the Encoding Standard', + 'ws2 is not supported by the Encoding Standard', + 'x-chinese-cns is not supported by the Encoding Standard', + 'x-chinese-eten is not supported by the Encoding Standard', + 'x-cp20001 is not supported by the Encoding Standard', + 'x-cp20003 is not supported by the Encoding Standard', + 'x-cp20004 is not supported by the Encoding Standard', + 'x-cp20005 is not supported by the Encoding Standard', + 'x-cp20261 is not supported by the Encoding Standard', + 'x-cp20269 is not supported by the Encoding Standard', + 'x-cp20936 is not supported by the Encoding Standard', + 'x-cp20949 is not supported by the Encoding Standard', + 'x-cp21027 is not supported by the Encoding Standard', + 'x-cp50227 is not supported by the Encoding Standard', + 'x-cp50229 is not supported by the Encoding Standard', + 'x-ebcdic-koreanextended is not supported by the Encoding Standard', + 'x-europa is not supported by the Encoding Standard', + 'x-ia5 is not supported by the Encoding Standard', + 'x-ia5-german is not supported by the Encoding Standard', + 'x-ia5-norwegian is not supported by the Encoding Standard', + 'x-ia5-swedish is not supported by the Encoding Standard', + 'x-iscii-as is not supported by the Encoding Standard', + 'x-iscii-be is not supported by the Encoding Standard', + 'x-iscii-de is not supported by the Encoding Standard', + 'x-iscii-gu is not supported by the Encoding Standard', + 'x-iscii-ka is not supported by the Encoding Standard', + 'x-iscii-ma is not supported by the Encoding Standard', + 'x-iscii-or is not supported by the Encoding Standard', + 'x-iscii-pa is not supported by the Encoding Standard', + 'x-iscii-t is not supported by the Encoding Standard', + 'x-iscii-ta is not supported by the Encoding Standard', + 'x-iscii-te is not supported by the Encoding Standard', + 'x-mac-arabic is not supported by the Encoding Standard', + 'x-mac-ce is not supported by the Encoding Standard', + 'x-mac-centraleurroman is not supported by the Encoding Standard', + 'x-mac-chinesesimp is not supported by the Encoding Standard', + 'x-mac-chinesetrad is not supported by the Encoding Standard', + 'x-mac-croatian is not supported by the Encoding Standard', + 'x-mac-devanagari is not supported by the Encoding Standard', + 'x-mac-dingbats is not supported by the Encoding Standard', + 'x-mac-farsi is not supported by the Encoding Standard', + 'x-mac-greek is not supported by the Encoding Standard', + 'x-mac-gujarati is not supported by the Encoding Standard', + 'x-mac-gurmukhi is not supported by the Encoding Standard', + 'x-mac-hebrew is not supported by the Encoding Standard', + 'x-mac-icelandic is not supported by the Encoding Standard', + 'x-mac-japanese is not supported by the Encoding Standard', + 'x-mac-korean is not supported by the Encoding Standard', + 'x-mac-roman-latin1 is not supported by the Encoding Standard', + 'x-mac-romanian is not supported by the Encoding Standard', + 'x-mac-symbol is not supported by the Encoding Standard', + 'x-mac-thai is not supported by the Encoding Standard', + 'x-mac-tibetan is not supported by the Encoding Standard', + 'x-mac-turkish is not supported by the Encoding Standard', + 'x-mac-vt100 is not supported by the Encoding Standard', + 'x-nextstep is not supported by the Encoding Standard', + 'x-vps is not supported by the Encoding Standard', + '_autodetect is not supported by the Encoding Standard', + '_autodetect_all is not supported by the Encoding Standard', + '_autodetect_kr is not supported by the Encoding Standard', + ], }, } satisfies TestRunnerConfig; From 79b3f4b6a7f9d24908bbde01d7891001cd55d5e2 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 20 Feb 2026 17:12:59 -0500 Subject: [PATCH 2/3] put textdecoder behind compat flag --- src/workerd/api/encoding.c++ | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index e4590818a66..6b5c84693e5 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -455,9 +455,17 @@ jsg::Ref TextDecoder::constructor(jsg::Lock& js, case Encoding::Gb18030: case Encoding::Gbk: case Encoding::Iso2022_Jp: - case Encoding::Shift_Jis: - case Encoding::Windows_1252: + case Encoding::Shift_Jis: { + // If the feature flag is disabled, we use the ICU decoder. + if (!FeatureFlags::get(js).getTextDecoderCjkDecoder()) { + break; + } + + // We fallthrough to LegacyDecoder in order to avoid breaking changes. + [[fallthrough]]; + } case Encoding::X_User_Defined: + case Encoding::Windows_1252: return js.alloc(LegacyDecoder(encoding, DecoderFatal(options.fatal)), options); default: break; From 6460a2727a385933dc1a553509c63bb9967240e8 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Tue, 24 Feb 2026 12:54:20 -0500 Subject: [PATCH 3/3] improve code consistency --- src/rust/encoding/lib.rs | 24 ++++++++++++++++++------ src/workerd/api/encoding-legacy.c++ | 12 ++++++++++-- src/workerd/api/encoding.c++ | 13 ++----------- src/workerd/api/tests/encoding-test.js | 18 ++++++++++-------- 4 files changed, 40 insertions(+), 27 deletions(-) diff --git a/src/rust/encoding/lib.rs b/src/rust/encoding/lib.rs index b6b112156ff..3a89198de4a 100644 --- a/src/rust/encoding/lib.rs +++ b/src/rust/encoding/lib.rs @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Cloudflare, Inc. +// Copyright (c) 2026 Cloudflare, Inc. // Licensed under the Apache 2.0 license found in the LICENSE file or at: // https://opensource.org/licenses/Apache-2.0 @@ -36,6 +36,11 @@ mod ffi { had_error: bool, } + struct DecodeOptions { + flush: bool, + fatal: bool, + } + extern "Rust" { type Decoder; @@ -47,7 +52,7 @@ mod ffi { /// Decode a chunk of bytes. Set `flush` to true on the final chunk. /// When `fatal` is true and an error is encountered, `had_error` is /// set and the output may be incomplete. - fn decode(decoder: &mut Decoder, input: &[u8], flush: bool, fatal: bool) -> DecodeResult; + fn decode(decoder: &mut Decoder, input: &[u8], options: &DecodeOptions) -> DecodeResult; /// Reset the decoder to its initial state. fn reset(decoder: &mut Decoder); @@ -85,7 +90,14 @@ pub fn new_decoder(encoding: ffi::Encoding) -> Box { }) } -pub fn decode(state: &mut Decoder, input: &[u8], flush: bool, fatal: bool) -> ffi::DecodeResult { +pub fn decode( + state: &mut Decoder, + input: &[u8], + options: &ffi::DecodeOptions, +) -> ffi::DecodeResult { + // max_utf16_buffer_length() returns None on usize overflow. The +4 covers extra + // UTF-16 code units from decoder state. Safe even if slightly short since the decode loop + // below resizes on OutputFull. let max_len = state .inner .max_utf16_buffer_length(input.len()) @@ -94,12 +106,12 @@ pub fn decode(state: &mut Decoder, input: &[u8], flush: bool, fatal: bool) -> ff let mut total_read = 0usize; let mut total_written = 0usize; - if fatal { + if options.fatal { loop { let (result, read, written) = state.inner.decode_to_utf16_without_replacement( &input[total_read..], &mut output[total_written..], - flush, + options.flush, ); total_read += read; total_written += written; @@ -124,7 +136,7 @@ pub fn decode(state: &mut Decoder, input: &[u8], flush: bool, fatal: bool) -> ff let (result, read, written, _had_errors) = state.inner.decode_to_utf16( &input[total_read..], &mut output[total_written..], - flush, + options.flush, ); total_read += read; total_written += written; diff --git a/src/workerd/api/encoding-legacy.c++ b/src/workerd/api/encoding-legacy.c++ index 378afec91b2..aef0562ff53 100644 --- a/src/workerd/api/encoding-legacy.c++ +++ b/src/workerd/api/encoding-legacy.c++ @@ -53,8 +53,16 @@ void LegacyDecoder::reset() { kj::Maybe LegacyDecoder::decode( jsg::Lock& js, kj::ArrayPtr buffer, bool flush) { - auto result = ::workerd::rust::encoding::decode( - *state, buffer.as(), flush, fatal.toBool()); + // Reset decoder state after flush, matching IcuDecoder's KJ_DEFER contract. + // This ensures decodePtr() (used by TextDecoderStream) resets correctly on flush. + KJ_DEFER({ + if (flush) reset(); + }); + + ::workerd::rust::encoding::DecodeOptions options{.flush = flush, .fatal = fatal.toBool()}; + // kj_rs::RustMutable is used to avoid a copy of the underlying buffer. + auto result = + ::workerd::rust::encoding::decode(*state, buffer.as(), kj::mv(options)); if (fatal.toBool() && result.had_error) { // Decoder state already reset by the Rust side on fatal error. diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 6b5c84693e5..6ef6b46589a 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -303,7 +303,6 @@ kj::Maybe IcuDecoder::create(Encoding encoding, bool fatal, bool ign kj::Maybe IcuDecoder::decode( jsg::Lock& js, kj::ArrayPtr buffer, bool flush) { UErrorCode status = U_ZERO_ERROR; - kj::Maybe> merged; const auto maxCharSize = [this]() { return ucnv_getMaxCharSize(inner.get()); }; const auto isUnicode = [this]() { @@ -485,17 +484,9 @@ jsg::JsString TextDecoder::decode(jsg::Lock& js, jsg::Optional> maybeInput, jsg::Optional maybeOptions) { auto options = maybeOptions.orDefault(DEFAULT_OPTIONS); - // Per spec, omitting input is end-of-queue, so we must flush pending bytes. - const auto flush = maybeInput == kj::none || !options.stream; auto& input = maybeInput.orDefault(EMPTY); - auto result = - JSG_REQUIRE_NONNULL(getImpl().decode(js, input, flush), TypeError, "Failed to decode input."); - // Per WHATWG spec, when flush is set the decoder is reset to a new instance - // so subsequent calls start with clean state. - if (flush) { - getImpl().reset(); - } - return kj::mv(result); + return JSG_REQUIRE_NONNULL( + getImpl().decode(js, input, !options.stream), TypeError, "Failed to decode input."); } kj::Maybe TextDecoder::decodePtr( diff --git a/src/workerd/api/tests/encoding-test.js b/src/workerd/api/tests/encoding-test.js index c5940d429e6..7fd8ae9f02c 100644 --- a/src/workerd/api/tests/encoding-test.js +++ b/src/workerd/api/tests/encoding-test.js @@ -15,8 +15,6 @@ function decodeStreaming(decoder, input) { return x; } -const u = (...args) => Uint8Array.of(...args); - // From https://developer.mozilla.org/en-US/docs/Web/API/Encoding_API/Encodings const windows1252Labels = [ 'ansi_x3.4-1968', @@ -888,11 +886,11 @@ export const stickyMultibyteStateIso2022JpLoose = { const d = new TextDecoder('iso-2022-jp'); for (const [bytes, text] of vectors) { - strictEqual(d.decode(u(0x40)), '@'); + strictEqual(d.decode(Uint8Array.of(0x40)), '@'); strictEqual(d.decode(Uint8Array.from(bytes)), text); - strictEqual(d.decode(u(0x40)), '@'); - strictEqual(d.decode(u(0x2a)), '*'); - strictEqual(d.decode(u(0x42)), 'B'); + strictEqual(d.decode(Uint8Array.of(0x40)), '@'); + strictEqual(d.decode(Uint8Array.of(0x2a)), '*'); + strictEqual(d.decode(Uint8Array.of(0x42)), 'B'); } }, }; @@ -904,7 +902,9 @@ export const fatalStreamGb18030Gbk = { const d = new TextDecoder(encoding, { fatal: true }); strictEqual(d.decode(Uint8Array.of(0x80), { stream: true }), '\u20AC'); throws(() => - d.decode(u(0x81, 0x30, 0x21, 0x21, 0x21), { stream: true }) + d.decode(Uint8Array.of(0x81, 0x30, 0x21, 0x21, 0x21), { + stream: true, + }) ); strictEqual(d.decode(Uint8Array.of(0x80)), '\u20AC'); } @@ -913,7 +913,9 @@ export const fatalStreamGb18030Gbk = { const d = new TextDecoder(encoding, { fatal: true }); strictEqual(d.decode(Uint8Array.of(0x80), { stream: true }), '\u20AC'); throws(() => - d.decode(u(0x81, 0x30, 0x81, 0x42, 0x42), { stream: true }) + d.decode(Uint8Array.of(0x81, 0x30, 0x81, 0x42, 0x42), { + stream: true, + }) ); strictEqual(d.decode(Uint8Array.of(0x80)), '\u20AC'); }