diff --git a/sjsonnet/src-js/sjsonnet/CharSWAR.scala b/sjsonnet/src-js/sjsonnet/CharSWAR.scala index 659e1233..cc16f6fa 100644 --- a/sjsonnet/src-js/sjsonnet/CharSWAR.scala +++ b/sjsonnet/src-js/sjsonnet/CharSWAR.scala @@ -7,7 +7,7 @@ object CharSWAR { val len = s.length while (i < len) { val c = s.charAt(i) - if (c < 32 || c == '"' || c == '\\') return true + if (c < 32 || c == '"' || c == '\\' || (c >= 0x7f && c <= 0x9f)) return true i += 1 } false @@ -17,7 +17,7 @@ object CharSWAR { var i = from while (i < to) { val c = arr(i) - if (c < 32 || c == '"' || c == '\\') return true + if (c < 32 || c == '"' || c == '\\' || (c >= 0x7f && c <= 0x9f)) return true i += 1 } false @@ -29,7 +29,7 @@ object CharSWAR { var i = from while (i < to) { val c = s.charAt(i) - if (c < 32 || c == '"' || c == '\\' || c >= 128) return false + if (c < 32 || c == '"' || c == '\\' || c >= 0x7f) return false i += 1 } true @@ -40,7 +40,7 @@ object CharSWAR { var i = from while (i < to) { val b = arr(i) & 0xff - if (b < 32 || b == '"' || b == '\\') return true + if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return true i += 1 } false @@ -50,7 +50,7 @@ object CharSWAR { var i = from while (i < to) { val b = arr(i) & 0xff - if (b < 32 || b == '"' || b == '\\') return i + if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return i i += 1 } -1 diff --git a/sjsonnet/src-jvm/sjsonnet/CharSWAR.java b/sjsonnet/src-jvm/sjsonnet/CharSWAR.java index 5a22470c..11710136 100644 --- a/sjsonnet/src-jvm/sjsonnet/CharSWAR.java +++ b/sjsonnet/src-jvm/sjsonnet/CharSWAR.java @@ -3,17 +3,16 @@ import java.lang.invoke.MethodHandles; import java.lang.invoke.VarHandle; import java.nio.ByteOrder; -import java.nio.charset.StandardCharsets; /** * SWAR (SIMD Within A Register) escape-char scanner for JSON string rendering. * *

Detects characters requiring JSON escaping: control chars ({@code < 32}), - * double-quote ({@code '"'}), and backslash ({@code '\\'}). + * double-quote ({@code '"'}), backslash ({@code '\\'}), DEL ({@code 0x7F}), + * and C1 control characters ({@code 0x80–0x9F}). * - *

For strings above a threshold length, converts to ISO-8859-1 bytes and - * processes 8 bytes at a time using {@link VarHandle} bulk reads + Hacker's - * Delight zero-detection formula. For shorter strings, uses a scalar charAt loop. + *

String scans use char-level semantics. Byte-array scans process 8 bytes at a time using + * {@link VarHandle} bulk reads + Hacker's Delight zero-detection formula. * *

Based on the SWAR technique from Hacker's Delight Ch. 6, as used by * @@ -51,9 +50,6 @@ private CharSWAR() {} /** Mask for bits 5-7 of each byte; zero result means byte < 32. */ private static final long CTRL = 0xE0E0_E0E0_E0E0_E0E0L; - /** Below this length, scalar charAt is faster than SWAR + byte[] conversion. */ - private static final int SWAR_THRESHOLD = 128; - private static final long U16_HOLE = 0x7FFF_7FFF_7FFF_7FFFL; private static final long U16_QUOTE = 0x0022_0022_0022_0022L; private static final long U16_BSLAS = 0x005C_005C_005C_005CL; @@ -66,21 +62,14 @@ private CharSWAR() {} */ static boolean hasEscapeChar(String str) { int len = str.length(); - if (len < SWAR_THRESHOLD) { - return hasEscapeCharScalar(str, len); - } - // ISO-8859-1 encoding is a JVM intrinsic for LATIN1 compact strings — - // essentially a memcpy of the internal byte[]. Chars > 255 map to '?' - // (0x3F), which is safe (not a control char, not '"', not '\\'). - byte[] bytes = str.getBytes(StandardCharsets.ISO_8859_1); - return hasEscapeCharSWAR(bytes, 0, bytes.length); + return hasEscapeCharScalar(str, len); } /** * Check if any byte in {@code arr[from..to)} needs JSON string escaping. - * Used by ByteRenderer for in-place SWAR scan on byte[] buffers. - * UTF-8 multi-byte sequences never produce bytes matching '"', '\\', or < 0x20, - * so this is safe for scanning UTF-8 encoded data. + * Used by ByteRenderer for in-place SWAR scan on UTF-8 byte[] buffers. UTF-8 multi-byte + * sequences can contain high-bit bytes, but those are data bytes, not JSON escapes; callers that + * need C1 detection must scan the original chars before encoding. */ static boolean hasEscapeChar(byte[] arr, int from, int to) { return hasEscapeCharSWAR(arr, from, to); @@ -116,7 +105,7 @@ static boolean isAsciiJsonSafe(String str, int from, int to) { } while (i < to) { char c = str.charAt(i); - if (c < 32 || c == '"' || c == '\\' || c >= 128) return false; + if (c < 32 || c == '"' || c == '\\' || c >= 0x7F) return false; i++; } return true; @@ -128,7 +117,7 @@ static boolean isAsciiJsonSafe(String str, int from, int to) { static boolean hasEscapeChar(char[] arr, int from, int to) { for (int i = from; i < to; i++) { char c = arr[i]; - if (c < 32 || c == '"' || c == '\\') return true; + if (c < 32 || c == '"' || c == '\\' || (c >= 0x7F && c <= 0x9F)) return true; } return false; } @@ -150,7 +139,7 @@ static int findFirstEscapeChar(byte[] arr, int from, int to) { } while (i < to) { int b = arr[i] & 0xFF; - if (b < 32 || b == '"' || b == '\\') return i; + if (b < 32 || b == '"' || b == '\\' || b == 0x7F) return i; i++; } return -1; @@ -167,18 +156,18 @@ private static boolean hasEscapeCharSWAR(byte[] arr, int from, int to) { // Tail: remaining 0-7 bytes while (i < to) { int b = arr[i] & 0xFF; - if (b < 32 || b == '"' || b == '\\') return true; + if (b < 32 || b == '"' || b == '\\' || b == 0x7F) return true; i++; } return false; } /** - * 8-bit SWAR: returns true if any byte lane in {@code word} - * contains '"' (0x22), '\\' (0x5C), or a control char (< 0x20). + * 8-bit SWAR mask for bytes requiring JSON escaping: control chars ({@code < 0x20}), + * double-quote ({@code '"'}), backslash ({@code '\\'}), or DEL ({@code 0x7F}). * - *

Uses Netty/Pekko pattern: XOR to produce zero lanes, then - * Hacker's Delight formula to detect zero bytes. + *

Uses Netty/Pekko pattern: XOR to produce zero lanes, then Hacker's Delight formula to + * detect zero bytes. */ private static long swarMatchMask(long word) { // 1. Detect '"' via XOR + zero-detection (Netty SWARUtil.applyPattern) @@ -193,7 +182,11 @@ private static long swarMatchMask(long word) { long c = word & CTRL; long cz = ~((c & HOLE) + HOLE | c | HOLE); - return qz | bz | cz; + // 4. Detect DEL (0x7F) via XOR + zero-detection (HOLE == DEL broadcast pattern) + long d = word ^ HOLE; + long dz = ~((d & HOLE) + HOLE | d | HOLE); + + return qz | bz | cz | dz; } private static int firstMatchedByte(long mask) { @@ -202,13 +195,16 @@ private static int firstMatchedByte(long mask) { : Long.numberOfLeadingZeros(mask)) >>> 3; } + private static final long U16_DEL = 0x007F_007F_007F_007FL; + private static boolean swarHasUnsafeAsciiChar(long word) { if ((word & U16_ASCII) != 0L) return true; long qz = zero16(word ^ U16_QUOTE); long bz = zero16(word ^ U16_BSLAS); long cz = zero16(word & U16_CTRL); - return (qz | bz | cz) != 0L; + long dz = zero16(word ^ U16_DEL); + return (qz | bz | cz | dz) != 0L; } private static long zero16(long word) { @@ -219,7 +215,7 @@ private static long zero16(long word) { private static boolean hasEscapeCharScalar(String s, int len) { for (int i = 0; i < len; i++) { char c = s.charAt(i); - if (c < 32 || c == '"' || c == '\\') return true; + if (c < 32 || c == '"' || c == '\\' || (c >= 0x7F && c <= 0x9F)) return true; } return false; } @@ -227,7 +223,7 @@ private static boolean hasEscapeCharScalar(String s, int len) { private static boolean isAsciiJsonSafeScalar(String s, int from, int to) { for (int i = from; i < to; i++) { char c = s.charAt(i); - if (c < 32 || c == '"' || c == '\\' || c >= 128) return false; + if (c < 32 || c == '"' || c == '\\' || c >= 0x7F) return false; } return true; } diff --git a/sjsonnet/src-native/sjsonnet/CharSWAR.scala b/sjsonnet/src-native/sjsonnet/CharSWAR.scala index 63cde049..b2b55e9f 100644 --- a/sjsonnet/src-native/sjsonnet/CharSWAR.scala +++ b/sjsonnet/src-native/sjsonnet/CharSWAR.scala @@ -8,8 +8,8 @@ import scala.scalanative.runtime.{ByteArray, Intrinsics} * Uses Scala Native's `Intrinsics.loadLong` + `ByteArray.atRawUnsafe` for zero-overhead 8-byte bulk * reads directly from Array[Byte] memory, matching the JVM VarHandle SWAR performance. * - * For String scanning, uses `getBytes(UTF-8)` + byte[] SWAR. On Scala Native compact strings are - * UTF-16, so converting to bytes first is necessary. + * String scans use char-level semantics. Byte-array scans process 8 bytes at a time using + * `Intrinsics.loadLong`. * * Inspired by netty's SWARUtil (io.netty.util.SWARUtil) and Hacker's Delight Ch. 6 zero-detection * formula. @@ -28,10 +28,11 @@ object CharSWAR { private final val U16_BSLAS = 0x005c005c005c005cL private final val U16_CTRL = 0xffe0ffe0ffe0ffe0L private final val U16_ASCII = 0xff80ff80ff80ff80L + private final val U16_DEL = 0x007f007f007f007fL /** - * SWAR: returns a mask for byte lanes in `word` containing '"' (0x22), '\\' (0x5C), or a control - * char (< 0x20). + * SWAR: returns a mask for byte lanes in `word` containing '"' (0x22), '\\' (0x5C), a control + * char (< 0x20), or DEL (0x7F). */ @inline private def swarMatchMask(word: Long): Long = { // 1. Detect '"' via XOR + zero-detection @@ -46,7 +47,11 @@ object CharSWAR { val c = word & CTRL val cz = ~((c & HOLE) + HOLE | c | HOLE) - qz | bz | cz + // 4. Detect DEL (0x7F) via XOR + zero-detection (HOLE == DEL broadcast pattern) + val d = word ^ HOLE + val dz = ~((d & HOLE) + HOLE | d | HOLE) + + qz | bz | cz | dz } @inline private def firstMatchedByte(mask: Long): Int = @@ -55,19 +60,14 @@ object CharSWAR { def hasEscapeChar(s: String): Boolean = { val len = s.length - if (len < 128) { - hasEscapeCharScalar(s, len) - } else { - val bytes = s.getBytes(java.nio.charset.StandardCharsets.UTF_8) - hasEscapeChar(bytes, 0, bytes.length) - } + hasEscapeCharScalar(s, len) } def hasEscapeChar(arr: Array[Char], from: Int, to: Int): Boolean = { var i = from while (i < to) { val c = arr(i) - if (c < 32 || c == '"' || c == '\\') return true + if (c < 32 || c == '"' || c == '\\' || (c >= 0x7f && c <= 0x9f)) return true i += 1 } false @@ -92,7 +92,7 @@ object CharSWAR { } while (i < to) { val c = s.charAt(i) - if (c < 32 || c == '"' || c == '\\' || c >= 128) return false + if (c < 32 || c == '"' || c == '\\' || c >= 0x7f) return false i += 1 } true @@ -100,8 +100,9 @@ object CharSWAR { /** * SWAR scan for byte[] using Intrinsics.loadLong for zero-overhead bulk reads. Processes 8 bytes - * per iteration — same throughput as the JVM VarHandle path. UTF-8 multi-byte sequences never - * produce bytes matching '"', '\', or < 0x20. + * per iteration — same throughput as the JVM VarHandle path. UTF-8 multi-byte sequences can + * contain high-bit bytes, but those are data bytes, not JSON escapes; callers that need C1 + * detection must scan the original chars before encoding. */ def hasEscapeChar(arr: Array[Byte], from: Int, to: Int): Boolean = { val len = to - from @@ -119,7 +120,7 @@ object CharSWAR { // Tail: remaining 0-7 bytes while (i < to) { val b = arr(i) & 0xff - if (b < 32 || b == '"' || b == '\\') return true + if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return true i += 1 } false @@ -141,7 +142,7 @@ object CharSWAR { } while (i < to) { val b = arr(i) & 0xff - if (b < 32 || b == '"' || b == '\\') return i + if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return i i += 1 } -1 @@ -151,7 +152,7 @@ object CharSWAR { var i = 0 while (i < len) { val c = s.charAt(i) - if (c < 32 || c == '"' || c == '\\') return true + if (c < 32 || c == '"' || c == '\\' || (c >= 0x7f && c <= 0x9f)) return true i += 1 } false @@ -162,7 +163,8 @@ object CharSWAR { val qz = zero16(word ^ U16_QUOTE) val bz = zero16(word ^ U16_BSLAS) val cz = zero16(word & U16_CTRL) - (qz | bz | cz) != 0L + val dz = zero16(word ^ U16_DEL) + (qz | bz | cz | dz) != 0L } @inline private def zero16(word: Long): Long = @@ -172,7 +174,7 @@ object CharSWAR { var i = from while (i < to) { val c = s.charAt(i) - if (c < 32 || c == '"' || c == '\\' || c >= 128) return false + if (c < 32 || c == '"' || c == '\\' || c >= 0x7f) return false i += 1 } true @@ -182,7 +184,7 @@ object CharSWAR { var i = from while (i < to) { val b = arr(i) & 0xff - if (b < 32 || b == '"' || b == '\\') return true + if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return true i += 1 } false @@ -192,7 +194,7 @@ object CharSWAR { var i = from while (i < to) { val b = arr(i) & 0xff - if (b < 32 || b == '"' || b == '\\') return i + if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return i i += 1 } -1 diff --git a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala index 5389f1a8..a1bcb14f 100644 --- a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala +++ b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala @@ -14,7 +14,7 @@ import upickle.core.{ArrVisitor, ObjVisitor, Visitor} * * String rendering uses a two-tier strategy: * - Short strings (< 128 chars): fused encode+check loop, zero allocation - * - Long strings (>= 128 chars): getBytes(UTF-8) + SWAR bulk scan + arraycopy + * - Long strings (>= 128 chars): char-level escape check, then UTF-8 bytes + SWAR bulk scan */ class BaseByteRenderer[T <: java.io.OutputStream]( out: T, @@ -230,13 +230,7 @@ class BaseByteRenderer[T <: java.io.OutputStream]( if (len < 128) visitShortString(str, len) else visitLongString(str) case _ => - upickle.core.RenderUtils.escapeByte( - unicodeCharBuilder, - elemBuilder, - s, - escapeUnicode = escapeUnicode, - wrapQuotes = true - ) + appendEscapedStringBytes(s, escapeUnicode) } flushByteBuilder() out @@ -285,7 +279,7 @@ class BaseByteRenderer[T <: java.io.OutputStream]( * Zero-allocation fast path for short ASCII strings (the vast majority of JSON keys/values). Uses * getChars to bulk-copy into a reusable char buffer, then scans the buffer directly (avoiding * per-char String.charAt virtual dispatch). If any char needs escaping or is non-ASCII, falls - * back to escapeByte. + * back to appendEscapedStringBytes. */ private def visitShortString(str: String, len: Int): Unit = { // Reuse unicodeCharBuilder's array as temp char buffer (no allocation after warmup) @@ -302,18 +296,12 @@ class BaseByteRenderer[T <: java.io.OutputStream]( var i = 0 while (i < len) { val c = chars(i) - if (c < 0x20 || c == '"' || c == '\\' || c >= 0x80) { + if (c < 0x20 || c == '"' || c == '\\' || c >= 0x7f) { // DO NOT CHANGE // WHY: elemBuilder.length is intentionally NOT updated before this call. - // escapeByte writes from the current elemBuilder.length position, overwriting + // appendEscapedStringBytes writes from the current elemBuilder.length position, overwriting // our partial work in the array. This avoids needing a separate "rollback". - upickle.core.RenderUtils.escapeByte( - unicodeCharBuilder, - elemBuilder, - str, - escapeUnicode = false, - wrapQuotes = true - ) + appendEscapedStringBytes(str, escapeUnicode = false) return } arr(pos) = c.toByte @@ -325,8 +313,8 @@ class BaseByteRenderer[T <: java.io.OutputStream]( } /** - * SWAR-accelerated path for long strings. Converts to UTF-8 bytes once, then bulk-copies clean - * chunks and escapes only the bytes that require it. + * SWAR-accelerated path for long strings. Escapable chars are detected before UTF-8 encoding; + * clean non-ASCII strings then encode once and bulk-copy their bytes. * * Probes the string with a SWAR ASCII-safe scan first. When the string is clean printable ASCII * (no escape chars, no non-ASCII), the entire UTF-8 encode pass (HeapCharBuffer.wrap + @@ -339,6 +327,10 @@ class BaseByteRenderer[T <: java.io.OutputStream]( renderAsciiSafeString(str) return } + if (CharSWAR.hasEscapeChar(str)) { + appendEscapedStringBytes(str, escapeUnicode = false) + return + } val bytes = str.getBytes(java.nio.charset.StandardCharsets.UTF_8) val bLen = bytes.length val firstEscape = CharSWAR.findFirstEscapeChar(bytes, 0, bLen) @@ -379,6 +371,60 @@ class BaseByteRenderer[T <: java.io.OutputStream]( } } + private def appendEscapedStringBytes(s: CharSequence, escapeUnicode: Boolean): Unit = { + elemBuilder.append('"') + var i = 0 + var start = 0 + val len = s.length + while (i < len) { + val c = s.charAt(i) + val needsEscape = + c == '"' || c == '\\' || c < 0x20 || (c >= 0x7f && c <= 0x9f) || (escapeUnicode && c > 0x7e) + if (needsEscape) { + appendUtf8Slice(s, start, i) + c match { + case '"' => appendEscapedAsciiByte('"') + case '\\' => appendEscapedAsciiByte('\\') + case '\b' => appendEscapedAsciiByte('b') + case '\f' => appendEscapedAsciiByte('f') + case '\n' => appendEscapedAsciiByte('n') + case '\r' => appendEscapedAsciiByte('r') + case '\t' => appendEscapedAsciiByte('t') + case _ => appendUnicodeEscapeByte(c) + } + start = i + 1 + } + i += 1 + } + appendUtf8Slice(s, start, len) + elemBuilder.append('"') + } + + private def appendUtf8Slice(s: CharSequence, from: Int, to: Int): Unit = { + if (from < to) { + val bytes = s.subSequence(from, to).toString.getBytes(java.nio.charset.StandardCharsets.UTF_8) + elemBuilder.appendAll(bytes, bytes.length) + } + } + + private def appendEscapedAsciiByte(c: Char): Unit = { + elemBuilder.append('\\') + elemBuilder.append(c) + } + + private def appendUnicodeEscapeByte(c: Char): Unit = { + val outPos = elemBuilder.length + elemBuilder.ensureLength(6) + val arr = elemBuilder.arr + arr(outPos) = '\\'.toByte + arr(outPos + 1) = 'u'.toByte + arr(outPos + 2) = BaseByteRenderer.HEX_BYTES((c >> 12) & 0xf) + arr(outPos + 3) = BaseByteRenderer.HEX_BYTES((c >> 8) & 0xf) + arr(outPos + 4) = BaseByteRenderer.HEX_BYTES((c >> 4) & 0xf) + arr(outPos + 5) = BaseByteRenderer.HEX_BYTES(c & 0xf) + elemBuilder.length = outPos + 6 + } + private def escapedStringLength(bytes: Array[Byte], bLen: Int, firstEscape: Int): Int = { var len = bLen + 2 var from = firstEscape diff --git a/sjsonnet/src/sjsonnet/BaseCharRenderer.scala b/sjsonnet/src/sjsonnet/BaseCharRenderer.scala index f75990fd..e901f2b2 100644 --- a/sjsonnet/src/sjsonnet/BaseCharRenderer.scala +++ b/sjsonnet/src/sjsonnet/BaseCharRenderer.scala @@ -34,6 +34,9 @@ object BaseCharRenderer { while (i < 100) { a(i) = ('0' + i % 10).toChar; i += 1 } a } + + private[sjsonnet] val HEX_CHARS: Array[Char] = + Array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f') } class BaseCharRenderer[T <: upickle.core.CharOps.Output]( @@ -291,22 +294,53 @@ class BaseCharRenderer[T <: upickle.core.CharOps.Output]( elemBuilder.length = pos + len elemBuilder.appendUnsafe('"') } else { - upickle.core.RenderUtils - .escapeChar(null, elemBuilder, s, escapeUnicode = escapeUnicode, wrapQuotes = true) + appendEscapedString(s, escapeUnicode) } case _ => - upickle.core.RenderUtils.escapeChar( - null, - elemBuilder, - s, - escapeUnicode = escapeUnicode, - wrapQuotes = true - ) + appendEscapedString(s, escapeUnicode) } flushCharBuilder() out } + private def appendEscapedString(s: CharSequence, escapeUnicode: Boolean): Unit = { + elemBuilder.append('"') + var i = 0 + val len = s.length + while (i < len) { + s.charAt(i) match { + case '"' => appendEscapedAscii('"') + case '\\' => appendEscapedAscii('\\') + case '\b' => appendEscapedAscii('b') + case '\f' => appendEscapedAscii('f') + case '\n' => appendEscapedAscii('n') + case '\r' => appendEscapedAscii('r') + case '\t' => appendEscapedAscii('t') + case c => + if (c < ' ' || (c >= 0x7f && c <= 0x9f) || (escapeUnicode && c > '~')) { + appendUnicodeEscape(c) + } else elemBuilder.append(c) + } + i += 1 + } + elemBuilder.append('"') + } + + private def appendEscapedAscii(c: Char): Unit = { + elemBuilder.append('\\') + elemBuilder.append(c) + } + + private def appendUnicodeEscape(c: Char): Unit = { + val hex = BaseCharRenderer.HEX_CHARS + elemBuilder.append('\\') + elemBuilder.append('u') + elemBuilder.append(hex((c >> 12) & 0xf)) + elemBuilder.append(hex((c >> 8) & 0xf)) + elemBuilder.append(hex((c >> 4) & 0xf)) + elemBuilder.append(hex(c & 0xf)) + } + /** * Fast path for [[Val.AsciiSafeStr]]: the string is statically known to contain only chars in * 0x20-0x7E, excluding `"` and `\`. That means no JSON escaping is ever required — not even under diff --git a/sjsonnet/src/sjsonnet/BaseRenderer.scala b/sjsonnet/src/sjsonnet/BaseRenderer.scala index 6afe4f00..ea1a6bc0 100644 --- a/sjsonnet/src/sjsonnet/BaseRenderer.scala +++ b/sjsonnet/src/sjsonnet/BaseRenderer.scala @@ -144,9 +144,11 @@ object BaseRenderer { * collapses the per-character `Writer.write(int)` loop — which on `StringWriter` synchronizes and * bounds-checks per call — into one `System.arraycopy` per safe run, with no upfront pass. * - * "Safe" characters are everything outside `"`, `\`, control chars `< 0x20`, and — when - * `unicode = true` — chars `> 0x7E` (which would otherwise be escaped to `\\uXXXX`). The mapping - * for the unsafe set is identical to the per-char path it replaces. + * "Safe" characters are everything outside `"`, `\`, control chars `< 0x20`, DEL (0x7F), C1 + * control characters (0x80–0x9F), and — when `unicode = true` — chars `> 0x7E` (which would + * otherwise be escaped to `\\uXXXX`). DEL and C1 are always escaped to match go-jsonnet's + * defensive behavior (RFC 8259 only requires U+0000–U+001F). The mapping for the unsafe set is + * identical to the per-char path it replaces. * * Tight, branch-light, charAt-based loop: friendly to JIT inlining (HotSpot, GraalVM) and to * Scala Native's LLVM backend. Common case (ASCII-clean strings used by config and manifest @@ -177,7 +179,9 @@ object BaseRenderer { val c = str.charAt(i) // Inlined classification, mirroring escapeChars below; `<` on a signed char is fine since // chars are unsigned 16-bit; 0x20 / 0x7E comparisons are valid for all values. - if (c == '"' || c == '\\' || c < 0x20 || (unicode && c > 0x7e)) { + // DEL (0x7F) and C1 control characters (0x80–0x9F) are always escaped to match + // go-jsonnet's defensive behavior (RFC 8259 only requires U+0000–U+001F). + if (c == '"' || c == '\\' || c < 0x20 || (c >= 0x7f && c <= 0x9f) || (unicode && c > 0x7e)) { if (i > start) sb.write(str, start, i - start) (c: @switch) match { case '"' => sb.append("\\\"") @@ -213,7 +217,9 @@ object BaseRenderer { case '\r' => sb.append("\\r") case '\t' => sb.append("\\t") case c => - if (c < ' ' || (c > '~' && unicode)) { + // DEL (0x7F) and C1 control characters (0x80–0x9F) are always escaped to match + // go-jsonnet's defensive behavior (RFC 8259 only requires U+0000–U+001F). + if (c < ' ' || (c >= 0x7f && c <= 0x9f) || (c > '~' && unicode)) { sb.append("\\u") .append(toHex((c >> 12) & 15)) .append(toHex((c >> 8) & 15)) diff --git a/sjsonnet/test/resources/new_test_suite/escape_string_unicode.jsonnet b/sjsonnet/test/resources/new_test_suite/escape_string_unicode.jsonnet index 3117cbfb..9d5633c4 100644 --- a/sjsonnet/test/resources/new_test_suite/escape_string_unicode.jsonnet +++ b/sjsonnet/test/resources/new_test_suite/escape_string_unicode.jsonnet @@ -6,4 +6,15 @@ std.assertEqual(std.escapeStringJson("日本語"), "\"日本語\"") && std.assertEqual(std.escapeStringJson("hello é world"), "\"hello é world\"") && std.assertEqual(std.escapeStringPython("é"), "\"é\"") && std.assertEqual(std.escapeStringPython("日本語"), "\"日本語\"") && -std.assertEqual(std.escapeStringPython("hello é world"), "\"hello é world\"") +std.assertEqual(std.escapeStringPython("hello é world"), "\"hello é world\"") && +// DEL (0x7F) and C1 control characters (0x80–0x9F) must be escaped as \uXXXX +// to match go-jsonnet and jrsonnet. +std.assertEqual(std.escapeStringJson(std.char(127)), "\"\\u007f\"") && +std.assertEqual(std.escapeStringJson(std.char(128)), "\"\\u0080\"") && +std.assertEqual(std.escapeStringJson(std.char(159)), "\"\\u009f\"") && +std.assertEqual(std.escapeStringPython(std.char(127)), "\"\\u007f\"") && +std.assertEqual(std.escapeStringPython(std.char(128)), "\"\\u0080\"") && +std.assertEqual(std.escapeStringPython(std.char(159)), "\"\\u009f\"") && +// Characters above 0x9F (NBSP, accented letters, etc.) stay literal. +std.assertEqual(std.escapeStringJson(std.char(160)), "\"\u00a0\"") && +std.assertEqual(std.escapeStringJson(std.char(233)), "\"é\"") diff --git a/sjsonnet/test/src/sjsonnet/RendererTests.scala b/sjsonnet/test/src/sjsonnet/RendererTests.scala index ee69e771..f6b4eb7a 100644 --- a/sjsonnet/test/src/sjsonnet/RendererTests.scala +++ b/sjsonnet/test/src/sjsonnet/RendererTests.scala @@ -97,7 +97,7 @@ object RendererTests extends TestSuite { assert(out.size() < 15000) } - test("byteRendererRepeatedLongAsciiValues") { + def renderByte(expr: String): String = { val interpreter = new Interpreter( Map(), Map(), @@ -105,11 +105,7 @@ object RendererTests extends TestSuite { Importer.empty, parseCache = new DefaultParseCache ) - val value = interpreter.evaluate( - """local s = std.repeat("x", 2048); - |[s, s, s, s, s, s, s, s]""".stripMargin, - DummyPath("(memory)") - ) match { + val value = interpreter.evaluate(expr, DummyPath("(memory)")) match { case Right(v) => v case Left(err) => throw new Exception(Error.formatError(err)) } @@ -118,11 +114,38 @@ object RendererTests extends TestSuite { case Left(err) => throw new Exception(Error.formatError(err)) case Right(_) => } - val rendered = new String(out.toByteArray, java.nio.charset.StandardCharsets.UTF_8) + new String(out.toByteArray, java.nio.charset.StandardCharsets.UTF_8) + } + + test("byteRendererRepeatedLongAsciiValues") { + val rendered = renderByte( + """local s = std.repeat("x", 2048); + |[s, s, s, s, s, s, s, s]""".stripMargin + ) val elem = "\"" + ("x" * 2048) + "\"" rendered ==> "[" + Array.fill(8)(elem).mkString(", ") + "]" } + test("byteRendererEscapesDelAndC1Controls") { + renderByte("""[std.char(127), std.char(128), std.char(159)]""") ==> + "[\"\\u007f\", \"\\u0080\", \"\\u009f\"]" + } + + test("byteRendererPreservesNonAsciiLiterals") { + renderByte("""[std.char(160), "é", std.repeat("é", 200)]""") ==> + "[\"\u00a0\", \"é\", \"" + ("é" * 200) + "\"]" + } + + test("byteRendererLongC1Controls") { + renderByte("""std.repeat(std.char(128), 130)""") ==> "\"" + ("\\u0080" * 130) + "\"" + } + + test("charRendererEscapesDelAndC1Controls") { + ujson.transform(ujson.Str("\u007f"), new Renderer()).toString ==> "\"\\u007f\"" + ujson.transform(ujson.Str("\u0080"), new Renderer()).toString ==> "\"\\u0080\"" + ujson.transform(ujson.Str("\u009f"), new Renderer()).toString ==> "\"\\u009f\"" + } + test("indentZero") { // indent=0 should produce newlines but no spaces ujson.transform(ujson.Arr(1, 2), new Renderer(indent = 0)).toString ==> @@ -168,13 +191,22 @@ object RendererTests extends TestSuite { escape(" ", unicode = true) ==> "\" \"" escape("~", unicode = true) ==> "\"~\"" - // 0x7F (DEL): escaped under unicode=true, but passes through under unicode=false. + // DEL (0x7F) and C1 control characters (0x80-0x9F) are now always escaped as a + // \\uNNNN sequence (regardless of the unicode flag) to match go-jsonnet and + // C++ jsonnet's defensive behavior. RFC 8259 only requires U+0000-U+001F to be + // escaped, but the 0x7F..0x9F range is treated the same way by the reference + // implementations. escape("\u007f", unicode = true) ==> "\"\\u007f\"" - escape("\u007f", unicode = false) ==> "\"\u007f\"" + escape("\u007f", unicode = false) ==> "\"\\u007f\"" + escape("\u0080", unicode = false) ==> "\"\\u0080\"" + escape("\u009f", unicode = false) ==> "\"\\u009f\"" + escape("\u009f", unicode = true) ==> "\"\\u009f\"" - // Higher BMP: \u00ff escaped under unicode=true, passes through under unicode=false. + // Higher BMP (>= 0xA0): \\u00ff escaped under unicode=true, passes through under + // unicode=false. NBSP (0xA0) and accented letters fall into this category. escape("\u00ff", unicode = true) ==> "\"\\u00ff\"" escape("\u00ff", unicode = false) ==> "\"\u00ff\"" + escape("\u00a0", unicode = false) ==> "\"\u00a0\"" // U+2028 / U+2029 (JS-specific line separators) — pinned to current behaviour: escaped // only when unicode=true. Old per-char path behaved the same way.