@@ -51,9 +50,6 @@ private CharSWAR() {}
/** Mask for bits 5-7 of each byte; zero result means byte < 32. */
private static final long CTRL = 0xE0E0_E0E0_E0E0_E0E0L;
- /** Below this length, scalar charAt is faster than SWAR + byte[] conversion. */
- private static final int SWAR_THRESHOLD = 128;
-
private static final long U16_HOLE = 0x7FFF_7FFF_7FFF_7FFFL;
private static final long U16_QUOTE = 0x0022_0022_0022_0022L;
private static final long U16_BSLAS = 0x005C_005C_005C_005CL;
@@ -66,21 +62,14 @@ private CharSWAR() {}
*/
static boolean hasEscapeChar(String str) {
int len = str.length();
- if (len < SWAR_THRESHOLD) {
- return hasEscapeCharScalar(str, len);
- }
- // ISO-8859-1 encoding is a JVM intrinsic for LATIN1 compact strings —
- // essentially a memcpy of the internal byte[]. Chars > 255 map to '?'
- // (0x3F), which is safe (not a control char, not '"', not '\\').
- byte[] bytes = str.getBytes(StandardCharsets.ISO_8859_1);
- return hasEscapeCharSWAR(bytes, 0, bytes.length);
+ return hasEscapeCharScalar(str, len);
}
/**
* Check if any byte in {@code arr[from..to)} needs JSON string escaping.
- * Used by ByteRenderer for in-place SWAR scan on byte[] buffers.
- * UTF-8 multi-byte sequences never produce bytes matching '"', '\\', or < 0x20,
- * so this is safe for scanning UTF-8 encoded data.
+ * Used by ByteRenderer for in-place SWAR scan on UTF-8 byte[] buffers. UTF-8 multi-byte
+ * sequences can contain high-bit bytes, but those are data bytes, not JSON escapes; callers that
+ * need C1 detection must scan the original chars before encoding.
*/
static boolean hasEscapeChar(byte[] arr, int from, int to) {
return hasEscapeCharSWAR(arr, from, to);
@@ -116,7 +105,7 @@ static boolean isAsciiJsonSafe(String str, int from, int to) {
}
while (i < to) {
char c = str.charAt(i);
- if (c < 32 || c == '"' || c == '\\' || c >= 128) return false;
+ if (c < 32 || c == '"' || c == '\\' || c >= 0x7F) return false;
i++;
}
return true;
@@ -128,7 +117,7 @@ static boolean isAsciiJsonSafe(String str, int from, int to) {
static boolean hasEscapeChar(char[] arr, int from, int to) {
for (int i = from; i < to; i++) {
char c = arr[i];
- if (c < 32 || c == '"' || c == '\\') return true;
+ if (c < 32 || c == '"' || c == '\\' || (c >= 0x7F && c <= 0x9F)) return true;
}
return false;
}
@@ -150,7 +139,7 @@ static int findFirstEscapeChar(byte[] arr, int from, int to) {
}
while (i < to) {
int b = arr[i] & 0xFF;
- if (b < 32 || b == '"' || b == '\\') return i;
+ if (b < 32 || b == '"' || b == '\\' || b == 0x7F) return i;
i++;
}
return -1;
@@ -167,18 +156,18 @@ private static boolean hasEscapeCharSWAR(byte[] arr, int from, int to) {
// Tail: remaining 0-7 bytes
while (i < to) {
int b = arr[i] & 0xFF;
- if (b < 32 || b == '"' || b == '\\') return true;
+ if (b < 32 || b == '"' || b == '\\' || b == 0x7F) return true;
i++;
}
return false;
}
/**
- * 8-bit SWAR: returns true if any byte lane in {@code word}
- * contains '"' (0x22), '\\' (0x5C), or a control char (< 0x20).
+ * 8-bit SWAR mask for bytes requiring JSON escaping: control chars ({@code < 0x20}),
+ * double-quote ({@code '"'}), backslash ({@code '\\'}), or DEL ({@code 0x7F}).
*
- * Uses Netty/Pekko pattern: XOR to produce zero lanes, then
- * Hacker's Delight formula to detect zero bytes.
+ *
Uses Netty/Pekko pattern: XOR to produce zero lanes, then Hacker's Delight formula to
+ * detect zero bytes.
*/
private static long swarMatchMask(long word) {
// 1. Detect '"' via XOR + zero-detection (Netty SWARUtil.applyPattern)
@@ -193,7 +182,11 @@ private static long swarMatchMask(long word) {
long c = word & CTRL;
long cz = ~((c & HOLE) + HOLE | c | HOLE);
- return qz | bz | cz;
+ // 4. Detect DEL (0x7F) via XOR + zero-detection (HOLE == DEL broadcast pattern)
+ long d = word ^ HOLE;
+ long dz = ~((d & HOLE) + HOLE | d | HOLE);
+
+ return qz | bz | cz | dz;
}
private static int firstMatchedByte(long mask) {
@@ -202,13 +195,16 @@ private static int firstMatchedByte(long mask) {
: Long.numberOfLeadingZeros(mask)) >>> 3;
}
+ private static final long U16_DEL = 0x007F_007F_007F_007FL;
+
private static boolean swarHasUnsafeAsciiChar(long word) {
if ((word & U16_ASCII) != 0L) return true;
long qz = zero16(word ^ U16_QUOTE);
long bz = zero16(word ^ U16_BSLAS);
long cz = zero16(word & U16_CTRL);
- return (qz | bz | cz) != 0L;
+ long dz = zero16(word ^ U16_DEL);
+ return (qz | bz | cz | dz) != 0L;
}
private static long zero16(long word) {
@@ -219,7 +215,7 @@ private static long zero16(long word) {
private static boolean hasEscapeCharScalar(String s, int len) {
for (int i = 0; i < len; i++) {
char c = s.charAt(i);
- if (c < 32 || c == '"' || c == '\\') return true;
+ if (c < 32 || c == '"' || c == '\\' || (c >= 0x7F && c <= 0x9F)) return true;
}
return false;
}
@@ -227,7 +223,7 @@ private static boolean hasEscapeCharScalar(String s, int len) {
private static boolean isAsciiJsonSafeScalar(String s, int from, int to) {
for (int i = from; i < to; i++) {
char c = s.charAt(i);
- if (c < 32 || c == '"' || c == '\\' || c >= 128) return false;
+ if (c < 32 || c == '"' || c == '\\' || c >= 0x7F) return false;
}
return true;
}
diff --git a/sjsonnet/src-native/sjsonnet/CharSWAR.scala b/sjsonnet/src-native/sjsonnet/CharSWAR.scala
index 63cde049..b2b55e9f 100644
--- a/sjsonnet/src-native/sjsonnet/CharSWAR.scala
+++ b/sjsonnet/src-native/sjsonnet/CharSWAR.scala
@@ -8,8 +8,8 @@ import scala.scalanative.runtime.{ByteArray, Intrinsics}
* Uses Scala Native's `Intrinsics.loadLong` + `ByteArray.atRawUnsafe` for zero-overhead 8-byte bulk
* reads directly from Array[Byte] memory, matching the JVM VarHandle SWAR performance.
*
- * For String scanning, uses `getBytes(UTF-8)` + byte[] SWAR. On Scala Native compact strings are
- * UTF-16, so converting to bytes first is necessary.
+ * String scans use char-level semantics. Byte-array scans process 8 bytes at a time using
+ * `Intrinsics.loadLong`.
*
* Inspired by netty's SWARUtil (io.netty.util.SWARUtil) and Hacker's Delight Ch. 6 zero-detection
* formula.
@@ -28,10 +28,11 @@ object CharSWAR {
private final val U16_BSLAS = 0x005c005c005c005cL
private final val U16_CTRL = 0xffe0ffe0ffe0ffe0L
private final val U16_ASCII = 0xff80ff80ff80ff80L
+ private final val U16_DEL = 0x007f007f007f007fL
/**
- * SWAR: returns a mask for byte lanes in `word` containing '"' (0x22), '\\' (0x5C), or a control
- * char (< 0x20).
+ * SWAR: returns a mask for byte lanes in `word` containing '"' (0x22), '\\' (0x5C), a control
+ * char (< 0x20), or DEL (0x7F).
*/
@inline private def swarMatchMask(word: Long): Long = {
// 1. Detect '"' via XOR + zero-detection
@@ -46,7 +47,11 @@ object CharSWAR {
val c = word & CTRL
val cz = ~((c & HOLE) + HOLE | c | HOLE)
- qz | bz | cz
+ // 4. Detect DEL (0x7F) via XOR + zero-detection (HOLE == DEL broadcast pattern)
+ val d = word ^ HOLE
+ val dz = ~((d & HOLE) + HOLE | d | HOLE)
+
+ qz | bz | cz | dz
}
@inline private def firstMatchedByte(mask: Long): Int =
@@ -55,19 +60,14 @@ object CharSWAR {
def hasEscapeChar(s: String): Boolean = {
val len = s.length
- if (len < 128) {
- hasEscapeCharScalar(s, len)
- } else {
- val bytes = s.getBytes(java.nio.charset.StandardCharsets.UTF_8)
- hasEscapeChar(bytes, 0, bytes.length)
- }
+ hasEscapeCharScalar(s, len)
}
def hasEscapeChar(arr: Array[Char], from: Int, to: Int): Boolean = {
var i = from
while (i < to) {
val c = arr(i)
- if (c < 32 || c == '"' || c == '\\') return true
+ if (c < 32 || c == '"' || c == '\\' || (c >= 0x7f && c <= 0x9f)) return true
i += 1
}
false
@@ -92,7 +92,7 @@ object CharSWAR {
}
while (i < to) {
val c = s.charAt(i)
- if (c < 32 || c == '"' || c == '\\' || c >= 128) return false
+ if (c < 32 || c == '"' || c == '\\' || c >= 0x7f) return false
i += 1
}
true
@@ -100,8 +100,9 @@ object CharSWAR {
/**
* SWAR scan for byte[] using Intrinsics.loadLong for zero-overhead bulk reads. Processes 8 bytes
- * per iteration — same throughput as the JVM VarHandle path. UTF-8 multi-byte sequences never
- * produce bytes matching '"', '\', or < 0x20.
+ * per iteration — same throughput as the JVM VarHandle path. UTF-8 multi-byte sequences can
+ * contain high-bit bytes, but those are data bytes, not JSON escapes; callers that need C1
+ * detection must scan the original chars before encoding.
*/
def hasEscapeChar(arr: Array[Byte], from: Int, to: Int): Boolean = {
val len = to - from
@@ -119,7 +120,7 @@ object CharSWAR {
// Tail: remaining 0-7 bytes
while (i < to) {
val b = arr(i) & 0xff
- if (b < 32 || b == '"' || b == '\\') return true
+ if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return true
i += 1
}
false
@@ -141,7 +142,7 @@ object CharSWAR {
}
while (i < to) {
val b = arr(i) & 0xff
- if (b < 32 || b == '"' || b == '\\') return i
+ if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return i
i += 1
}
-1
@@ -151,7 +152,7 @@ object CharSWAR {
var i = 0
while (i < len) {
val c = s.charAt(i)
- if (c < 32 || c == '"' || c == '\\') return true
+ if (c < 32 || c == '"' || c == '\\' || (c >= 0x7f && c <= 0x9f)) return true
i += 1
}
false
@@ -162,7 +163,8 @@ object CharSWAR {
val qz = zero16(word ^ U16_QUOTE)
val bz = zero16(word ^ U16_BSLAS)
val cz = zero16(word & U16_CTRL)
- (qz | bz | cz) != 0L
+ val dz = zero16(word ^ U16_DEL)
+ (qz | bz | cz | dz) != 0L
}
@inline private def zero16(word: Long): Long =
@@ -172,7 +174,7 @@ object CharSWAR {
var i = from
while (i < to) {
val c = s.charAt(i)
- if (c < 32 || c == '"' || c == '\\' || c >= 128) return false
+ if (c < 32 || c == '"' || c == '\\' || c >= 0x7f) return false
i += 1
}
true
@@ -182,7 +184,7 @@ object CharSWAR {
var i = from
while (i < to) {
val b = arr(i) & 0xff
- if (b < 32 || b == '"' || b == '\\') return true
+ if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return true
i += 1
}
false
@@ -192,7 +194,7 @@ object CharSWAR {
var i = from
while (i < to) {
val b = arr(i) & 0xff
- if (b < 32 || b == '"' || b == '\\') return i
+ if (b < 32 || b == '"' || b == '\\' || b == 0x7f) return i
i += 1
}
-1
diff --git a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala
index 5389f1a8..a1bcb14f 100644
--- a/sjsonnet/src/sjsonnet/BaseByteRenderer.scala
+++ b/sjsonnet/src/sjsonnet/BaseByteRenderer.scala
@@ -14,7 +14,7 @@ import upickle.core.{ArrVisitor, ObjVisitor, Visitor}
*
* String rendering uses a two-tier strategy:
* - Short strings (< 128 chars): fused encode+check loop, zero allocation
- * - Long strings (>= 128 chars): getBytes(UTF-8) + SWAR bulk scan + arraycopy
+ * - Long strings (>= 128 chars): char-level escape check, then UTF-8 bytes + SWAR bulk scan
*/
class BaseByteRenderer[T <: java.io.OutputStream](
out: T,
@@ -230,13 +230,7 @@ class BaseByteRenderer[T <: java.io.OutputStream](
if (len < 128) visitShortString(str, len)
else visitLongString(str)
case _ =>
- upickle.core.RenderUtils.escapeByte(
- unicodeCharBuilder,
- elemBuilder,
- s,
- escapeUnicode = escapeUnicode,
- wrapQuotes = true
- )
+ appendEscapedStringBytes(s, escapeUnicode)
}
flushByteBuilder()
out
@@ -285,7 +279,7 @@ class BaseByteRenderer[T <: java.io.OutputStream](
* Zero-allocation fast path for short ASCII strings (the vast majority of JSON keys/values). Uses
* getChars to bulk-copy into a reusable char buffer, then scans the buffer directly (avoiding
* per-char String.charAt virtual dispatch). If any char needs escaping or is non-ASCII, falls
- * back to escapeByte.
+ * back to appendEscapedStringBytes.
*/
private def visitShortString(str: String, len: Int): Unit = {
// Reuse unicodeCharBuilder's array as temp char buffer (no allocation after warmup)
@@ -302,18 +296,12 @@ class BaseByteRenderer[T <: java.io.OutputStream](
var i = 0
while (i < len) {
val c = chars(i)
- if (c < 0x20 || c == '"' || c == '\\' || c >= 0x80) {
+ if (c < 0x20 || c == '"' || c == '\\' || c >= 0x7f) {
// DO NOT CHANGE
// WHY: elemBuilder.length is intentionally NOT updated before this call.
- // escapeByte writes from the current elemBuilder.length position, overwriting
+ // appendEscapedStringBytes writes from the current elemBuilder.length position, overwriting
// our partial work in the array. This avoids needing a separate "rollback".
- upickle.core.RenderUtils.escapeByte(
- unicodeCharBuilder,
- elemBuilder,
- str,
- escapeUnicode = false,
- wrapQuotes = true
- )
+ appendEscapedStringBytes(str, escapeUnicode = false)
return
}
arr(pos) = c.toByte
@@ -325,8 +313,8 @@ class BaseByteRenderer[T <: java.io.OutputStream](
}
/**
- * SWAR-accelerated path for long strings. Converts to UTF-8 bytes once, then bulk-copies clean
- * chunks and escapes only the bytes that require it.
+ * SWAR-accelerated path for long strings. Escapable chars are detected before UTF-8 encoding;
+ * clean non-ASCII strings then encode once and bulk-copy their bytes.
*
* Probes the string with a SWAR ASCII-safe scan first. When the string is clean printable ASCII
* (no escape chars, no non-ASCII), the entire UTF-8 encode pass (HeapCharBuffer.wrap +
@@ -339,6 +327,10 @@ class BaseByteRenderer[T <: java.io.OutputStream](
renderAsciiSafeString(str)
return
}
+ if (CharSWAR.hasEscapeChar(str)) {
+ appendEscapedStringBytes(str, escapeUnicode = false)
+ return
+ }
val bytes = str.getBytes(java.nio.charset.StandardCharsets.UTF_8)
val bLen = bytes.length
val firstEscape = CharSWAR.findFirstEscapeChar(bytes, 0, bLen)
@@ -379,6 +371,60 @@ class BaseByteRenderer[T <: java.io.OutputStream](
}
}
+ private def appendEscapedStringBytes(s: CharSequence, escapeUnicode: Boolean): Unit = {
+ elemBuilder.append('"')
+ var i = 0
+ var start = 0
+ val len = s.length
+ while (i < len) {
+ val c = s.charAt(i)
+ val needsEscape =
+ c == '"' || c == '\\' || c < 0x20 || (c >= 0x7f && c <= 0x9f) || (escapeUnicode && c > 0x7e)
+ if (needsEscape) {
+ appendUtf8Slice(s, start, i)
+ c match {
+ case '"' => appendEscapedAsciiByte('"')
+ case '\\' => appendEscapedAsciiByte('\\')
+ case '\b' => appendEscapedAsciiByte('b')
+ case '\f' => appendEscapedAsciiByte('f')
+ case '\n' => appendEscapedAsciiByte('n')
+ case '\r' => appendEscapedAsciiByte('r')
+ case '\t' => appendEscapedAsciiByte('t')
+ case _ => appendUnicodeEscapeByte(c)
+ }
+ start = i + 1
+ }
+ i += 1
+ }
+ appendUtf8Slice(s, start, len)
+ elemBuilder.append('"')
+ }
+
+ private def appendUtf8Slice(s: CharSequence, from: Int, to: Int): Unit = {
+ if (from < to) {
+ val bytes = s.subSequence(from, to).toString.getBytes(java.nio.charset.StandardCharsets.UTF_8)
+ elemBuilder.appendAll(bytes, bytes.length)
+ }
+ }
+
+ private def appendEscapedAsciiByte(c: Char): Unit = {
+ elemBuilder.append('\\')
+ elemBuilder.append(c)
+ }
+
+ private def appendUnicodeEscapeByte(c: Char): Unit = {
+ val outPos = elemBuilder.length
+ elemBuilder.ensureLength(6)
+ val arr = elemBuilder.arr
+ arr(outPos) = '\\'.toByte
+ arr(outPos + 1) = 'u'.toByte
+ arr(outPos + 2) = BaseByteRenderer.HEX_BYTES((c >> 12) & 0xf)
+ arr(outPos + 3) = BaseByteRenderer.HEX_BYTES((c >> 8) & 0xf)
+ arr(outPos + 4) = BaseByteRenderer.HEX_BYTES((c >> 4) & 0xf)
+ arr(outPos + 5) = BaseByteRenderer.HEX_BYTES(c & 0xf)
+ elemBuilder.length = outPos + 6
+ }
+
private def escapedStringLength(bytes: Array[Byte], bLen: Int, firstEscape: Int): Int = {
var len = bLen + 2
var from = firstEscape
diff --git a/sjsonnet/src/sjsonnet/BaseCharRenderer.scala b/sjsonnet/src/sjsonnet/BaseCharRenderer.scala
index f75990fd..e901f2b2 100644
--- a/sjsonnet/src/sjsonnet/BaseCharRenderer.scala
+++ b/sjsonnet/src/sjsonnet/BaseCharRenderer.scala
@@ -34,6 +34,9 @@ object BaseCharRenderer {
while (i < 100) { a(i) = ('0' + i % 10).toChar; i += 1 }
a
}
+
+ private[sjsonnet] val HEX_CHARS: Array[Char] =
+ Array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f')
}
class BaseCharRenderer[T <: upickle.core.CharOps.Output](
@@ -291,22 +294,53 @@ class BaseCharRenderer[T <: upickle.core.CharOps.Output](
elemBuilder.length = pos + len
elemBuilder.appendUnsafe('"')
} else {
- upickle.core.RenderUtils
- .escapeChar(null, elemBuilder, s, escapeUnicode = escapeUnicode, wrapQuotes = true)
+ appendEscapedString(s, escapeUnicode)
}
case _ =>
- upickle.core.RenderUtils.escapeChar(
- null,
- elemBuilder,
- s,
- escapeUnicode = escapeUnicode,
- wrapQuotes = true
- )
+ appendEscapedString(s, escapeUnicode)
}
flushCharBuilder()
out
}
+ private def appendEscapedString(s: CharSequence, escapeUnicode: Boolean): Unit = {
+ elemBuilder.append('"')
+ var i = 0
+ val len = s.length
+ while (i < len) {
+ s.charAt(i) match {
+ case '"' => appendEscapedAscii('"')
+ case '\\' => appendEscapedAscii('\\')
+ case '\b' => appendEscapedAscii('b')
+ case '\f' => appendEscapedAscii('f')
+ case '\n' => appendEscapedAscii('n')
+ case '\r' => appendEscapedAscii('r')
+ case '\t' => appendEscapedAscii('t')
+ case c =>
+ if (c < ' ' || (c >= 0x7f && c <= 0x9f) || (escapeUnicode && c > '~')) {
+ appendUnicodeEscape(c)
+ } else elemBuilder.append(c)
+ }
+ i += 1
+ }
+ elemBuilder.append('"')
+ }
+
+ private def appendEscapedAscii(c: Char): Unit = {
+ elemBuilder.append('\\')
+ elemBuilder.append(c)
+ }
+
+ private def appendUnicodeEscape(c: Char): Unit = {
+ val hex = BaseCharRenderer.HEX_CHARS
+ elemBuilder.append('\\')
+ elemBuilder.append('u')
+ elemBuilder.append(hex((c >> 12) & 0xf))
+ elemBuilder.append(hex((c >> 8) & 0xf))
+ elemBuilder.append(hex((c >> 4) & 0xf))
+ elemBuilder.append(hex(c & 0xf))
+ }
+
/**
* Fast path for [[Val.AsciiSafeStr]]: the string is statically known to contain only chars in
* 0x20-0x7E, excluding `"` and `\`. That means no JSON escaping is ever required — not even under
diff --git a/sjsonnet/src/sjsonnet/BaseRenderer.scala b/sjsonnet/src/sjsonnet/BaseRenderer.scala
index 6afe4f00..ea1a6bc0 100644
--- a/sjsonnet/src/sjsonnet/BaseRenderer.scala
+++ b/sjsonnet/src/sjsonnet/BaseRenderer.scala
@@ -144,9 +144,11 @@ object BaseRenderer {
* collapses the per-character `Writer.write(int)` loop — which on `StringWriter` synchronizes and
* bounds-checks per call — into one `System.arraycopy` per safe run, with no upfront pass.
*
- * "Safe" characters are everything outside `"`, `\`, control chars `< 0x20`, and — when
- * `unicode = true` — chars `> 0x7E` (which would otherwise be escaped to `\\uXXXX`). The mapping
- * for the unsafe set is identical to the per-char path it replaces.
+ * "Safe" characters are everything outside `"`, `\`, control chars `< 0x20`, DEL (0x7F), C1
+ * control characters (0x80–0x9F), and — when `unicode = true` — chars `> 0x7E` (which would
+ * otherwise be escaped to `\\uXXXX`). DEL and C1 are always escaped to match go-jsonnet's
+ * defensive behavior (RFC 8259 only requires U+0000–U+001F). The mapping for the unsafe set is
+ * identical to the per-char path it replaces.
*
* Tight, branch-light, charAt-based loop: friendly to JIT inlining (HotSpot, GraalVM) and to
* Scala Native's LLVM backend. Common case (ASCII-clean strings used by config and manifest
@@ -177,7 +179,9 @@ object BaseRenderer {
val c = str.charAt(i)
// Inlined classification, mirroring escapeChars below; `<` on a signed char is fine since
// chars are unsigned 16-bit; 0x20 / 0x7E comparisons are valid for all values.
- if (c == '"' || c == '\\' || c < 0x20 || (unicode && c > 0x7e)) {
+ // DEL (0x7F) and C1 control characters (0x80–0x9F) are always escaped to match
+ // go-jsonnet's defensive behavior (RFC 8259 only requires U+0000–U+001F).
+ if (c == '"' || c == '\\' || c < 0x20 || (c >= 0x7f && c <= 0x9f) || (unicode && c > 0x7e)) {
if (i > start) sb.write(str, start, i - start)
(c: @switch) match {
case '"' => sb.append("\\\"")
@@ -213,7 +217,9 @@ object BaseRenderer {
case '\r' => sb.append("\\r")
case '\t' => sb.append("\\t")
case c =>
- if (c < ' ' || (c > '~' && unicode)) {
+ // DEL (0x7F) and C1 control characters (0x80–0x9F) are always escaped to match
+ // go-jsonnet's defensive behavior (RFC 8259 only requires U+0000–U+001F).
+ if (c < ' ' || (c >= 0x7f && c <= 0x9f) || (c > '~' && unicode)) {
sb.append("\\u")
.append(toHex((c >> 12) & 15))
.append(toHex((c >> 8) & 15))
diff --git a/sjsonnet/test/resources/new_test_suite/escape_string_unicode.jsonnet b/sjsonnet/test/resources/new_test_suite/escape_string_unicode.jsonnet
index 3117cbfb..9d5633c4 100644
--- a/sjsonnet/test/resources/new_test_suite/escape_string_unicode.jsonnet
+++ b/sjsonnet/test/resources/new_test_suite/escape_string_unicode.jsonnet
@@ -6,4 +6,15 @@ std.assertEqual(std.escapeStringJson("日本語"), "\"日本語\"") &&
std.assertEqual(std.escapeStringJson("hello é world"), "\"hello é world\"") &&
std.assertEqual(std.escapeStringPython("é"), "\"é\"") &&
std.assertEqual(std.escapeStringPython("日本語"), "\"日本語\"") &&
-std.assertEqual(std.escapeStringPython("hello é world"), "\"hello é world\"")
+std.assertEqual(std.escapeStringPython("hello é world"), "\"hello é world\"") &&
+// DEL (0x7F) and C1 control characters (0x80–0x9F) must be escaped as \uXXXX
+// to match go-jsonnet and jrsonnet.
+std.assertEqual(std.escapeStringJson(std.char(127)), "\"\\u007f\"") &&
+std.assertEqual(std.escapeStringJson(std.char(128)), "\"\\u0080\"") &&
+std.assertEqual(std.escapeStringJson(std.char(159)), "\"\\u009f\"") &&
+std.assertEqual(std.escapeStringPython(std.char(127)), "\"\\u007f\"") &&
+std.assertEqual(std.escapeStringPython(std.char(128)), "\"\\u0080\"") &&
+std.assertEqual(std.escapeStringPython(std.char(159)), "\"\\u009f\"") &&
+// Characters above 0x9F (NBSP, accented letters, etc.) stay literal.
+std.assertEqual(std.escapeStringJson(std.char(160)), "\"\u00a0\"") &&
+std.assertEqual(std.escapeStringJson(std.char(233)), "\"é\"")
diff --git a/sjsonnet/test/src/sjsonnet/RendererTests.scala b/sjsonnet/test/src/sjsonnet/RendererTests.scala
index ee69e771..f6b4eb7a 100644
--- a/sjsonnet/test/src/sjsonnet/RendererTests.scala
+++ b/sjsonnet/test/src/sjsonnet/RendererTests.scala
@@ -97,7 +97,7 @@ object RendererTests extends TestSuite {
assert(out.size() < 15000)
}
- test("byteRendererRepeatedLongAsciiValues") {
+ def renderByte(expr: String): String = {
val interpreter = new Interpreter(
Map(),
Map(),
@@ -105,11 +105,7 @@ object RendererTests extends TestSuite {
Importer.empty,
parseCache = new DefaultParseCache
)
- val value = interpreter.evaluate(
- """local s = std.repeat("x", 2048);
- |[s, s, s, s, s, s, s, s]""".stripMargin,
- DummyPath("(memory)")
- ) match {
+ val value = interpreter.evaluate(expr, DummyPath("(memory)")) match {
case Right(v) => v
case Left(err) => throw new Exception(Error.formatError(err))
}
@@ -118,11 +114,38 @@ object RendererTests extends TestSuite {
case Left(err) => throw new Exception(Error.formatError(err))
case Right(_) =>
}
- val rendered = new String(out.toByteArray, java.nio.charset.StandardCharsets.UTF_8)
+ new String(out.toByteArray, java.nio.charset.StandardCharsets.UTF_8)
+ }
+
+ test("byteRendererRepeatedLongAsciiValues") {
+ val rendered = renderByte(
+ """local s = std.repeat("x", 2048);
+ |[s, s, s, s, s, s, s, s]""".stripMargin
+ )
val elem = "\"" + ("x" * 2048) + "\""
rendered ==> "[" + Array.fill(8)(elem).mkString(", ") + "]"
}
+ test("byteRendererEscapesDelAndC1Controls") {
+ renderByte("""[std.char(127), std.char(128), std.char(159)]""") ==>
+ "[\"\\u007f\", \"\\u0080\", \"\\u009f\"]"
+ }
+
+ test("byteRendererPreservesNonAsciiLiterals") {
+ renderByte("""[std.char(160), "é", std.repeat("é", 200)]""") ==>
+ "[\"\u00a0\", \"é\", \"" + ("é" * 200) + "\"]"
+ }
+
+ test("byteRendererLongC1Controls") {
+ renderByte("""std.repeat(std.char(128), 130)""") ==> "\"" + ("\\u0080" * 130) + "\""
+ }
+
+ test("charRendererEscapesDelAndC1Controls") {
+ ujson.transform(ujson.Str("\u007f"), new Renderer()).toString ==> "\"\\u007f\""
+ ujson.transform(ujson.Str("\u0080"), new Renderer()).toString ==> "\"\\u0080\""
+ ujson.transform(ujson.Str("\u009f"), new Renderer()).toString ==> "\"\\u009f\""
+ }
+
test("indentZero") {
// indent=0 should produce newlines but no spaces
ujson.transform(ujson.Arr(1, 2), new Renderer(indent = 0)).toString ==>
@@ -168,13 +191,22 @@ object RendererTests extends TestSuite {
escape(" ", unicode = true) ==> "\" \""
escape("~", unicode = true) ==> "\"~\""
- // 0x7F (DEL): escaped under unicode=true, but passes through under unicode=false.
+ // DEL (0x7F) and C1 control characters (0x80-0x9F) are now always escaped as a
+ // \\uNNNN sequence (regardless of the unicode flag) to match go-jsonnet and
+ // C++ jsonnet's defensive behavior. RFC 8259 only requires U+0000-U+001F to be
+ // escaped, but the 0x7F..0x9F range is treated the same way by the reference
+ // implementations.
escape("\u007f", unicode = true) ==> "\"\\u007f\""
- escape("\u007f", unicode = false) ==> "\"\u007f\""
+ escape("\u007f", unicode = false) ==> "\"\\u007f\""
+ escape("\u0080", unicode = false) ==> "\"\\u0080\""
+ escape("\u009f", unicode = false) ==> "\"\\u009f\""
+ escape("\u009f", unicode = true) ==> "\"\\u009f\""
- // Higher BMP: \u00ff escaped under unicode=true, passes through under unicode=false.
+ // Higher BMP (>= 0xA0): \\u00ff escaped under unicode=true, passes through under
+ // unicode=false. NBSP (0xA0) and accented letters fall into this category.
escape("\u00ff", unicode = true) ==> "\"\\u00ff\""
escape("\u00ff", unicode = false) ==> "\"\u00ff\""
+ escape("\u00a0", unicode = false) ==> "\"\u00a0\""
// U+2028 / U+2029 (JS-specific line separators) — pinned to current behaviour: escaped
// only when unicode=true. Old per-char path behaved the same way.