@@ -9,6 +9,7 @@ package org.dexpace.sdk.core.http.pipeline.steps
99
1010import org.dexpace.sdk.core.http.common.MediaType
1111import java.nio.charset.Charset
12+ import java.util.concurrent.ConcurrentHashMap
1213
1314/* *
1415 * Renders captured request/response body bytes into a short, log-safe preview string.
@@ -23,8 +24,10 @@ import java.nio.charset.Charset
2324 * names one the JVM does not recognise, the preview falls back to [DEFAULT_CHARSET] (UTF-8),
2425 * matching the HTTP default for most text types and the previous behaviour.
2526 * - **Binary-safe** — a body that does not look like text is never decoded. Instead it is
26- * summarised as `[binary N bytes]`, so an operator sees a stable, greppable marker rather
27- * than a line of replacement characters that can corrupt log viewers.
27+ * summarised as `[binary N bytes captured]`, so an operator sees a stable, greppable marker
28+ * rather than a line of replacement characters that can corrupt log viewers. `N` is the number
29+ * of bytes in the (already bounded) preview snapshot — i.e. how much was captured, not
30+ * necessarily the full body length — so the summary cannot imply a size it didn't observe.
2831 *
2932 * Decoding itself never throws: [Charset]-based decoding substitutes the replacement character
3033 * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a
@@ -38,8 +41,9 @@ import java.nio.charset.Charset
3841 * first: when the [MediaType] explicitly declares a *resolvable* charset whose encoding is not
3942 * NUL-free for ASCII (UTF-16/UTF-32 and friends), the body is decoded with that charset and the
4043 * NUL-based heuristic is skipped. When no charset is declared, or the declared one is single-byte
41- * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is summarised
42- * rather than decoded into noise.
44+ * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is
45+ * summarised rather than decoded into noise. The wide-charset probe is memoised per [Charset], so
46+ * the cost is a single small encode the first time a charset is seen and a map lookup thereafter.
4347 *
4448 * ## Best-effort detection
4549 * The heuristic is **best-effort**, not a guarantee. It samples only the first
@@ -87,8 +91,8 @@ internal object BodyPreview {
8791 * Empty input yields the empty string. When [mediaType] explicitly declares a resolvable
8892 * multi-byte charset (UTF-16/UTF-32 and friends), the bytes are decoded with that charset and
8993 * the NUL-based binary heuristic is skipped — see the class KDoc. Otherwise a body that does
90- * not pass [isProbablyText] is rendered as a size-only `[binary N bytes]` summary, and a body
91- * that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when
94+ * not pass [isProbablyText] is rendered as a size-only `[binary N bytes captured ]` summary, and
95+ * a body that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when
9296 * absent/unknown).
9397 */
9498 internal fun render (
@@ -108,23 +112,34 @@ internal object BodyPreview {
108112
109113 /* *
110114 * Decodes [bytes] using [charset]. Invalid byte sequences are replaced rather than throwing.
115+ * [render] guarantees a non-empty array before calling this.
111116 */
112117 private fun previewText (
113118 bytes : ByteArray ,
114119 charset : Charset ,
115- ): String {
116- if (bytes.isEmpty()) return " "
117- return String (bytes, charset)
118- }
120+ ): String = String (bytes, charset)
121+
122+ /* * Memoised results of [computeEncodesAsciiWithNul], keyed by [Charset] (a JVM singleton). */
123+ private val asciiWithNulByCharset = ConcurrentHashMap < Charset , Boolean >()
119124
120125 /* *
121126 * True when [charset] encodes a plain ASCII character to a byte sequence that contains a NUL
122127 * byte — the signature of a fixed-width multi-byte charset such as UTF-16 or UTF-32, where
123128 * `'A'` becomes e.g. `0x00 0x41`. Single-byte charsets (US-ASCII, ISO-8859-1) and UTF-8
124- * encode ASCII without NUL padding and return false. The probe is computed from the charset's
125- * own encoder, so it covers any such charset the JVM knows, not a hard-coded name list.
129+ * encode ASCII without NUL padding and return false.
130+ *
131+ * The result is memoised per charset so the encode runs at most once per distinct [Charset];
132+ * subsequent renders for that charset are a plain map lookup with no per-call allocation.
126133 */
127134 private fun encodesAsciiWithNul (charset : Charset ): Boolean =
135+ asciiWithNulByCharset.computeIfAbsent(charset) { computeEncodesAsciiWithNul(it) }
136+
137+ /* *
138+ * Computes the [encodesAsciiWithNul] verdict from the charset's own encoder, so it covers any
139+ * such charset the JVM knows rather than a hard-coded name list. An encoder that rejects the
140+ * probe is treated as non-wide (the heuristic path then applies).
141+ */
142+ private fun computeEncodesAsciiWithNul (charset : Charset ): Boolean =
128143 try {
129144 " A" .toByteArray(charset).any { it.toInt() == NUL }
130145 } catch (_: Exception ) {
@@ -161,6 +176,10 @@ internal object BodyPreview {
161176 private fun isControlByte (b : Int ): Boolean =
162177 (b < FIRST_PRINTABLE && b != TAB && b != LINE_FEED && b != CARRIAGE_RETURN ) || b == DEL
163178
164- /* * The size-only summary emitted for a binary body. */
165- private fun binarySummary (size : Int ): String = " [binary $size bytes]"
179+ /* *
180+ * The size-only summary emitted for a binary body. [size] is the captured/preview byte count
181+ * (the input is an already-bounded snapshot), so the wording says "captured" rather than
182+ * implying it observed the full body length.
183+ */
184+ private fun binarySummary (size : Int ): String = " [binary $size bytes captured]"
166185}
0 commit comments