From 7817b26215445b8ce8827301deeb47a15a6c6bfa Mon Sep 17 00:00:00 2001
From: nap-liu <qfgznzwcfx@privaterelay.appleid.com>
Date: Thu, 23 Apr 2026 16:25:43 +0800
Subject: [PATCH 1/7] feat(llm): add JSON recovery utility for tool_call
 arguments

Pure-Python helper that parses malformed tool_call.arguments strings
(trailing commas, unescaped control chars) and produces a canonical JSON
string guaranteed to round-trip through json.loads. Will be used by
caller._process_tool_call to prevent malformed JSON from leaking into
subsequent DashScope requests.
---
 backend/app/services/llm/json_recovery.py | 115 ++++++++++++++++++++++
 backend/tests/test_llm_json_recovery.py   |  87 ++++++++++++++++
 2 files changed, 202 insertions(+)
 create mode 100644 backend/app/services/llm/json_recovery.py
 create mode 100644 backend/tests/test_llm_json_recovery.py

diff --git a/backend/app/services/llm/json_recovery.py b/backend/app/services/llm/json_recovery.py
new file mode 100644
index 000000000..b4a5c69be
--- /dev/null
+++ b/backend/app/services/llm/json_recovery.py
@@ -0,0 +1,115 @@
+"""Tool-call JSON argument recovery and canonicalization.
+
+LLM streaming sometimes produces slightly malformed JSON for tool_call.arguments:
+trailing commas, unescaped control characters in string values, truncated tokens.
+DashScope validates this field strictly server-side and rejects the request with
+HTTP 400 `function.arguments parameter must be in JSON format` on the NEXT round.
+
+`canonicalize_tool_arguments` accepts any raw string and returns a parsed dict
+plus a canonical JSON string that is guaranteed to round-trip through
+`json.loads`. It never raises.
+"""
+from __future__ import annotations
+
+import json
+import re
+from typing import Any
+
+# Control characters that are illegal unescaped in a JSON string
+_UNESCAPED_CONTROL_RE = re.compile(r'(?<!\\)[\x00-\x1f]')
+
+
+def _strip_trailing_commas(s: str) -> str:
+    """Remove trailing commas before } or ] — the most common LLM mistake."""
+    # Match ',' followed by optional whitespace and then ] or }
+    return re.sub(r',(\s*[}\]])', r'\1', s)
+
+
+def _escape_control_chars_in_strings(s: str) -> str:
+    """Scan through string and escape unescaped control chars inside JSON string values.
+
+    We can't do this by simple regex because we only want to escape control
+    chars *inside string values*, not outside. Walk char by char tracking
+    whether we're inside a string.
+    """
+    out: list[str] = []
+    in_string = False
+    escape_next = False
+    for ch in s:
+        if escape_next:
+            out.append(ch)
+            escape_next = False
+            continue
+        if ch == '\\' and in_string:
+            out.append(ch)
+            escape_next = True
+            continue
+        if ch == '"':
+            in_string = not in_string
+            out.append(ch)
+            continue
+        if in_string and ord(ch) < 0x20:
+            # Escape control chars per JSON spec
+            if ch == '\n':
+                out.append('\\n')
+            elif ch == '\r':
+                out.append('\\r')
+            elif ch == '\t':
+                out.append('\\t')
+            elif ch == '\b':
+                out.append('\\b')
+            elif ch == '\f':
+                out.append('\\f')
+            else:
+                out.append(f'\\u{ord(ch):04x}')
+            continue
+        out.append(ch)
+    return ''.join(out)
+
+
+def canonicalize_tool_arguments(raw: str) -> tuple[dict[str, Any], str, str]:
+    """Parse and canonicalize a raw tool_call.arguments string.
+
+    Returns:
+        (parsed_dict, canonical_json_string, repair_method)
+
+    repair_method is one of: "clean", "trailing_comma", "control_char_escape",
+    "failed". Never raises.
+    """
+    if not raw:
+        return {}, "{}", "clean"
+
+    # Attempt 1: clean parse
+    try:
+        parsed = json.loads(raw)
+        if not isinstance(parsed, dict):
+            parsed = {}
+        canonical = json.dumps(parsed, ensure_ascii=False)
+        return parsed, canonical, "clean"
+    except json.JSONDecodeError:
+        pass
+
+    # Attempt 2: strip trailing commas
+    cleaned = _strip_trailing_commas(raw)
+    try:
+        parsed = json.loads(cleaned)
+        if not isinstance(parsed, dict):
+            parsed = {}
+        canonical = json.dumps(parsed, ensure_ascii=False)
+        return parsed, canonical, "trailing_comma"
+    except json.JSONDecodeError:
+        pass
+
+    # Attempt 3: escape unescaped control chars in strings, then retry
+    escaped = _escape_control_chars_in_strings(cleaned)
+    try:
+        parsed = json.loads(escaped)
+        if not isinstance(parsed, dict):
+            parsed = {}
+        canonical = json.dumps(parsed, ensure_ascii=False)
+        return parsed, canonical, "control_char_escape"
+    except json.JSONDecodeError:
+        pass
+
+    # Gave up — return safe empty
+    return {}, "{}", "failed"
diff --git a/backend/tests/test_llm_json_recovery.py b/backend/tests/test_llm_json_recovery.py
new file mode 100644
index 000000000..dbfd8a093
--- /dev/null
+++ b/backend/tests/test_llm_json_recovery.py
@@ -0,0 +1,87 @@
+"""Unit tests for tool_call.arguments JSON recovery helpers."""
+from app.services.llm.json_recovery import canonicalize_tool_arguments
+
+
+def test_clean_json_passes_through():
+    raw = '{"path": "foo.md", "content": "hello"}'
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {"path": "foo.md", "content": "hello"}
+    assert method == "clean"
+    # canonical is still valid JSON and round-trips
+    import json
+    assert json.loads(canonical) == parsed
+
+
+def test_trailing_comma_in_object_is_repaired():
+    raw = '{"path": "foo.md", "content": "hi",}'
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {"path": "foo.md", "content": "hi"}
+    assert method == "trailing_comma"
+    import json
+    assert json.loads(canonical) == parsed
+
+
+def test_trailing_comma_in_array_is_repaired():
+    raw = '{"items": [1, 2, 3,]}'
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {"items": [1, 2, 3]}
+    assert method == "trailing_comma"
+
+
+def test_unescaped_newline_inside_string_is_repaired():
+    # Qwen streaming sometimes produces raw \n inside a string value
+    raw = '{"content": "line1\nline2"}'
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {"content": "line1\nline2"}
+    assert method == "control_char_escape"
+    import json
+    # canonical round-trip preserves semantic content
+    assert json.loads(canonical)["content"] == "line1\nline2"
+
+
+def test_unescaped_tab_inside_string_is_repaired():
+    raw = '{"content": "a\tb"}'
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {"content": "a\tb"}
+    assert method == "control_char_escape"
+
+
+def test_unicode_is_preserved_without_escaping():
+    raw = '{"content": "你好世界测试"}'
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {"content": "你好世界测试"}
+    assert method == "clean"
+    # canonical must keep Chinese chars unescaped (ensure_ascii=False)
+    assert "你好" in canonical
+
+
+def test_empty_string_yields_empty_dict():
+    parsed, canonical, method = canonicalize_tool_arguments("")
+    assert parsed == {}
+    assert canonical == "{}"
+    assert method == "clean"
+
+
+def test_hopelessly_broken_returns_failed():
+    raw = '{"path": "foo" "content": }'  # totally broken
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {}
+    assert canonical == "{}"
+    assert method == "failed"
+
+
+def test_canonical_is_always_valid_json_even_on_failure():
+    """Invariant: canonical output must always be parseable JSON."""
+    import json
+    for raw in [
+        '',
+        '{"a": 1}',
+        '{"a": 1,}',
+        '{"a": "b\nc"}',
+        'not json at all',
+        '{"broken',
+        None,
+    ]:
+        _, canonical, _ = canonicalize_tool_arguments(raw or "")
+        # Must not raise
+        json.loads(canonical)

From 093bbdbe6eab5869ae376b0ea3c1977c4e98630a Mon Sep 17 00:00:00 2001
From: nap-liu <qfgznzwcfx@privaterelay.appleid.com>
Date: Thu, 23 Apr 2026 16:25:43 +0800
Subject: [PATCH 2/7] fix(llm): string-aware trailing-comma strip + explicit
 non-dict label

Two silent-data-loss paths found in code review:

1. Regex-based trailing-comma strip would eat commas inside string values
   ({"a": "hello,}", ...} -> {"a": "hello}", ...}). Replaced with a
   string-aware walker that tracks in_string and escape state.

2. Non-dict top-level parses (array, scalar, null) were silently coerced to
   {} with method="clean", making the data-loss event invisible to
   observability. Added explicit method="non_dict_coerced" so the caller
   can log-and-alert when this happens.

Tests: 3 new (string-content comma, escaped quote, non-dict coercion).
Total: 12 passed.
---
 backend/app/services/llm/json_recovery.py | 64 ++++++++++++++++++++---
 backend/tests/test_llm_json_recovery.py   | 31 +++++++++++
 2 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/backend/app/services/llm/json_recovery.py b/backend/app/services/llm/json_recovery.py
index b4a5c69be..f15026b77 100644
--- a/backend/app/services/llm/json_recovery.py
+++ b/backend/app/services/llm/json_recovery.py
@@ -8,6 +8,19 @@
 `canonicalize_tool_arguments` accepts any raw string and returns a parsed dict
 plus a canonical JSON string that is guaranteed to round-trip through
 `json.loads`. It never raises.
+
+Repair methods reported back to callers:
+
+- ``"clean"`` — ``json.loads`` succeeded on the raw input and it was a dict.
+- ``"trailing_comma"`` — succeeded after stripping trailing commas before
+  ``}`` or ``]`` (string-aware so commas inside string literals are kept).
+- ``"control_char_escape"`` — succeeded after escaping unescaped control
+  characters inside JSON string values.
+- ``"non_dict_coerced"`` — a parse attempt succeeded but produced a non-dict
+  top-level value (list, scalar, ``null``). Coerced to ``{}``. Callers
+  should log/alert on this because real user data was dropped.
+- ``"failed"`` — every repair attempt raised ``json.JSONDecodeError``.
+  Returns ``{}`` / ``"{}"``.
 """
 from __future__ import annotations
 
@@ -20,9 +33,46 @@
 
 
 def _strip_trailing_commas(s: str) -> str:
-    """Remove trailing commas before } or ] — the most common LLM mistake."""
-    # Match ',' followed by optional whitespace and then ] or }
-    return re.sub(r',(\s*[}\]])', r'\1', s)
+    """Remove trailing commas before } or ] — but only when OUTSIDE a JSON string.
+
+    Walks the input char by char so that a comma inside a string literal
+    (e.g. `"hello,}"`) is not confused with a trailing comma in the outer
+    structure.
+    """
+    out: list[str] = []
+    in_string = False
+    escape_next = False
+    i = 0
+    n = len(s)
+    while i < n:
+        ch = s[i]
+        if escape_next:
+            out.append(ch)
+            escape_next = False
+            i += 1
+            continue
+        if ch == '\\' and in_string:
+            out.append(ch)
+            escape_next = True
+            i += 1
+            continue
+        if ch == '"':
+            in_string = not in_string
+            out.append(ch)
+            i += 1
+            continue
+        if not in_string and ch == ',':
+            # Peek ahead past whitespace to see if next non-ws is } or ]
+            j = i + 1
+            while j < n and s[j] in ' \t\n\r':
+                j += 1
+            if j < n and s[j] in '}]':
+                # Drop the comma, keep the whitespace
+                i += 1
+                continue
+        out.append(ch)
+        i += 1
+    return ''.join(out)
 
 
 def _escape_control_chars_in_strings(s: str) -> str:
@@ -74,7 +124,7 @@ def canonicalize_tool_arguments(raw: str) -> tuple[dict[str, Any], str, str]:
         (parsed_dict, canonical_json_string, repair_method)
 
     repair_method is one of: "clean", "trailing_comma", "control_char_escape",
-    "failed". Never raises.
+    "non_dict_coerced", "failed". Never raises.
     """
     if not raw:
         return {}, "{}", "clean"
@@ -83,7 +133,7 @@ def canonicalize_tool_arguments(raw: str) -> tuple[dict[str, Any], str, str]:
     try:
         parsed = json.loads(raw)
         if not isinstance(parsed, dict):
-            parsed = {}
+            return {}, "{}", "non_dict_coerced"
         canonical = json.dumps(parsed, ensure_ascii=False)
         return parsed, canonical, "clean"
     except json.JSONDecodeError:
@@ -94,7 +144,7 @@ def canonicalize_tool_arguments(raw: str) -> tuple[dict[str, Any], str, str]:
     try:
         parsed = json.loads(cleaned)
         if not isinstance(parsed, dict):
-            parsed = {}
+            return {}, "{}", "non_dict_coerced"
         canonical = json.dumps(parsed, ensure_ascii=False)
         return parsed, canonical, "trailing_comma"
     except json.JSONDecodeError:
@@ -105,7 +155,7 @@ def canonicalize_tool_arguments(raw: str) -> tuple[dict[str, Any], str, str]:
     try:
         parsed = json.loads(escaped)
         if not isinstance(parsed, dict):
-            parsed = {}
+            return {}, "{}", "non_dict_coerced"
         canonical = json.dumps(parsed, ensure_ascii=False)
         return parsed, canonical, "control_char_escape"
     except json.JSONDecodeError:
diff --git a/backend/tests/test_llm_json_recovery.py b/backend/tests/test_llm_json_recovery.py
index dbfd8a093..a82f89429 100644
--- a/backend/tests/test_llm_json_recovery.py
+++ b/backend/tests/test_llm_json_recovery.py
@@ -85,3 +85,34 @@ def test_canonical_is_always_valid_json_even_on_failure():
         _, canonical, _ = canonicalize_tool_arguments(raw or "")
         # Must not raise
         json.loads(canonical)
+
+
+def test_trailing_comma_inside_string_value_is_not_stripped():
+    """Regression: regex-based stripping would silently corrupt
+    `{"a": "hello,}", "b": 1,}` by eating the comma inside the string value.
+    The string-aware walker must preserve string content exactly."""
+    raw = '{"a": "hello,}", "b": 1,}'
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {"a": "hello,}", "b": 1}
+    assert method == "trailing_comma"
+    import json
+    assert json.loads(canonical) == parsed
+
+
+def test_trailing_comma_inside_escaped_string_not_stripped():
+    """Escaped quote inside a string must not end the string prematurely."""
+    raw = '{"a": "he said \\"hi,}\\"", "b": 1,}'
+    parsed, canonical, method = canonicalize_tool_arguments(raw)
+    assert parsed == {"a": 'he said "hi,}"', "b": 1}
+    assert method == "trailing_comma"
+
+
+def test_non_dict_top_level_is_coerced_with_explicit_method():
+    """A top-level array or scalar is not a valid tool_call args object.
+    Must be coerced to {} AND reported as non_dict_coerced (not "clean")
+    so observability can flag the data-loss event."""
+    for raw in ['[1, 2, 3]', '"just a string"', '42', 'null']:
+        parsed, canonical, method = canonicalize_tool_arguments(raw)
+        assert parsed == {}
+        assert canonical == "{}"
+        assert method == "non_dict_coerced", f"raw={raw!r} got method={method}"

From 2609121a41c49115cbffc9ae619034da5bdded26 Mon Sep 17 00:00:00 2001
From: nap-liu <qfgznzwcfx@privaterelay.appleid.com>
Date: Thu, 23 Apr 2026 16:25:43 +0800
Subject: [PATCH 3/7] feat(llm): add head+tail tool result shaping

Large tool results (e.g. execute_code stdout) accumulate
across rounds and can push conversation context over Qwen3.5-plus's
983k char input limit. shape_tool_result keeps head + tail with an
explicit truncation marker so the LLM knows truncation happened.
---
 .../app/services/llm/tool_result_shaping.py   | 31 ++++++++++
 backend/tests/test_llm_tool_result_shaping.py | 56 +++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 backend/app/services/llm/tool_result_shaping.py
 create mode 100644 backend/tests/test_llm_tool_result_shaping.py

diff --git a/backend/app/services/llm/tool_result_shaping.py b/backend/app/services/llm/tool_result_shaping.py
new file mode 100644
index 000000000..76f769db3
--- /dev/null
+++ b/backend/app/services/llm/tool_result_shaping.py
@@ -0,0 +1,31 @@
+"""Shape oversized tool results to stay within per-call size budget.
+
+A single tool result (e.g. a long a long tool output JSON, or an
+`execute_code` stdout dump) can exceed 50KB. Accumulating many such results
+across 10+ tool rounds blows past Qwen3.5-plus's ~983k-char input limit and
+causes `HTTP 400: Range of input length should be [1, 983616]`.
+
+This module applies a head+tail truncation with an explicit marker so the
+LLM can see that truncation happened and ask for more if needed.
+"""
+from __future__ import annotations
+
+
+def shape_tool_result(result, max_chars: int) -> tuple[str, bool]:
+    """Return (possibly-truncated string, was_truncated).
+
+    Strategy for oversized results: keep ~60% head and ~30% tail, with a
+    marker in between describing how much was dropped. Total output stays
+    within max_chars plus a small marker overhead (~120 chars).
+    """
+    s = str(result) if not isinstance(result, str) else result
+    if len(s) <= max_chars:
+        return s, False
+
+    # Budget split: 60% head, 30% tail, 10% safety
+    head_budget = int(max_chars * 0.60)
+    tail_budget = int(max_chars * 0.30)
+    dropped = len(s) - head_budget - tail_budget
+    marker = f"\n\n[... truncated: {dropped:,} chars omitted (head {head_budget:,} + tail {tail_budget:,} kept) ...]\n\n"
+
+    return s[:head_budget] + marker + s[-tail_budget:], True
diff --git a/backend/tests/test_llm_tool_result_shaping.py b/backend/tests/test_llm_tool_result_shaping.py
new file mode 100644
index 000000000..fa770d3bc
--- /dev/null
+++ b/backend/tests/test_llm_tool_result_shaping.py
@@ -0,0 +1,56 @@
+"""Unit tests for tool result size-shaping."""
+from app.services.llm.tool_result_shaping import shape_tool_result
+
+
+def test_short_result_passes_through_unchanged():
+    short = "hello world"
+    out, truncated = shape_tool_result(short, max_chars=1000)
+    assert out == short
+    assert truncated is False
+
+
+def test_exactly_at_limit_passes_through():
+    s = "x" * 1000
+    out, truncated = shape_tool_result(s, max_chars=1000)
+    assert out == s
+    assert truncated is False
+
+
+def test_oversized_result_is_truncated_with_marker():
+    s = "A" * 500 + "B" * 2000 + "C" * 500
+    out, truncated = shape_tool_result(s, max_chars=1000)
+    assert truncated is True
+    assert len(out) < len(s)
+    # Marker is present and mentions how much was dropped
+    assert "truncated" in out.lower()
+    # Head (starts with A) and tail (ends with C) both preserved
+    assert out.startswith("A")
+    assert out.endswith("C")
+
+
+def test_marker_reports_dropped_byte_count():
+    s = "A" * 10_000
+    out, truncated = shape_tool_result(s, max_chars=1000)
+    assert truncated is True
+    # The marker should contain the number of dropped characters
+    assert "9" in out  # ~9000 dropped
+
+
+def test_output_length_respects_budget():
+    s = "x" * 100_000
+    out, truncated = shape_tool_result(s, max_chars=1000)
+    # Output should be <= max_chars + reasonable marker overhead (~200 chars)
+    assert len(out) <= 1000 + 200
+    assert truncated is True
+
+
+def test_empty_result():
+    out, truncated = shape_tool_result("", max_chars=1000)
+    assert out == ""
+    assert truncated is False
+
+
+def test_non_string_coerced_to_string():
+    out, truncated = shape_tool_result(12345, max_chars=1000)
+    assert out == "12345"
+    assert truncated is False

From 2221360353c2e8e2f5801540cc9c2154e665788f Mon Sep 17 00:00:00 2001
From: nap-liu <qfgznzwcfx@privaterelay.appleid.com>
Date: Thu, 23 Apr 2026 16:25:43 +0800
Subject: [PATCH 4/7] fix(llm): guard zero/negative max_chars in
 shape_tool_result
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two silent-corruption edge cases found in code review:

1. Negative max_chars (e.g. -5) produced nonsense via Python's negative
   slice semantics — overlapping slices + a lying marker claiming more
   chars dropped than the input had.
2. max_chars=0 returned only the marker, violating the contract that
   output stays within max_chars + small marker overhead.

Both now return empty string with was_truncated accurately reflecting
whether any content was dropped. Tests added for both cases plus
renamed test_marker_reports_dropped_byte_count → ..._char_count since
len(str) counts code points, not UTF-8 bytes.
---
 .../app/services/llm/tool_result_shaping.py   | 12 +++++++++
 backend/tests/test_llm_tool_result_shaping.py | 26 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/backend/app/services/llm/tool_result_shaping.py b/backend/app/services/llm/tool_result_shaping.py
index 76f769db3..de7dbd74c 100644
--- a/backend/app/services/llm/tool_result_shaping.py
+++ b/backend/app/services/llm/tool_result_shaping.py
@@ -7,6 +7,9 @@
 
 This module applies a head+tail truncation with an explicit marker so the
 LLM can see that truncation happened and ask for more if needed.
+
+A degenerate budget (``max_chars <= 0``) returns an empty string, with
+``was_truncated=True`` iff the input was non-empty.
 """
 from __future__ import annotations
 
@@ -17,8 +20,17 @@ def shape_tool_result(result, max_chars: int) -> tuple[str, bool]:
     Strategy for oversized results: keep ~60% head and ~30% tail, with a
     marker in between describing how much was dropped. Total output stays
     within max_chars plus a small marker overhead (~120 chars).
+
+    Edge case: if ``max_chars <= 0`` the budget is degenerate — there is no
+    room for any content (nor for the marker itself), so an empty string is
+    returned, with ``was_truncated=True`` iff the input was non-empty.
     """
     s = str(result) if not isinstance(result, str) else result
+    if max_chars <= 0:
+        # Degenerate budget — treat as "drop everything", no marker (it would
+        # exceed max_chars itself). was_truncated reflects whether any content
+        # was actually dropped.
+        return "", len(s) > 0
     if len(s) <= max_chars:
         return s, False
 
diff --git a/backend/tests/test_llm_tool_result_shaping.py b/backend/tests/test_llm_tool_result_shaping.py
index fa770d3bc..4dcada4b2 100644
--- a/backend/tests/test_llm_tool_result_shaping.py
+++ b/backend/tests/test_llm_tool_result_shaping.py
@@ -28,7 +28,7 @@ def test_oversized_result_is_truncated_with_marker():
     assert out.endswith("C")
 
 
-def test_marker_reports_dropped_byte_count():
+def test_marker_reports_dropped_char_count():
     s = "A" * 10_000
     out, truncated = shape_tool_result(s, max_chars=1000)
     assert truncated is True
@@ -36,6 +36,30 @@ def test_marker_reports_dropped_byte_count():
     assert "9" in out  # ~9000 dropped
 
 
+def test_zero_budget_returns_empty():
+    """max_chars=0 is a degenerate budget; return empty string and
+    report truncation when any content was dropped."""
+    out, truncated = shape_tool_result("hello", 0)
+    assert out == ""
+    assert truncated is True
+
+
+def test_zero_budget_with_empty_input():
+    """Empty input under a zero budget is still not truncation."""
+    out, truncated = shape_tool_result("", 0)
+    assert out == ""
+    assert truncated is False
+
+
+def test_negative_budget_degenerates_gracefully():
+    """Negative max_chars should not silently produce garbage.
+    Regression: prior regex-free slicing produced overlapping slices
+    with a lying marker (output longer than input)."""
+    out, truncated = shape_tool_result("hello world", -5)
+    assert out == ""
+    assert truncated is True
+
+
 def test_output_length_respects_budget():
     s = "x" * 100_000
     out, truncated = shape_tool_result(s, max_chars=1000)

From cfce7cf115d510106ddc281793ae1077f8f6120b Mon Sep 17 00:00:00 2001
From: nap-liu <qfgznzwcfx@privaterelay.appleid.com>
Date: Thu, 23 Apr 2026 16:25:43 +0800
Subject: [PATCH 5/7] fix(llm): canonicalize tool_call arguments + cap tool
 result size

_process_tool_call now uses canonicalize_tool_arguments to repair
malformed JSON from Qwen streaming (trailing commas, unescaped control
chars) and writes the canonical form back to tc.function.arguments so
that subsequent LLM rounds receive a history entry that passes
DashScope's JSON validator. Prevents HTTP 400 'function.arguments
must be in JSON format'.

Also caps single tool result content at 20k chars via head+tail
shaping. Prevents single-run context blowup that caused HTTP 400
'Range of input length should be [1, 983616]'.

Both paths log a structured warning when repair or truncation fires,
for observability.
---
 backend/app/services/llm/caller.py           |  29 ++++-
 backend/tests/test_llm_caller_integration.py | 105 +++++++++++++++++++
 2 files changed, 130 insertions(+), 4 deletions(-)
 create mode 100644 backend/tests/test_llm_caller_integration.py

diff --git a/backend/app/services/llm/caller.py b/backend/app/services/llm/caller.py
index d3a3b66d1..e14a710f6 100644
--- a/backend/app/services/llm/caller.py
+++ b/backend/app/services/llm/caller.py
@@ -209,10 +209,17 @@ async def _process_tool_call(
     raw_args = fn.get("arguments", "{}")
     logger.info(f"[LLM] Calling tool: {tool_name}({json.dumps(raw_args, ensure_ascii=False)[:100]})")
 
-    try:
-        args = json.loads(raw_args) if raw_args else {}
-    except json.JSONDecodeError:
-        args = {}
+    from app.services.llm.json_recovery import canonicalize_tool_arguments
+    args, canonical_args, repair_method = canonicalize_tool_arguments(raw_args)
+    # Overwrite tc.function.arguments with canonical JSON so later LLM rounds
+    # receive a history entry guaranteed to pass DashScope's JSON validator.
+    fn["arguments"] = canonical_args
+    if repair_method != "clean":
+        logger.warning(
+            f"[LLM] tool_call args repaired: tool={tool_name} method={repair_method} "
+            f"orig_len={len(raw_args)} canonical_len={len(canonical_args)} "
+            f"session={session_id}"
+        )
 
     # Guard: check if tool requires arguments
     should_execute, error_msg = _check_tool_requires_args(tool_name, args)
@@ -268,6 +275,20 @@ async def _process_tool_call(
         except Exception:
             pass
     
+    # Shape oversized tool results to protect conversation context budget.
+    # Only applies to plain-string content — vision content arrays are left alone.
+    from app.services.llm.tool_result_shaping import shape_tool_result
+    TOOL_RESULT_MAX_CHARS = 20_000
+    if isinstance(tool_content, str):
+        shaped, was_truncated = shape_tool_result(tool_content, TOOL_RESULT_MAX_CHARS)
+        if was_truncated:
+            logger.warning(
+                f"[LLM] tool_result truncated: tool={tool_name} "
+                f"orig_len={len(tool_content)} shaped_len={len(shaped)} "
+                f"session={session_id}"
+            )
+        tool_content = shaped
+
     api_messages.append(LLMMessage(
         role="tool",
         tool_call_id=tc["id"],
diff --git a/backend/tests/test_llm_caller_integration.py b/backend/tests/test_llm_caller_integration.py
new file mode 100644
index 000000000..85498f610
--- /dev/null
+++ b/backend/tests/test_llm_caller_integration.py
@@ -0,0 +1,105 @@
+"""Integration tests for caller._process_tool_call normalization."""
+import json
+import pytest
+from unittest.mock import patch
+
+from app.services.llm.caller import _process_tool_call
+
+
+@pytest.mark.asyncio
+async def test_process_tool_call_canonicalizes_malformed_arguments():
+    """Malformed arguments (trailing comma) must be rewritten to valid JSON
+    on tc['function']['arguments'] so later LLM rounds get clean history."""
+    tc = {
+        "id": "call_1",
+        "function": {
+            "name": "read_file",
+            # Trailing comma — Qwen streaming produces this sometimes
+            "arguments": '{"path": "foo.md",}',
+        },
+    }
+    api_messages: list = []
+
+    async def fake_execute_tool(name, args, **kwargs):
+        assert name == "read_file"
+        assert args == {"path": "foo.md"}
+        return "file contents here"
+
+    with patch("app.services.llm.caller.execute_tool", side_effect=fake_execute_tool):
+        await _process_tool_call(
+            tc=tc,
+            api_messages=api_messages,
+            agent_id="agent-1",
+            user_id="user-1",
+            session_id="sess-1",
+            supports_vision=False,
+            on_tool_call=None,
+            full_reasoning_content="",
+        )
+
+    # CRITICAL: arguments on the tc object must now be valid JSON
+    repaired = tc["function"]["arguments"]
+    parsed = json.loads(repaired)
+    assert parsed == {"path": "foo.md"}
+    # And it must not have a trailing comma
+    assert ",}" not in repaired.replace(" ", "")
+
+
+@pytest.mark.asyncio
+async def test_process_tool_call_truncates_oversized_result():
+    """Tool results over the cap must be head+tail truncated in the
+    api_messages tool-result entry."""
+    tc = {
+        "id": "call_1",
+        "function": {"name": "example_tool", "arguments": '{"command": ["report", "list"]}'},
+    }
+    api_messages: list = []
+    huge_result = "A" * 200_000  # 200KB result
+
+    async def fake_execute_tool(name, args, **kwargs):
+        return huge_result
+
+    with patch("app.services.llm.caller.execute_tool", side_effect=fake_execute_tool):
+        await _process_tool_call(
+            tc=tc,
+            api_messages=api_messages,
+            agent_id="agent-1",
+            user_id="user-1",
+            session_id="sess-1",
+            supports_vision=False,
+            on_tool_call=None,
+            full_reasoning_content="",
+        )
+
+    tool_msg = api_messages[-1]
+    # Stored content should be capped and contain the truncation marker
+    content = tool_msg.content if isinstance(tool_msg.content, str) else str(tool_msg.content)
+    assert len(content) < 50_000
+    assert "truncated" in content.lower()
+
+
+@pytest.mark.asyncio
+async def test_process_tool_call_clean_arguments_pass_through_unchanged_semantic():
+    """Clean JSON must still work exactly as before (backwards compat)."""
+    tc = {
+        "id": "call_1",
+        "function": {
+            "name": "read_file",
+            "arguments": '{"path": "foo.md"}',
+        },
+    }
+    api_messages: list = []
+
+    async def fake_execute_tool(name, args, **kwargs):
+        assert args == {"path": "foo.md"}
+        return "ok"
+
+    with patch("app.services.llm.caller.execute_tool", side_effect=fake_execute_tool):
+        await _process_tool_call(
+            tc=tc, api_messages=api_messages,
+            agent_id="agent-1", user_id="user-1", session_id="sess-1",
+            supports_vision=False, on_tool_call=None, full_reasoning_content="",
+        )
+
+    # Semantic equivalence (key order / spacing may differ)
+    assert json.loads(tc["function"]["arguments"]) == {"path": "foo.md"}

From 350a073a9b2725662d19db4b681134f77d2b454b Mon Sep 17 00:00:00 2001
From: nap-liu <qfgznzwcfx@privaterelay.appleid.com>
Date: Thu, 23 Apr 2026 16:25:43 +0800
Subject: [PATCH 6/7] fix(llm): extend canonicalize+shape to
 call_agent_llm_with_tools path

Code review found call_agent_llm_with_tools._try_model has an identical
tool loop to _process_tool_call, used by background services
(scheduler, task executor, heartbeat). It still ran the naive json.loads
path without canonicalize or shaping, leaving the same two 400 errors
unfixed on the background-service code path.

Extracted shared helpers _canonicalize_tc_arguments and
_shape_tool_content_for_context at module level so both callers use
the same normalization + logging. Also:

- Hoisted json_recovery and tool_result_shaping imports to top of file
- Promoted TOOL_RESULT_MAX_CHARS to module constant
- Unified log fields (orig_len/new_len) and added dropped= on truncation
- Tightened test assertion from <50_000 to <=TOOL_RESULT_MAX_CHARS+500
- Added 3 direct tests for the helpers
---
 backend/app/services/llm/caller.py           | 94 ++++++++++++--------
 backend/tests/test_llm_caller_integration.py | 37 +++++++-
 2 files changed, 93 insertions(+), 38 deletions(-)

diff --git a/backend/app/services/llm/caller.py b/backend/app/services/llm/caller.py
index e14a710f6..b0f746b24 100644
--- a/backend/app/services/llm/caller.py
+++ b/backend/app/services/llm/caller.py
@@ -15,7 +15,7 @@
 import json
 import uuid
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from loguru import logger
 from sqlalchemy import select
@@ -27,6 +27,8 @@
 
 from .client import LLMError
 from .failover import classify_error, FailoverErrorType
+from .json_recovery import canonicalize_tool_arguments
+from .tool_result_shaping import shape_tool_result
 from .utils import LLMMessage, create_llm_client, get_max_tokens, get_model_api_key
 
 if TYPE_CHECKING:
@@ -39,6 +41,10 @@
     "send_message_to_agent", "send_feishu_message", "send_email"
 })
 
+# Cap for any single tool-result entry sent into LLM history.
+# Phase 1 uses a constant; Phase 2 will make this per-agent configurable.
+TOOL_RESULT_MAX_CHARS = 20_000
+
 
 # ═══════════════════════════════════════════════════════════════════════════════
 # Failover Guard
@@ -193,6 +199,47 @@ def _check_tool_requires_args(tool_name: str, args: dict) -> tuple[bool, str]:
     return True, ""
 
 
+def _canonicalize_tc_arguments(tc: dict, session_id: str) -> dict[str, Any]:
+    """Canonicalize ``tc['function']['arguments']`` in place and return the parsed dict.
+
+    The canonical JSON is written back to ``tc['function']['arguments']`` so that
+    any subsequent LLM round receiving this ``tc`` in conversation history will
+    pass DashScope's ``function.arguments must be in JSON format`` validation.
+    Used by both in-flight tool loops (_process_tool_call and _try_model).
+    """
+    fn = tc["function"]
+    tool_name = fn["name"]
+    raw_args = fn.get("arguments", "{}")
+    args, canonical_args, repair_method = canonicalize_tool_arguments(raw_args)
+    fn["arguments"] = canonical_args
+    if repair_method != "clean":
+        logger.warning(
+            f"[LLM] tool_call args repaired: tool={tool_name} method={repair_method} "
+            f"orig_len={len(raw_args)} new_len={len(canonical_args)} session={session_id}"
+        )
+    return args
+
+
+def _shape_tool_content_for_context(tool_content, tool_name: str, session_id: str):
+    """Return tool_content capped at TOOL_RESULT_MAX_CHARS (string content only).
+
+    Vision content is always a list[dict] per vision_inject.try_inject_screenshot_vision —
+    we pass those through unchanged to preserve base64 image data.
+    """
+    # Invariant: vision content is always a list[dict]; see vision_inject.try_inject_screenshot_vision.
+    if not isinstance(tool_content, str):
+        return tool_content
+    shaped, was_truncated = shape_tool_result(tool_content, TOOL_RESULT_MAX_CHARS)
+    if was_truncated:
+        dropped = len(tool_content) - len(shaped)
+        logger.warning(
+            f"[LLM] tool_result truncated: tool={tool_name} "
+            f"orig_len={len(tool_content)} new_len={len(shaped)} "
+            f"dropped={dropped} session={session_id}"
+        )
+    return shaped
+
+
 async def _process_tool_call(
     tc: dict,
     api_messages: list,
@@ -204,22 +251,11 @@ async def _process_tool_call(
     full_reasoning_content: str,
 ) -> str:
     """Process a single tool call and return result."""
-    fn = tc["function"]
-    tool_name = fn["name"]
-    raw_args = fn.get("arguments", "{}")
-    logger.info(f"[LLM] Calling tool: {tool_name}({json.dumps(raw_args, ensure_ascii=False)[:100]})")
+    raw_args = tc["function"].get("arguments", "{}")
+    logger.info(f"[LLM] Calling tool: {tc['function']['name']}({json.dumps(raw_args, ensure_ascii=False)[:100]})")
 
-    from app.services.llm.json_recovery import canonicalize_tool_arguments
-    args, canonical_args, repair_method = canonicalize_tool_arguments(raw_args)
-    # Overwrite tc.function.arguments with canonical JSON so later LLM rounds
-    # receive a history entry guaranteed to pass DashScope's JSON validator.
-    fn["arguments"] = canonical_args
-    if repair_method != "clean":
-        logger.warning(
-            f"[LLM] tool_call args repaired: tool={tool_name} method={repair_method} "
-            f"orig_len={len(raw_args)} canonical_len={len(canonical_args)} "
-            f"session={session_id}"
-        )
+    args = _canonicalize_tc_arguments(tc, session_id)
+    tool_name = tc["function"]["name"]
 
     # Guard: check if tool requires arguments
     should_execute, error_msg = _check_tool_requires_args(tool_name, args)
@@ -275,19 +311,7 @@ async def _process_tool_call(
         except Exception:
             pass
     
-    # Shape oversized tool results to protect conversation context budget.
-    # Only applies to plain-string content — vision content arrays are left alone.
-    from app.services.llm.tool_result_shaping import shape_tool_result
-    TOOL_RESULT_MAX_CHARS = 20_000
-    if isinstance(tool_content, str):
-        shaped, was_truncated = shape_tool_result(tool_content, TOOL_RESULT_MAX_CHARS)
-        if was_truncated:
-            logger.warning(
-                f"[LLM] tool_result truncated: tool={tool_name} "
-                f"orig_len={len(tool_content)} shaped_len={len(shaped)} "
-                f"session={session_id}"
-            )
-        tool_content = shaped
+    tool_content = _shape_tool_content_for_context(tool_content, tool_name, session_id)
 
     api_messages.append(LLMMessage(
         role="tool",
@@ -770,13 +794,8 @@ async def _try_model(model: LLMModel) -> tuple[str, bool, bool]:
                 ))
 
                 for tc in response.tool_calls:
-                    fn = tc["function"]
-                    tool_name = fn["name"]
-                    raw_args = fn.get("arguments", "{}")
-                    try:
-                        args = json.loads(raw_args) if raw_args else {}
-                    except json.JSONDecodeError:
-                        args = {}
+                    args = _canonicalize_tc_arguments(tc, session_id)
+                    tool_name = tc["function"]["name"]
 
                     tool_executed = True
                     result = await execute_tool(
@@ -785,10 +804,11 @@ async def _try_model(model: LLMModel) -> tuple[str, bool, bool]:
                         user_id=agent.creator_id,
                         session_id=session_id,
                     )
+                    shaped_content = _shape_tool_content_for_context(str(result), tool_name, session_id)
                     api_messages.append(LLMMessage(
                         role="tool",
                         tool_call_id=tc["id"],
-                        content=str(result),
+                        content=shaped_content,
                     ))
 
             if agent_id and _accumulated_tokens > 0:
diff --git a/backend/tests/test_llm_caller_integration.py b/backend/tests/test_llm_caller_integration.py
index 85498f610..c7abcc411 100644
--- a/backend/tests/test_llm_caller_integration.py
+++ b/backend/tests/test_llm_caller_integration.py
@@ -74,7 +74,9 @@ async def fake_execute_tool(name, args, **kwargs):
     tool_msg = api_messages[-1]
     # Stored content should be capped and contain the truncation marker
     content = tool_msg.content if isinstance(tool_msg.content, str) else str(tool_msg.content)
-    assert len(content) < 50_000
+    from app.services.llm.caller import TOOL_RESULT_MAX_CHARS
+    # Cap is TOOL_RESULT_MAX_CHARS + small marker overhead (~120 chars).
+    assert len(content) <= TOOL_RESULT_MAX_CHARS + 500
     assert "truncated" in content.lower()
 
 
@@ -103,3 +105,36 @@ async def fake_execute_tool(name, args, **kwargs):
 
     # Semantic equivalence (key order / spacing may differ)
     assert json.loads(tc["function"]["arguments"]) == {"path": "foo.md"}
+
+
+def test_canonicalize_tc_arguments_helper_rewrites_tc_inplace():
+    """Unit test the helper directly — exercised by both _process_tool_call
+    and call_agent_llm_with_tools._try_model."""
+    from app.services.llm.caller import _canonicalize_tc_arguments
+    tc = {
+        "id": "call_1",
+        "function": {"name": "read_file", "arguments": '{"path": "foo.md",}'},
+    }
+    args = _canonicalize_tc_arguments(tc, session_id="sess-x")
+    assert args == {"path": "foo.md"}
+    # In-place mutation: tc now carries canonical JSON
+    import json
+    parsed = json.loads(tc["function"]["arguments"])
+    assert parsed == {"path": "foo.md"}
+
+
+def test_shape_tool_content_for_context_bypasses_list_content():
+    """Vision content (list of parts) must pass through untouched."""
+    from app.services.llm.caller import _shape_tool_content_for_context
+    vision_content = [{"type": "text", "text": "see image"}, {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}]
+    out = _shape_tool_content_for_context(vision_content, "screenshot", "sess-x")
+    assert out is vision_content  # identity — no copy, no mutation
+
+
+def test_shape_tool_content_for_context_truncates_long_string():
+    """Oversized string content is capped."""
+    from app.services.llm.caller import _shape_tool_content_for_context, TOOL_RESULT_MAX_CHARS
+    out = _shape_tool_content_for_context("A" * 200_000, "example_tool", "sess-x")
+    assert isinstance(out, str)
+    assert len(out) <= TOOL_RESULT_MAX_CHARS + 500
+    assert "truncated" in out.lower()

From 946d70a72d115d83cad2cad81a96935f703ed57e Mon Sep 17 00:00:00 2001
From: nap-liu <qfgznzwcfx@privaterelay.appleid.com>
Date: Thu, 23 Apr 2026 16:25:43 +0800
Subject: [PATCH 7/7] refactor(llm): fix pre-existing _try_model tuple bug +
 cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three small items surfaced by final integrated review:

1. _try_model declared -> tuple[str, bool, bool] but the no-tool-calls
   happy-path early return produced a 2-tuple. Caller unpacked 3 values
   so the background-services path would ValueError on any immediate
   non-tool answer. Pre-existing; folded in here since this PR already
   edits the function.

2. Added a one-line NB comment above both assistant-with-tool-calls
   message appends documenting that tc['function'] must remain shared
   by reference with _canonicalize_tc_arguments in-place write — a
   future deepcopy refactor would silently carry pre-repair arguments
   into history.

3. Removed dead 'import re' and '_UNESCAPED_CONTROL_RE' from
   json_recovery.py — unused after the string-aware walker rewrite.
---
 backend/app/services/llm/caller.py        | 8 +++++++-
 backend/app/services/llm/json_recovery.py | 4 ----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/backend/app/services/llm/caller.py b/backend/app/services/llm/caller.py
index b0f746b24..f8235a546 100644
--- a/backend/app/services/llm/caller.py
+++ b/backend/app/services/llm/caller.py
@@ -449,6 +449,9 @@ async def call_llm(
         logger.info(f"[LLM] Round {round_i+1}: {len(response.tool_calls)} tool call(s)")
 
         # Add assistant message with tool calls
+        # NB: tc["function"] is shared by reference with _canonicalize_tc_arguments's
+        # in-place canonicalization — must stay as a reference (no deepcopy), or
+        # history entries will carry the pre-repair malformed arguments.
         api_messages.append(LLMMessage(
             role="assistant",
             content=response.content or None,
@@ -779,9 +782,12 @@ async def _try_model(model: LLMModel) -> tuple[str, bool, bool]:
                     if agent_id and _accumulated_tokens > 0:
                         await record_token_usage(agent_id, _accumulated_tokens)
                     await client.close()
-                    return response.content or "[Empty response]", True
+                    return response.content or "[Empty response]", True, tool_executed
 
                 # Execute tool calls
+                # NB: tc["function"] is shared by reference with _canonicalize_tc_arguments's
+                # in-place canonicalization — must stay as a reference (no deepcopy), or
+                # history entries will carry the pre-repair malformed arguments.
                 api_messages.append(LLMMessage(
                     role="assistant",
                     content=response.content or None,
diff --git a/backend/app/services/llm/json_recovery.py b/backend/app/services/llm/json_recovery.py
index f15026b77..f98d9741a 100644
--- a/backend/app/services/llm/json_recovery.py
+++ b/backend/app/services/llm/json_recovery.py
@@ -25,12 +25,8 @@
 from __future__ import annotations
 
 import json
-import re
 from typing import Any
 
-# Control characters that are illegal unescaped in a JSON string
-_UNESCAPED_CONTROL_RE = re.compile(r'(?<!\\)[\x00-\x1f]')
-
 
 def _strip_trailing_commas(s: str) -> str:
     """Remove trailing commas before } or ] — but only when OUTSIDE a JSON string.