machinefi · Liuhaai · May 8, 2026 · May 8, 2026
diff --git a/src/trio_core/api/routers/inference.py b/src/trio_core/api/routers/inference.py
@@ -705,11 +705,28 @@ async def _crop_describe_inner(req: CropDescribeRequest):
 
     # Try to extract JSON (from any format)
     s, e = clean.find("{"), clean.rfind("}")
+    json_parse_failed = False
     if s >= 0 and e > s:
         try:
             entities = json.loads(clean[s : e + 1])
-        except json.JSONDecodeError:
-            pass
+        except json.JSONDecodeError as exc:
+            json_parse_failed = True
+            logger.warning(
+                "crop_describe: JSON parse failed (%s) text_len=%d head=%r tail=%r",
+                exc.msg,
+                len(clean),
+                clean[:80],
+                clean[-80:],
+            )
+    elif clean.startswith("{"):
+        # No closing brace at all — almost certainly truncation.
+        json_parse_failed = True
+        logger.warning(
+            "crop_describe: JSON-shaped text has no closing brace (truncation suspected) text_len=%d head=%r tail=%r",
+            len(clean),
+            clean[:80],
+            clean[-80:],
+        )
     entities = _normalize_entities(entities)
 
     # Extract description — combine SCENE + ACTIVITIES + NOTABLE into rich description
@@ -749,6 +766,10 @@ async def _crop_describe_inner(req: CropDescribeRequest):
             desc = entities.pop("DESCRIPTION")
         elif "SCENE" in entities:
             desc = entities.pop("SCENE")
+        elif isinstance(entities.get("summary"), str) and entities["summary"]:
+            # SCENE_SCHEMA output (lowercase `summary`); read without popping
+            # so callers that re-parse the entities dict still see it.
+            desc = entities["summary"]
         else:
             parts = []
             for p in entities.get("persons") or []:
@@ -764,6 +785,12 @@ async def _crop_describe_inner(req: CropDescribeRequest):
             )
             if parts:
                 desc += ": " + ", ".join(parts[:5])
+    elif json_parse_failed:
+        # The model emitted JSON-shaped text but it didn't parse. Slicing
+        # `clean[:300]` would store raw garbage in observations.description
+        # (and trip downstream substring-matching). Prefer empty desc — the
+        # fallback below will still synthesize a minimal `entities` dict.
+        desc = ""
     else:
         desc = clean[:300] if clean else ""
 

diff --git a/tests/test_inference_router.py b/tests/test_inference_router.py
@@ -107,6 +107,106 @@ async def test_crop_describe_uses_single_composite_vlm_call(monkeypatch):
     assert response.entities["people_count"] == 1
 
 
+@pytest.mark.asyncio
+async def test_crop_describe_uses_summary_field_from_scene_schema(monkeypatch):
+    """SCENE_SCHEMA output (lowercase `summary`) should populate description.
+
+    Previously the entities branch only looked at uppercase `DESCRIPTION` /
+    `SCENE` keys and fell through to a "0 people, N vehicles: ..." parts
+    template, throwing away the model's natural-language summary.
+    """
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    engine.analyze_frame.return_value = VideoResult(
+        text=(
+            '{"summary":"A car wash area with vehicles in motion and parked.",'
+            '"people_count":0,"vehicle_count":3,'
+            '"persons":[],'
+            '"vehicles":['
+            '{"type":"sedan","color":"black","make":"unknown","action":"parked","is_known":false},'
+            '{"type":"suv","color":"white","make":"toyota","action":"parked","is_known":true},'
+            '{"type":"sedan","color":"red","make":"hyundai","action":"moving","is_known":false}'
+            "],"
+            '"scene_type":"car_wash_area","activity_level":"moderate"}'
+        ),
+        metrics=InferenceMetrics(latency_ms=100.0),
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    req = inference.CropDescribeRequest(image_b64=_image_b64(), crops=[], max_crops=0)
+    response = await inference._crop_describe_inner(req)
+
+    assert response.description == "A car wash area with vehicles in motion and parked."
+    assert response.entities["scene_type"] == "car_wash_area"
+    # `summary` is read without popping so cortex's own override still sees it
+    assert response.entities.get("summary")
+
+
+@pytest.mark.asyncio
+async def test_crop_describe_truncated_json_does_not_dump_raw_garbage(monkeypatch, caplog):
+    """Truncated JSON (no closing brace) must not leak into description.
+
+    Regression: the `else: desc = clean[:300]` fallback was storing 300-char
+    raw JSON slices into observations.description, ending mid-token.
+    """
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    truncated = (
+        '{\n  "summary": "A scene description",\n'
+        '  "scene_type": "car_wash_area",\n'
+        '  "vehicles": [\n    {\n      "action": "parked",\n      "is_know'
+    )
+    engine.analyze_frame.return_value = VideoResult(
+        text=truncated, metrics=InferenceMetrics(latency_ms=100.0)
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    req = inference.CropDescribeRequest(image_b64=_image_b64(), crops=[], max_crops=0)
+    with caplog.at_level("WARNING", logger="trio.inference"):
+        response = await inference._crop_describe_inner(req)
+
+    assert response.description == ""
+    assert "{" not in response.description
+    assert any("truncation suspected" in rec.message for rec in caplog.records)
+
+
+@pytest.mark.asyncio
+async def test_crop_describe_malformed_json_does_not_dump_raw_garbage(monkeypatch, caplog):
+    """JSON with a closing brace but invalid syntax also must not leak raw text."""
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    # Trailing comma — has both `{` and `}` so json.loads is attempted and fails.
+    malformed = '{"summary": "x", "vehicles": [{"action": "parked",}]}'
+    engine.analyze_frame.return_value = VideoResult(
+        text=malformed, metrics=InferenceMetrics(latency_ms=100.0)
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    req = inference.CropDescribeRequest(image_b64=_image_b64(), crops=[], max_crops=0)
+    with caplog.at_level("WARNING", logger="trio.inference"):
+        response = await inference._crop_describe_inner(req)
+
+    assert response.description == ""
+    assert any("JSON parse failed" in rec.message for rec in caplog.records)
+
+
+@pytest.mark.asyncio
+async def test_crop_describe_plain_prose_still_uses_300_char_slice(monkeypatch):
+    """Plain prose (not JSON-shaped) should still fall through to clean[:300]."""
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    engine.analyze_frame.return_value = VideoResult(
+        text="A quiet street at dawn with no visible activity.",
+        metrics=InferenceMetrics(latency_ms=100.0),
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    req = inference.CropDescribeRequest(image_b64=_image_b64(), crops=[], max_crops=0)
+    response = await inference._crop_describe_inner(req)
+
+    assert response.description == "A quiet street at dawn with no visible activity."
+
+
 @pytest.mark.asyncio
 async def test_crop_describe_max_crops_zero_keeps_single_full_frame(monkeypatch):
     engine = MagicMock()