From 80b3f6cf7b46f783a5b8a51bd00855b0806e3f33 Mon Sep 17 00:00:00 2001
From: Liuhaai <haixiang@iotex.io>
Date: Tue, 19 May 2026 15:26:28 -0700
Subject: [PATCH 1/2] fix(inference): stop polluting schema-mode crop-describe
 responses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When cortex calls /crop-describe with a json_schema response_format,
trio-core was silently mutating the wire response in two ways that
caused production VLM outputs to lose attributes the model had actually
identified:

1. **`_normalize_entity_item` injected schema-forbidden fields.** For
   every vehicle entity the normalizer ran
   `setdefault("brand", label)` even when the schema didn't declare a
   `brand` key, and `label` fell back to `description` (empty) when
   `make` was empty. Result on prod: model returned
   `{"id":"nv0","type":"suv","make":""}`; cortex received
   `{"id":"nv0","type":"suv","make":"","brand":""}`. The strict
   json_schema contract was bypassed post-VLM.

2. **YOLO-context auto-prepend duplicated the DETECTIONS hint.** Cortex
   already enumerates per-id detections in its scene_prompt. trio-core
   matched on the literal substring `"YOLO detections"` (which cortex's
   prompt doesn't contain) and prepended a second bbox table ahead of
   the caller's prompt. Empirical A/B on qwen3-vl-flash with the
   prod Audi frame: 3/3 trials emit `make: "Audi"` with the clean
   prompt, but only 1/3 trials with the duplicate-context prefix.

Fix: gate both behaviors on `req.response_format is None`. Schema
callers get the wire response back unchanged plus their prompt verbatim;
the legacy free-text default scene_prompt path keeps the string-to-dict
fallback and the YOLO hint prepend (verified by the existing tests, and
by a new `test_crop_describe_default_prompt_still_gets_yolo_context`).

Tests:
- `test_normalize_entities_schema_mode_preserves_empty_make` — schema
  mode must not inject `brand` or rewrite empty `make`.
- `test_normalize_entities_schema_mode_drops_extra_keys_in_passthrough`
  — schema mode does not iterate `animals` (or any other key).
- `test_crop_describe_schema_mode_skips_yolo_context_prepend` —
  scene_prompt reaches the engine byte-identical.
- `test_crop_describe_default_prompt_still_gets_yolo_context` — the
  legacy callers keep their auto-hint behavior.

419 passed, 7 skipped (no regressions).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/trio_core/api/routers/inference.py | 30 ++++++++--
 tests/test_inference_router.py         | 82 ++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/src/trio_core/api/routers/inference.py b/src/trio_core/api/routers/inference.py
index 420d23e..0a5c97b 100644
--- a/src/trio_core/api/routers/inference.py
+++ b/src/trio_core/api/routers/inference.py
@@ -560,7 +560,11 @@ def _extract_crop_descriptions(text: str, panels: list[dict]) -> list[str]:
 
 
 def _normalize_entity_item(kind: str, item) -> dict:
-    """Normalize model-emitted entity items (strings or dicts) to a standard dict."""
+    """Normalize a free-text-mode entity item into the default-prompt shape.
+
+    Only called when no caller-supplied JSON schema is in effect. With a schema,
+    the caller owns the wire shape and trio-core must not invent fields.
+    """
     if isinstance(item, dict):
         norm = dict(item)
     else:
@@ -581,11 +585,20 @@ def _normalize_entity_item(kind: str, item) -> dict:
     return norm
 
 
-def _normalize_entities(entities: dict | None) -> dict:
-    """Normalize parsed entities payload into the shape downstream code expects."""
+def _normalize_entities(entities: dict | None, *, schema_mode: bool = False) -> dict:
+    """Normalize parsed entities payload into the shape downstream code expects.
+
+    When ``schema_mode`` is True the caller passed a json_schema response_format
+    so the wire shape is contractually fixed: we return the parsed dict
+    unchanged. Otherwise we apply the legacy free-text fallback that wraps
+    string entities into dicts for the default scene_prompt path.
+    """
     if not isinstance(entities, dict):
         return {}
 
+    if schema_mode:
+        return dict(entities)
+
     res = dict(entities)
     for k in ("persons", "vehicles", "animals"):
         items = res.get(k) or []
@@ -686,7 +699,14 @@ async def _crop_describe_inner(req: CropDescribeRequest):
 
     if zoom_panels:
         scene_prompt = _format_zoom_panel_context(zoom_panels) + "\n\n" + scene_prompt
-    elif req.crops and "YOLO detections" not in scene_prompt:
+    elif (
+        req.crops
+        and req.response_format is None
+        and "YOLO detections" not in scene_prompt
+    ):
+        # Schema callers (cortex) already enumerate detections in their own
+        # prompt — auto-prepending a second YOLO bbox table duplicates the
+        # visual hint and the differing format confuses the model.
         yolo_context = _format_yolo_detection_context(req.crops)
         if yolo_context:
             scene_prompt = yolo_context + "\n\n" + scene_prompt
@@ -748,7 +768,7 @@ async def _crop_describe_inner(req: CropDescribeRequest):
             clean[:80],
             clean[-80:],
         )
-    entities = _normalize_entities(entities)
+    entities = _normalize_entities(entities, schema_mode=req.response_format is not None)
 
     # Extract description — combine SCENE + ACTIVITIES + NOTABLE into rich description
     scene_line = ""
diff --git a/tests/test_inference_router.py b/tests/test_inference_router.py
index b4fc0db..de202d3 100644
--- a/tests/test_inference_router.py
+++ b/tests/test_inference_router.py
@@ -57,6 +57,34 @@ def test_normalize_entities_converts_string_persons_to_dicts():
     ]
 
 
+def test_normalize_entities_schema_mode_preserves_empty_make():
+    # Regression: with a caller-supplied json_schema, trio-core must not
+    # invent fields or backfill empty `make` with description. Previously
+    # `setdefault("brand", "")` injected a brand key the schema forbids,
+    # and an empty `make` survived as empty rather than being recovered.
+    raw = {
+        "vehicles": [
+            {"id": "nv0", "type": "suv", "action": "parked", "make": ""}
+        ]
+    }
+    entities = inference._normalize_entities(raw, schema_mode=True)
+
+    assert entities["vehicles"] == [
+        {"id": "nv0", "type": "suv", "action": "parked", "make": ""}
+    ]
+    assert "brand" not in entities["vehicles"][0]
+
+
+def test_normalize_entities_schema_mode_drops_extra_keys_in_passthrough():
+    # Schema mode must not iterate animals (or any key the caller's schema
+    # didn't declare). Pass-through means: whatever the model emitted is
+    # what downstream consumers see, no trio-core invention.
+    raw = {"vehicles": [{"id": "nv0", "make": "Audi"}], "animals": []}
+    entities = inference._normalize_entities(raw, schema_mode=True)
+
+    assert entities == raw
+
+
 def _image_b64(width: int = 120, height: int = 80) -> str:
     image = np.zeros((height, width, 3), dtype=np.uint8)
     image[:] = (20, 40, 60)
@@ -242,6 +270,60 @@ async def test_crop_describe_plain_prose_still_uses_300_char_slice(monkeypatch):
     assert response.description == "A quiet street at dawn with no visible activity."
 
 
+@pytest.mark.asyncio
+async def test_crop_describe_schema_mode_skips_yolo_context_prepend(monkeypatch):
+    # Schema callers (cortex) already enumerate their own DETECTIONS block in
+    # scene_prompt. trio-core must not auto-prepend a second YOLO bbox table
+    # — the duplicate hints degrade attribute extraction.
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    engine.analyze_frame.return_value = VideoResult(
+        text='{"summary":"x","scene_type":"car_wash","activity_level":"quiet",'
+        '"persons":[],"vehicles":[],"no_significant_change":false}',
+        metrics=InferenceMetrics(latency_ms=100.0),
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    caller_prompt = "Use DETECTIONS as row anchors.\nDETECTIONS:\n  id=nv0 vehicle bbox=[0,0,10,10]"
+    req = inference.CropDescribeRequest(
+        image_b64=_image_b64(),
+        crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}],
+        max_crops=0,
+        scene_prompt=caller_prompt,
+        response_format={"type": "json_schema", "json_schema": {"name": "scene", "strict": True, "schema": {}}},
+    )
+
+    await inference._crop_describe_inner(req)
+
+    sent_prompt = engine.analyze_frame.call_args.args[1]
+    assert "YOLO detections to use as visual hints" not in sent_prompt
+    assert sent_prompt == caller_prompt
+
+
+@pytest.mark.asyncio
+async def test_crop_describe_default_prompt_still_gets_yolo_context(monkeypatch):
+    # Without response_format the legacy free-text scene_prompt benefits from
+    # the auto-prepended YOLO context. Preserve that path.
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    engine.analyze_frame.return_value = VideoResult(
+        text="SCENE: x\nACTIVITIES: y\nNOTABLE: nothing unusual",
+        metrics=InferenceMetrics(latency_ms=100.0),
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    req = inference.CropDescribeRequest(
+        image_b64=_image_b64(),
+        crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}],
+        max_crops=0,
+    )
+
+    await inference._crop_describe_inner(req)
+
+    sent_prompt = engine.analyze_frame.call_args.args[1]
+    assert "YOLO detections to use as visual hints" in sent_prompt
+
+
 @pytest.mark.asyncio
 async def test_crop_describe_max_crops_zero_keeps_single_full_frame(monkeypatch):
     engine = MagicMock()

From da74d32f1674a07e9eb3e07a2dc8dce02ec88cb2 Mon Sep 17 00:00:00 2001
From: Liuhaai <haixiang@iotex.io>
Date: Tue, 19 May 2026 15:28:27 -0700
Subject: [PATCH 2/2] style: apply ruff format

---
 src/trio_core/api/routers/inference.py |  6 +-----
 tests/test_inference_router.py         | 15 ++++++---------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/trio_core/api/routers/inference.py b/src/trio_core/api/routers/inference.py
index 0a5c97b..95c490f 100644
--- a/src/trio_core/api/routers/inference.py
+++ b/src/trio_core/api/routers/inference.py
@@ -699,11 +699,7 @@ async def _crop_describe_inner(req: CropDescribeRequest):
 
     if zoom_panels:
         scene_prompt = _format_zoom_panel_context(zoom_panels) + "\n\n" + scene_prompt
-    elif (
-        req.crops
-        and req.response_format is None
-        and "YOLO detections" not in scene_prompt
-    ):
+    elif req.crops and req.response_format is None and "YOLO detections" not in scene_prompt:
         # Schema callers (cortex) already enumerate detections in their own
         # prompt — auto-prepending a second YOLO bbox table duplicates the
         # visual hint and the differing format confuses the model.
diff --git a/tests/test_inference_router.py b/tests/test_inference_router.py
index de202d3..d3a3c7b 100644
--- a/tests/test_inference_router.py
+++ b/tests/test_inference_router.py
@@ -62,16 +62,10 @@ def test_normalize_entities_schema_mode_preserves_empty_make():
     # invent fields or backfill empty `make` with description. Previously
     # `setdefault("brand", "")` injected a brand key the schema forbids,
     # and an empty `make` survived as empty rather than being recovered.
-    raw = {
-        "vehicles": [
-            {"id": "nv0", "type": "suv", "action": "parked", "make": ""}
-        ]
-    }
+    raw = {"vehicles": [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}]}
     entities = inference._normalize_entities(raw, schema_mode=True)
 
-    assert entities["vehicles"] == [
-        {"id": "nv0", "type": "suv", "action": "parked", "make": ""}
-    ]
+    assert entities["vehicles"] == [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}]
     assert "brand" not in entities["vehicles"][0]
 
 
@@ -290,7 +284,10 @@ async def test_crop_describe_schema_mode_skips_yolo_context_prepend(monkeypatch)
         crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}],
         max_crops=0,
         scene_prompt=caller_prompt,
-        response_format={"type": "json_schema", "json_schema": {"name": "scene", "strict": True, "schema": {}}},
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "scene", "strict": True, "schema": {}},
+        },
     )
 
     await inference._crop_describe_inner(req)