machinefi · Liuhaai · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/src/trio_core/api/routers/inference.py b/src/trio_core/api/routers/inference.py
@@ -560,7 +560,11 @@ def _extract_crop_descriptions(text: str, panels: list[dict]) -> list[str]:
 
 
 def _normalize_entity_item(kind: str, item) -> dict:
-    """Normalize model-emitted entity items (strings or dicts) to a standard dict."""
+    """Normalize a free-text-mode entity item into the default-prompt shape.
+
+    Only called when no caller-supplied JSON schema is in effect. With a schema,
+    the caller owns the wire shape and trio-core must not invent fields.
+    """
     if isinstance(item, dict):
         norm = dict(item)
     else:
@@ -581,11 +585,20 @@ def _normalize_entity_item(kind: str, item) -> dict:
     return norm
 
 
-def _normalize_entities(entities: dict | None) -> dict:
-    """Normalize parsed entities payload into the shape downstream code expects."""
+def _normalize_entities(entities: dict | None, *, schema_mode: bool = False) -> dict:
+    """Normalize parsed entities payload into the shape downstream code expects.
+
+    When ``schema_mode`` is True the caller passed a json_schema response_format
+    so the wire shape is contractually fixed: we return the parsed dict
+    unchanged. Otherwise we apply the legacy free-text fallback that wraps
+    string entities into dicts for the default scene_prompt path.
+    """
     if not isinstance(entities, dict):
         return {}
 
+    if schema_mode:
+        return dict(entities)
+
     res = dict(entities)
     for k in ("persons", "vehicles", "animals"):
         items = res.get(k) or []
@@ -686,7 +699,10 @@ async def _crop_describe_inner(req: CropDescribeRequest):
 
     if zoom_panels:
         scene_prompt = _format_zoom_panel_context(zoom_panels) + "\n\n" + scene_prompt
-    elif req.crops and "YOLO detections" not in scene_prompt:
+    elif req.crops and req.response_format is None and "YOLO detections" not in scene_prompt:
+        # Schema callers (cortex) already enumerate detections in their own
+        # prompt — auto-prepending a second YOLO bbox table duplicates the
+        # visual hint and the differing format confuses the model.
         yolo_context = _format_yolo_detection_context(req.crops)
         if yolo_context:
             scene_prompt = yolo_context + "\n\n" + scene_prompt
@@ -748,7 +764,7 @@ async def _crop_describe_inner(req: CropDescribeRequest):
             clean[:80],
             clean[-80:],
         )
-    entities = _normalize_entities(entities)
+    entities = _normalize_entities(entities, schema_mode=req.response_format is not None)
 
     # Extract description — combine SCENE + ACTIVITIES + NOTABLE into rich description
     scene_line = ""

diff --git a/tests/test_inference_router.py b/tests/test_inference_router.py
@@ -57,6 +57,28 @@ def test_normalize_entities_converts_string_persons_to_dicts():
     ]
 
 
+def test_normalize_entities_schema_mode_preserves_empty_make():
+    # Regression: with a caller-supplied json_schema, trio-core must not
+    # invent fields or backfill empty `make` with description. Previously
+    # `setdefault("brand", "")` injected a brand key the schema forbids,
+    # and an empty `make` survived as empty rather than being recovered.
+    raw = {"vehicles": [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}]}
+    entities = inference._normalize_entities(raw, schema_mode=True)
+
+    assert entities["vehicles"] == [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}]
+    assert "brand" not in entities["vehicles"][0]
+
+
+def test_normalize_entities_schema_mode_drops_extra_keys_in_passthrough():
+    # Schema mode must not iterate animals (or any key the caller's schema
+    # didn't declare). Pass-through means: whatever the model emitted is
+    # what downstream consumers see, no trio-core invention.
+    raw = {"vehicles": [{"id": "nv0", "make": "Audi"}], "animals": []}
+    entities = inference._normalize_entities(raw, schema_mode=True)
+
+    assert entities == raw
+
+
 def _image_b64(width: int = 120, height: int = 80) -> str:
     image = np.zeros((height, width, 3), dtype=np.uint8)
     image[:] = (20, 40, 60)
@@ -242,6 +264,63 @@ async def test_crop_describe_plain_prose_still_uses_300_char_slice(monkeypatch):
     assert response.description == "A quiet street at dawn with no visible activity."
 
 
+@pytest.mark.asyncio
+async def test_crop_describe_schema_mode_skips_yolo_context_prepend(monkeypatch):
+    # Schema callers (cortex) already enumerate their own DETECTIONS block in
+    # scene_prompt. trio-core must not auto-prepend a second YOLO bbox table
+    # — the duplicate hints degrade attribute extraction.
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    engine.analyze_frame.return_value = VideoResult(
+        text='{"summary":"x","scene_type":"car_wash","activity_level":"quiet",'
+        '"persons":[],"vehicles":[],"no_significant_change":false}',
+        metrics=InferenceMetrics(latency_ms=100.0),
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    caller_prompt = "Use DETECTIONS as row anchors.\nDETECTIONS:\n  id=nv0 vehicle bbox=[0,0,10,10]"
+    req = inference.CropDescribeRequest(
+        image_b64=_image_b64(),
+        crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}],
+        max_crops=0,
+        scene_prompt=caller_prompt,
+        response_format={
+            "type": "json_schema",
+            "json_schema": {"name": "scene", "strict": True, "schema": {}},
+        },
+    )
+
+    await inference._crop_describe_inner(req)
+
+    sent_prompt = engine.analyze_frame.call_args.args[1]
+    assert "YOLO detections to use as visual hints" not in sent_prompt
+    assert sent_prompt == caller_prompt
+
+
+@pytest.mark.asyncio
+async def test_crop_describe_default_prompt_still_gets_yolo_context(monkeypatch):
+    # Without response_format the legacy free-text scene_prompt benefits from
+    # the auto-prepended YOLO context. Preserve that path.
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    engine.analyze_frame.return_value = VideoResult(
+        text="SCENE: x\nACTIVITIES: y\nNOTABLE: nothing unusual",
+        metrics=InferenceMetrics(latency_ms=100.0),
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    req = inference.CropDescribeRequest(
+        image_b64=_image_b64(),
+        crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}],
+        max_crops=0,
+    )
+
+    await inference._crop_describe_inner(req)
+
+    sent_prompt = engine.analyze_frame.call_args.args[1]
+    assert "YOLO detections to use as visual hints" in sent_prompt
+
+
 @pytest.mark.asyncio
 async def test_crop_describe_max_crops_zero_keeps_single_full_frame(monkeypatch):
     engine = MagicMock()