From 80b3f6cf7b46f783a5b8a51bd00855b0806e3f33 Mon Sep 17 00:00:00 2001 From: Liuhaai Date: Tue, 19 May 2026 15:26:28 -0700 Subject: [PATCH 1/2] fix(inference): stop polluting schema-mode crop-describe responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cortex calls /crop-describe with a json_schema response_format, trio-core was silently mutating the wire response in two ways that caused production VLM outputs to lose attributes the model had actually identified: 1. **`_normalize_entity_item` injected schema-forbidden fields.** For every vehicle entity the normalizer ran `setdefault("brand", label)` even when the schema didn't declare a `brand` key, and `label` fell back to `description` (empty) when `make` was empty. Result on prod: model returned `{"id":"nv0","type":"suv","make":""}`; cortex received `{"id":"nv0","type":"suv","make":"","brand":""}`. The strict json_schema contract was bypassed post-VLM. 2. **YOLO-context auto-prepend duplicated the DETECTIONS hint.** Cortex already enumerates per-id detections in its scene_prompt. trio-core matched on the literal substring `"YOLO detections"` (which cortex's prompt doesn't contain) and prepended a second bbox table ahead of the caller's prompt. Empirical A/B on qwen3-vl-flash with the prod Audi frame: 3/3 trials emit `make: "Audi"` with the clean prompt, but only 1/3 trials with the duplicate-context prefix. Fix: gate both behaviors on `req.response_format is None`. Schema callers get the wire response back unchanged plus their prompt verbatim; the legacy free-text default scene_prompt path keeps the string-to-dict fallback and the YOLO hint prepend (verified by the existing tests, and by a new `test_crop_describe_default_prompt_still_gets_yolo_context`). Tests: - `test_normalize_entities_schema_mode_preserves_empty_make` — schema mode must not inject `brand` or rewrite empty `make`. - `test_normalize_entities_schema_mode_drops_extra_keys_in_passthrough` — schema mode does not iterate `animals` (or any other key). - `test_crop_describe_schema_mode_skips_yolo_context_prepend` — scene_prompt reaches the engine byte-identical. - `test_crop_describe_default_prompt_still_gets_yolo_context` — the legacy callers keep their auto-hint behavior. 419 passed, 7 skipped (no regressions). Co-Authored-By: Claude Opus 4.7 --- src/trio_core/api/routers/inference.py | 30 ++++++++-- tests/test_inference_router.py | 82 ++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 5 deletions(-) diff --git a/src/trio_core/api/routers/inference.py b/src/trio_core/api/routers/inference.py index 420d23e..0a5c97b 100644 --- a/src/trio_core/api/routers/inference.py +++ b/src/trio_core/api/routers/inference.py @@ -560,7 +560,11 @@ def _extract_crop_descriptions(text: str, panels: list[dict]) -> list[str]: def _normalize_entity_item(kind: str, item) -> dict: - """Normalize model-emitted entity items (strings or dicts) to a standard dict.""" + """Normalize a free-text-mode entity item into the default-prompt shape. + + Only called when no caller-supplied JSON schema is in effect. With a schema, + the caller owns the wire shape and trio-core must not invent fields. + """ if isinstance(item, dict): norm = dict(item) else: @@ -581,11 +585,20 @@ def _normalize_entity_item(kind: str, item) -> dict: return norm -def _normalize_entities(entities: dict | None) -> dict: - """Normalize parsed entities payload into the shape downstream code expects.""" +def _normalize_entities(entities: dict | None, *, schema_mode: bool = False) -> dict: + """Normalize parsed entities payload into the shape downstream code expects. + + When ``schema_mode`` is True the caller passed a json_schema response_format + so the wire shape is contractually fixed: we return the parsed dict + unchanged. Otherwise we apply the legacy free-text fallback that wraps + string entities into dicts for the default scene_prompt path. + """ if not isinstance(entities, dict): return {} + if schema_mode: + return dict(entities) + res = dict(entities) for k in ("persons", "vehicles", "animals"): items = res.get(k) or [] @@ -686,7 +699,14 @@ async def _crop_describe_inner(req: CropDescribeRequest): if zoom_panels: scene_prompt = _format_zoom_panel_context(zoom_panels) + "\n\n" + scene_prompt - elif req.crops and "YOLO detections" not in scene_prompt: + elif ( + req.crops + and req.response_format is None + and "YOLO detections" not in scene_prompt + ): + # Schema callers (cortex) already enumerate detections in their own + # prompt — auto-prepending a second YOLO bbox table duplicates the + # visual hint and the differing format confuses the model. yolo_context = _format_yolo_detection_context(req.crops) if yolo_context: scene_prompt = yolo_context + "\n\n" + scene_prompt @@ -748,7 +768,7 @@ async def _crop_describe_inner(req: CropDescribeRequest): clean[:80], clean[-80:], ) - entities = _normalize_entities(entities) + entities = _normalize_entities(entities, schema_mode=req.response_format is not None) # Extract description — combine SCENE + ACTIVITIES + NOTABLE into rich description scene_line = "" diff --git a/tests/test_inference_router.py b/tests/test_inference_router.py index b4fc0db..de202d3 100644 --- a/tests/test_inference_router.py +++ b/tests/test_inference_router.py @@ -57,6 +57,34 @@ def test_normalize_entities_converts_string_persons_to_dicts(): ] +def test_normalize_entities_schema_mode_preserves_empty_make(): + # Regression: with a caller-supplied json_schema, trio-core must not + # invent fields or backfill empty `make` with description. Previously + # `setdefault("brand", "")` injected a brand key the schema forbids, + # and an empty `make` survived as empty rather than being recovered. + raw = { + "vehicles": [ + {"id": "nv0", "type": "suv", "action": "parked", "make": ""} + ] + } + entities = inference._normalize_entities(raw, schema_mode=True) + + assert entities["vehicles"] == [ + {"id": "nv0", "type": "suv", "action": "parked", "make": ""} + ] + assert "brand" not in entities["vehicles"][0] + + +def test_normalize_entities_schema_mode_drops_extra_keys_in_passthrough(): + # Schema mode must not iterate animals (or any key the caller's schema + # didn't declare). Pass-through means: whatever the model emitted is + # what downstream consumers see, no trio-core invention. + raw = {"vehicles": [{"id": "nv0", "make": "Audi"}], "animals": []} + entities = inference._normalize_entities(raw, schema_mode=True) + + assert entities == raw + + def _image_b64(width: int = 120, height: int = 80) -> str: image = np.zeros((height, width, 3), dtype=np.uint8) image[:] = (20, 40, 60) @@ -242,6 +270,60 @@ async def test_crop_describe_plain_prose_still_uses_300_char_slice(monkeypatch): assert response.description == "A quiet street at dawn with no visible activity." +@pytest.mark.asyncio +async def test_crop_describe_schema_mode_skips_yolo_context_prepend(monkeypatch): + # Schema callers (cortex) already enumerate their own DETECTIONS block in + # scene_prompt. trio-core must not auto-prepend a second YOLO bbox table + # — the duplicate hints degrade attribute extraction. + engine = MagicMock() + engine._profile = SimpleNamespace(merge_factor=32) + engine.analyze_frame.return_value = VideoResult( + text='{"summary":"x","scene_type":"car_wash","activity_level":"quiet",' + '"persons":[],"vehicles":[],"no_significant_change":false}', + metrics=InferenceMetrics(latency_ms=100.0), + ) + monkeypatch.setattr(inference, "_get_vlm", lambda: engine) + + caller_prompt = "Use DETECTIONS as row anchors.\nDETECTIONS:\n id=nv0 vehicle bbox=[0,0,10,10]" + req = inference.CropDescribeRequest( + image_b64=_image_b64(), + crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}], + max_crops=0, + scene_prompt=caller_prompt, + response_format={"type": "json_schema", "json_schema": {"name": "scene", "strict": True, "schema": {}}}, + ) + + await inference._crop_describe_inner(req) + + sent_prompt = engine.analyze_frame.call_args.args[1] + assert "YOLO detections to use as visual hints" not in sent_prompt + assert sent_prompt == caller_prompt + + +@pytest.mark.asyncio +async def test_crop_describe_default_prompt_still_gets_yolo_context(monkeypatch): + # Without response_format the legacy free-text scene_prompt benefits from + # the auto-prepended YOLO context. Preserve that path. + engine = MagicMock() + engine._profile = SimpleNamespace(merge_factor=32) + engine.analyze_frame.return_value = VideoResult( + text="SCENE: x\nACTIVITIES: y\nNOTABLE: nothing unusual", + metrics=InferenceMetrics(latency_ms=100.0), + ) + monkeypatch.setattr(inference, "_get_vlm", lambda: engine) + + req = inference.CropDescribeRequest( + image_b64=_image_b64(), + crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}], + max_crops=0, + ) + + await inference._crop_describe_inner(req) + + sent_prompt = engine.analyze_frame.call_args.args[1] + assert "YOLO detections to use as visual hints" in sent_prompt + + @pytest.mark.asyncio async def test_crop_describe_max_crops_zero_keeps_single_full_frame(monkeypatch): engine = MagicMock() From da74d32f1674a07e9eb3e07a2dc8dce02ec88cb2 Mon Sep 17 00:00:00 2001 From: Liuhaai Date: Tue, 19 May 2026 15:28:27 -0700 Subject: [PATCH 2/2] style: apply ruff format --- src/trio_core/api/routers/inference.py | 6 +----- tests/test_inference_router.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/trio_core/api/routers/inference.py b/src/trio_core/api/routers/inference.py index 0a5c97b..95c490f 100644 --- a/src/trio_core/api/routers/inference.py +++ b/src/trio_core/api/routers/inference.py @@ -699,11 +699,7 @@ async def _crop_describe_inner(req: CropDescribeRequest): if zoom_panels: scene_prompt = _format_zoom_panel_context(zoom_panels) + "\n\n" + scene_prompt - elif ( - req.crops - and req.response_format is None - and "YOLO detections" not in scene_prompt - ): + elif req.crops and req.response_format is None and "YOLO detections" not in scene_prompt: # Schema callers (cortex) already enumerate detections in their own # prompt — auto-prepending a second YOLO bbox table duplicates the # visual hint and the differing format confuses the model. diff --git a/tests/test_inference_router.py b/tests/test_inference_router.py index de202d3..d3a3c7b 100644 --- a/tests/test_inference_router.py +++ b/tests/test_inference_router.py @@ -62,16 +62,10 @@ def test_normalize_entities_schema_mode_preserves_empty_make(): # invent fields or backfill empty `make` with description. Previously # `setdefault("brand", "")` injected a brand key the schema forbids, # and an empty `make` survived as empty rather than being recovered. - raw = { - "vehicles": [ - {"id": "nv0", "type": "suv", "action": "parked", "make": ""} - ] - } + raw = {"vehicles": [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}]} entities = inference._normalize_entities(raw, schema_mode=True) - assert entities["vehicles"] == [ - {"id": "nv0", "type": "suv", "action": "parked", "make": ""} - ] + assert entities["vehicles"] == [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}] assert "brand" not in entities["vehicles"][0] @@ -290,7 +284,10 @@ async def test_crop_describe_schema_mode_skips_yolo_context_prepend(monkeypatch) crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}], max_crops=0, scene_prompt=caller_prompt, - response_format={"type": "json_schema", "json_schema": {"name": "scene", "strict": True, "schema": {}}}, + response_format={ + "type": "json_schema", + "json_schema": {"name": "scene", "strict": True, "schema": {}}, + }, ) await inference._crop_describe_inner(req)