diff --git a/src/trio_core/api/routers/inference.py b/src/trio_core/api/routers/inference.py index 420d23e..95c490f 100644 --- a/src/trio_core/api/routers/inference.py +++ b/src/trio_core/api/routers/inference.py @@ -560,7 +560,11 @@ def _extract_crop_descriptions(text: str, panels: list[dict]) -> list[str]: def _normalize_entity_item(kind: str, item) -> dict: - """Normalize model-emitted entity items (strings or dicts) to a standard dict.""" + """Normalize a free-text-mode entity item into the default-prompt shape. + + Only called when no caller-supplied JSON schema is in effect. With a schema, + the caller owns the wire shape and trio-core must not invent fields. + """ if isinstance(item, dict): norm = dict(item) else: @@ -581,11 +585,20 @@ def _normalize_entity_item(kind: str, item) -> dict: return norm -def _normalize_entities(entities: dict | None) -> dict: - """Normalize parsed entities payload into the shape downstream code expects.""" +def _normalize_entities(entities: dict | None, *, schema_mode: bool = False) -> dict: + """Normalize parsed entities payload into the shape downstream code expects. + + When ``schema_mode`` is True the caller passed a json_schema response_format + so the wire shape is contractually fixed: we return the parsed dict + unchanged. Otherwise we apply the legacy free-text fallback that wraps + string entities into dicts for the default scene_prompt path. + """ if not isinstance(entities, dict): return {} + if schema_mode: + return dict(entities) + res = dict(entities) for k in ("persons", "vehicles", "animals"): items = res.get(k) or [] @@ -686,7 +699,10 @@ async def _crop_describe_inner(req: CropDescribeRequest): if zoom_panels: scene_prompt = _format_zoom_panel_context(zoom_panels) + "\n\n" + scene_prompt - elif req.crops and "YOLO detections" not in scene_prompt: + elif req.crops and req.response_format is None and "YOLO detections" not in scene_prompt: + # Schema callers (cortex) already enumerate detections in their own + # prompt — auto-prepending a second YOLO bbox table duplicates the + # visual hint and the differing format confuses the model. yolo_context = _format_yolo_detection_context(req.crops) if yolo_context: scene_prompt = yolo_context + "\n\n" + scene_prompt @@ -748,7 +764,7 @@ async def _crop_describe_inner(req: CropDescribeRequest): clean[:80], clean[-80:], ) - entities = _normalize_entities(entities) + entities = _normalize_entities(entities, schema_mode=req.response_format is not None) # Extract description — combine SCENE + ACTIVITIES + NOTABLE into rich description scene_line = "" diff --git a/tests/test_inference_router.py b/tests/test_inference_router.py index b4fc0db..d3a3c7b 100644 --- a/tests/test_inference_router.py +++ b/tests/test_inference_router.py @@ -57,6 +57,28 @@ def test_normalize_entities_converts_string_persons_to_dicts(): ] +def test_normalize_entities_schema_mode_preserves_empty_make(): + # Regression: with a caller-supplied json_schema, trio-core must not + # invent fields or backfill empty `make` with description. Previously + # `setdefault("brand", "")` injected a brand key the schema forbids, + # and an empty `make` survived as empty rather than being recovered. + raw = {"vehicles": [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}]} + entities = inference._normalize_entities(raw, schema_mode=True) + + assert entities["vehicles"] == [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}] + assert "brand" not in entities["vehicles"][0] + + +def test_normalize_entities_schema_mode_drops_extra_keys_in_passthrough(): + # Schema mode must not iterate animals (or any key the caller's schema + # didn't declare). Pass-through means: whatever the model emitted is + # what downstream consumers see, no trio-core invention. + raw = {"vehicles": [{"id": "nv0", "make": "Audi"}], "animals": []} + entities = inference._normalize_entities(raw, schema_mode=True) + + assert entities == raw + + def _image_b64(width: int = 120, height: int = 80) -> str: image = np.zeros((height, width, 3), dtype=np.uint8) image[:] = (20, 40, 60) @@ -242,6 +264,63 @@ async def test_crop_describe_plain_prose_still_uses_300_char_slice(monkeypatch): assert response.description == "A quiet street at dawn with no visible activity." +@pytest.mark.asyncio +async def test_crop_describe_schema_mode_skips_yolo_context_prepend(monkeypatch): + # Schema callers (cortex) already enumerate their own DETECTIONS block in + # scene_prompt. trio-core must not auto-prepend a second YOLO bbox table + # — the duplicate hints degrade attribute extraction. + engine = MagicMock() + engine._profile = SimpleNamespace(merge_factor=32) + engine.analyze_frame.return_value = VideoResult( + text='{"summary":"x","scene_type":"car_wash","activity_level":"quiet",' + '"persons":[],"vehicles":[],"no_significant_change":false}', + metrics=InferenceMetrics(latency_ms=100.0), + ) + monkeypatch.setattr(inference, "_get_vlm", lambda: engine) + + caller_prompt = "Use DETECTIONS as row anchors.\nDETECTIONS:\n id=nv0 vehicle bbox=[0,0,10,10]" + req = inference.CropDescribeRequest( + image_b64=_image_b64(), + crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}], + max_crops=0, + scene_prompt=caller_prompt, + response_format={ + "type": "json_schema", + "json_schema": {"name": "scene", "strict": True, "schema": {}}, + }, + ) + + await inference._crop_describe_inner(req) + + sent_prompt = engine.analyze_frame.call_args.args[1] + assert "YOLO detections to use as visual hints" not in sent_prompt + assert sent_prompt == caller_prompt + + +@pytest.mark.asyncio +async def test_crop_describe_default_prompt_still_gets_yolo_context(monkeypatch): + # Without response_format the legacy free-text scene_prompt benefits from + # the auto-prepended YOLO context. Preserve that path. + engine = MagicMock() + engine._profile = SimpleNamespace(merge_factor=32) + engine.analyze_frame.return_value = VideoResult( + text="SCENE: x\nACTIVITIES: y\nNOTABLE: nothing unusual", + metrics=InferenceMetrics(latency_ms=100.0), + ) + monkeypatch.setattr(inference, "_get_vlm", lambda: engine) + + req = inference.CropDescribeRequest( + image_b64=_image_b64(), + crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}], + max_crops=0, + ) + + await inference._crop_describe_inner(req) + + sent_prompt = engine.analyze_frame.call_args.args[1] + assert "YOLO detections to use as visual hints" in sent_prompt + + @pytest.mark.asyncio async def test_crop_describe_max_crops_zero_keeps_single_full_frame(monkeypatch): engine = MagicMock()