Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions src/trio_core/api/routers/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,11 @@ def _extract_crop_descriptions(text: str, panels: list[dict]) -> list[str]:


def _normalize_entity_item(kind: str, item) -> dict:
"""Normalize model-emitted entity items (strings or dicts) to a standard dict."""
"""Normalize a free-text-mode entity item into the default-prompt shape.

Only called when no caller-supplied JSON schema is in effect. With a schema,
the caller owns the wire shape and trio-core must not invent fields.
"""
if isinstance(item, dict):
norm = dict(item)
else:
Expand All @@ -581,11 +585,20 @@ def _normalize_entity_item(kind: str, item) -> dict:
return norm


def _normalize_entities(entities: dict | None) -> dict:
"""Normalize parsed entities payload into the shape downstream code expects."""
def _normalize_entities(entities: dict | None, *, schema_mode: bool = False) -> dict:
"""Normalize parsed entities payload into the shape downstream code expects.

When ``schema_mode`` is True the caller passed a json_schema response_format
so the wire shape is contractually fixed: we return the parsed dict
unchanged. Otherwise we apply the legacy free-text fallback that wraps
string entities into dicts for the default scene_prompt path.
"""
if not isinstance(entities, dict):
return {}

if schema_mode:
return dict(entities)

res = dict(entities)
for k in ("persons", "vehicles", "animals"):
items = res.get(k) or []
Expand Down Expand Up @@ -686,7 +699,10 @@ async def _crop_describe_inner(req: CropDescribeRequest):

if zoom_panels:
scene_prompt = _format_zoom_panel_context(zoom_panels) + "\n\n" + scene_prompt
elif req.crops and "YOLO detections" not in scene_prompt:
elif req.crops and req.response_format is None and "YOLO detections" not in scene_prompt:
# Schema callers (cortex) already enumerate detections in their own
# prompt — auto-prepending a second YOLO bbox table duplicates the
# visual hint and the differing format confuses the model.
yolo_context = _format_yolo_detection_context(req.crops)
if yolo_context:
scene_prompt = yolo_context + "\n\n" + scene_prompt
Expand Down Expand Up @@ -748,7 +764,7 @@ async def _crop_describe_inner(req: CropDescribeRequest):
clean[:80],
clean[-80:],
)
entities = _normalize_entities(entities)
entities = _normalize_entities(entities, schema_mode=req.response_format is not None)

# Extract description — combine SCENE + ACTIVITIES + NOTABLE into rich description
scene_line = ""
Expand Down
79 changes: 79 additions & 0 deletions tests/test_inference_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,28 @@ def test_normalize_entities_converts_string_persons_to_dicts():
]


def test_normalize_entities_schema_mode_preserves_empty_make():
# Regression: with a caller-supplied json_schema, trio-core must not
# invent fields or backfill empty `make` with description. Previously
# `setdefault("brand", "")` injected a brand key the schema forbids,
# and an empty `make` survived as empty rather than being recovered.
raw = {"vehicles": [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}]}
entities = inference._normalize_entities(raw, schema_mode=True)

assert entities["vehicles"] == [{"id": "nv0", "type": "suv", "action": "parked", "make": ""}]
assert "brand" not in entities["vehicles"][0]


def test_normalize_entities_schema_mode_drops_extra_keys_in_passthrough():
# Schema mode must not iterate animals (or any key the caller's schema
# didn't declare). Pass-through means: whatever the model emitted is
# what downstream consumers see, no trio-core invention.
raw = {"vehicles": [{"id": "nv0", "make": "Audi"}], "animals": []}
entities = inference._normalize_entities(raw, schema_mode=True)

assert entities == raw


def _image_b64(width: int = 120, height: int = 80) -> str:
image = np.zeros((height, width, 3), dtype=np.uint8)
image[:] = (20, 40, 60)
Expand Down Expand Up @@ -242,6 +264,63 @@ async def test_crop_describe_plain_prose_still_uses_300_char_slice(monkeypatch):
assert response.description == "A quiet street at dawn with no visible activity."


@pytest.mark.asyncio
async def test_crop_describe_schema_mode_skips_yolo_context_prepend(monkeypatch):
# Schema callers (cortex) already enumerate their own DETECTIONS block in
# scene_prompt. trio-core must not auto-prepend a second YOLO bbox table
# — the duplicate hints degrade attribute extraction.
engine = MagicMock()
engine._profile = SimpleNamespace(merge_factor=32)
engine.analyze_frame.return_value = VideoResult(
text='{"summary":"x","scene_type":"car_wash","activity_level":"quiet",'
'"persons":[],"vehicles":[],"no_significant_change":false}',
metrics=InferenceMetrics(latency_ms=100.0),
)
monkeypatch.setattr(inference, "_get_vlm", lambda: engine)

caller_prompt = "Use DETECTIONS as row anchors.\nDETECTIONS:\n id=nv0 vehicle bbox=[0,0,10,10]"
req = inference.CropDescribeRequest(
image_b64=_image_b64(),
crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}],
max_crops=0,
scene_prompt=caller_prompt,
response_format={
"type": "json_schema",
"json_schema": {"name": "scene", "strict": True, "schema": {}},
},
)

await inference._crop_describe_inner(req)

sent_prompt = engine.analyze_frame.call_args.args[1]
assert "YOLO detections to use as visual hints" not in sent_prompt
assert sent_prompt == caller_prompt


@pytest.mark.asyncio
async def test_crop_describe_default_prompt_still_gets_yolo_context(monkeypatch):
# Without response_format the legacy free-text scene_prompt benefits from
# the auto-prepended YOLO context. Preserve that path.
engine = MagicMock()
engine._profile = SimpleNamespace(merge_factor=32)
engine.analyze_frame.return_value = VideoResult(
text="SCENE: x\nACTIVITIES: y\nNOTABLE: nothing unusual",
metrics=InferenceMetrics(latency_ms=100.0),
)
monkeypatch.setattr(inference, "_get_vlm", lambda: engine)

req = inference.CropDescribeRequest(
image_b64=_image_b64(),
crops=[{"bbox": [0, 0, 10, 10], "class": "car", "confidence": 0.9}],
max_crops=0,
)

await inference._crop_describe_inner(req)

sent_prompt = engine.analyze_frame.call_args.args[1]
assert "YOLO detections to use as visual hints" in sent_prompt


@pytest.mark.asyncio
async def test_crop_describe_max_crops_zero_keeps_single_full_frame(monkeypatch):
engine = MagicMock()
Expand Down
Loading