machinefi · Liuhaai · May 18, 2026 · May 18, 2026
diff --git a/src/trio_core/api/routers/inference.py b/src/trio_core/api/routers/inference.py
@@ -245,6 +245,25 @@ class CropDescribeRequest(BaseModel):
             "to remote backends; ignored (with a warning) by local backends."
         ),
     )
+    max_tokens: int | None = Field(
+        default=None,
+        ge=1,
+        le=16384,
+        description=(
+            "Maximum tokens to generate. None falls back to engine default "
+            "(512), which is too small for the structured-JSON scene schema "
+            "and silently truncates responses mid-output — observed as "
+            "'Expecting , delimiter' parse failures in cortex."
+        ),
+    )
+    extra_body: dict | None = Field(
+        default=None,
+        description=(
+            "Backend-specific kwargs forwarded as the OpenAI SDK extra_body "
+            "(e.g. DashScope's enable_thinking). Honored by RemoteHTTPBackend; "
+            "ignored by local backends."
+        ),
+    )
 
 
 class CropDescribeResponse(BaseModel):
@@ -680,8 +699,10 @@ async def _crop_describe_inner(req: CropDescribeRequest):
             engine.analyze_frame,
             frame_chw,
             scene_prompt,
+            max_tokens=req.max_tokens,
             response_format=req.response_format,
             model=req.model,
+            extra_body=req.extra_body,
         ),
     )
     text = _strip_thinking(result.text or "")

diff --git a/tests/test_inference_router.py b/tests/test_inference_router.py
@@ -107,6 +107,41 @@ async def test_crop_describe_uses_single_composite_vlm_call(monkeypatch):
     assert response.entities["people_count"] == 1
 
 
+@pytest.mark.asyncio
+async def test_crop_describe_forwards_max_tokens_and_extra_body(monkeypatch):
+    """Regression: req.max_tokens and req.extra_body were silently dropped
+    before reaching engine.analyze_frame, so structured-JSON prompts hit
+    the engine default (512) and truncated mid-output. Cortex observed
+    2131 'Expecting , delimiter' parse failures in /tmp/vlm.log over a
+    single night because of this. Forwarding both keeps crop-describe
+    consistent with /describe at the same router."""
+    engine = MagicMock()
+    engine._profile = SimpleNamespace(merge_factor=32)
+    engine.analyze_frame.return_value = VideoResult(
+        text='{"summary":"x","scene_type":"s","activity_level":"quiet"}',
+        metrics=InferenceMetrics(latency_ms=10.0),
+    )
+    monkeypatch.setattr(inference, "_get_vlm", lambda: engine)
+
+    req = inference.CropDescribeRequest(
+        image_b64=_image_b64(),
+        crops=[],
+        max_crops=0,
+        max_tokens=4096,
+        extra_body={"enable_thinking": False},
+    )
+
+    await inference._crop_describe_inner(req)
+
+    assert engine.analyze_frame.call_count == 1
+    kwargs = engine.analyze_frame.call_args.kwargs
+    assert kwargs["max_tokens"] == 4096, (
+        "max_tokens must reach analyze_frame; default-512 truncation was the "
+        "root cause of the JSON parse failures."
+    )
+    assert kwargs["extra_body"] == {"enable_thinking": False}
+
+
 @pytest.mark.asyncio
 async def test_crop_describe_uses_summary_field_from_scene_schema(monkeypatch):
     """SCENE_SCHEMA output (lowercase `summary`) should populate description.