brianmeyer · brianmeyer · May 17, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/benchmarks/cross_modal_ablation.py b/benchmarks/cross_modal_ablation.py
diff --git a/docs/ENV_VARS.md b/docs/ENV_VARS.md
@@ -4,50 +4,79 @@ This is the canonical reference for all `RECALLFORGE_*` environment variables us
 
 ## Runtime selection
 
-- `RECALLFORGE_BACKEND`  
+- `RECALLFORGE_BACKEND`
   Backend selector: `auto` (default), `torch`, `mlx`.
 
-- `RECALLFORGE_MODE`  
+- `RECALLFORGE_MODE`
   Search mode: `embed` or `hybrid`.
 
-- `RECALLFORGE_MLX_QUANTIZE`  
+- `RECALLFORGE_MLX_QUANTIZE`
   MLX quantization mode: `bf16` or `4bit`.
 
-- `RECALLFORGE_DISABLE_MLX`  
+- `RECALLFORGE_DISABLE_MLX`
   Disable MLX backend probing when set to `1`.
 
-- `RECALLFORGE_STORAGE`  
+## MLX safety knobs
+
+- `RECALLFORGE_MLX_HEAVY_OP_CONCURRENCY`
+  Concurrency ceiling for the heaviest MLX multimodal operations. Default is `1` for local safety.
+
+- `RECALLFORGE_MLX_VIDEO_SAMPLE_FPS`
+  Sampling rate for MLX raw-video processing. Lower values reduce memory pressure.
+
+- `RECALLFORGE_MLX_VIDEO_MAX_FRAMES`
+  Frame cap for MLX raw-video processing. The shipped default is intentionally conservative for local-agent use.
+
+- `RECALLFORGE_MLX_VIDEO_FALLBACK_MAX_FRAMES`
+  Frame cap for the ffmpeg-based frame-averaging fallback used when native video embedding is unavailable or downgraded.
+
+- `RECALLFORGE_MLX_MIN_PIXELS`
+  Lower bound for MLX processor visual resolution budgeting.
+
+- `RECALLFORGE_MLX_MAX_PIXELS`
+  Upper bound for MLX processor visual resolution budgeting.
+
+- `RECALLFORGE_ENABLE_MLX_NATIVE_VIDEO_PROCESSING`
+  Enable qwen-vl-utils native video decoding on MLX. Disabled by default for local safety; if you opt in, prefer `FORCE_QWENVL_VIDEO_READER=torchcodec`.
+
+- `RECALLFORGE_STORAGE`
   Storage backend selector (currently `lancedb`).
 
-- `RECALLFORGE_STORE_PATH`  
+- `RECALLFORGE_STORE_PATH`
   Path to the RecallForge data store.
 
 ## Search pipeline tuning
 
-- `RECALLFORGE_OVERFETCH_FACTOR`  
+- `RECALLFORGE_OVERFETCH_FACTOR`
   Candidate overfetch multiplier before final trim.
 
-- `RECALLFORGE_MAX_CANDIDATES`  
+- `RECALLFORGE_MAX_CANDIDATES`
   Hard cap for candidate pool size before reranking.
 
-- `RECALLFORGE_RERANK_TOP_K`  
+- `RECALLFORGE_RERANK_TOP_K`
   Number of top RRF candidates to rerank.
 
+- `RECALLFORGE_ENABLE_MEDIA_RERANKING`
+  Enable multimodal reranking for image/video-involved searches. Disabled by default.
+
+- `RECALLFORGE_ENABLE_RAW_VIDEO_QUERY_EMBEDDING`
+  Enable raw video query embedding. On MLX, RecallForge now defaults to safer caption/transcript-first retrieval unless you explicitly enable this.
+
 ## Server behavior
 
-- `RECALLFORGE_TRACE`  
+- `RECALLFORGE_TRACE`
   Enables trace logging for MCP tools when set to `1`.
 
-- `RECALLFORGE_MCP_MAX_CONCURRENCY`  
+- `RECALLFORGE_MCP_MAX_CONCURRENCY`
   Maximum number of blocking MCP tool operations run concurrently.
 
 ## Storage/FTS internals
 
-- `RECALLFORGE_BM25_FALLBACK_MAX_ROWS`  
+- `RECALLFORGE_BM25_FALLBACK_MAX_ROWS`
   Row limit used by BM25 fallback recovery paths.
 
-- `RECALLFORGE_BULK_FLUSH_DOCS`  
+- `RECALLFORGE_BULK_FLUSH_DOCS`
   Batch flush threshold for document table writes.
 
-- `RECALLFORGE_BULK_FLUSH_EMBEDDINGS`  
+- `RECALLFORGE_BULK_FLUSH_EMBEDDINGS`
   Batch flush threshold for embedding table writes.
diff --git a/docs/RELEASE.md b/docs/RELEASE.md
@@ -40,11 +40,61 @@ UAT_MCP_LIVE=1 .venv/bin/python -m pytest -q tests/uat/test_uat_comprehensive.py
 Then run the expanded benchmark:
 
 ```bash
-.venv/bin/python benchmarks/cross_modal_ablation.py --backend mlx --output benchmarks/results/cross_modal_ablation_results.json
+.venv/bin/python benchmarks/cross_modal_ablation.py --backend mlx --expansion-profile caption_only --output benchmarks/results/cross_modal_ablation_results.json
 ```
 
 The benchmark now checkpoints to JSON as it runs. If the run is interrupted, the output file still contains partial results plus progress metadata.
 
+For safer local validation after the MLX hardening work, prefer the bounded smoke lane first:
+
+```bash
+.venv/bin/python benchmarks/cross_modal_ablation.py --backend mlx --smoke-profile safe --expansion-profile caption_only
+```
+
+That profile defaults to a smaller stage/query footprint and can enforce an RSS stop condition:
+
+```bash
+.venv/bin/python benchmarks/cross_modal_ablation.py --backend mlx --smoke-profile safe --rss-limit-mb 6144
+```
+
+The output JSON now records:
+
+- `configuration.smoke_profile`
+- `configuration.rss_limit_mb`
+- `telemetry.peak_rss_mb`
+
+Use the full benchmark only after the safe smoke completes cleanly on the target machine.
+
+For query-expansion release decisions, compare at least these profiles:
+
+```bash
+.venv/bin/python benchmarks/cross_modal_ablation.py --backend mlx --expansion-profile caption_only
+.venv/bin/python benchmarks/cross_modal_ablation.py --backend mlx --expansion-profile heuristic
+.venv/bin/python benchmarks/cross_modal_ablation.py --backend mlx --expansion-profile qwen
+```
+
+Profile meanings:
+
+- `caption_only`: shipped default baseline for media queries. Text queries do not expand; image/video queries still use caption or transcript BM25 probes.
+- `heuristic`: opt-in expansion branches using the legacy heuristic rewrite fallback.
+- `qwen`: opt-in expansion branches using the backend `generate_text()` path when available.
+- `off`: pure no-expansion baseline, including no media caption probe, useful for measuring the value of caption/transcript query text itself.
+
+When you omit `--output`, the benchmark now keeps profile-specific filenames for non-default runs, for example `cross_modal_ablation_results_qwen.json`.
+
+### MLX safety notes
+
+- MLX heavy multimodal operations are now intentionally serialized by default via `RECALLFORGE_MLX_HEAVY_OP_CONCURRENCY=1`.
+- On MLX, raw video query embedding is no longer the default hot path. RecallForge prefers caption/transcript-first retrieval unless `RECALLFORGE_ENABLE_RAW_VIDEO_QUERY_EMBEDDING=1` is set.
+- On MLX, qwen-vl-utils native video decoding is now also opt-in. RecallForge defaults to frame/caption fallbacks unless `RECALLFORGE_ENABLE_MLX_NATIVE_VIDEO_PROCESSING=1` is set.
+- If you do opt back into native MLX video decoding, prefer `FORCE_QWENVL_VIDEO_READER=torchcodec` per Qwen's upstream guidance.
+- The raw-video path now has explicit frame and pixel budget knobs:
+  - `RECALLFORGE_MLX_VIDEO_SAMPLE_FPS`
+  - `RECALLFORGE_MLX_VIDEO_MAX_FRAMES`
+  - `RECALLFORGE_MLX_VIDEO_FALLBACK_MAX_FRAMES`
+  - `RECALLFORGE_MLX_MIN_PIXELS`
+  - `RECALLFORGE_MLX_MAX_PIXELS`
+
 ## 4. Tag and publish
 
 1. Commit the release changes.

diff --git a/docs/mcp-tools.md b/docs/mcp-tools.md
@@ -149,7 +149,7 @@ Example MCP client config (Claude Desktop):
 | profile | string | No | — | Profile namespace filter |
 | intent | string (`exact_lookup`\|`semantic`\|`broad`) | No | — | Intent steering for RRF weights |
 | rerank_top_k | integer | No | 20 | Max top RRF candidates to rerank (`0` disables reranking) |
-| expand | boolean | No | false | Enable VL-aware query expansion |
+| expand | boolean | No | false | Enable opt-in query expansion. Text queries use Qwen-backed variants when the backend supports `generate_text()`, otherwise they fall back to heuristic rewrites. Image/video queries expand the generated caption/transcript probe text on the same rules. |
 
 \* Exactly one of `query`, `image_path`, or `video_path` must be provided.
 
@@ -203,6 +203,7 @@ Example MCP client config (Claude Desktop):
 
 **Notes:**
 - Reuses the same retrieval pipeline as `search`, so explanations reflect the actual ranking path.
+- `expand=true` is still opt-in. It adds extra retrieval branches, so expect a latency/quality tradeoff rather than a free win.
 - `provenance.rrf.sources` maps each contributing RRF list to that result’s rank in the list.
 - `provenance.reranker.scoring_path` shows whether the reranker used text or VL scoring.
 - `media_compensation_applied` is `true` for image/video candidates that received RRF compensation because BM25 cannot surface them structurally.

diff --git a/src/recallforge/__init__.py b/src/recallforge/__init__.py
@@ -44,6 +44,13 @@ def _has_torch() -> bool:
     "RECALLFORGE_BACKEND": "Backend selector: auto | torch | mlx.",
     "RECALLFORGE_MODE": "Search mode: embed | hybrid.",
     "RECALLFORGE_MLX_QUANTIZE": "MLX quantization mode: bf16 | 4bit.",
+    "RECALLFORGE_MLX_HEAVY_OP_CONCURRENCY": "Concurrency ceiling for heavy MLX multimodal ops (default 1 for local safety).",
+    "RECALLFORGE_MLX_VIDEO_SAMPLE_FPS": "Sampling rate for MLX raw-video processing (lower is safer).",
+    "RECALLFORGE_MLX_VIDEO_MAX_FRAMES": "Frame cap for MLX raw-video processing (default tuned for local safety).",
+    "RECALLFORGE_MLX_VIDEO_FALLBACK_MAX_FRAMES": "Frame cap for ffmpeg frame-averaging fallback when native video embedding is unavailable.",
+    "RECALLFORGE_MLX_MIN_PIXELS": "Lower bound for MLX processor visual resolution budgeting.",
+    "RECALLFORGE_MLX_MAX_PIXELS": "Upper bound for MLX processor visual resolution budgeting.",
+    "RECALLFORGE_ENABLE_MLX_NATIVE_VIDEO_PROCESSING": "Enable qwen-vl-utils native video decoding for MLX; disabled by default for local safety.",
     "RECALLFORGE_STORAGE": "Storage backend selector (currently lancedb).",
     "RECALLFORGE_STORE_PATH": "Path to RecallForge data store.",
     "RECALLFORGE_TRACE": "Enable verbose MCP server trace logging (1=true).",
@@ -52,6 +59,7 @@ def _has_torch() -> bool:
     "RECALLFORGE_MAX_CANDIDATES": "Hard cap for candidate pool before reranking.",
     "RECALLFORGE_RERANK_TOP_K": "Number of top RRF candidates sent to reranker.",
     "RECALLFORGE_ENABLE_MEDIA_RERANKING": "Enable multimodal reranking for image/video-involved searches (disabled by default).",
+    "RECALLFORGE_ENABLE_RAW_VIDEO_QUERY_EMBEDDING": "Enable raw video query embedding; MLX defaults to safer caption/transcript-first retrieval unless explicitly enabled.",
     "RECALLFORGE_MEDIA_QUERY_RERANK_TOP_K": "Rerank cap for query-side image/video searches.",
     "RECALLFORGE_MEDIA_RESULT_RERANK_TOP_K": "Rerank cap when text queries retrieve image/video candidates.",
     "RECALLFORGE_DISABLE_MLX": "Force-disable MLX backend detection (1=true).",