From 74b7125784677e90ce2549773942cf7f620b26bc Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Thu, 19 Mar 2026 09:56:01 -0700
Subject: [PATCH 01/20] (retriever) Add VLM image captioning via vLLM

---
 nemo_retriever/README.md                      |  37 +++
 nemo_retriever/pyproject.toml                 |  11 +-
 .../src/nemo_retriever/caption/__init__.py    |   3 +
 .../src/nemo_retriever/caption/caption.py     | 177 ++++++++++++++
 .../examples/inprocess_pipeline.py            |  33 +++
 .../nemo_retriever/ingest_modes/gpu_pool.py   |  37 ++-
 .../nemo_retriever/ingest_modes/inprocess.py  |  68 +++++-
 nemo_retriever/src/nemo_retriever/ingestor.py |   4 +-
 .../nemo_retriever/model/local/__init__.py    |   5 +
 .../model/local/nemotron_vlm_captioner.py     | 217 ++++++++++++++++++
 .../src/nemo_retriever/params/__init__.py     |   2 +
 .../src/nemo_retriever/params/models.py       |  14 ++
 .../src/nemo_retriever/pdf/extract.py         |  27 ++-
 13 files changed, 626 insertions(+), 9 deletions(-)
 create mode 100644 nemo_retriever/src/nemo_retriever/caption/__init__.py
 create mode 100644 nemo_retriever/src/nemo_retriever/caption/caption.py
 create mode 100644 nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py

diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
index fbff5b00f..b47928c56 100644
--- a/nemo_retriever/README.md
+++ b/nemo_retriever/README.md
@@ -52,6 +52,43 @@ uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu13
 ```
 This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime.
 
+## Image Captioning (optional)
+
+NeMo Retriever Library can caption extracted images using a local VLM
+([Nemotron Nano 12B v2 VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16)).
+This requires [vLLM](https://github.com/vllm-project/vllm) and
+[mamba-ssm](https://github.com/state-spaces/mamba), which must be installed
+separately because they contain CUDA kernels that must match your torch build.
+
+```bash
+# 1. Install vLLM (--no-deps avoids overwriting the torch+cu130 already installed)
+uv pip install --no-deps vllm>=0.16.0
+
+# 2. Build mamba-ssm from source against your torch (takes a few minutes)
+uv pip install --no-deps --no-build-isolation mamba-ssm>=2.3.1
+```
+
+After installing, add `--caption` and `--caption-device` to your pipeline command:
+
+```bash
+python -m nemo_retriever.examples.inprocess_pipeline \
+  data/multimodal_test.pdf \
+  --caption \
+  --caption-device cuda:1
+```
+
+`--caption-device` places the VLM on a separate GPU so it does not compete with
+the page-elements, OCR, and embedding models. If omitted, a warning is printed
+and the VLM defaults to `cuda:0`.
+
+Supported `--caption-model-name` values:
+
+| Model | Precision | Notes |
+|---|---|---|
+| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` (default) | BFloat16 | Works on SM80+ (A100, A10, RTX 3090, ...) |
+| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8` | FP8 | Works on SM80+ |
+| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD` | NVFP4 | Requires SM89+ (Ada Lovelace / Hopper) |
+
 ## Run the pipeline
 
 The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/).
diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml
index 2e7d53df7..c7c0e3409 100644
--- a/nemo_retriever/pyproject.toml
+++ b/nemo_retriever/pyproject.toml
@@ -56,8 +56,8 @@ dependencies = [
   "transformers>=5.0.0",
   "tokenizers>=0.20.3",
   "accelerate>=1.1.0",
-  "torch~=2.9.1",
-  "torchvision>=0.24,<0.25",
+  "torch>=2.5.0",
+  "torchvision",
   "einops",
   "easydict",
   "addict",
@@ -82,6 +82,13 @@ dependencies = [
 svg = [
   "cairosvg>=2.7.0",
 ]
+# Install with: pip install ".[vlm-caption]"
+# mamba-ssm must be built from source against the installed torch:
+#   uv pip install --no-deps --no-build-isolation mamba-ssm
+vlm-caption = [
+  "vllm>=0.16.0",
+  "mamba-ssm>=2.3.1",
+]
 dev = [
   "build>=1.2.2",
   "pytest>=8.0.2",
diff --git a/nemo_retriever/src/nemo_retriever/caption/__init__.py b/nemo_retriever/src/nemo_retriever/caption/__init__.py
new file mode 100644
index 000000000..6aa2e3d5b
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/caption/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
new file mode 100644
index 000000000..0de69614d
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -0,0 +1,177 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Tuple
+
+import pandas as pd
+
+
+def _caption_batch_remote(
+    base64_images: List[str],
+    *,
+    endpoint_url: str,
+    model_name: str,
+    api_key: str | None,
+    prompt: str,
+    system_prompt: str | None,
+    temperature: float,
+) -> List[str]:
+    """Send a batch of images to a remote VLM endpoint and return captions."""
+    from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface
+    from nv_ingest_api.util.nim import create_inference_client
+    from nv_ingest_api.util.image_processing.transforms import scale_image_to_encoding_size
+
+    scaled = [scale_image_to_encoding_size(b64)[0] for b64 in base64_images]
+
+    data: Dict[str, Any] = {
+        "base64_images": scaled,
+        "prompt": prompt,
+    }
+    if system_prompt:
+        data["system_prompt"] = system_prompt
+
+    nim_client = create_inference_client(
+        model_interface=VLMModelInterface(),
+        endpoints=(None, endpoint_url),
+        auth_token=api_key,
+        infer_protocol="http",
+    )
+    return nim_client.infer(data, model_name=model_name, temperature=temperature)
+
+
+def _caption_batch_local(
+    base64_images: List[str],
+    *,
+    model: Any,
+    prompt: str,
+    system_prompt: str | None,
+    temperature: float,
+) -> List[str]:
+    """Generate captions using a local ``NemotronVLMCaptioner`` model."""
+    return model.caption_batch(
+        base64_images,
+        prompt=prompt,
+        system_prompt=system_prompt,
+        temperature=temperature,
+    )
+
+
+def caption_images(
+    batch_df: pd.DataFrame,
+    *,
+    model: Any = None,
+    endpoint_url: str | None = None,
+    model_name: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+    api_key: str | None = None,
+    prompt: str = "Caption the content of this image:",
+    system_prompt: str | None = "/no_think",
+    temperature: float = 1.0,
+    batch_size: int = 8,
+    **kwargs: Any,
+) -> pd.DataFrame:
+    """Caption images in the ``images`` column using a VLM.
+
+    Supports two modes:
+
+    * **Remote** (``endpoint_url`` is set): sends images to an HTTP VLM
+      endpoint via ``create_inference_client`` / ``VLMModelInterface``.
+    * **Local** (``model`` is set): runs inference through a local
+      ``NemotronVLMCaptioner`` instance loaded from Hugging Face.
+
+    For each row, any item in the ``images`` list whose ``text`` field is
+    empty will be captioned.  The returned caption is written back into
+    ``images[i]["text"]``.
+
+    Parameters
+    ----------
+    batch_df : pd.DataFrame
+        DataFrame with an ``images`` column containing lists of dicts with
+        keys ``image_b64``, ``text``, and ``bbox_xyxy_norm``.
+    model : NemotronVLMCaptioner | None
+        Pre-loaded local VLM model.  When provided, ``endpoint_url`` is
+        ignored and inference runs in-process.
+    endpoint_url : str | None
+        URL of a remote VLM HTTP endpoint.
+    model_name : str
+        Model identifier passed to the remote VLM endpoint (ignored for
+        local mode).
+    api_key : str | None
+        Bearer token for the remote VLM endpoint.
+    prompt : str
+        Text prompt sent alongside each image.
+    system_prompt : str | None
+        Optional system prompt for the VLM.
+    temperature : float
+        Sampling temperature.
+    batch_size : int
+        Number of images per remote VLM request (local mode processes
+        images one at a time).
+    """
+    if not isinstance(batch_df, pd.DataFrame) or batch_df.empty:
+        return batch_df
+    if "images" not in batch_df.columns:
+        return batch_df
+
+    if model is None and not endpoint_url:
+        # Lazy model creation for the sequential (no GPU pool) fallback.
+        from nemo_retriever.model.local import NemotronVLMCaptioner
+
+        model = NemotronVLMCaptioner(
+            model_path=kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"),
+            device=kwargs.get("device"),
+            hf_cache_dir=kwargs.get("hf_cache_dir"),
+            tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
+            gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9),
+        )
+
+    # Collect all (row_idx, item_idx, image_b64) needing captions.
+    pending: List[Tuple[int, int, str]] = []
+    for row_idx, row in batch_df.iterrows():
+        images = row.get("images")
+        if not isinstance(images, list):
+            continue
+        for item_idx, item in enumerate(images):
+            if not isinstance(item, dict):
+                continue
+            if item.get("text"):
+                continue  # already captioned
+            b64 = item.get("image_b64")
+            if b64:
+                pending.append((row_idx, item_idx, b64))
+
+    if not pending:
+        return batch_df
+
+    # Generate captions.
+    all_captions: List[str] = []
+    for start in range(0, len(pending), batch_size):
+        chunk_b64 = [b64 for _, _, b64 in pending[start : start + batch_size]]
+
+        if model is not None:
+            captions = _caption_batch_local(
+                chunk_b64,
+                model=model,
+                prompt=prompt,
+                system_prompt=system_prompt,
+                temperature=temperature,
+            )
+        else:
+            captions = _caption_batch_remote(
+                chunk_b64,
+                endpoint_url=endpoint_url,  # type: ignore[arg-type]
+                model_name=model_name,
+                api_key=api_key,
+                prompt=prompt,
+                system_prompt=system_prompt,
+                temperature=temperature,
+            )
+        all_captions.extend(captions)
+
+    # Write captions back into the DataFrame.
+    for (row_idx, item_idx, _), caption in zip(pending, all_captions):
+        batch_df.at[row_idx, "images"][item_idx]["text"] = caption
+
+    return batch_df
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
index b4bdb34ef..7fefa7fe0 100644
--- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -15,6 +15,7 @@
 import typer
 from nemo_retriever import create_ingestor
 from nemo_retriever.examples.common import estimate_processed_pages, print_pages_per_second
+from nemo_retriever.params import CaptionParams
 from nemo_retriever.params import EmbedParams
 from nemo_retriever.params import ExtractParams
 from nemo_retriever.params import IngestExecuteParams
@@ -150,6 +151,28 @@ def main(
         "--graphic-elements-invoke-url",
         help="Optional remote endpoint URL for graphic-elements model inference.",
     ),
+    caption: bool = typer.Option(
+        False,
+        "--caption/--no-caption",
+        help="Enable image captioning. Uses a local model by default, "
+        "or a remote endpoint if --caption-invoke-url is set.",
+    ),
+    caption_invoke_url: Optional[str] = typer.Option(
+        None,
+        "--caption-invoke-url",
+        help="Optional VLM endpoint URL for image captioning (e.g. http://vlm:8000/v1/chat/completions). "
+        "Implies --caption. When omitted, a local HF model is loaded instead.",
+    ),
+    caption_model_name: str = typer.Option(
+        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+        "--caption-model-name",
+        help="VLM model name / HF model ID for image captioning.",
+    ),
+    caption_device: Optional[str] = typer.Option(
+        None,
+        "--caption-device",
+        help="GPU device for the local VLM captioner (e.g. 'cuda:1'). Defaults to the first --gpu-devices entry.",
+    ),
     hybrid: bool = typer.Option(
         False,
         "--hybrid/--no-hybrid",
@@ -274,6 +297,16 @@ def main(
             )
         )
 
+    enable_caption = caption or caption_invoke_url is not None
+    if enable_caption:
+        ingestor = ingestor.caption(
+            CaptionParams(
+                endpoint_url=caption_invoke_url,
+                model_name=caption_model_name,
+                device=caption_device,
+            )
+        )
+
     ingestor = ingestor.embed(
         EmbedParams(
             model_name=str(embed_model_name),
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py b/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py
index cb1aa019a..11f3c36d6 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py
@@ -66,6 +66,28 @@ def create(self) -> Any:
         return NemotronParseV12(task_prompt=self.task_prompt)
 
 
+@dataclass
+class CaptionModelConfig:
+    """Config to recreate a NemotronVLMCaptioner model."""
+
+    model_path: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"
+    device: Optional[str] = None
+    hf_cache_dir: Optional[str] = None
+    tensor_parallel_size: int = 1
+    gpu_memory_utilization: float = 0.9
+
+    def create(self) -> Any:
+        from nemo_retriever.model.local import NemotronVLMCaptioner
+
+        return NemotronVLMCaptioner(
+            model_path=self.model_path,
+            device=self.device,
+            hf_cache_dir=self.hf_cache_dir,
+            tensor_parallel_size=self.tensor_parallel_size,
+            gpu_memory_utilization=self.gpu_memory_utilization,
+        )
+
+
 @dataclass
 class EmbeddingModelConfig:
     """Config to recreate an embedding model (VL or non-VL)."""
@@ -167,6 +189,19 @@ def _extract_model_config(func: Callable, kwargs: dict[str, Any]) -> Any:
     if func is collapse_content_to_page_rows:
         return None  # CPU-only, no model
 
+    from nemo_retriever.caption.caption import caption_images
+
+    if func is caption_images:
+        if kwargs.get("endpoint_url"):
+            return None  # Remote endpoint, no local model
+        return CaptionModelConfig(
+            model_path=kwargs.get("model_name", CaptionModelConfig.model_path),
+            device=kwargs.get("device"),
+            hf_cache_dir=kwargs.get("hf_cache_dir"),
+            tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
+            gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9),
+        )
+
     return None
 
 
@@ -292,7 +327,7 @@ def start(self) -> None:
             p = self._ctx.Process(
                 target=_gpu_worker_entry,
                 args=(idx, device_id, self._task_descriptors, iq, self._output_queue, evt),
-                daemon=True,
+                daemon=False,
             )
             p.start()
             self._workers.append(p)
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
index 1f1d229a2..144d2b599 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
@@ -25,6 +25,8 @@
 from collections.abc import Callable, Iterator
 from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
 
+from nemo_retriever.params import CaptionParams
+
 
 import pandas as pd
 from nemo_retriever.model.local import NemotronOCRV1, NemotronPageElementsV3, NemotronParseV12
@@ -958,6 +960,7 @@ def __init__(self, documents: Optional[List[str]] = None) -> None:
         self._pipeline_type: Literal["pdf", "txt", "html", "image"] = "pdf"
         self._extract_txt_kwargs: Dict[str, Any] = {}
         self._extract_html_kwargs: Dict[str, Any] = {}
+        self._caption_enabled: bool = False
 
     def files(self, documents: Union[str, List[str]]) -> "InProcessIngestor":
         """
@@ -1332,6 +1335,45 @@ def extract_audio(
         self._tasks.append((apply_asr_to_df, {"asr_params": self._extract_audio_asr_kwargs}))
         return self
 
+    def caption(self, params: "CaptionParams | None" = None, **kwargs: Any) -> "InProcessIngestor":
+        """
+        Configure image captioning via a local VLM model or remote endpoint.
+
+        Sends cropped images (from the ``images`` column populated by
+        ``extract(extract_images=True)``) to a VLM and writes the returned
+        captions back as ``images[i]["text"]``.
+
+        When ``endpoint_url`` is set, a remote NIM endpoint is used.
+        Otherwise a local ``NemotronVLMCaptioner`` is loaded from HF.
+        """
+        from nemo_retriever.caption.caption import caption_images
+        from nemo_retriever.params import CaptionParams
+
+        resolved = _coerce_params(params, CaptionParams, kwargs)
+        caption_kwargs = resolved.model_dump(mode="python")
+
+        if resolved.endpoint_url:
+            # Remote mode.
+            if not resolved.api_key:
+                caption_kwargs["api_key"] = resolve_remote_api_key()
+        else:
+            # Local mode: defer model creation so the VLM is loaded lazily
+            # on the device specified by CaptionParams.device.
+            if not resolved.device:
+                import warnings
+
+                warnings.warn(
+                    "No caption device specified. The VLM will load on cuda:0, which "
+                    "may conflict with other models. Use --caption-device (e.g. "
+                    "'cuda:1') to place the captioner on a separate GPU.",
+                    stacklevel=2,
+                )
+            caption_kwargs["model"] = None
+
+        self._caption_enabled = True
+        self._tasks.append((caption_images, caption_kwargs))
+        return self
+
     def embed(self, params: EmbedParams | None = None, **kwargs: Any) -> "InProcessIngestor":
         """
         Configure embedding for in-process execution.
@@ -1349,12 +1391,14 @@ def embed(self, params: EmbedParams | None = None, **kwargs: Any) -> "InProcessI
         embed_modality = resolved.embed_modality
         embed_granularity = resolved.embed_granularity
 
+        content_columns = (_CONTENT_COLUMNS + ("images",)) if self._caption_enabled else _CONTENT_COLUMNS
+
         if embed_granularity == "page":
             # Page-level: one row per page with concatenated text and full page image.
             self._tasks.append(
                 (
                     collapse_content_to_page_rows,
-                    {"modality": embed_modality},
+                    {"modality": embed_modality, "content_columns": content_columns},
                 )
             )
         else:
@@ -1368,6 +1412,7 @@ def embed(self, params: EmbedParams | None = None, **kwargs: Any) -> "InProcessI
                         "modality": embed_modality,
                         "text_elements_modality": text_elements_modality,
                         "structured_elements_modality": structured_elements_modality,
+                        "content_columns": content_columns,
                     },
                 )
             )
@@ -1487,12 +1532,21 @@ def ingest(self, params: IngestExecuteParams | None = None, **kwargs: Any) -> li
 
         _start = time.perf_counter()
 
-        # -- Three-way task classification --------------------------------
+        # -- Task classification -------------------------------------------
+        from nemo_retriever.caption.caption import caption_images as _caption_images_fn
+
         _post_task_fns = (upload_embeddings_to_lancedb_inprocess, save_dataframe_to_disk_json)
         _cpu_task_fns = (pdf_extraction,)
+        # Caption runs on its own device (--caption-device), not in the GPU pool.
+        _own_device_fns = (_caption_images_fn,)
 
         cpu_tasks = [(f, k) for f, k in self._tasks if f in _cpu_task_fns]
-        gpu_tasks = [(f, k) for f, k in self._tasks if f not in _cpu_task_fns and f not in _post_task_fns]
+        gpu_tasks = [
+            (f, k)
+            for f, k in self._tasks
+            if f not in _cpu_task_fns and f not in _post_task_fns and f not in _own_device_fns
+        ]
+        own_device_tasks = [(f, k) for f, k in self._tasks if f in _own_device_fns]
         post_tasks = [(f, k) for f, k in self._tasks if f in _post_task_fns]
 
         docs = list(self._documents)
@@ -1545,6 +1599,8 @@ def _check_file_done(doc_path: str) -> None:
                             try:
                                 result = future.result()
                                 if isinstance(result, pd.DataFrame) and not result.empty:
+                                    for func, kw in own_device_tasks:
+                                        result = func(result, **kw)
                                     shard_to_doc[shard_id] = doc
                                     gpu_pool.submit(shard_id, result)
                                     shard_id += 1
@@ -1639,6 +1695,8 @@ def _on_gpu_done(sid: int) -> None:
                     return results
 
                 combined = pd.concat(cpu_results, ignore_index=True)
+                for func, kwargs in own_device_tasks:
+                    combined = func(combined, **kwargs)
                 for func, kwargs in gpu_tasks:
                     combined = func(combined, **kwargs)
 
@@ -1678,6 +1736,8 @@ def _on_gpu_done(sid: int) -> None:
                             else:
                                 current = func(current, **kwargs)
                         if isinstance(current, pd.DataFrame) and not current.empty:
+                            for func, kw in own_device_tasks:
+                                current = func(current, **kw)
                             shard_to_doc[shard_id] = doc_path
                             gpu_pool.submit(shard_id, current)
                             shard_id += 1
@@ -1777,7 +1837,7 @@ def _loader(p: str) -> pd.DataFrame:
             results.append(current)
 
         # Run upload/save once on combined results so overwrite=True keeps full corpus.
-        if post_tasks and results and all(isinstance(r, pd.DataFrame) for r in results):
+        if results and all(isinstance(r, pd.DataFrame) for r in results):
             combined = pd.concat(results, ignore_index=True)
             for func, kwargs in post_tasks:
                 combined = func(combined, **kwargs)
diff --git a/nemo_retriever/src/nemo_retriever/ingestor.py b/nemo_retriever/src/nemo_retriever/ingestor.py
index 7bbc19486..74b6612e6 100644
--- a/nemo_retriever/src/nemo_retriever/ingestor.py
+++ b/nemo_retriever/src/nemo_retriever/ingestor.py
@@ -20,6 +20,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from nemo_retriever.application.modes.factory import create_runmode_ingestor
+from nemo_retriever.params import CaptionParams
 from nemo_retriever.params import EmbedParams
 from nemo_retriever.params import ExtractParams
 from nemo_retriever.params import TextChunkParams
@@ -176,8 +177,9 @@ def save_to_disk(
         """Record result persistence configuration (execution TBD)."""
         self._not_implemented("save_to_disk")
 
-    def caption(self) -> "ingestor":
+    def caption(self, params: "CaptionParams | None" = None, **kwargs: Any) -> "ingestor":
         """Record a caption task configuration."""
+        _ = _merge_params(params, kwargs)
         self._not_implemented("caption")
 
     def pdf_split_config(self, pages_per_chunk: int = 32) -> "ingestor":
diff --git a/nemo_retriever/src/nemo_retriever/model/local/__init__.py b/nemo_retriever/src/nemo_retriever/model/local/__init__.py
index 791df4daa..af068fa7d 100644
--- a/nemo_retriever/src/nemo_retriever/model/local/__init__.py
+++ b/nemo_retriever/src/nemo_retriever/model/local/__init__.py
@@ -18,6 +18,7 @@
     "NemotronGraphicElementsV1",
     "NemotronParseV12",
     "NemotronRerankV2",
+    "NemotronVLMCaptioner",
     "ParakeetCTC1B1ASR",
 ]
 
@@ -47,6 +48,10 @@ def __getattr__(name: str):
         from .nemotron_rerank_v2 import NemotronRerankV2
 
         return NemotronRerankV2
+    if name == "NemotronVLMCaptioner":
+        from .nemotron_vlm_captioner import NemotronVLMCaptioner
+
+        return NemotronVLMCaptioner
     if name == "ParakeetCTC1B1ASR":
         from .parakeet_ctc_1_1b_asr import ParakeetCTC1B1ASR
 
diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
new file mode 100644
index 000000000..c881f5c5f
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
@@ -0,0 +1,217 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import base64
+from io import BytesIO
+from typing import Any, List, Optional
+
+from PIL import Image
+
+from nemo_retriever.utils.hf_cache import configure_global_hf_cache_base
+from ..model import BaseModel, RunMode
+
+
+def _b64_to_pil(b64: str) -> Image.Image:
+    """Decode a base64-encoded image string to a PIL Image."""
+    return Image.open(BytesIO(base64.b64decode(b64))).convert("RGB")
+
+
+class NemotronVLMCaptioner(BaseModel):
+    """
+    Local VLM captioner wrapping Nemotron Nano 12B v2 VL variants.
+
+    Supported models:
+
+    * ``nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16`` (default, BFloat16)
+    * ``nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8``  (FP8 quantised)
+    * ``nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD`` (NVFP4 quantised,
+      requires GPU compute capability >= 8.9, e.g. Ada Lovelace / Hopper)
+
+    Uses vLLM for inference with batched scheduling.
+
+    Usage::
+
+        captioner = NemotronVLMCaptioner()
+        captions = captioner.caption_batch(
+            ["<base64-png>", "<base64-png>"],
+            prompt="Caption the content of this image:",
+        )
+    """
+
+    SUPPORTED_MODELS: dict[str, str] = {
+        "BF16": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+        "FP8": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8",
+        "NVFP4-QAD": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD",
+    }
+
+    # Pinned HF revision (commit SHA) per model to ensure reproducibility.
+    _MODEL_REVISIONS: dict[str, str] = {
+        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16": "5d250e2e111dc5e1434131bdf3d590c27a878ade",
+        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8": "7394488badb786e1decc0e00e308de1cab9560e6",
+        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD": "b8d3c170d9ee3a078917ef9bfd508eff988d6de7",
+    }
+
+    # Map model-name suffixes to vLLM engine kwargs.
+    # The FP8 HF config ships with quant_method="modelopt" which triggers
+    # vLLM's ModelOptFp8Config (SM89+).  Override to quant_method="fp8" in
+    # the HF config so vLLM uses its plain FP8 handler (SM80+).
+    _QUANTIZATION_PROFILES: dict[str, dict[str, Any]] = {
+        "BF16": {"dtype": "bfloat16"},
+        "FP8": {
+            "dtype": "auto",
+            "quantization": "fp8",
+            "hf_overrides": {"quantization_config": {"quant_method": "fp8", "activation_scheme": "static"}},
+        },
+        "NVFP4-QAD": {"dtype": "auto", "quantization": "modelopt"},
+    }
+
+    def __init__(
+        self,
+        model_path: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+        device: Optional[str] = None,
+        hf_cache_dir: Optional[str] = None,
+        max_new_tokens: int = 1024,
+        tensor_parallel_size: int = 1,
+        gpu_memory_utilization: float = 0.9,
+    ) -> None:
+        super().__init__()
+
+        valid_models = list(self.SUPPORTED_MODELS.values())
+        if model_path not in valid_models:
+            raise ValueError(
+                f"Unknown caption model: {model_path!r}\n"
+                f"Supported models:\n" + "\n".join(f"  - {m}" for m in valid_models)
+            )
+
+        try:
+            from vllm import LLM, SamplingParams  # noqa: F401
+        except ImportError as e:
+            raise ImportError(
+                "Local VLM captioning requires vLLM. " 'Install with: pip install "nemo-retriever[vlm-caption]"'
+            ) from e
+
+        self._model_path = model_path
+        self._max_new_tokens = max_new_tokens
+
+        if device is not None:
+            # vLLM uses CUDA_VISIBLE_DEVICES rather than a torch device string.
+            # Translate e.g. "cuda:1" → "1" so vLLM sees only the requested GPU.
+            import os
+
+            dev_id = device.split(":")[-1] if ":" in device else device
+            os.environ["CUDA_VISIBLE_DEVICES"] = dev_id
+
+        configure_global_hf_cache_base(hf_cache_dir)
+
+        revision = self._MODEL_REVISIONS.get(model_path)
+
+        # Pick vLLM engine kwargs based on the model variant.
+        engine_kwargs: dict[str, Any] = {"dtype": "bfloat16"}  # fallback
+        model_upper = model_path.upper()
+        for suffix, profile in self._QUANTIZATION_PROFILES.items():
+            if model_upper.endswith(suffix):
+                engine_kwargs = profile
+                break
+
+        self._llm = LLM(
+            model=model_path,
+            revision=revision,
+            trust_remote_code=True,
+            tensor_parallel_size=tensor_parallel_size,
+            gpu_memory_utilization=gpu_memory_utilization,
+            **engine_kwargs,
+        )
+
+    def _build_messages(
+        self,
+        base64_image: str,
+        *,
+        prompt: str,
+        system_prompt: Optional[str],
+    ) -> list[dict[str, Any]]:
+        """Build chat messages in OpenAI format for vLLM."""
+        messages: list[dict[str, Any]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append(
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        )
+        return messages
+
+    def caption(
+        self,
+        base64_image: str,
+        *,
+        prompt: str = "Caption the content of this image:",
+        system_prompt: Optional[str] = "/no_think",
+        temperature: float = 1.0,
+    ) -> str:
+        """Generate a caption for a single base64-encoded image."""
+        from vllm import SamplingParams
+
+        messages = self._build_messages(base64_image, prompt=prompt, system_prompt=system_prompt)
+        sampling_params = SamplingParams(temperature=temperature, max_tokens=self._max_new_tokens)
+        outputs = self._llm.chat([messages], sampling_params=sampling_params)
+        return outputs[0].outputs[0].text.strip()
+
+    def caption_batch(
+        self,
+        base64_images: List[str],
+        *,
+        prompt: str = "Caption the content of this image:",
+        system_prompt: Optional[str] = "/no_think",
+        temperature: float = 1.0,
+    ) -> List[str]:
+        """Generate captions for a list of base64-encoded images.
+
+        vLLM batches internally and handles scheduling across images.
+        """
+        from vllm import SamplingParams
+
+        conversations = [self._build_messages(b64, prompt=prompt, system_prompt=system_prompt) for b64 in base64_images]
+        sampling_params = SamplingParams(temperature=temperature, max_tokens=self._max_new_tokens)
+        outputs = self._llm.chat(conversations, sampling_params=sampling_params)
+        return [out.outputs[0].text.strip() for out in outputs]
+
+    # ---- BaseModel abstract interface ----
+
+    @property
+    def model_name(self) -> str:
+        return "NVIDIA-Nemotron-Nano-12B-v2-VL"
+
+    @property
+    def model_type(self) -> str:
+        return "vlm-captioner"
+
+    @property
+    def model_runmode(self) -> RunMode:
+        return "local"
+
+    @property
+    def input(self) -> Any:
+        return {
+            "type": "image",
+            "format": "base64",
+            "description": "Base64-encoded image for captioning.",
+        }
+
+    @property
+    def output(self) -> Any:
+        return {
+            "type": "text",
+            "format": "string",
+            "description": "Generated caption for the input image.",
+        }
+
+    @property
+    def input_batch_size(self) -> int:
+        return 1
diff --git a/nemo_retriever/src/nemo_retriever/params/__init__.py b/nemo_retriever/src/nemo_retriever/params/__init__.py
index 5f4eef723..bfc65b50c 100644
--- a/nemo_retriever/src/nemo_retriever/params/__init__.py
+++ b/nemo_retriever/src/nemo_retriever/params/__init__.py
@@ -5,6 +5,7 @@
 from .models import ASRParams
 from .models import AudioChunkParams
 from .models import BatchTuningParams
+from .models import CaptionParams
 from .models import ChartParams
 from .models import EmbedParams
 from .models import ExtractParams
@@ -30,6 +31,7 @@
     "ASRParams",
     "AudioChunkParams",
     "BatchTuningParams",
+    "CaptionParams",
     "ChartParams",
     "EmbedParams",
     "ExtractParams",
diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py
index 1f81e38e0..1dd735a6c 100644
--- a/nemo_retriever/src/nemo_retriever/params/models.py
+++ b/nemo_retriever/src/nemo_retriever/params/models.py
@@ -296,6 +296,20 @@ class ChartParams(_ParamsModel):
     inference_batch_size: int = 8
 
 
+class CaptionParams(_ParamsModel):
+    endpoint_url: Optional[str] = None
+    model_name: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"
+    api_key: Optional[str] = None
+    prompt: str = "Caption the content of this image:"
+    system_prompt: Optional[str] = "/no_think"
+    temperature: float = 1.0
+    batch_size: int = 8
+    device: Optional[str] = None
+    hf_cache_dir: Optional[str] = None
+    tensor_parallel_size: int = 1
+    gpu_memory_utilization: float = 0.9
+
+
 class InfographicParams(_ParamsModel):
     remote: RemoteInvokeParams = Field(default_factory=RemoteInvokeParams)
     remote_retry: RemoteRetryParams = Field(default_factory=RemoteRetryParams)
diff --git a/nemo_retriever/src/nemo_retriever/pdf/extract.py b/nemo_retriever/src/nemo_retriever/pdf/extract.py
index 992c18ebe..a25502f97 100644
--- a/nemo_retriever/src/nemo_retriever/pdf/extract.py
+++ b/nemo_retriever/src/nemo_retriever/pdf/extract.py
@@ -15,6 +15,7 @@
 
 from nv_ingest_api.util.pdf.pdfium import (
     convert_bitmap_to_corrected_numpy,
+    extract_image_like_objects_from_pdfium_page,
     is_scanned_page as _is_scanned_page,
 )
 
@@ -296,13 +297,37 @@ def pdf_extraction(
                             render_mode=render_mode,
                         )
 
+                    # Extract cropped images from pdfium page objects.
+                    detected_images: List[Dict[str, Any]] = []
+                    if extract_images:
+                        try:
+                            base64_images = extract_image_like_objects_from_pdfium_page(page)
+                            for img in base64_images:
+                                max_w = float(img.max_width) if img.max_width else 1.0
+                                max_h = float(img.max_height) if img.max_height else 1.0
+                                x0, y0, x1, y1 = img.bbox
+                                detected_images.append(
+                                    {
+                                        "bbox_xyxy_norm": [
+                                            x0 / max_w,
+                                            y0 / max_h,
+                                            x1 / max_w,
+                                            y1 / max_h,
+                                        ],
+                                        "text": "",
+                                        "image_b64": img.image,
+                                    }
+                                )
+                        except Exception:
+                            pass  # Image extraction failure should not crash the pipeline.
+
                     page_record: Dict[str, Any] = {
                         "path": pdf_path,
                         "page_number": page_number,
                         "source_id": source_id,
                         "text": text if extract_text else "",
                         "page_image": None,
-                        "images": [],
+                        "images": detected_images,
                         "tables": [],
                         "charts": [],
                         "infographics": [],

From 4c99cca1b1f8eac68bf3d731ee8da54e2396acc9 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Thu, 19 Mar 2026 10:03:39 -0700
Subject: [PATCH 02/20] revert fix pyproject.toml

---
 nemo_retriever/pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml
index f293a79cb..fea03d44f 100644
--- a/nemo_retriever/pyproject.toml
+++ b/nemo_retriever/pyproject.toml
@@ -56,8 +56,8 @@ dependencies = [
   "transformers>=5.0.0",
   "tokenizers>=0.20.3",
   "accelerate>=1.1.0",
-  "torch>=2.5.0",
-  "torchvision",
+  "torch~=2.9.1",
+  "torchvision>=0.24,<0.25",
   "einops",
   "easydict",
   "addict",

From c601e7f5cc92cd9fe944326e450229be898a782a Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Thu, 19 Mar 2026 17:39:43 -0700
Subject: [PATCH 03/20] add batch mode

---
 nemo_retriever/README.md                      | 13 ++++++-
 .../src/nemo_retriever/caption/caption.py     | 30 ++++++++++++++
 .../nemo_retriever/examples/batch_pipeline.py | 31 +++++++++++++++
 .../src/nemo_retriever/ingest_modes/batch.py  | 39 +++++++++++++++++++
 .../model/local/nemotron_vlm_captioner.py     |  2 +-
 nemo_retriever/src/nemo_retriever/version.py  | 39 ++++++++++++-------
 6 files changed, 136 insertions(+), 18 deletions(-)

diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
index e26fb443c..62b7af995 100644
--- a/nemo_retriever/README.md
+++ b/nemo_retriever/README.md
@@ -52,6 +52,12 @@ uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu13
 ```
 This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime.
 
+Alternatively, if you have uv 0.7+ you can set `UV_TORCH_BACKEND` to select the correct PyTorch CUDA index automatically:
+
+```bash
+UV_TORCH_BACKEND=cu130 uv pip install torch torchvision
+```
+
 ## Image Captioning (optional)
 
 NeMo Retriever Library can caption extracted images using a local VLM
@@ -61,13 +67,16 @@ This requires [vLLM](https://github.com/vllm-project/vllm) and
 separately because they contain CUDA kernels that must match your torch build.
 
 ```bash
-# 1. Install vLLM (--no-deps avoids overwriting the torch+cu130 already installed)
+# Install vLLM (--no-deps prevents overwriting torch, transformers, etc.)
 uv pip install --no-deps vllm>=0.16.0
 
-# 2. Build mamba-ssm from source against your torch (takes a few minutes)
+# Build mamba-ssm from source against your torch (takes a few minutes)
 uv pip install --no-deps --no-build-isolation mamba-ssm>=2.3.1
 ```
 
+> **Note:** `--no-deps` is required because vLLM's dependency solver would
+> downgrade `transformers` and `huggingface-hub` to incompatible versions.
+
 After installing, add `--caption` and `--caption-device` to your pipeline command:
 
 ```bash
diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
index 0de69614d..738d2ee48 100644
--- a/nemo_retriever/src/nemo_retriever/caption/caption.py
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -8,6 +8,36 @@
 
 import pandas as pd
 
+from nemo_retriever.params import CaptionParams
+
+
+class CaptionActor:
+    """Ray Data actor that holds a local VLM captioner on a single GPU.
+
+    When ``endpoint_url`` is provided, the actor delegates to a remote VLM
+    endpoint and no local model is loaded.
+    """
+
+    def __init__(self, params: CaptionParams) -> None:
+        self._params = params
+        self._kwargs = params.model_dump(mode="python")
+        endpoint = (self._kwargs.get("endpoint_url") or "").strip()
+        if endpoint:
+            self._model = None
+        else:
+            from nemo_retriever.model.local import NemotronVLMCaptioner
+
+            self._model = NemotronVLMCaptioner(
+                model_path=self._kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"),
+                device=self._kwargs.get("device"),
+                hf_cache_dir=self._kwargs.get("hf_cache_dir"),
+                tensor_parallel_size=self._kwargs.get("tensor_parallel_size", 1),
+                gpu_memory_utilization=self._kwargs.get("gpu_memory_utilization", 0.9),
+            )
+
+    def __call__(self, batch_df: Any) -> Any:
+        return caption_images(batch_df, model=self._model, **self._kwargs)
+
 
 def _caption_batch_remote(
     base64_images: List[str],
diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
index 6098f3731..784a8c8aa 100644
--- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
@@ -23,6 +23,7 @@
 from nemo_retriever.ingest_modes.batch import BatchIngestor
 from nemo_retriever.ingest_modes.lancedb_utils import lancedb_schema
 from nemo_retriever.model import resolve_embed_model
+from nemo_retriever.params import CaptionParams
 from nemo_retriever.params import EmbedParams
 from nemo_retriever.params import ExtractParams
 from nemo_retriever.params import IngestExecuteParams
@@ -521,6 +522,26 @@ def main(
         "--extract-page-as-image/--no-extract-page-as-image",
         help="Render and retain full page images for downstream multimodal stages.",
     ),
+    caption: bool = typer.Option(
+        False,
+        "--caption/--no-caption",
+        help="Enable image captioning via a local VLM or remote endpoint.",
+    ),
+    caption_invoke_url: Optional[str] = typer.Option(
+        None,
+        "--caption-invoke-url",
+        help="Optional VLM endpoint URL for image captioning. Implies --caption.",
+    ),
+    caption_model_name: str = typer.Option(
+        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+        "--caption-model-name",
+        help="VLM model name / HF model ID for image captioning.",
+    ),
+    caption_device: Optional[str] = typer.Option(
+        None,
+        "--caption-device",
+        help="GPU device for the local VLM captioner (e.g. 'cuda:1').",
+    ),
     text_chunk: bool = typer.Option(
         False,
         "--text-chunk",
@@ -747,6 +768,16 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
         if enable_text_chunk:
             ingestor = ingestor.split(_text_chunk_params)
 
+        enable_caption = caption or caption_invoke_url is not None
+        if enable_caption:
+            ingestor = ingestor.caption(
+                CaptionParams(
+                    endpoint_url=caption_invoke_url,
+                    model_name=caption_model_name,
+                    device=caption_device,
+                )
+            )
+
         ingestor = ingestor.embed(embed_params)
 
         logger.info("Running extraction...")
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
index e00037285..0d1ac3488 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
@@ -48,6 +48,7 @@
 from ..params import IngestExecuteParams
 from ..params import PdfSplitParams
 from ..params import TextChunkParams
+from ..params import CaptionParams
 from ..params import VdbUploadParams
 
 logger = logging.getLogger(__name__)
@@ -868,10 +869,17 @@ def embed(
             target_num_rows_per_block=self._requested_plan.get_embed_batch_size()
         )
 
+        from nemo_retriever.ingest_modes.inprocess import _CONTENT_COLUMNS
+
+        content_columns = (
+            (_CONTENT_COLUMNS + ("images",)) if getattr(self, "_caption_enabled", False) else _CONTENT_COLUMNS
+        )
+
         if embed_granularity == "page":
             _row_fn = partial(
                 collapse_content_to_page_rows,
                 modality=embed_modality,
+                content_columns=content_columns,
             )
         else:
             text_elements_modality = resolved.text_elements_modality or embed_modality
@@ -881,6 +889,7 @@ def embed(
                 modality=embed_modality,
                 text_elements_modality=text_elements_modality,
                 structured_elements_modality=structured_elements_modality,
+                content_columns=content_columns,
             )
         self._rd_dataset = self._rd_dataset.map_batches(
             _row_fn,
@@ -911,6 +920,36 @@ def embed(
 
         return self
 
+    def caption(self, params: CaptionParams | None = None, **kwargs: Any) -> "BatchIngestor":
+        """
+        Add an image-captioning stage to the batch pipeline.
+
+        Uses a GPU actor pool with a local VLM (vLLM) or delegates to a
+        remote VLM endpoint when ``endpoint_url`` is set.
+        """
+        if self._rd_dataset is None:
+            raise RuntimeError("No Ray Dataset to caption. Run .files(...) / .extract(...) first.")
+
+        resolved = _coerce_params(params, CaptionParams, kwargs)
+        if resolved.endpoint_url and not resolved.api_key:
+            resolved = resolved.model_copy(update={"api_key": resolve_remote_api_key()})
+
+        from nemo_retriever.caption.caption import CaptionActor
+
+        caption_num_gpus = 0.0 if resolved.endpoint_url else 1.0
+
+        self._rd_dataset = self._rd_dataset.map_batches(
+            CaptionActor,
+            batch_size=resolved.batch_size or 8,
+            batch_format="pandas",
+            num_gpus=caption_num_gpus,
+            concurrency=1,
+            fn_constructor_kwargs={"params": resolved},
+        )
+
+        self._caption_enabled = True
+        return self
+
     def vdb_upload(self, params: VdbUploadParams | None = None, **kwargs: Any) -> "BatchIngestor":
         """
         Add a streaming LanceDB upload stage to the batch pipeline.
diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
index c881f5c5f..8264a82f5 100644
--- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
+++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
@@ -90,7 +90,7 @@ def __init__(
             from vllm import LLM, SamplingParams  # noqa: F401
         except ImportError as e:
             raise ImportError(
-                "Local VLM captioning requires vLLM. " 'Install with: pip install "nemo-retriever[vlm-caption]"'
+                'Local VLM captioning requires vLLM. Install with: pip install "nemo-retriever[vlm-caption]"'
             ) from e
 
         self._model_path = model_path
diff --git a/nemo_retriever/src/nemo_retriever/version.py b/nemo_retriever/src/nemo_retriever/version.py
index 9999c919c..13ec2bd3b 100644
--- a/nemo_retriever/src/nemo_retriever/version.py
+++ b/nemo_retriever/src/nemo_retriever/version.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 import os
 import subprocess
+import tempfile
 
 try:
     from ._build_info import BUILD_DATE as _PACKAGE_BUILD_DATE
@@ -23,6 +24,7 @@
 
 _PKG_NAME = "nemo-retriever"
 _UNKNOWN = "unknown"
+_BUILD_STAMP = Path(tempfile.gettempdir()) / ".nemo_retriever_build_stamp"
 
 
 def _utc_now() -> datetime:
@@ -57,7 +59,28 @@ def _build_datetime() -> datetime:
         except ValueError:
             pass
 
-    return _utc_now()
+    # Stamp file in the system temp dir makes the timestamp deterministic
+    # across the two separate subprocesses pip spawns during a PEP 517 build
+    # (metadata + wheel).  We use tempdir rather than the source tree because
+    # pip may copy the source to different locations for each step.
+    if _BUILD_STAMP.exists():
+        try:
+            cached = _BUILD_STAMP.read_text().strip()
+            if cached:
+                ts = float(cached)
+                # Only reuse if less than 60 s old to avoid stale stamps.
+                if abs(_utc_now().timestamp() - ts) < 60:
+                    return datetime.fromtimestamp(ts, tz=timezone.utc)
+            _BUILD_STAMP.unlink(missing_ok=True)
+        except (OSError, ValueError):
+            pass
+
+    now = _utc_now()
+    try:
+        _BUILD_STAMP.write_text(str(now.timestamp()))
+    except OSError:
+        pass
+    return now
 
 
 @lru_cache(maxsize=1)
@@ -108,18 +131,6 @@ def _base_version() -> str:
     return os.getenv("RETRIEVER_VERSION") or os.getenv("NV_INGEST_VERSION") or _build_datetime().strftime("%Y.%m.%d")
 
 
-def _has_prerelease(version_str: str) -> bool:
-    """Return True if *version_str* already contains a PEP 440 pre-release segment."""
-    from packaging.version import Version
-
-    try:
-        return Version(version_str).pre is not None
-    except Exception:
-        import re
-
-        return bool(re.search(r"(a|alpha|b|beta|rc|c|dev|pre)[-_.]?\d*", version_str, re.I))
-
-
 def get_build_version() -> str:
     """Return a PEP 440 compliant version string for packaging."""
     release_type = (os.getenv("RETRIEVER_RELEASE_TYPE") or os.getenv("NV_INGEST_RELEASE_TYPE") or "dev").lower()
@@ -128,8 +139,6 @@ def get_build_version() -> str:
     build_number = _build_number()
 
     if release_type == "release":
-        if _has_prerelease(base_version):
-            return base_version
         return f"{base_version}.post{build_number}" if int(build_number) > 0 else base_version
     if release_type == "dev":
         return f"{base_version}.dev{build_number}"

From cca500148d5d26c7ec40ae957042352998efbedd Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Thu, 19 Mar 2026 18:51:24 -0700
Subject: [PATCH 04/20] build endpoint working

---
 nemo_retriever/README.md                      | 62 +++++++++++++------
 nemo_retriever/pyproject.toml                 |  4 --
 .../nemo_retriever/examples/batch_pipeline.py | 21 ++++++-
 .../examples/inprocess_pipeline.py            | 44 +++++++++----
 4 files changed, 93 insertions(+), 38 deletions(-)

diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
index 62b7af995..a2300cd41 100644
--- a/nemo_retriever/README.md
+++ b/nemo_retriever/README.md
@@ -61,43 +61,67 @@ UV_TORCH_BACKEND=cu130 uv pip install torch torchvision
 ## Image Captioning (optional)
 
 NeMo Retriever Library can caption extracted images using a local VLM
-([Nemotron Nano 12B v2 VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16)).
-This requires [vLLM](https://github.com/vllm-project/vllm) and
-[mamba-ssm](https://github.com/state-spaces/mamba), which must be installed
-separately because they contain CUDA kernels that must match your torch build.
+([Nemotron Nano 12B v2 VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16))
+powered by [vLLM](https://github.com/vllm-project/vllm), or by calling a
+remote VLM endpoint.
 
-```bash
-# Install vLLM (--no-deps prevents overwriting torch, transformers, etc.)
-uv pip install --no-deps vllm>=0.16.0
+### Install vLLM
 
-# Build mamba-ssm from source against your torch (takes a few minutes)
-uv pip install --no-deps --no-build-isolation mamba-ssm>=2.3.1
+```bash
+uv pip install vllm --extra-index-url https://pypi.ngc.nvidia.com
 ```
 
-> **Note:** `--no-deps` is required because vLLM's dependency solver would
-> downgrade `transformers` and `huggingface-hub` to incompatible versions.
+The NGC index provides cu130 wheels that match the torch build installed above.
 
-After installing, add `--caption` and `--caption-device` to your pipeline command:
+### Local captioning
+
+Add `--caption` to your pipeline command. The VLM model is downloaded from
+Hugging Face on first use and loaded via vLLM for inference.
 
 ```bash
 python -m nemo_retriever.examples.inprocess_pipeline \
   data/multimodal_test.pdf \
-  --caption \
-  --caption-device cuda:1
+  --caption
 ```
 
-`--caption-device` places the VLM on a separate GPU so it does not compete with
-the page-elements, OCR, and embedding models. If omitted, a warning is printed
-and the VLM defaults to `cuda:0`.
+### Remote captioning
+
+If you have a VLM endpoint running (e.g. via `vllm serve`), pass the URL
+instead:
 
-Supported `--caption-model-name` values:
+```bash
+python -m nemo_retriever.examples.inprocess_pipeline \
+  data/multimodal_test.pdf \
+  --caption-invoke-url http://vlm:8000/v1/chat/completions
+```
 
-| Model | Precision | Notes |
+### Supported models
+
+| `--caption-model-name` | Precision | Notes |
 |---|---|---|
 | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` (default) | BFloat16 | Works on SM80+ (A100, A10, RTX 3090, ...) |
 | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8` | FP8 | Works on SM80+ |
 | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD` | NVFP4 | Requires SM89+ (Ada Lovelace / Hopper) |
 
+### Python API
+
+```python
+from nemo_retriever import create_ingestor
+from nemo_retriever.params import CaptionParams, ExtractParams
+
+ingestor = create_ingestor(run_mode="inprocess")
+results = (
+    ingestor
+    .files("doc.pdf")
+    .extract(ExtractParams(extract_images=True))
+    .caption(CaptionParams())                    # local vLLM
+    # or: .caption(CaptionParams(endpoint_url="http://vlm:8000/v1/chat/completions"))
+    .embed()
+    .vdb_upload()
+    .ingest()
+)
+```
+
 ## Run the pipeline
 
 The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/).
diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml
index fea03d44f..19c522057 100644
--- a/nemo_retriever/pyproject.toml
+++ b/nemo_retriever/pyproject.toml
@@ -83,12 +83,8 @@ dependencies = [
 svg = [
   "cairosvg>=2.7.0",
 ]
-# Install with: pip install ".[vlm-caption]"
-# mamba-ssm must be built from source against the installed torch:
-#   uv pip install --no-deps --no-build-isolation mamba-ssm
 vlm-caption = [
   "vllm>=0.16.0",
-  "mamba-ssm>=2.3.1",
 ]
 dev = [
   "build>=1.2.2",
diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
index 784a8c8aa..bed85299a 100644
--- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
@@ -512,6 +512,21 @@ def main(
             "(used when --table-output-format=markdown)."
         ),
     ),
+    extract_text: bool = typer.Option(
+        True,
+        "--extract-text/--no-extract-text",
+        help="Extract text from PDF pages.",
+    ),
+    extract_tables: bool = typer.Option(
+        True,
+        "--extract-tables/--no-extract-tables",
+        help="Extract tables from PDF pages.",
+    ),
+    extract_charts: bool = typer.Option(
+        True,
+        "--extract-charts/--no-extract-charts",
+        help="Extract charts from PDF pages.",
+    ),
     extract_infographics: bool = typer.Option(
         False,
         "--extract-infographics/--no-extract-infographics",
@@ -729,9 +744,9 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
             return ExtractParams(
                 method=method,
                 dpi=int(dpi),
-                extract_text=True,
-                extract_tables=True,
-                extract_charts=True,
+                extract_text=extract_text,
+                extract_tables=extract_tables,
+                extract_charts=extract_charts,
                 extract_infographics=extract_infographics,
                 extract_page_as_image=extract_page_as_image,
                 api_key=extract_remote_api_key,
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
index 71802f879..629c82f7e 100644
--- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -93,6 +93,26 @@ def main(
         "--embed-model-name",
         help="Embedding model name passed to .embed().",
     ),
+    extract_text: bool = typer.Option(
+        True,
+        "--extract-text/--no-extract-text",
+        help="Extract text from PDF pages.",
+    ),
+    extract_tables: bool = typer.Option(
+        True,
+        "--extract-tables/--no-extract-tables",
+        help="Extract tables from PDF pages.",
+    ),
+    extract_charts: bool = typer.Option(
+        True,
+        "--extract-charts/--no-extract-charts",
+        help="Extract charts from PDF pages.",
+    ),
+    extract_infographics: bool = typer.Option(
+        False,
+        "--extract-infographics/--no-extract-infographics",
+        help="Extract infographics from PDF pages.",
+    ),
     method: str = typer.Option(
         "pdfium",
         "--method",
@@ -239,10 +259,10 @@ def main(
         ingestor = ingestor.files(file_patterns).extract_image_files(
             ExtractParams(
                 method=method,
-                extract_text=True,
-                extract_tables=True,
-                extract_charts=True,
-                extract_infographics=False,
+                extract_text=extract_text,
+                extract_tables=extract_tables,
+                extract_charts=extract_charts,
+                extract_infographics=extract_infographics,
                 use_graphic_elements=use_graphic_elements,
                 graphic_elements_invoke_url=graphic_elements_invoke_url,
                 use_table_structure=use_table_structure,
@@ -256,10 +276,10 @@ def main(
         ingestor = ingestor.files(file_patterns).extract(
             ExtractParams(
                 method=method,
-                extract_text=True,
-                extract_tables=True,
-                extract_charts=True,
-                extract_infographics=False,
+                extract_text=extract_text,
+                extract_tables=extract_tables,
+                extract_charts=extract_charts,
+                extract_infographics=extract_infographics,
                 use_graphic_elements=use_graphic_elements,
                 graphic_elements_invoke_url=graphic_elements_invoke_url,
                 use_table_structure=use_table_structure,
@@ -273,10 +293,10 @@ def main(
         ingestor = ingestor.files(file_patterns).extract(
             ExtractParams(
                 method=method,
-                extract_text=True,
-                extract_tables=True,
-                extract_charts=True,
-                extract_infographics=False,
+                extract_text=extract_text,
+                extract_tables=extract_tables,
+                extract_charts=extract_charts,
+                extract_infographics=extract_infographics,
                 use_graphic_elements=use_graphic_elements,
                 graphic_elements_invoke_url=graphic_elements_invoke_url,
                 use_table_structure=use_table_structure,

From 1384c6f3a2c0a29f39127a93416069774b60a2dc Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Thu, 19 Mar 2026 21:36:39 -0700
Subject: [PATCH 05/20] add context window

---
 nemo_retriever/README.md                      |  70 ----------
 .../src/nemo_retriever/caption/caption.py     | 130 +++++++++++-------
 .../nemo_retriever/examples/batch_pipeline.py |   6 +
 .../examples/inprocess_pipeline.py            |   6 +
 .../src/nemo_retriever/params/models.py       |   1 +
 nemo_retriever/tests/test_caption.py          | 107 ++++++++++++++
 6 files changed, 199 insertions(+), 121 deletions(-)
 create mode 100644 nemo_retriever/tests/test_caption.py

diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
index a2300cd41..6a0ac50db 100644
--- a/nemo_retriever/README.md
+++ b/nemo_retriever/README.md
@@ -52,76 +52,6 @@ uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu13
 ```
 This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime.
 
-Alternatively, if you have uv 0.7+ you can set `UV_TORCH_BACKEND` to select the correct PyTorch CUDA index automatically:
-
-```bash
-UV_TORCH_BACKEND=cu130 uv pip install torch torchvision
-```
-
-## Image Captioning (optional)
-
-NeMo Retriever Library can caption extracted images using a local VLM
-([Nemotron Nano 12B v2 VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16))
-powered by [vLLM](https://github.com/vllm-project/vllm), or by calling a
-remote VLM endpoint.
-
-### Install vLLM
-
-```bash
-uv pip install vllm --extra-index-url https://pypi.ngc.nvidia.com
-```
-
-The NGC index provides cu130 wheels that match the torch build installed above.
-
-### Local captioning
-
-Add `--caption` to your pipeline command. The VLM model is downloaded from
-Hugging Face on first use and loaded via vLLM for inference.
-
-```bash
-python -m nemo_retriever.examples.inprocess_pipeline \
-  data/multimodal_test.pdf \
-  --caption
-```
-
-### Remote captioning
-
-If you have a VLM endpoint running (e.g. via `vllm serve`), pass the URL
-instead:
-
-```bash
-python -m nemo_retriever.examples.inprocess_pipeline \
-  data/multimodal_test.pdf \
-  --caption-invoke-url http://vlm:8000/v1/chat/completions
-```
-
-### Supported models
-
-| `--caption-model-name` | Precision | Notes |
-|---|---|---|
-| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` (default) | BFloat16 | Works on SM80+ (A100, A10, RTX 3090, ...) |
-| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8` | FP8 | Works on SM80+ |
-| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD` | NVFP4 | Requires SM89+ (Ada Lovelace / Hopper) |
-
-### Python API
-
-```python
-from nemo_retriever import create_ingestor
-from nemo_retriever.params import CaptionParams, ExtractParams
-
-ingestor = create_ingestor(run_mode="inprocess")
-results = (
-    ingestor
-    .files("doc.pdf")
-    .extract(ExtractParams(extract_images=True))
-    .caption(CaptionParams())                    # local vLLM
-    # or: .caption(CaptionParams(endpoint_url="http://vlm:8000/v1/chat/completions"))
-    .embed()
-    .vdb_upload()
-    .ingest()
-)
-```
-
 ## Run the pipeline
 
 The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/).
diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
index 738d2ee48..5cad882e8 100644
--- a/nemo_retriever/src/nemo_retriever/caption/caption.py
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -10,6 +10,8 @@
 
 from nemo_retriever.params import CaptionParams
 
+_MAX_CONTEXT_TEXT_CHARS = 4096
+
 
 class CaptionActor:
     """Ray Data actor that holds a local VLM captioner on a single GPU.
@@ -39,6 +41,16 @@ def __call__(self, batch_df: Any) -> Any:
         return caption_images(batch_df, model=self._model, **self._kwargs)
 
 
+def _build_prompt_with_context(base_prompt: str, context_text: str) -> str:
+    """Prepend surrounding page text to the base VLM prompt.
+
+    If *context_text* is empty the *base_prompt* is returned unchanged.
+    """
+    if not context_text:
+        return base_prompt
+    return f"Text near this image:\n---\n{context_text}\n---\n\n{base_prompt}"
+
+
 def _caption_batch_remote(
     base64_images: List[str],
     *,
@@ -89,6 +101,32 @@ def _caption_batch_local(
     )
 
 
+def _caption_one(
+    b64: str,
+    *,
+    model: Any,
+    endpoint_url: str | None,
+    model_name: str,
+    api_key: str | None,
+    prompt: str,
+    system_prompt: str | None,
+    temperature: float,
+) -> str:
+    """Caption a single image (used when each image gets a unique prompt)."""
+    if model is not None:
+        captions = _caption_batch_local(
+            [b64], model=model, prompt=prompt,
+            system_prompt=system_prompt, temperature=temperature,
+        )
+    else:
+        captions = _caption_batch_remote(
+            [b64], endpoint_url=endpoint_url,  # type: ignore[arg-type]
+            model_name=model_name, api_key=api_key, prompt=prompt,
+            system_prompt=system_prompt, temperature=temperature,
+        )
+    return captions[0] if captions else ""
+
+
 def caption_images(
     batch_df: pd.DataFrame,
     *,
@@ -100,6 +138,7 @@ def caption_images(
     system_prompt: str | None = "/no_think",
     temperature: float = 1.0,
     batch_size: int = 8,
+    context_text_max_chars: int = 0,
     **kwargs: Any,
 ) -> pd.DataFrame:
     """Caption images in the ``images`` column using a VLM.
@@ -111,34 +150,14 @@ def caption_images(
     * **Local** (``model`` is set): runs inference through a local
       ``NemotronVLMCaptioner`` instance loaded from Hugging Face.
 
+    When ``context_text_max_chars`` is greater than zero, the page's ``text``
+    column is prepended to the prompt for each image so the VLM can use
+    surrounding OCR text as context.  In this mode images are captioned
+    one at a time (each gets its own enriched prompt).
+
     For each row, any item in the ``images`` list whose ``text`` field is
     empty will be captioned.  The returned caption is written back into
     ``images[i]["text"]``.
-
-    Parameters
-    ----------
-    batch_df : pd.DataFrame
-        DataFrame with an ``images`` column containing lists of dicts with
-        keys ``image_b64``, ``text``, and ``bbox_xyxy_norm``.
-    model : NemotronVLMCaptioner | None
-        Pre-loaded local VLM model.  When provided, ``endpoint_url`` is
-        ignored and inference runs in-process.
-    endpoint_url : str | None
-        URL of a remote VLM HTTP endpoint.
-    model_name : str
-        Model identifier passed to the remote VLM endpoint (ignored for
-        local mode).
-    api_key : str | None
-        Bearer token for the remote VLM endpoint.
-    prompt : str
-        Text prompt sent alongside each image.
-    system_prompt : str | None
-        Optional system prompt for the VLM.
-    temperature : float
-        Sampling temperature.
-    batch_size : int
-        Number of images per remote VLM request (local mode processes
-        images one at a time).
     """
     if not isinstance(batch_df, pd.DataFrame) or batch_df.empty:
         return batch_df
@@ -157,6 +176,9 @@ def caption_images(
             gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9),
         )
 
+    use_context = context_text_max_chars > 0
+    effective_max = min(context_text_max_chars, _MAX_CONTEXT_TEXT_CHARS) if use_context else 0
+
     # Collect all (row_idx, item_idx, image_b64) needing captions.
     pending: List[Tuple[int, int, str]] = []
     for row_idx, row in batch_df.iterrows():
@@ -175,33 +197,39 @@ def caption_images(
     if not pending:
         return batch_df
 
-    # Generate captions.
-    all_captions: List[str] = []
-    for start in range(0, len(pending), batch_size):
-        chunk_b64 = [b64 for _, _, b64 in pending[start : start + batch_size]]
-
-        if model is not None:
-            captions = _caption_batch_local(
-                chunk_b64,
-                model=model,
-                prompt=prompt,
-                system_prompt=system_prompt,
+    if use_context:
+        # Each image gets a per-page enriched prompt, so caption one at a time.
+        for row_idx, item_idx, b64 in pending:
+            page_text = batch_df.at[row_idx, "text"] if "text" in batch_df.columns else ""
+            context = (page_text or "")[:effective_max]
+            enriched_prompt = _build_prompt_with_context(prompt, context)
+            caption = _caption_one(
+                b64, model=model, endpoint_url=endpoint_url,
+                model_name=model_name, api_key=api_key,
+                prompt=enriched_prompt, system_prompt=system_prompt,
                 temperature=temperature,
             )
-        else:
-            captions = _caption_batch_remote(
-                chunk_b64,
-                endpoint_url=endpoint_url,  # type: ignore[arg-type]
-                model_name=model_name,
-                api_key=api_key,
-                prompt=prompt,
-                system_prompt=system_prompt,
-                temperature=temperature,
-            )
-        all_captions.extend(captions)
-
-    # Write captions back into the DataFrame.
-    for (row_idx, item_idx, _), caption in zip(pending, all_captions):
-        batch_df.at[row_idx, "images"][item_idx]["text"] = caption
+            batch_df.at[row_idx, "images"][item_idx]["text"] = caption
+    else:
+        # Batch mode: all images share the same prompt.
+        all_captions: List[str] = []
+        for start in range(0, len(pending), batch_size):
+            chunk_b64 = [b64 for _, _, b64 in pending[start : start + batch_size]]
+
+            if model is not None:
+                captions = _caption_batch_local(
+                    chunk_b64, model=model, prompt=prompt,
+                    system_prompt=system_prompt, temperature=temperature,
+                )
+            else:
+                captions = _caption_batch_remote(
+                    chunk_b64, endpoint_url=endpoint_url,  # type: ignore[arg-type]
+                    model_name=model_name, api_key=api_key, prompt=prompt,
+                    system_prompt=system_prompt, temperature=temperature,
+                )
+            all_captions.extend(captions)
+
+        for (row_idx, item_idx, _), caption in zip(pending, all_captions):
+            batch_df.at[row_idx, "images"][item_idx]["text"] = caption
 
     return batch_df
diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
index bed85299a..4090618fa 100644
--- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
@@ -557,6 +557,11 @@ def main(
         "--caption-device",
         help="GPU device for the local VLM captioner (e.g. 'cuda:1').",
     ),
+    caption_context_text_max_chars: int = typer.Option(
+        0,
+        "--caption-context-text-max-chars",
+        help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
+    ),
     text_chunk: bool = typer.Option(
         False,
         "--text-chunk",
@@ -790,6 +795,7 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
                     endpoint_url=caption_invoke_url,
                     model_name=caption_model_name,
                     device=caption_device,
+                    context_text_max_chars=caption_context_text_max_chars,
                 )
             )
 
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
index 629c82f7e..c8fda38b9 100644
--- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -192,6 +192,11 @@ def main(
         "--caption-device",
         help="GPU device for the local VLM captioner (e.g. 'cuda:1'). Defaults to the first --gpu-devices entry.",
     ),
+    caption_context_text_max_chars: int = typer.Option(
+        0,
+        "--caption-context-text-max-chars",
+        help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
+    ),
     hybrid: bool = typer.Option(
         False,
         "--hybrid/--no-hybrid",
@@ -323,6 +328,7 @@ def main(
                 endpoint_url=caption_invoke_url,
                 model_name=caption_model_name,
                 device=caption_device,
+                context_text_max_chars=caption_context_text_max_chars,
             )
         )
 
diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py
index 6fd246966..8f48e125e 100644
--- a/nemo_retriever/src/nemo_retriever/params/models.py
+++ b/nemo_retriever/src/nemo_retriever/params/models.py
@@ -312,6 +312,7 @@ class CaptionParams(_ParamsModel):
     batch_size: int = 8
     device: Optional[str] = None
     hf_cache_dir: Optional[str] = None
+    context_text_max_chars: int = 0
     tensor_parallel_size: int = 1
     gpu_memory_utilization: float = 0.9
 
diff --git a/nemo_retriever/tests/test_caption.py b/nemo_retriever/tests/test_caption.py
new file mode 100644
index 000000000..3bf6844be
--- /dev/null
+++ b/nemo_retriever/tests/test_caption.py
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for image captioning pipeline stage."""
+
+import base64
+import io
+from unittest.mock import MagicMock, patch
+
+import pandas as pd
+import pytest
+
+PIL = pytest.importorskip("PIL")
+from PIL import Image  # noqa: E402
+
+
+def _make_1x1_png_b64() -> str:
+    img = Image.new("RGB", (1, 1), color=(255, 0, 0))
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return base64.b64encode(buf.getvalue()).decode("ascii")
+
+
+def _make_page_df(num_images=2, captioned=False):
+    b64 = _make_1x1_png_b64()
+    images = [
+        {"bbox_xyxy_norm": [0.1, 0.2, 0.5, 0.8], "text": "done" if captioned else "", "image_b64": b64}
+        for _ in range(num_images)
+    ]
+    return pd.DataFrame([{"text": "page", "images": images, "tables": [], "charts": [], "infographics": []}])
+
+
+def test_caption_images_writes_back():
+    from nemo_retriever.caption.caption import caption_images
+
+    mock_model = MagicMock()
+    mock_model.caption_batch.return_value = ["cap1", "cap2"]
+    result = caption_images(_make_page_df(), model=mock_model)
+    assert result.iloc[0]["images"][0]["text"] == "cap1"
+    assert result.iloc[0]["images"][1]["text"] == "cap2"
+
+
+def test_caption_images_skips_already_captioned():
+    from nemo_retriever.caption.caption import caption_images
+
+    mock_model = MagicMock()
+    result = caption_images(_make_page_df(captioned=True), model=mock_model)
+    mock_model.caption_batch.assert_not_called()
+    assert result.iloc[0]["images"][0]["text"] == "done"
+
+
+@patch("nemo_retriever.pdf.extract.extract_image_like_objects_from_pdfium_page")
+def test_pdf_extraction_populates_images(mock_extract):
+    _ext = pytest.importorskip("nemo_retriever.pdf.extract")
+    pdfium = pytest.importorskip("pypdfium2")
+
+    mock_img = MagicMock(image=_make_1x1_png_b64(), bbox=(10, 20, 100, 200), max_width=612, max_height=792)
+    mock_extract.return_value = [mock_img]
+
+    doc = pdfium.PdfDocument.new()
+    doc.new_page(612, 792)
+    buf = io.BytesIO()
+    doc.save(buf)
+    doc.close()
+
+    result = _ext.pdf_extraction(pd.DataFrame([{"bytes": buf.getvalue(), "path": "t.pdf", "page_number": 1}]), extract_images=True)
+    images = result.iloc[0]["images"]
+    assert len(images) == 1
+    assert images[0]["text"] == ""
+    assert abs(images[0]["bbox_xyxy_norm"][0] - 10 / 612) < 1e-6
+
+
+def test_explode_includes_captioned_images():
+    from nemo_retriever.ingest_modes.inprocess import explode_content_to_rows
+
+    b64 = _make_1x1_png_b64()
+    df = pd.DataFrame([{
+        "text": "page",
+        "page_image": {"image_b64": b64},
+        "images": [{"text": "a dog", "bbox_xyxy_norm": [0.1, 0.2, 0.5, 0.8], "image_b64": b64}],
+        "tables": [], "charts": [], "infographics": [],
+    }])
+    result = explode_content_to_rows(df, content_columns=("table", "chart", "infographic", "images"))
+    assert len(result) == 2  # page text + image caption
+
+    # Default columns exclude images
+    result2 = explode_content_to_rows(df)
+    assert len(result2) == 1
+
+
+def test_context_text_prepended_to_prompt():
+    from nemo_retriever.caption.caption import caption_images
+
+    mock_model = MagicMock()
+    mock_model.caption_batch.return_value = ["captioned with context"]
+
+    df = _make_page_df(num_images=1)
+    df.at[0, "text"] = "The quick brown fox jumps over the lazy dog."
+
+    result = caption_images(df, model=mock_model, context_text_max_chars=100)
+
+    assert result.iloc[0]["images"][0]["text"] == "captioned with context"
+    # The prompt passed to caption_batch should contain the page text.
+    call_kwargs = mock_model.caption_batch.call_args[1]
+    assert "quick brown fox" in call_kwargs["prompt"]
+    assert "Text near this image:" in call_kwargs["prompt"]

From 8ba2c81e9379748ba157bf1ceca3184447c69b87 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Thu, 19 Mar 2026 21:44:32 -0700
Subject: [PATCH 06/20] update readme

---
 nemo_retriever/README.md | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
index 6a0ac50db..b454bbcf1 100644
--- a/nemo_retriever/README.md
+++ b/nemo_retriever/README.md
@@ -48,10 +48,18 @@ Use the CUDA 13.0 wheels from the dedicated index by running the following comma
 
 ```bash
 uv pip uninstall torch torchvision
-uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu130
+uv pip install torch==2.9.1 torchvision --torch-backend=cu130
 ```
 This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime.
 
+3. (Optional) Install vLLM for image captioning
+
+If you want to generate captions for extracted images, install [vLLM](https://docs.vllm.ai/).
+
+```bash
+uv pip install vllm --torch-backend=cu130
+```
+
 ## Run the pipeline
 
 The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/).
@@ -242,6 +250,30 @@ ingestor = create_ingestor(run_mode="batch")
 ingestor = ingestor.files([str(INPUT_AUDIO)]).extract_audio()
 ```
 
+### Caption extracted images
+
+Use `.caption()` to generate text descriptions for extracted images using a local VLM. Requires vLLM (see step 3 above).
+
+```python
+ingestor = (
+  ingestor.files(documents)
+  .extract()
+  .caption()
+  .embed()
+  .vdb_upload()
+)
+```
+
+By default this uses [Nemotron-Nano-12B-VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16). You can customize the model and prompt:
+
+```python
+.caption(
+  model_name="nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+  prompt="Describe this image in detail:",
+  context_text_max_chars=1024,  # include surrounding page text as context
+)
+```
+
 ### Explore Different Pipeline Options:
 
 You can use the [Nemotron RAG VL Embedder](https://huggingface.co/nvidia/llama-nemotron-embed-vl-1b-v2)

From 06e5d8ea64a5050c92efbe92546bc02486008d83 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Fri, 20 Mar 2026 13:25:23 -0700
Subject: [PATCH 07/20] install vllm wheels for cu130 support

---
 nemo_retriever/README.md      | 4 ++--
 nemo_retriever/pyproject.toml | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
index b454bbcf1..566d50980 100644
--- a/nemo_retriever/README.md
+++ b/nemo_retriever/README.md
@@ -54,10 +54,10 @@ This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library
 
 3. (Optional) Install vLLM for image captioning
 
-If you want to generate captions for extracted images, install [vLLM](https://docs.vllm.ai/).
+If you want to generate captions for extracted images, install the `vlm-caption` extra which includes [vLLM](https://docs.vllm.ai/) built for CUDA 13.
 
 ```bash
-uv pip install vllm --torch-backend=cu130
+uv pip install "nemo-retriever[vlm-caption]"
 ```
 
 ## Run the pipeline
diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml
index 3c43286f1..4daf510e4 100644
--- a/nemo_retriever/pyproject.toml
+++ b/nemo_retriever/pyproject.toml
@@ -106,6 +106,10 @@ nemotron-table-structure-v1 = { index = "test-pypi" }
 nemotron-ocr = { index = "test-pypi" }
 torch = { index = "torch-cuda"}
 torchvision = { index ="torch-cuda"}
+vllm = [
+  { url = "https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0+cu130-cp38-abi3-manylinux_2_35_x86_64.whl", marker = "platform_machine == 'x86_64'" },
+  { url = "https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0+cu130-cp38-abi3-manylinux_2_35_aarch64.whl", marker = "platform_machine == 'aarch64'" },
+]
 
 [[tool.uv.index]]
 name = "test-pypi"

From 58fe3811cc4aa5f7e1c7a4292f6d2306120294ce Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Fri, 20 Mar 2026 13:34:41 -0700
Subject: [PATCH 08/20] pin vllm to exact match

---
 nemo_retriever/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml
index 4daf510e4..6c6d2d7a7 100644
--- a/nemo_retriever/pyproject.toml
+++ b/nemo_retriever/pyproject.toml
@@ -82,7 +82,7 @@ svg = [
   "cairosvg>=2.7.0",
 ]
 vlm-caption = [
-  "vllm>=0.16.0",
+  "vllm==0.16.0",
 ]
 dev = [
   "build>=1.2.2",

From f90de97d5cd17a769dcad6f80f87f791255b69e4 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 22 Mar 2026 10:54:05 -0700
Subject: [PATCH 09/20] cache model globally

---
 .../src/nemo_retriever/caption/caption.py     | 71 +++++++++++++------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
index 5cad882e8..6bc0c9b53 100644
--- a/nemo_retriever/src/nemo_retriever/caption/caption.py
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -11,6 +11,22 @@
 from nemo_retriever.params import CaptionParams
 
 _MAX_CONTEXT_TEXT_CHARS = 4096
+_cached_local_model = None
+
+
+def _get_cached_local_model(kwargs: dict) -> "Any":
+    global _cached_local_model
+    if _cached_local_model is None:
+        from nemo_retriever.model.local import NemotronVLMCaptioner
+
+        _cached_local_model = NemotronVLMCaptioner(
+            model_path=kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"),
+            device=kwargs.get("device"),
+            hf_cache_dir=kwargs.get("hf_cache_dir"),
+            tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
+            gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9),
+        )
+    return _cached_local_model
 
 
 class CaptionActor:
@@ -115,14 +131,21 @@ def _caption_one(
     """Caption a single image (used when each image gets a unique prompt)."""
     if model is not None:
         captions = _caption_batch_local(
-            [b64], model=model, prompt=prompt,
-            system_prompt=system_prompt, temperature=temperature,
+            [b64],
+            model=model,
+            prompt=prompt,
+            system_prompt=system_prompt,
+            temperature=temperature,
         )
     else:
         captions = _caption_batch_remote(
-            [b64], endpoint_url=endpoint_url,  # type: ignore[arg-type]
-            model_name=model_name, api_key=api_key, prompt=prompt,
-            system_prompt=system_prompt, temperature=temperature,
+            [b64],
+            endpoint_url=endpoint_url,  # type: ignore[arg-type]
+            model_name=model_name,
+            api_key=api_key,
+            prompt=prompt,
+            system_prompt=system_prompt,
+            temperature=temperature,
         )
     return captions[0] if captions else ""
 
@@ -166,15 +189,8 @@ def caption_images(
 
     if model is None and not endpoint_url:
         # Lazy model creation for the sequential (no GPU pool) fallback.
-        from nemo_retriever.model.local import NemotronVLMCaptioner
-
-        model = NemotronVLMCaptioner(
-            model_path=kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"),
-            device=kwargs.get("device"),
-            hf_cache_dir=kwargs.get("hf_cache_dir"),
-            tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
-            gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9),
-        )
+        # Cache the model so it is not re-created on every call.
+        model = _get_cached_local_model(kwargs)
 
     use_context = context_text_max_chars > 0
     effective_max = min(context_text_max_chars, _MAX_CONTEXT_TEXT_CHARS) if use_context else 0
@@ -204,9 +220,13 @@ def caption_images(
             context = (page_text or "")[:effective_max]
             enriched_prompt = _build_prompt_with_context(prompt, context)
             caption = _caption_one(
-                b64, model=model, endpoint_url=endpoint_url,
-                model_name=model_name, api_key=api_key,
-                prompt=enriched_prompt, system_prompt=system_prompt,
+                b64,
+                model=model,
+                endpoint_url=endpoint_url,
+                model_name=model_name,
+                api_key=api_key,
+                prompt=enriched_prompt,
+                system_prompt=system_prompt,
                 temperature=temperature,
             )
             batch_df.at[row_idx, "images"][item_idx]["text"] = caption
@@ -218,14 +238,21 @@ def caption_images(
 
             if model is not None:
                 captions = _caption_batch_local(
-                    chunk_b64, model=model, prompt=prompt,
-                    system_prompt=system_prompt, temperature=temperature,
+                    chunk_b64,
+                    model=model,
+                    prompt=prompt,
+                    system_prompt=system_prompt,
+                    temperature=temperature,
                 )
             else:
                 captions = _caption_batch_remote(
-                    chunk_b64, endpoint_url=endpoint_url,  # type: ignore[arg-type]
-                    model_name=model_name, api_key=api_key, prompt=prompt,
-                    system_prompt=system_prompt, temperature=temperature,
+                    chunk_b64,
+                    endpoint_url=endpoint_url,  # type: ignore[arg-type]
+                    model_name=model_name,
+                    api_key=api_key,
+                    prompt=prompt,
+                    system_prompt=system_prompt,
+                    temperature=temperature,
                 )
             all_captions.extend(captions)
 

From 2a3df58f21464669baa6c6b382ff97439daaab24 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 22 Mar 2026 13:46:12 -0700
Subject: [PATCH 10/20] set gpu memory utilization

---
 .../src/nemo_retriever/examples/batch_pipeline.py           | 6 ++++++
 .../src/nemo_retriever/examples/inprocess_pipeline.py       | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
index 4090618fa..00c80ca74 100644
--- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
@@ -562,6 +562,11 @@ def main(
         "--caption-context-text-max-chars",
         help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
     ),
+    caption_gpu_memory_utilization: float = typer.Option(
+        0.25,
+        "--caption-gpu-memory-utilization",
+        help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).",
+    ),
     text_chunk: bool = typer.Option(
         False,
         "--text-chunk",
@@ -796,6 +801,7 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
                     model_name=caption_model_name,
                     device=caption_device,
                     context_text_max_chars=caption_context_text_max_chars,
+                    gpu_memory_utilization=caption_gpu_memory_utilization,
                 )
             )
 
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
index c8fda38b9..37324c554 100644
--- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -197,6 +197,11 @@ def main(
         "--caption-context-text-max-chars",
         help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
     ),
+    caption_gpu_memory_utilization: float = typer.Option(
+        0.25,
+        "--caption-gpu-memory-utilization",
+        help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).",
+    ),
     hybrid: bool = typer.Option(
         False,
         "--hybrid/--no-hybrid",
@@ -329,6 +334,7 @@ def main(
                 model_name=caption_model_name,
                 device=caption_device,
                 context_text_max_chars=caption_context_text_max_chars,
+                gpu_memory_utilization=caption_gpu_memory_utilization,
             )
         )
 

From 1306f2be819bf3c17880774845f71d069a7f255a Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 22 Mar 2026 14:09:09 -0700
Subject: [PATCH 11/20] set caption batch size

---
 .../src/nemo_retriever/examples/batch_pipeline.py         | 8 +++++++-
 .../src/nemo_retriever/examples/inprocess_pipeline.py     | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
index 00c80ca74..491cbf8b6 100644
--- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
@@ -562,8 +562,13 @@ def main(
         "--caption-context-text-max-chars",
         help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
     ),
+    caption_batch_size: int = typer.Option(
+        4,
+        "--caption-batch-size",
+        help="Number of images to caption per batch.",
+    ),
     caption_gpu_memory_utilization: float = typer.Option(
-        0.25,
+        0.5,
         "--caption-gpu-memory-utilization",
         help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).",
     ),
@@ -801,6 +806,7 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
                     model_name=caption_model_name,
                     device=caption_device,
                     context_text_max_chars=caption_context_text_max_chars,
+                    batch_size=caption_batch_size,
                     gpu_memory_utilization=caption_gpu_memory_utilization,
                 )
             )
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
index 37324c554..ebcea181e 100644
--- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -197,8 +197,13 @@ def main(
         "--caption-context-text-max-chars",
         help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
     ),
+    caption_batch_size: int = typer.Option(
+        4,
+        "--caption-batch-size",
+        help="Number of images to caption per batch.",
+    ),
     caption_gpu_memory_utilization: float = typer.Option(
-        0.25,
+        0.5,
         "--caption-gpu-memory-utilization",
         help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).",
     ),
@@ -334,6 +339,7 @@ def main(
                 model_name=caption_model_name,
                 device=caption_device,
                 context_text_max_chars=caption_context_text_max_chars,
+                batch_size=caption_batch_size,
                 gpu_memory_utilization=caption_gpu_memory_utilization,
             )
         )

From 858f7ca4a5b6b585c03533d1fc95a413f375b804 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 22 Mar 2026 14:21:38 -0700
Subject: [PATCH 12/20] remove batch size arg

---
 .../src/nemo_retriever/caption/caption.py     | 33 ++++++++++---------
 .../nemo_retriever/examples/batch_pipeline.py |  8 +----
 .../examples/inprocess_pipeline.py            |  8 +----
 3 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
index 6bc0c9b53..92429930c 100644
--- a/nemo_retriever/src/nemo_retriever/caption/caption.py
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -232,21 +232,24 @@ def caption_images(
             batch_df.at[row_idx, "images"][item_idx]["text"] = caption
     else:
         # Batch mode: all images share the same prompt.
-        all_captions: List[str] = []
-        for start in range(0, len(pending), batch_size):
-            chunk_b64 = [b64 for _, _, b64 in pending[start : start + batch_size]]
-
-            if model is not None:
-                captions = _caption_batch_local(
-                    chunk_b64,
-                    model=model,
-                    prompt=prompt,
-                    system_prompt=system_prompt,
-                    temperature=temperature,
-                )
-            else:
+        all_b64 = [b64 for _, _, b64 in pending]
+
+        if model is not None:
+            # Submit all at once — vLLM schedules internally based on
+            # available GPU memory.
+            all_captions = _caption_batch_local(
+                all_b64,
+                model=model,
+                prompt=prompt,
+                system_prompt=system_prompt,
+                temperature=temperature,
+            )
+        else:
+            # Remote endpoints may have request-size limits; chunk.
+            all_captions: List[str] = []
+            for start in range(0, len(all_b64), batch_size):
                 captions = _caption_batch_remote(
-                    chunk_b64,
+                    all_b64[start : start + batch_size],
                     endpoint_url=endpoint_url,  # type: ignore[arg-type]
                     model_name=model_name,
                     api_key=api_key,
@@ -254,7 +257,7 @@ def caption_images(
                     system_prompt=system_prompt,
                     temperature=temperature,
                 )
-            all_captions.extend(captions)
+                all_captions.extend(captions)
 
         for (row_idx, item_idx, _), caption in zip(pending, all_captions):
             batch_df.at[row_idx, "images"][item_idx]["text"] = caption
diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
index 491cbf8b6..b0113efe4 100644
--- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
@@ -562,13 +562,8 @@ def main(
         "--caption-context-text-max-chars",
         help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
     ),
-    caption_batch_size: int = typer.Option(
-        4,
-        "--caption-batch-size",
-        help="Number of images to caption per batch.",
-    ),
     caption_gpu_memory_utilization: float = typer.Option(
-        0.5,
+        0.4,
         "--caption-gpu-memory-utilization",
         help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).",
     ),
@@ -806,7 +801,6 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
                     model_name=caption_model_name,
                     device=caption_device,
                     context_text_max_chars=caption_context_text_max_chars,
-                    batch_size=caption_batch_size,
                     gpu_memory_utilization=caption_gpu_memory_utilization,
                 )
             )
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
index ebcea181e..5a89fc433 100644
--- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -197,13 +197,8 @@ def main(
         "--caption-context-text-max-chars",
         help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
     ),
-    caption_batch_size: int = typer.Option(
-        4,
-        "--caption-batch-size",
-        help="Number of images to caption per batch.",
-    ),
     caption_gpu_memory_utilization: float = typer.Option(
-        0.5,
+        0.4,
         "--caption-gpu-memory-utilization",
         help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).",
     ),
@@ -339,7 +334,6 @@ def main(
                 model_name=caption_model_name,
                 device=caption_device,
                 context_text_max_chars=caption_context_text_max_chars,
-                batch_size=caption_batch_size,
                 gpu_memory_utilization=caption_gpu_memory_utilization,
             )
         )

From 5a2e0fde8f048f84efd34fd45c72689f2e5ce8d1 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 22 Mar 2026 16:12:55 -0700
Subject: [PATCH 13/20] skip loading ocr

---
 .../src/nemo_retriever/examples/batch_pipeline.py          | 2 +-
 .../src/nemo_retriever/examples/inprocess_pipeline.py      | 2 +-
 nemo_retriever/src/nemo_retriever/ingest_modes/batch.py    | 7 ++++++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
index b0113efe4..c53ff0db5 100644
--- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
@@ -563,7 +563,7 @@ def main(
         help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
     ),
     caption_gpu_memory_utilization: float = typer.Option(
-        0.4,
+        0.5,
         "--caption-gpu-memory-utilization",
         help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).",
     ),
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
index 5a89fc433..d8c77ff1f 100644
--- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -198,7 +198,7 @@ def main(
         help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.",
     ),
     caption_gpu_memory_utilization: float = typer.Option(
-        0.4,
+        0.5,
         "--caption-gpu-memory-utilization",
         help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).",
     ),
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
index 0d1ac3488..e93ef7ed6 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
@@ -644,7 +644,12 @@ def _append_detection_stages(self, kwargs: dict[str, Any]) -> None:
 
             ocr_flags["inference_batch_size"] = self._requested_plan.get_ocr_batch_size()
 
-            if ocr_flags:
+            # Only append OCR stage if at least one content type needs it.
+            needs_ocr = any(
+                ocr_flags.get(k)
+                for k in ("extract_text", "extract_tables", "extract_charts", "extract_infographics")
+            )
+            if needs_ocr:
                 self._rd_dataset = self._rd_dataset.map_batches(
                     OCRActor,
                     batch_size=self._requested_plan.get_ocr_batch_size(),

From 83092073fb23fc7bf916dad1938db09a6b31bdfc Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 22 Mar 2026 16:58:46 -0700
Subject: [PATCH 14/20] use fractional gpu

---
 nemo_retriever/src/nemo_retriever/ingest_modes/batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
index e93ef7ed6..506d8a321 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
@@ -941,7 +941,7 @@ def caption(self, params: CaptionParams | None = None, **kwargs: Any) -> "BatchI
 
         from nemo_retriever.caption.caption import CaptionActor
 
-        caption_num_gpus = 0.0 if resolved.endpoint_url else 1.0
+        caption_num_gpus = 0.0 if resolved.endpoint_url else resolved.gpu_memory_utilization
 
         self._rd_dataset = self._rd_dataset.map_batches(
             CaptionActor,

From 564d72cb81eac7ef87a7f0efbcf07a6f8e8144a1 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Sun, 22 Mar 2026 18:14:47 -0700
Subject: [PATCH 15/20] filter out small images

---
 .../src/nemo_retriever/caption/caption.py       | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
index 92429930c..64d1a1e40 100644
--- a/nemo_retriever/src/nemo_retriever/caption/caption.py
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -11,6 +11,21 @@
 from nemo_retriever.params import CaptionParams
 
 _MAX_CONTEXT_TEXT_CHARS = 4096
+_MIN_IMAGE_DIMENSION = 32
+
+
+def _image_meets_min_size(b64: str) -> bool:
+    """Return True if the base64 image is at least _MIN_IMAGE_DIMENSION on both sides."""
+    import base64
+    from io import BytesIO
+    from PIL import Image
+
+    try:
+        img = Image.open(BytesIO(base64.b64decode(b64)))
+        w, h = img.size
+        return w >= _MIN_IMAGE_DIMENSION and h >= _MIN_IMAGE_DIMENSION
+    except Exception:
+        return False
 _cached_local_model = None
 
 
@@ -207,7 +222,7 @@ def caption_images(
             if item.get("text"):
                 continue  # already captioned
             b64 = item.get("image_b64")
-            if b64:
+            if b64 and _image_meets_min_size(b64):
                 pending.append((row_idx, item_idx, b64))
 
     if not pending:

From a921382b573b8ff9b98c67f95eecac638cff945c Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Mon, 23 Mar 2026 11:05:53 -0700
Subject: [PATCH 16/20] updates

---
 nemo_retriever/README.md                      | 18 ++++-----
 nemo_retriever/pyproject.toml                 |  4 +-
 .../src/nemo_retriever/caption/caption.py     | 16 ++++----
 .../model/local/nemotron_vlm_captioner.py     |  2 +-
 .../src/nemo_retriever/params/models.py       |  2 +-
 nemo_retriever/src/nemo_retriever/version.py  | 39 +++++++------------
 6 files changed, 34 insertions(+), 47 deletions(-)

diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
index 566d50980..479721819 100644
--- a/nemo_retriever/README.md
+++ b/nemo_retriever/README.md
@@ -48,18 +48,10 @@ Use the CUDA 13.0 wheels from the dedicated index by running the following comma
 
 ```bash
 uv pip uninstall torch torchvision
-uv pip install torch==2.9.1 torchvision --torch-backend=cu130
+uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu130
 ```
 This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime.
 
-3. (Optional) Install vLLM for image captioning
-
-If you want to generate captions for extracted images, install the `vlm-caption` extra which includes [vLLM](https://docs.vllm.ai/) built for CUDA 13.
-
-```bash
-uv pip install "nemo-retriever[vlm-caption]"
-```
-
 ## Run the pipeline
 
 The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/).
@@ -257,7 +249,13 @@ Use `.caption()` to generate text descriptions for extracted images using a loca
 ```python
 ingestor = (
   ingestor.files(documents)
-  .extract()
+  .extract(
+      extract_text=True,
+      extract_tables=False,
+      extract_charts=False,
+      extract_infographics=False,
+      extract_images=True,
+  )
   .caption()
   .embed()
   .vdb_upload()
diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml
index 6c6d2d7a7..7de00f251 100644
--- a/nemo_retriever/pyproject.toml
+++ b/nemo_retriever/pyproject.toml
@@ -75,15 +75,13 @@ dependencies = [
   "soundfile>=0.12.0",
   "scipy>=1.11.0",
   "nvidia-ml-py",
+  "vllm==0.16.0",
 ]
 
 [project.optional-dependencies]
 svg = [
   "cairosvg>=2.7.0",
 ]
-vlm-caption = [
-  "vllm==0.16.0",
-]
 dev = [
   "build>=1.2.2",
   "pytest>=8.0.2",
diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
index 64d1a1e40..2b73eddc5 100644
--- a/nemo_retriever/src/nemo_retriever/caption/caption.py
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -4,29 +4,29 @@
 
 from __future__ import annotations
 
+import base64
+from io import BytesIO
 from typing import Any, Dict, List, Tuple
 
 import pandas as pd
+from PIL import Image
 
 from nemo_retriever.params import CaptionParams
 
+_DEFAULT_MODEL_NAME = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"
 _MAX_CONTEXT_TEXT_CHARS = 4096
 _MIN_IMAGE_DIMENSION = 32
+_cached_local_model = None
 
 
 def _image_meets_min_size(b64: str) -> bool:
     """Return True if the base64 image is at least _MIN_IMAGE_DIMENSION on both sides."""
-    import base64
-    from io import BytesIO
-    from PIL import Image
-
     try:
         img = Image.open(BytesIO(base64.b64decode(b64)))
         w, h = img.size
         return w >= _MIN_IMAGE_DIMENSION and h >= _MIN_IMAGE_DIMENSION
     except Exception:
         return False
-_cached_local_model = None
 
 
 def _get_cached_local_model(kwargs: dict) -> "Any":
@@ -35,7 +35,7 @@ def _get_cached_local_model(kwargs: dict) -> "Any":
         from nemo_retriever.model.local import NemotronVLMCaptioner
 
         _cached_local_model = NemotronVLMCaptioner(
-            model_path=kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"),
+            model_path=kwargs.get("model_name", _DEFAULT_MODEL_NAME),
             device=kwargs.get("device"),
             hf_cache_dir=kwargs.get("hf_cache_dir"),
             tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
@@ -61,7 +61,7 @@ def __init__(self, params: CaptionParams) -> None:
             from nemo_retriever.model.local import NemotronVLMCaptioner
 
             self._model = NemotronVLMCaptioner(
-                model_path=self._kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"),
+                model_path=self._kwargs.get("model_name", _DEFAULT_MODEL_NAME),
                 device=self._kwargs.get("device"),
                 hf_cache_dir=self._kwargs.get("hf_cache_dir"),
                 tensor_parallel_size=self._kwargs.get("tensor_parallel_size", 1),
@@ -170,7 +170,7 @@ def caption_images(
     *,
     model: Any = None,
     endpoint_url: str | None = None,
-    model_name: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+    model_name: str = _DEFAULT_MODEL_NAME,
     api_key: str | None = None,
     prompt: str = "Caption the content of this image:",
     system_prompt: str | None = "/no_think",
diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
index 8264a82f5..9279ccd6e 100644
--- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
+++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
@@ -75,7 +75,7 @@ def __init__(
         hf_cache_dir: Optional[str] = None,
         max_new_tokens: int = 1024,
         tensor_parallel_size: int = 1,
-        gpu_memory_utilization: float = 0.9,
+        gpu_memory_utilization: float = 0.8,
     ) -> None:
         super().__init__()
 
diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py
index 8f48e125e..8c16b388e 100644
--- a/nemo_retriever/src/nemo_retriever/params/models.py
+++ b/nemo_retriever/src/nemo_retriever/params/models.py
@@ -314,7 +314,7 @@ class CaptionParams(_ParamsModel):
     hf_cache_dir: Optional[str] = None
     context_text_max_chars: int = 0
     tensor_parallel_size: int = 1
-    gpu_memory_utilization: float = 0.9
+    gpu_memory_utilization: float = 0.8
 
 
 class InfographicParams(_ParamsModel):
diff --git a/nemo_retriever/src/nemo_retriever/version.py b/nemo_retriever/src/nemo_retriever/version.py
index 13ec2bd3b..9999c919c 100644
--- a/nemo_retriever/src/nemo_retriever/version.py
+++ b/nemo_retriever/src/nemo_retriever/version.py
@@ -12,7 +12,6 @@
 from pathlib import Path
 import os
 import subprocess
-import tempfile
 
 try:
     from ._build_info import BUILD_DATE as _PACKAGE_BUILD_DATE
@@ -24,7 +23,6 @@
 
 _PKG_NAME = "nemo-retriever"
 _UNKNOWN = "unknown"
-_BUILD_STAMP = Path(tempfile.gettempdir()) / ".nemo_retriever_build_stamp"
 
 
 def _utc_now() -> datetime:
@@ -59,28 +57,7 @@ def _build_datetime() -> datetime:
         except ValueError:
             pass
 
-    # Stamp file in the system temp dir makes the timestamp deterministic
-    # across the two separate subprocesses pip spawns during a PEP 517 build
-    # (metadata + wheel).  We use tempdir rather than the source tree because
-    # pip may copy the source to different locations for each step.
-    if _BUILD_STAMP.exists():
-        try:
-            cached = _BUILD_STAMP.read_text().strip()
-            if cached:
-                ts = float(cached)
-                # Only reuse if less than 60 s old to avoid stale stamps.
-                if abs(_utc_now().timestamp() - ts) < 60:
-                    return datetime.fromtimestamp(ts, tz=timezone.utc)
-            _BUILD_STAMP.unlink(missing_ok=True)
-        except (OSError, ValueError):
-            pass
-
-    now = _utc_now()
-    try:
-        _BUILD_STAMP.write_text(str(now.timestamp()))
-    except OSError:
-        pass
-    return now
+    return _utc_now()
 
 
 @lru_cache(maxsize=1)
@@ -131,6 +108,18 @@ def _base_version() -> str:
     return os.getenv("RETRIEVER_VERSION") or os.getenv("NV_INGEST_VERSION") or _build_datetime().strftime("%Y.%m.%d")
 
 
+def _has_prerelease(version_str: str) -> bool:
+    """Return True if *version_str* already contains a PEP 440 pre-release segment."""
+    from packaging.version import Version
+
+    try:
+        return Version(version_str).pre is not None
+    except Exception:
+        import re
+
+        return bool(re.search(r"(a|alpha|b|beta|rc|c|dev|pre)[-_.]?\d*", version_str, re.I))
+
+
 def get_build_version() -> str:
     """Return a PEP 440 compliant version string for packaging."""
     release_type = (os.getenv("RETRIEVER_RELEASE_TYPE") or os.getenv("NV_INGEST_RELEASE_TYPE") or "dev").lower()
@@ -139,6 +128,8 @@ def get_build_version() -> str:
     build_number = _build_number()
 
     if release_type == "release":
+        if _has_prerelease(base_version):
+            return base_version
         return f"{base_version}.post{build_number}" if int(build_number) > 0 else base_version
     if release_type == "dev":
         return f"{base_version}.dev{build_number}"

From b0b447548d0b0c7b3d0f5518f89e828df5bd3f4b Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Mon, 23 Mar 2026 12:04:49 -0700
Subject: [PATCH 17/20] updates

---
 .../nemo_retriever/ingest_modes/gpu_pool.py   | 37 +------------------
 .../nemo_retriever/ingest_modes/inprocess.py  |  5 ++-
 2 files changed, 4 insertions(+), 38 deletions(-)

diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py b/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py
index 11f3c36d6..cb1aa019a 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py
@@ -66,28 +66,6 @@ def create(self) -> Any:
         return NemotronParseV12(task_prompt=self.task_prompt)
 
 
-@dataclass
-class CaptionModelConfig:
-    """Config to recreate a NemotronVLMCaptioner model."""
-
-    model_path: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"
-    device: Optional[str] = None
-    hf_cache_dir: Optional[str] = None
-    tensor_parallel_size: int = 1
-    gpu_memory_utilization: float = 0.9
-
-    def create(self) -> Any:
-        from nemo_retriever.model.local import NemotronVLMCaptioner
-
-        return NemotronVLMCaptioner(
-            model_path=self.model_path,
-            device=self.device,
-            hf_cache_dir=self.hf_cache_dir,
-            tensor_parallel_size=self.tensor_parallel_size,
-            gpu_memory_utilization=self.gpu_memory_utilization,
-        )
-
-
 @dataclass
 class EmbeddingModelConfig:
     """Config to recreate an embedding model (VL or non-VL)."""
@@ -189,19 +167,6 @@ def _extract_model_config(func: Callable, kwargs: dict[str, Any]) -> Any:
     if func is collapse_content_to_page_rows:
         return None  # CPU-only, no model
 
-    from nemo_retriever.caption.caption import caption_images
-
-    if func is caption_images:
-        if kwargs.get("endpoint_url"):
-            return None  # Remote endpoint, no local model
-        return CaptionModelConfig(
-            model_path=kwargs.get("model_name", CaptionModelConfig.model_path),
-            device=kwargs.get("device"),
-            hf_cache_dir=kwargs.get("hf_cache_dir"),
-            tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
-            gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9),
-        )
-
     return None
 
 
@@ -327,7 +292,7 @@ def start(self) -> None:
             p = self._ctx.Process(
                 target=_gpu_worker_entry,
                 args=(idx, device_id, self._task_descriptors, iq, self._output_queue, evt),
-                daemon=False,
+                daemon=True,
             )
             p.start()
             self._workers.append(p)
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
index 144d2b599..d47af16ea 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
@@ -1364,8 +1364,9 @@ def caption(self, params: "CaptionParams | None" = None, **kwargs: Any) -> "InPr
 
                 warnings.warn(
                     "No caption device specified. The VLM will load on cuda:0, which "
-                    "may conflict with other models. Use --caption-device (e.g. "
-                    "'cuda:1') to place the captioner on a separate GPU.",
+                    "may conflict with other models. Use device='cuda:1' (or "
+                    "--caption-device from the CLI) to place the captioner on a "
+                    "separate GPU.",
                     stacklevel=2,
                 )
             caption_kwargs["model"] = None

From e6cb852c9839dd7115495f7e7eaf9e9d0255a9fc Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Mon, 23 Mar 2026 12:11:29 -0700
Subject: [PATCH 18/20] fix tests

---
 nemo_retriever/tests/test_caption.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/nemo_retriever/tests/test_caption.py b/nemo_retriever/tests/test_caption.py
index 3bf6844be..f216ac831 100644
--- a/nemo_retriever/tests/test_caption.py
+++ b/nemo_retriever/tests/test_caption.py
@@ -15,15 +15,15 @@
 from PIL import Image  # noqa: E402
 
 
-def _make_1x1_png_b64() -> str:
-    img = Image.new("RGB", (1, 1), color=(255, 0, 0))
+def _make_test_png_b64(size: tuple[int, int] = (64, 64)) -> str:
+    img = Image.new("RGB", size, color=(255, 0, 0))
     buf = io.BytesIO()
     img.save(buf, format="PNG")
     return base64.b64encode(buf.getvalue()).decode("ascii")
 
 
 def _make_page_df(num_images=2, captioned=False):
-    b64 = _make_1x1_png_b64()
+    b64 = _make_test_png_b64()
     images = [
         {"bbox_xyxy_norm": [0.1, 0.2, 0.5, 0.8], "text": "done" if captioned else "", "image_b64": b64}
         for _ in range(num_images)
@@ -55,7 +55,7 @@ def test_pdf_extraction_populates_images(mock_extract):
     _ext = pytest.importorskip("nemo_retriever.pdf.extract")
     pdfium = pytest.importorskip("pypdfium2")
 
-    mock_img = MagicMock(image=_make_1x1_png_b64(), bbox=(10, 20, 100, 200), max_width=612, max_height=792)
+    mock_img = MagicMock(image=_make_test_png_b64(), bbox=(10, 20, 100, 200), max_width=612, max_height=792)
     mock_extract.return_value = [mock_img]
 
     doc = pdfium.PdfDocument.new()
@@ -74,7 +74,7 @@ def test_pdf_extraction_populates_images(mock_extract):
 def test_explode_includes_captioned_images():
     from nemo_retriever.ingest_modes.inprocess import explode_content_to_rows
 
-    b64 = _make_1x1_png_b64()
+    b64 = _make_test_png_b64()
     df = pd.DataFrame([{
         "text": "page",
         "page_image": {"image_b64": b64},
@@ -105,3 +105,16 @@ def test_context_text_prepended_to_prompt():
     call_kwargs = mock_model.caption_batch.call_args[1]
     assert "quick brown fox" in call_kwargs["prompt"]
     assert "Text near this image:" in call_kwargs["prompt"]
+
+
+def test_caption_images_skips_small_images():
+    from nemo_retriever.caption.caption import caption_images
+
+    tiny_b64 = _make_test_png_b64(size=(1, 1))
+    images = [{"bbox_xyxy_norm": [0.1, 0.2, 0.5, 0.8], "text": "", "image_b64": tiny_b64}]
+    df = pd.DataFrame([{"text": "page", "images": images, "tables": [], "charts": [], "infographics": []}])
+
+    mock_model = MagicMock()
+    result = caption_images(df, model=mock_model)
+    mock_model.caption_batch.assert_not_called()
+    assert result.iloc[0]["images"][0]["text"] == ""

From ae086798db1523654d8145d8cb699ed813bd0335 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Mon, 23 Mar 2026 19:15:08 -0700
Subject: [PATCH 19/20] simplify

---
 .../src/nemo_retriever/caption/caption.py     | 78 ++++++++-----------
 .../model/local/nemotron_vlm_captioner.py     | 11 +--
 2 files changed, 38 insertions(+), 51 deletions(-)

diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
index 2b73eddc5..ee0c41efb 100644
--- a/nemo_retriever/src/nemo_retriever/caption/caption.py
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -29,18 +29,22 @@ def _image_meets_min_size(b64: str) -> bool:
         return False
 
 
+def _create_local_model(kwargs: dict) -> "Any":
+    from nemo_retriever.model.local import NemotronVLMCaptioner
+
+    return NemotronVLMCaptioner(
+        model_path=kwargs.get("model_name", _DEFAULT_MODEL_NAME),
+        device=kwargs.get("device"),
+        hf_cache_dir=kwargs.get("hf_cache_dir"),
+        tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
+        gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.8),
+    )
+
+
 def _get_cached_local_model(kwargs: dict) -> "Any":
     global _cached_local_model
     if _cached_local_model is None:
-        from nemo_retriever.model.local import NemotronVLMCaptioner
-
-        _cached_local_model = NemotronVLMCaptioner(
-            model_path=kwargs.get("model_name", _DEFAULT_MODEL_NAME),
-            device=kwargs.get("device"),
-            hf_cache_dir=kwargs.get("hf_cache_dir"),
-            tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
-            gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9),
-        )
+        _cached_local_model = _create_local_model(kwargs)
     return _cached_local_model
 
 
@@ -58,15 +62,7 @@ def __init__(self, params: CaptionParams) -> None:
         if endpoint:
             self._model = None
         else:
-            from nemo_retriever.model.local import NemotronVLMCaptioner
-
-            self._model = NemotronVLMCaptioner(
-                model_path=self._kwargs.get("model_name", _DEFAULT_MODEL_NAME),
-                device=self._kwargs.get("device"),
-                hf_cache_dir=self._kwargs.get("hf_cache_dir"),
-                tensor_parallel_size=self._kwargs.get("tensor_parallel_size", 1),
-                gpu_memory_utilization=self._kwargs.get("gpu_memory_utilization", 0.9),
-            )
+            self._model = _create_local_model(self._kwargs)
 
     def __call__(self, batch_df: Any) -> Any:
         return caption_images(batch_df, model=self._model, **self._kwargs)
@@ -82,19 +78,29 @@ def _build_prompt_with_context(base_prompt: str, context_text: str) -> str:
     return f"Text near this image:\n---\n{context_text}\n---\n\n{base_prompt}"
 
 
+def _create_remote_client(endpoint_url: str, api_key: str | None) -> Any:
+    """Create a reusable NIM inference client for a remote VLM endpoint."""
+    from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface
+    from nv_ingest_api.util.nim import create_inference_client
+
+    return create_inference_client(
+        model_interface=VLMModelInterface(),
+        endpoints=(None, endpoint_url),
+        auth_token=api_key,
+        infer_protocol="http",
+    )
+
+
 def _caption_batch_remote(
     base64_images: List[str],
     *,
-    endpoint_url: str,
+    nim_client: Any,
     model_name: str,
-    api_key: str | None,
     prompt: str,
     system_prompt: str | None,
     temperature: float,
 ) -> List[str]:
     """Send a batch of images to a remote VLM endpoint and return captions."""
-    from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface
-    from nv_ingest_api.util.nim import create_inference_client
     from nv_ingest_api.util.image_processing.transforms import scale_image_to_encoding_size
 
     scaled = [scale_image_to_encoding_size(b64)[0] for b64 in base64_images]
@@ -106,12 +112,6 @@ def _caption_batch_remote(
     if system_prompt:
         data["system_prompt"] = system_prompt
 
-    nim_client = create_inference_client(
-        model_interface=VLMModelInterface(),
-        endpoints=(None, endpoint_url),
-        auth_token=api_key,
-        infer_protocol="http",
-    )
     return nim_client.infer(data, model_name=model_name, temperature=temperature)
 
 
@@ -136,9 +136,8 @@ def _caption_one(
     b64: str,
     *,
     model: Any,
-    endpoint_url: str | None,
+    nim_client: Any | None,
     model_name: str,
-    api_key: str | None,
     prompt: str,
     system_prompt: str | None,
     temperature: float,
@@ -155,9 +154,8 @@ def _caption_one(
     else:
         captions = _caption_batch_remote(
             [b64],
-            endpoint_url=endpoint_url,  # type: ignore[arg-type]
+            nim_client=nim_client,
             model_name=model_name,
-            api_key=api_key,
             prompt=prompt,
             system_prompt=system_prompt,
             temperature=temperature,
@@ -203,14 +201,13 @@ def caption_images(
         return batch_df
 
     if model is None and not endpoint_url:
-        # Lazy model creation for the sequential (no GPU pool) fallback.
-        # Cache the model so it is not re-created on every call.
         model = _get_cached_local_model(kwargs)
 
+    nim_client = _create_remote_client(endpoint_url, api_key) if endpoint_url and model is None else None
+
     use_context = context_text_max_chars > 0
     effective_max = min(context_text_max_chars, _MAX_CONTEXT_TEXT_CHARS) if use_context else 0
 
-    # Collect all (row_idx, item_idx, image_b64) needing captions.
     pending: List[Tuple[int, int, str]] = []
     for row_idx, row in batch_df.iterrows():
         images = row.get("images")
@@ -229,7 +226,6 @@ def caption_images(
         return batch_df
 
     if use_context:
-        # Each image gets a per-page enriched prompt, so caption one at a time.
         for row_idx, item_idx, b64 in pending:
             page_text = batch_df.at[row_idx, "text"] if "text" in batch_df.columns else ""
             context = (page_text or "")[:effective_max]
@@ -237,21 +233,17 @@ def caption_images(
             caption = _caption_one(
                 b64,
                 model=model,
-                endpoint_url=endpoint_url,
+                nim_client=nim_client,
                 model_name=model_name,
-                api_key=api_key,
                 prompt=enriched_prompt,
                 system_prompt=system_prompt,
                 temperature=temperature,
             )
             batch_df.at[row_idx, "images"][item_idx]["text"] = caption
     else:
-        # Batch mode: all images share the same prompt.
         all_b64 = [b64 for _, _, b64 in pending]
 
         if model is not None:
-            # Submit all at once — vLLM schedules internally based on
-            # available GPU memory.
             all_captions = _caption_batch_local(
                 all_b64,
                 model=model,
@@ -260,14 +252,12 @@ def caption_images(
                 temperature=temperature,
             )
         else:
-            # Remote endpoints may have request-size limits; chunk.
             all_captions: List[str] = []
             for start in range(0, len(all_b64), batch_size):
                 captions = _caption_batch_remote(
                     all_b64[start : start + batch_size],
-                    endpoint_url=endpoint_url,  # type: ignore[arg-type]
+                    nim_client=nim_client,
                     model_name=model_name,
-                    api_key=api_key,
                     prompt=prompt,
                     system_prompt=system_prompt,
                     temperature=temperature,
diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
index 9279ccd6e..7d329054e 100644
--- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
+++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
@@ -156,12 +156,9 @@ def caption(
         temperature: float = 1.0,
     ) -> str:
         """Generate a caption for a single base64-encoded image."""
-        from vllm import SamplingParams
-
-        messages = self._build_messages(base64_image, prompt=prompt, system_prompt=system_prompt)
-        sampling_params = SamplingParams(temperature=temperature, max_tokens=self._max_new_tokens)
-        outputs = self._llm.chat([messages], sampling_params=sampling_params)
-        return outputs[0].outputs[0].text.strip()
+        return self.caption_batch([base64_image], prompt=prompt, system_prompt=system_prompt, temperature=temperature)[
+            0
+        ]
 
     def caption_batch(
         self,
@@ -186,7 +183,7 @@ def caption_batch(
 
     @property
     def model_name(self) -> str:
-        return "NVIDIA-Nemotron-Nano-12B-v2-VL"
+        return self._model_path
 
     @property
     def model_type(self) -> str:

From 92779f8b00a9e5c6fc0d9780572a694ba199c140 Mon Sep 17 00:00:00 2001
From: edknv <edwardk@nvidia.com>
Date: Mon, 23 Mar 2026 21:55:01 -0700
Subject: [PATCH 20/20] consistent default gpu mem util

---
 nemo_retriever/src/nemo_retriever/caption/caption.py            | 2 +-
 .../src/nemo_retriever/model/local/nemotron_vlm_captioner.py    | 2 +-
 nemo_retriever/src/nemo_retriever/params/models.py              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py
index ee0c41efb..b55dea563 100644
--- a/nemo_retriever/src/nemo_retriever/caption/caption.py
+++ b/nemo_retriever/src/nemo_retriever/caption/caption.py
@@ -37,7 +37,7 @@ def _create_local_model(kwargs: dict) -> "Any":
         device=kwargs.get("device"),
         hf_cache_dir=kwargs.get("hf_cache_dir"),
         tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
-        gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.8),
+        gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.5),
     )
 
 
diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
index 7d329054e..14b814381 100644
--- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
+++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py
@@ -75,7 +75,7 @@ def __init__(
         hf_cache_dir: Optional[str] = None,
         max_new_tokens: int = 1024,
         tensor_parallel_size: int = 1,
-        gpu_memory_utilization: float = 0.8,
+        gpu_memory_utilization: float = 0.5,
     ) -> None:
         super().__init__()
 
diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py
index 8c16b388e..8b92975db 100644
--- a/nemo_retriever/src/nemo_retriever/params/models.py
+++ b/nemo_retriever/src/nemo_retriever/params/models.py
@@ -314,7 +314,7 @@ class CaptionParams(_ParamsModel):
     hf_cache_dir: Optional[str] = None
     context_text_max_chars: int = 0
     tensor_parallel_size: int = 1
-    gpu_memory_utilization: float = 0.8
+    gpu_memory_utilization: float = 0.5
 
 
 class InfographicParams(_ParamsModel):