From 74b7125784677e90ce2549773942cf7f620b26bc Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 19 Mar 2026 09:56:01 -0700 Subject: [PATCH 01/20] (retriever) Add VLM image captioning via vLLM --- nemo_retriever/README.md | 37 +++ nemo_retriever/pyproject.toml | 11 +- .../src/nemo_retriever/caption/__init__.py | 3 + .../src/nemo_retriever/caption/caption.py | 177 ++++++++++++++ .../examples/inprocess_pipeline.py | 33 +++ .../nemo_retriever/ingest_modes/gpu_pool.py | 37 ++- .../nemo_retriever/ingest_modes/inprocess.py | 68 +++++- nemo_retriever/src/nemo_retriever/ingestor.py | 4 +- .../nemo_retriever/model/local/__init__.py | 5 + .../model/local/nemotron_vlm_captioner.py | 217 ++++++++++++++++++ .../src/nemo_retriever/params/__init__.py | 2 + .../src/nemo_retriever/params/models.py | 14 ++ .../src/nemo_retriever/pdf/extract.py | 27 ++- 13 files changed, 626 insertions(+), 9 deletions(-) create mode 100644 nemo_retriever/src/nemo_retriever/caption/__init__.py create mode 100644 nemo_retriever/src/nemo_retriever/caption/caption.py create mode 100644 nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index fbff5b00f..b47928c56 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -52,6 +52,43 @@ uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu13 ``` This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime. +## Image Captioning (optional) + +NeMo Retriever Library can caption extracted images using a local VLM +([Nemotron Nano 12B v2 VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16)). +This requires [vLLM](https://github.com/vllm-project/vllm) and +[mamba-ssm](https://github.com/state-spaces/mamba), which must be installed +separately because they contain CUDA kernels that must match your torch build. + +```bash +# 1. Install vLLM (--no-deps avoids overwriting the torch+cu130 already installed) +uv pip install --no-deps vllm>=0.16.0 + +# 2. Build mamba-ssm from source against your torch (takes a few minutes) +uv pip install --no-deps --no-build-isolation mamba-ssm>=2.3.1 +``` + +After installing, add `--caption` and `--caption-device` to your pipeline command: + +```bash +python -m nemo_retriever.examples.inprocess_pipeline \ + data/multimodal_test.pdf \ + --caption \ + --caption-device cuda:1 +``` + +`--caption-device` places the VLM on a separate GPU so it does not compete with +the page-elements, OCR, and embedding models. If omitted, a warning is printed +and the VLM defaults to `cuda:0`. + +Supported `--caption-model-name` values: + +| Model | Precision | Notes | +|---|---|---| +| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` (default) | BFloat16 | Works on SM80+ (A100, A10, RTX 3090, ...) | +| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8` | FP8 | Works on SM80+ | +| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD` | NVFP4 | Requires SM89+ (Ada Lovelace / Hopper) | + ## Run the pipeline The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/). diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 2e7d53df7..c7c0e3409 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -56,8 +56,8 @@ dependencies = [ "transformers>=5.0.0", "tokenizers>=0.20.3", "accelerate>=1.1.0", - "torch~=2.9.1", - "torchvision>=0.24,<0.25", + "torch>=2.5.0", + "torchvision", "einops", "easydict", "addict", @@ -82,6 +82,13 @@ dependencies = [ svg = [ "cairosvg>=2.7.0", ] +# Install with: pip install ".[vlm-caption]" +# mamba-ssm must be built from source against the installed torch: +# uv pip install --no-deps --no-build-isolation mamba-ssm +vlm-caption = [ + "vllm>=0.16.0", + "mamba-ssm>=2.3.1", +] dev = [ "build>=1.2.2", "pytest>=8.0.2", diff --git a/nemo_retriever/src/nemo_retriever/caption/__init__.py b/nemo_retriever/src/nemo_retriever/caption/__init__.py new file mode 100644 index 000000000..6aa2e3d5b --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/caption/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py new file mode 100644 index 000000000..0de69614d --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -0,0 +1,177 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple + +import pandas as pd + + +def _caption_batch_remote( + base64_images: List[str], + *, + endpoint_url: str, + model_name: str, + api_key: str | None, + prompt: str, + system_prompt: str | None, + temperature: float, +) -> List[str]: + """Send a batch of images to a remote VLM endpoint and return captions.""" + from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface + from nv_ingest_api.util.nim import create_inference_client + from nv_ingest_api.util.image_processing.transforms import scale_image_to_encoding_size + + scaled = [scale_image_to_encoding_size(b64)[0] for b64 in base64_images] + + data: Dict[str, Any] = { + "base64_images": scaled, + "prompt": prompt, + } + if system_prompt: + data["system_prompt"] = system_prompt + + nim_client = create_inference_client( + model_interface=VLMModelInterface(), + endpoints=(None, endpoint_url), + auth_token=api_key, + infer_protocol="http", + ) + return nim_client.infer(data, model_name=model_name, temperature=temperature) + + +def _caption_batch_local( + base64_images: List[str], + *, + model: Any, + prompt: str, + system_prompt: str | None, + temperature: float, +) -> List[str]: + """Generate captions using a local ``NemotronVLMCaptioner`` model.""" + return model.caption_batch( + base64_images, + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, + ) + + +def caption_images( + batch_df: pd.DataFrame, + *, + model: Any = None, + endpoint_url: str | None = None, + model_name: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + api_key: str | None = None, + prompt: str = "Caption the content of this image:", + system_prompt: str | None = "/no_think", + temperature: float = 1.0, + batch_size: int = 8, + **kwargs: Any, +) -> pd.DataFrame: + """Caption images in the ``images`` column using a VLM. + + Supports two modes: + + * **Remote** (``endpoint_url`` is set): sends images to an HTTP VLM + endpoint via ``create_inference_client`` / ``VLMModelInterface``. + * **Local** (``model`` is set): runs inference through a local + ``NemotronVLMCaptioner`` instance loaded from Hugging Face. + + For each row, any item in the ``images`` list whose ``text`` field is + empty will be captioned. The returned caption is written back into + ``images[i]["text"]``. + + Parameters + ---------- + batch_df : pd.DataFrame + DataFrame with an ``images`` column containing lists of dicts with + keys ``image_b64``, ``text``, and ``bbox_xyxy_norm``. + model : NemotronVLMCaptioner | None + Pre-loaded local VLM model. When provided, ``endpoint_url`` is + ignored and inference runs in-process. + endpoint_url : str | None + URL of a remote VLM HTTP endpoint. + model_name : str + Model identifier passed to the remote VLM endpoint (ignored for + local mode). + api_key : str | None + Bearer token for the remote VLM endpoint. + prompt : str + Text prompt sent alongside each image. + system_prompt : str | None + Optional system prompt for the VLM. + temperature : float + Sampling temperature. + batch_size : int + Number of images per remote VLM request (local mode processes + images one at a time). + """ + if not isinstance(batch_df, pd.DataFrame) or batch_df.empty: + return batch_df + if "images" not in batch_df.columns: + return batch_df + + if model is None and not endpoint_url: + # Lazy model creation for the sequential (no GPU pool) fallback. + from nemo_retriever.model.local import NemotronVLMCaptioner + + model = NemotronVLMCaptioner( + model_path=kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"), + device=kwargs.get("device"), + hf_cache_dir=kwargs.get("hf_cache_dir"), + tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), + gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9), + ) + + # Collect all (row_idx, item_idx, image_b64) needing captions. + pending: List[Tuple[int, int, str]] = [] + for row_idx, row in batch_df.iterrows(): + images = row.get("images") + if not isinstance(images, list): + continue + for item_idx, item in enumerate(images): + if not isinstance(item, dict): + continue + if item.get("text"): + continue # already captioned + b64 = item.get("image_b64") + if b64: + pending.append((row_idx, item_idx, b64)) + + if not pending: + return batch_df + + # Generate captions. + all_captions: List[str] = [] + for start in range(0, len(pending), batch_size): + chunk_b64 = [b64 for _, _, b64 in pending[start : start + batch_size]] + + if model is not None: + captions = _caption_batch_local( + chunk_b64, + model=model, + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, + ) + else: + captions = _caption_batch_remote( + chunk_b64, + endpoint_url=endpoint_url, # type: ignore[arg-type] + model_name=model_name, + api_key=api_key, + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, + ) + all_captions.extend(captions) + + # Write captions back into the DataFrame. + for (row_idx, item_idx, _), caption in zip(pending, all_captions): + batch_df.at[row_idx, "images"][item_idx]["text"] = caption + + return batch_df diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index b4bdb34ef..7fefa7fe0 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -15,6 +15,7 @@ import typer from nemo_retriever import create_ingestor from nemo_retriever.examples.common import estimate_processed_pages, print_pages_per_second +from nemo_retriever.params import CaptionParams from nemo_retriever.params import EmbedParams from nemo_retriever.params import ExtractParams from nemo_retriever.params import IngestExecuteParams @@ -150,6 +151,28 @@ def main( "--graphic-elements-invoke-url", help="Optional remote endpoint URL for graphic-elements model inference.", ), + caption: bool = typer.Option( + False, + "--caption/--no-caption", + help="Enable image captioning. Uses a local model by default, " + "or a remote endpoint if --caption-invoke-url is set.", + ), + caption_invoke_url: Optional[str] = typer.Option( + None, + "--caption-invoke-url", + help="Optional VLM endpoint URL for image captioning (e.g. http://vlm:8000/v1/chat/completions). " + "Implies --caption. When omitted, a local HF model is loaded instead.", + ), + caption_model_name: str = typer.Option( + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + "--caption-model-name", + help="VLM model name / HF model ID for image captioning.", + ), + caption_device: Optional[str] = typer.Option( + None, + "--caption-device", + help="GPU device for the local VLM captioner (e.g. 'cuda:1'). Defaults to the first --gpu-devices entry.", + ), hybrid: bool = typer.Option( False, "--hybrid/--no-hybrid", @@ -274,6 +297,16 @@ def main( ) ) + enable_caption = caption or caption_invoke_url is not None + if enable_caption: + ingestor = ingestor.caption( + CaptionParams( + endpoint_url=caption_invoke_url, + model_name=caption_model_name, + device=caption_device, + ) + ) + ingestor = ingestor.embed( EmbedParams( model_name=str(embed_model_name), diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py b/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py index cb1aa019a..11f3c36d6 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py @@ -66,6 +66,28 @@ def create(self) -> Any: return NemotronParseV12(task_prompt=self.task_prompt) +@dataclass +class CaptionModelConfig: + """Config to recreate a NemotronVLMCaptioner model.""" + + model_path: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" + device: Optional[str] = None + hf_cache_dir: Optional[str] = None + tensor_parallel_size: int = 1 + gpu_memory_utilization: float = 0.9 + + def create(self) -> Any: + from nemo_retriever.model.local import NemotronVLMCaptioner + + return NemotronVLMCaptioner( + model_path=self.model_path, + device=self.device, + hf_cache_dir=self.hf_cache_dir, + tensor_parallel_size=self.tensor_parallel_size, + gpu_memory_utilization=self.gpu_memory_utilization, + ) + + @dataclass class EmbeddingModelConfig: """Config to recreate an embedding model (VL or non-VL).""" @@ -167,6 +189,19 @@ def _extract_model_config(func: Callable, kwargs: dict[str, Any]) -> Any: if func is collapse_content_to_page_rows: return None # CPU-only, no model + from nemo_retriever.caption.caption import caption_images + + if func is caption_images: + if kwargs.get("endpoint_url"): + return None # Remote endpoint, no local model + return CaptionModelConfig( + model_path=kwargs.get("model_name", CaptionModelConfig.model_path), + device=kwargs.get("device"), + hf_cache_dir=kwargs.get("hf_cache_dir"), + tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), + gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9), + ) + return None @@ -292,7 +327,7 @@ def start(self) -> None: p = self._ctx.Process( target=_gpu_worker_entry, args=(idx, device_id, self._task_descriptors, iq, self._output_queue, evt), - daemon=True, + daemon=False, ) p.start() self._workers.append(p) diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py index 1f1d229a2..144d2b599 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py @@ -25,6 +25,8 @@ from collections.abc import Callable, Iterator from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, Union +from nemo_retriever.params import CaptionParams + import pandas as pd from nemo_retriever.model.local import NemotronOCRV1, NemotronPageElementsV3, NemotronParseV12 @@ -958,6 +960,7 @@ def __init__(self, documents: Optional[List[str]] = None) -> None: self._pipeline_type: Literal["pdf", "txt", "html", "image"] = "pdf" self._extract_txt_kwargs: Dict[str, Any] = {} self._extract_html_kwargs: Dict[str, Any] = {} + self._caption_enabled: bool = False def files(self, documents: Union[str, List[str]]) -> "InProcessIngestor": """ @@ -1332,6 +1335,45 @@ def extract_audio( self._tasks.append((apply_asr_to_df, {"asr_params": self._extract_audio_asr_kwargs})) return self + def caption(self, params: "CaptionParams | None" = None, **kwargs: Any) -> "InProcessIngestor": + """ + Configure image captioning via a local VLM model or remote endpoint. + + Sends cropped images (from the ``images`` column populated by + ``extract(extract_images=True)``) to a VLM and writes the returned + captions back as ``images[i]["text"]``. + + When ``endpoint_url`` is set, a remote NIM endpoint is used. + Otherwise a local ``NemotronVLMCaptioner`` is loaded from HF. + """ + from nemo_retriever.caption.caption import caption_images + from nemo_retriever.params import CaptionParams + + resolved = _coerce_params(params, CaptionParams, kwargs) + caption_kwargs = resolved.model_dump(mode="python") + + if resolved.endpoint_url: + # Remote mode. + if not resolved.api_key: + caption_kwargs["api_key"] = resolve_remote_api_key() + else: + # Local mode: defer model creation so the VLM is loaded lazily + # on the device specified by CaptionParams.device. + if not resolved.device: + import warnings + + warnings.warn( + "No caption device specified. The VLM will load on cuda:0, which " + "may conflict with other models. Use --caption-device (e.g. " + "'cuda:1') to place the captioner on a separate GPU.", + stacklevel=2, + ) + caption_kwargs["model"] = None + + self._caption_enabled = True + self._tasks.append((caption_images, caption_kwargs)) + return self + def embed(self, params: EmbedParams | None = None, **kwargs: Any) -> "InProcessIngestor": """ Configure embedding for in-process execution. @@ -1349,12 +1391,14 @@ def embed(self, params: EmbedParams | None = None, **kwargs: Any) -> "InProcessI embed_modality = resolved.embed_modality embed_granularity = resolved.embed_granularity + content_columns = (_CONTENT_COLUMNS + ("images",)) if self._caption_enabled else _CONTENT_COLUMNS + if embed_granularity == "page": # Page-level: one row per page with concatenated text and full page image. self._tasks.append( ( collapse_content_to_page_rows, - {"modality": embed_modality}, + {"modality": embed_modality, "content_columns": content_columns}, ) ) else: @@ -1368,6 +1412,7 @@ def embed(self, params: EmbedParams | None = None, **kwargs: Any) -> "InProcessI "modality": embed_modality, "text_elements_modality": text_elements_modality, "structured_elements_modality": structured_elements_modality, + "content_columns": content_columns, }, ) ) @@ -1487,12 +1532,21 @@ def ingest(self, params: IngestExecuteParams | None = None, **kwargs: Any) -> li _start = time.perf_counter() - # -- Three-way task classification -------------------------------- + # -- Task classification ------------------------------------------- + from nemo_retriever.caption.caption import caption_images as _caption_images_fn + _post_task_fns = (upload_embeddings_to_lancedb_inprocess, save_dataframe_to_disk_json) _cpu_task_fns = (pdf_extraction,) + # Caption runs on its own device (--caption-device), not in the GPU pool. + _own_device_fns = (_caption_images_fn,) cpu_tasks = [(f, k) for f, k in self._tasks if f in _cpu_task_fns] - gpu_tasks = [(f, k) for f, k in self._tasks if f not in _cpu_task_fns and f not in _post_task_fns] + gpu_tasks = [ + (f, k) + for f, k in self._tasks + if f not in _cpu_task_fns and f not in _post_task_fns and f not in _own_device_fns + ] + own_device_tasks = [(f, k) for f, k in self._tasks if f in _own_device_fns] post_tasks = [(f, k) for f, k in self._tasks if f in _post_task_fns] docs = list(self._documents) @@ -1545,6 +1599,8 @@ def _check_file_done(doc_path: str) -> None: try: result = future.result() if isinstance(result, pd.DataFrame) and not result.empty: + for func, kw in own_device_tasks: + result = func(result, **kw) shard_to_doc[shard_id] = doc gpu_pool.submit(shard_id, result) shard_id += 1 @@ -1639,6 +1695,8 @@ def _on_gpu_done(sid: int) -> None: return results combined = pd.concat(cpu_results, ignore_index=True) + for func, kwargs in own_device_tasks: + combined = func(combined, **kwargs) for func, kwargs in gpu_tasks: combined = func(combined, **kwargs) @@ -1678,6 +1736,8 @@ def _on_gpu_done(sid: int) -> None: else: current = func(current, **kwargs) if isinstance(current, pd.DataFrame) and not current.empty: + for func, kw in own_device_tasks: + current = func(current, **kw) shard_to_doc[shard_id] = doc_path gpu_pool.submit(shard_id, current) shard_id += 1 @@ -1777,7 +1837,7 @@ def _loader(p: str) -> pd.DataFrame: results.append(current) # Run upload/save once on combined results so overwrite=True keeps full corpus. - if post_tasks and results and all(isinstance(r, pd.DataFrame) for r in results): + if results and all(isinstance(r, pd.DataFrame) for r in results): combined = pd.concat(results, ignore_index=True) for func, kwargs in post_tasks: combined = func(combined, **kwargs) diff --git a/nemo_retriever/src/nemo_retriever/ingestor.py b/nemo_retriever/src/nemo_retriever/ingestor.py index 7bbc19486..74b6612e6 100644 --- a/nemo_retriever/src/nemo_retriever/ingestor.py +++ b/nemo_retriever/src/nemo_retriever/ingestor.py @@ -20,6 +20,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union from nemo_retriever.application.modes.factory import create_runmode_ingestor +from nemo_retriever.params import CaptionParams from nemo_retriever.params import EmbedParams from nemo_retriever.params import ExtractParams from nemo_retriever.params import TextChunkParams @@ -176,8 +177,9 @@ def save_to_disk( """Record result persistence configuration (execution TBD).""" self._not_implemented("save_to_disk") - def caption(self) -> "ingestor": + def caption(self, params: "CaptionParams | None" = None, **kwargs: Any) -> "ingestor": """Record a caption task configuration.""" + _ = _merge_params(params, kwargs) self._not_implemented("caption") def pdf_split_config(self, pages_per_chunk: int = 32) -> "ingestor": diff --git a/nemo_retriever/src/nemo_retriever/model/local/__init__.py b/nemo_retriever/src/nemo_retriever/model/local/__init__.py index 791df4daa..af068fa7d 100644 --- a/nemo_retriever/src/nemo_retriever/model/local/__init__.py +++ b/nemo_retriever/src/nemo_retriever/model/local/__init__.py @@ -18,6 +18,7 @@ "NemotronGraphicElementsV1", "NemotronParseV12", "NemotronRerankV2", + "NemotronVLMCaptioner", "ParakeetCTC1B1ASR", ] @@ -47,6 +48,10 @@ def __getattr__(name: str): from .nemotron_rerank_v2 import NemotronRerankV2 return NemotronRerankV2 + if name == "NemotronVLMCaptioner": + from .nemotron_vlm_captioner import NemotronVLMCaptioner + + return NemotronVLMCaptioner if name == "ParakeetCTC1B1ASR": from .parakeet_ctc_1_1b_asr import ParakeetCTC1B1ASR diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py new file mode 100644 index 000000000..c881f5c5f --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py @@ -0,0 +1,217 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import base64 +from io import BytesIO +from typing import Any, List, Optional + +from PIL import Image + +from nemo_retriever.utils.hf_cache import configure_global_hf_cache_base +from ..model import BaseModel, RunMode + + +def _b64_to_pil(b64: str) -> Image.Image: + """Decode a base64-encoded image string to a PIL Image.""" + return Image.open(BytesIO(base64.b64decode(b64))).convert("RGB") + + +class NemotronVLMCaptioner(BaseModel): + """ + Local VLM captioner wrapping Nemotron Nano 12B v2 VL variants. + + Supported models: + + * ``nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16`` (default, BFloat16) + * ``nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8`` (FP8 quantised) + * ``nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD`` (NVFP4 quantised, + requires GPU compute capability >= 8.9, e.g. Ada Lovelace / Hopper) + + Uses vLLM for inference with batched scheduling. + + Usage:: + + captioner = NemotronVLMCaptioner() + captions = captioner.caption_batch( + ["", ""], + prompt="Caption the content of this image:", + ) + """ + + SUPPORTED_MODELS: dict[str, str] = { + "BF16": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + "FP8": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8", + "NVFP4-QAD": "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD", + } + + # Pinned HF revision (commit SHA) per model to ensure reproducibility. + _MODEL_REVISIONS: dict[str, str] = { + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16": "5d250e2e111dc5e1434131bdf3d590c27a878ade", + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8": "7394488badb786e1decc0e00e308de1cab9560e6", + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD": "b8d3c170d9ee3a078917ef9bfd508eff988d6de7", + } + + # Map model-name suffixes to vLLM engine kwargs. + # The FP8 HF config ships with quant_method="modelopt" which triggers + # vLLM's ModelOptFp8Config (SM89+). Override to quant_method="fp8" in + # the HF config so vLLM uses its plain FP8 handler (SM80+). + _QUANTIZATION_PROFILES: dict[str, dict[str, Any]] = { + "BF16": {"dtype": "bfloat16"}, + "FP8": { + "dtype": "auto", + "quantization": "fp8", + "hf_overrides": {"quantization_config": {"quant_method": "fp8", "activation_scheme": "static"}}, + }, + "NVFP4-QAD": {"dtype": "auto", "quantization": "modelopt"}, + } + + def __init__( + self, + model_path: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + device: Optional[str] = None, + hf_cache_dir: Optional[str] = None, + max_new_tokens: int = 1024, + tensor_parallel_size: int = 1, + gpu_memory_utilization: float = 0.9, + ) -> None: + super().__init__() + + valid_models = list(self.SUPPORTED_MODELS.values()) + if model_path not in valid_models: + raise ValueError( + f"Unknown caption model: {model_path!r}\n" + f"Supported models:\n" + "\n".join(f" - {m}" for m in valid_models) + ) + + try: + from vllm import LLM, SamplingParams # noqa: F401 + except ImportError as e: + raise ImportError( + "Local VLM captioning requires vLLM. " 'Install with: pip install "nemo-retriever[vlm-caption]"' + ) from e + + self._model_path = model_path + self._max_new_tokens = max_new_tokens + + if device is not None: + # vLLM uses CUDA_VISIBLE_DEVICES rather than a torch device string. + # Translate e.g. "cuda:1" → "1" so vLLM sees only the requested GPU. + import os + + dev_id = device.split(":")[-1] if ":" in device else device + os.environ["CUDA_VISIBLE_DEVICES"] = dev_id + + configure_global_hf_cache_base(hf_cache_dir) + + revision = self._MODEL_REVISIONS.get(model_path) + + # Pick vLLM engine kwargs based on the model variant. + engine_kwargs: dict[str, Any] = {"dtype": "bfloat16"} # fallback + model_upper = model_path.upper() + for suffix, profile in self._QUANTIZATION_PROFILES.items(): + if model_upper.endswith(suffix): + engine_kwargs = profile + break + + self._llm = LLM( + model=model_path, + revision=revision, + trust_remote_code=True, + tensor_parallel_size=tensor_parallel_size, + gpu_memory_utilization=gpu_memory_utilization, + **engine_kwargs, + ) + + def _build_messages( + self, + base64_image: str, + *, + prompt: str, + system_prompt: Optional[str], + ) -> list[dict[str, Any]]: + """Build chat messages in OpenAI format for vLLM.""" + messages: list[dict[str, Any]] = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append( + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}, + {"type": "text", "text": prompt}, + ], + } + ) + return messages + + def caption( + self, + base64_image: str, + *, + prompt: str = "Caption the content of this image:", + system_prompt: Optional[str] = "/no_think", + temperature: float = 1.0, + ) -> str: + """Generate a caption for a single base64-encoded image.""" + from vllm import SamplingParams + + messages = self._build_messages(base64_image, prompt=prompt, system_prompt=system_prompt) + sampling_params = SamplingParams(temperature=temperature, max_tokens=self._max_new_tokens) + outputs = self._llm.chat([messages], sampling_params=sampling_params) + return outputs[0].outputs[0].text.strip() + + def caption_batch( + self, + base64_images: List[str], + *, + prompt: str = "Caption the content of this image:", + system_prompt: Optional[str] = "/no_think", + temperature: float = 1.0, + ) -> List[str]: + """Generate captions for a list of base64-encoded images. + + vLLM batches internally and handles scheduling across images. + """ + from vllm import SamplingParams + + conversations = [self._build_messages(b64, prompt=prompt, system_prompt=system_prompt) for b64 in base64_images] + sampling_params = SamplingParams(temperature=temperature, max_tokens=self._max_new_tokens) + outputs = self._llm.chat(conversations, sampling_params=sampling_params) + return [out.outputs[0].text.strip() for out in outputs] + + # ---- BaseModel abstract interface ---- + + @property + def model_name(self) -> str: + return "NVIDIA-Nemotron-Nano-12B-v2-VL" + + @property + def model_type(self) -> str: + return "vlm-captioner" + + @property + def model_runmode(self) -> RunMode: + return "local" + + @property + def input(self) -> Any: + return { + "type": "image", + "format": "base64", + "description": "Base64-encoded image for captioning.", + } + + @property + def output(self) -> Any: + return { + "type": "text", + "format": "string", + "description": "Generated caption for the input image.", + } + + @property + def input_batch_size(self) -> int: + return 1 diff --git a/nemo_retriever/src/nemo_retriever/params/__init__.py b/nemo_retriever/src/nemo_retriever/params/__init__.py index 5f4eef723..bfc65b50c 100644 --- a/nemo_retriever/src/nemo_retriever/params/__init__.py +++ b/nemo_retriever/src/nemo_retriever/params/__init__.py @@ -5,6 +5,7 @@ from .models import ASRParams from .models import AudioChunkParams from .models import BatchTuningParams +from .models import CaptionParams from .models import ChartParams from .models import EmbedParams from .models import ExtractParams @@ -30,6 +31,7 @@ "ASRParams", "AudioChunkParams", "BatchTuningParams", + "CaptionParams", "ChartParams", "EmbedParams", "ExtractParams", diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py index 1f81e38e0..1dd735a6c 100644 --- a/nemo_retriever/src/nemo_retriever/params/models.py +++ b/nemo_retriever/src/nemo_retriever/params/models.py @@ -296,6 +296,20 @@ class ChartParams(_ParamsModel): inference_batch_size: int = 8 +class CaptionParams(_ParamsModel): + endpoint_url: Optional[str] = None + model_name: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" + api_key: Optional[str] = None + prompt: str = "Caption the content of this image:" + system_prompt: Optional[str] = "/no_think" + temperature: float = 1.0 + batch_size: int = 8 + device: Optional[str] = None + hf_cache_dir: Optional[str] = None + tensor_parallel_size: int = 1 + gpu_memory_utilization: float = 0.9 + + class InfographicParams(_ParamsModel): remote: RemoteInvokeParams = Field(default_factory=RemoteInvokeParams) remote_retry: RemoteRetryParams = Field(default_factory=RemoteRetryParams) diff --git a/nemo_retriever/src/nemo_retriever/pdf/extract.py b/nemo_retriever/src/nemo_retriever/pdf/extract.py index 992c18ebe..a25502f97 100644 --- a/nemo_retriever/src/nemo_retriever/pdf/extract.py +++ b/nemo_retriever/src/nemo_retriever/pdf/extract.py @@ -15,6 +15,7 @@ from nv_ingest_api.util.pdf.pdfium import ( convert_bitmap_to_corrected_numpy, + extract_image_like_objects_from_pdfium_page, is_scanned_page as _is_scanned_page, ) @@ -296,13 +297,37 @@ def pdf_extraction( render_mode=render_mode, ) + # Extract cropped images from pdfium page objects. + detected_images: List[Dict[str, Any]] = [] + if extract_images: + try: + base64_images = extract_image_like_objects_from_pdfium_page(page) + for img in base64_images: + max_w = float(img.max_width) if img.max_width else 1.0 + max_h = float(img.max_height) if img.max_height else 1.0 + x0, y0, x1, y1 = img.bbox + detected_images.append( + { + "bbox_xyxy_norm": [ + x0 / max_w, + y0 / max_h, + x1 / max_w, + y1 / max_h, + ], + "text": "", + "image_b64": img.image, + } + ) + except Exception: + pass # Image extraction failure should not crash the pipeline. + page_record: Dict[str, Any] = { "path": pdf_path, "page_number": page_number, "source_id": source_id, "text": text if extract_text else "", "page_image": None, - "images": [], + "images": detected_images, "tables": [], "charts": [], "infographics": [], From 4c99cca1b1f8eac68bf3d731ee8da54e2396acc9 Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 19 Mar 2026 10:03:39 -0700 Subject: [PATCH 02/20] revert fix pyproject.toml --- nemo_retriever/pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index f293a79cb..fea03d44f 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -56,8 +56,8 @@ dependencies = [ "transformers>=5.0.0", "tokenizers>=0.20.3", "accelerate>=1.1.0", - "torch>=2.5.0", - "torchvision", + "torch~=2.9.1", + "torchvision>=0.24,<0.25", "einops", "easydict", "addict", From c601e7f5cc92cd9fe944326e450229be898a782a Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 19 Mar 2026 17:39:43 -0700 Subject: [PATCH 03/20] add batch mode --- nemo_retriever/README.md | 13 ++++++- .../src/nemo_retriever/caption/caption.py | 30 ++++++++++++++ .../nemo_retriever/examples/batch_pipeline.py | 31 +++++++++++++++ .../src/nemo_retriever/ingest_modes/batch.py | 39 +++++++++++++++++++ .../model/local/nemotron_vlm_captioner.py | 2 +- nemo_retriever/src/nemo_retriever/version.py | 39 ++++++++++++------- 6 files changed, 136 insertions(+), 18 deletions(-) diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index e26fb443c..62b7af995 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -52,6 +52,12 @@ uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu13 ``` This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime. +Alternatively, if you have uv 0.7+ you can set `UV_TORCH_BACKEND` to select the correct PyTorch CUDA index automatically: + +```bash +UV_TORCH_BACKEND=cu130 uv pip install torch torchvision +``` + ## Image Captioning (optional) NeMo Retriever Library can caption extracted images using a local VLM @@ -61,13 +67,16 @@ This requires [vLLM](https://github.com/vllm-project/vllm) and separately because they contain CUDA kernels that must match your torch build. ```bash -# 1. Install vLLM (--no-deps avoids overwriting the torch+cu130 already installed) +# Install vLLM (--no-deps prevents overwriting torch, transformers, etc.) uv pip install --no-deps vllm>=0.16.0 -# 2. Build mamba-ssm from source against your torch (takes a few minutes) +# Build mamba-ssm from source against your torch (takes a few minutes) uv pip install --no-deps --no-build-isolation mamba-ssm>=2.3.1 ``` +> **Note:** `--no-deps` is required because vLLM's dependency solver would +> downgrade `transformers` and `huggingface-hub` to incompatible versions. + After installing, add `--caption` and `--caption-device` to your pipeline command: ```bash diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py index 0de69614d..738d2ee48 100644 --- a/nemo_retriever/src/nemo_retriever/caption/caption.py +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -8,6 +8,36 @@ import pandas as pd +from nemo_retriever.params import CaptionParams + + +class CaptionActor: + """Ray Data actor that holds a local VLM captioner on a single GPU. + + When ``endpoint_url`` is provided, the actor delegates to a remote VLM + endpoint and no local model is loaded. + """ + + def __init__(self, params: CaptionParams) -> None: + self._params = params + self._kwargs = params.model_dump(mode="python") + endpoint = (self._kwargs.get("endpoint_url") or "").strip() + if endpoint: + self._model = None + else: + from nemo_retriever.model.local import NemotronVLMCaptioner + + self._model = NemotronVLMCaptioner( + model_path=self._kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"), + device=self._kwargs.get("device"), + hf_cache_dir=self._kwargs.get("hf_cache_dir"), + tensor_parallel_size=self._kwargs.get("tensor_parallel_size", 1), + gpu_memory_utilization=self._kwargs.get("gpu_memory_utilization", 0.9), + ) + + def __call__(self, batch_df: Any) -> Any: + return caption_images(batch_df, model=self._model, **self._kwargs) + def _caption_batch_remote( base64_images: List[str], diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index 6098f3731..784a8c8aa 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -23,6 +23,7 @@ from nemo_retriever.ingest_modes.batch import BatchIngestor from nemo_retriever.ingest_modes.lancedb_utils import lancedb_schema from nemo_retriever.model import resolve_embed_model +from nemo_retriever.params import CaptionParams from nemo_retriever.params import EmbedParams from nemo_retriever.params import ExtractParams from nemo_retriever.params import IngestExecuteParams @@ -521,6 +522,26 @@ def main( "--extract-page-as-image/--no-extract-page-as-image", help="Render and retain full page images for downstream multimodal stages.", ), + caption: bool = typer.Option( + False, + "--caption/--no-caption", + help="Enable image captioning via a local VLM or remote endpoint.", + ), + caption_invoke_url: Optional[str] = typer.Option( + None, + "--caption-invoke-url", + help="Optional VLM endpoint URL for image captioning. Implies --caption.", + ), + caption_model_name: str = typer.Option( + "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + "--caption-model-name", + help="VLM model name / HF model ID for image captioning.", + ), + caption_device: Optional[str] = typer.Option( + None, + "--caption-device", + help="GPU device for the local VLM captioner (e.g. 'cuda:1').", + ), text_chunk: bool = typer.Option( False, "--text-chunk", @@ -747,6 +768,16 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams: if enable_text_chunk: ingestor = ingestor.split(_text_chunk_params) + enable_caption = caption or caption_invoke_url is not None + if enable_caption: + ingestor = ingestor.caption( + CaptionParams( + endpoint_url=caption_invoke_url, + model_name=caption_model_name, + device=caption_device, + ) + ) + ingestor = ingestor.embed(embed_params) logger.info("Running extraction...") diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py index e00037285..0d1ac3488 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py @@ -48,6 +48,7 @@ from ..params import IngestExecuteParams from ..params import PdfSplitParams from ..params import TextChunkParams +from ..params import CaptionParams from ..params import VdbUploadParams logger = logging.getLogger(__name__) @@ -868,10 +869,17 @@ def embed( target_num_rows_per_block=self._requested_plan.get_embed_batch_size() ) + from nemo_retriever.ingest_modes.inprocess import _CONTENT_COLUMNS + + content_columns = ( + (_CONTENT_COLUMNS + ("images",)) if getattr(self, "_caption_enabled", False) else _CONTENT_COLUMNS + ) + if embed_granularity == "page": _row_fn = partial( collapse_content_to_page_rows, modality=embed_modality, + content_columns=content_columns, ) else: text_elements_modality = resolved.text_elements_modality or embed_modality @@ -881,6 +889,7 @@ def embed( modality=embed_modality, text_elements_modality=text_elements_modality, structured_elements_modality=structured_elements_modality, + content_columns=content_columns, ) self._rd_dataset = self._rd_dataset.map_batches( _row_fn, @@ -911,6 +920,36 @@ def embed( return self + def caption(self, params: CaptionParams | None = None, **kwargs: Any) -> "BatchIngestor": + """ + Add an image-captioning stage to the batch pipeline. + + Uses a GPU actor pool with a local VLM (vLLM) or delegates to a + remote VLM endpoint when ``endpoint_url`` is set. + """ + if self._rd_dataset is None: + raise RuntimeError("No Ray Dataset to caption. Run .files(...) / .extract(...) first.") + + resolved = _coerce_params(params, CaptionParams, kwargs) + if resolved.endpoint_url and not resolved.api_key: + resolved = resolved.model_copy(update={"api_key": resolve_remote_api_key()}) + + from nemo_retriever.caption.caption import CaptionActor + + caption_num_gpus = 0.0 if resolved.endpoint_url else 1.0 + + self._rd_dataset = self._rd_dataset.map_batches( + CaptionActor, + batch_size=resolved.batch_size or 8, + batch_format="pandas", + num_gpus=caption_num_gpus, + concurrency=1, + fn_constructor_kwargs={"params": resolved}, + ) + + self._caption_enabled = True + return self + def vdb_upload(self, params: VdbUploadParams | None = None, **kwargs: Any) -> "BatchIngestor": """ Add a streaming LanceDB upload stage to the batch pipeline. diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py index c881f5c5f..8264a82f5 100644 --- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py +++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py @@ -90,7 +90,7 @@ def __init__( from vllm import LLM, SamplingParams # noqa: F401 except ImportError as e: raise ImportError( - "Local VLM captioning requires vLLM. " 'Install with: pip install "nemo-retriever[vlm-caption]"' + 'Local VLM captioning requires vLLM. Install with: pip install "nemo-retriever[vlm-caption]"' ) from e self._model_path = model_path diff --git a/nemo_retriever/src/nemo_retriever/version.py b/nemo_retriever/src/nemo_retriever/version.py index 9999c919c..13ec2bd3b 100644 --- a/nemo_retriever/src/nemo_retriever/version.py +++ b/nemo_retriever/src/nemo_retriever/version.py @@ -12,6 +12,7 @@ from pathlib import Path import os import subprocess +import tempfile try: from ._build_info import BUILD_DATE as _PACKAGE_BUILD_DATE @@ -23,6 +24,7 @@ _PKG_NAME = "nemo-retriever" _UNKNOWN = "unknown" +_BUILD_STAMP = Path(tempfile.gettempdir()) / ".nemo_retriever_build_stamp" def _utc_now() -> datetime: @@ -57,7 +59,28 @@ def _build_datetime() -> datetime: except ValueError: pass - return _utc_now() + # Stamp file in the system temp dir makes the timestamp deterministic + # across the two separate subprocesses pip spawns during a PEP 517 build + # (metadata + wheel). We use tempdir rather than the source tree because + # pip may copy the source to different locations for each step. + if _BUILD_STAMP.exists(): + try: + cached = _BUILD_STAMP.read_text().strip() + if cached: + ts = float(cached) + # Only reuse if less than 60 s old to avoid stale stamps. + if abs(_utc_now().timestamp() - ts) < 60: + return datetime.fromtimestamp(ts, tz=timezone.utc) + _BUILD_STAMP.unlink(missing_ok=True) + except (OSError, ValueError): + pass + + now = _utc_now() + try: + _BUILD_STAMP.write_text(str(now.timestamp())) + except OSError: + pass + return now @lru_cache(maxsize=1) @@ -108,18 +131,6 @@ def _base_version() -> str: return os.getenv("RETRIEVER_VERSION") or os.getenv("NV_INGEST_VERSION") or _build_datetime().strftime("%Y.%m.%d") -def _has_prerelease(version_str: str) -> bool: - """Return True if *version_str* already contains a PEP 440 pre-release segment.""" - from packaging.version import Version - - try: - return Version(version_str).pre is not None - except Exception: - import re - - return bool(re.search(r"(a|alpha|b|beta|rc|c|dev|pre)[-_.]?\d*", version_str, re.I)) - - def get_build_version() -> str: """Return a PEP 440 compliant version string for packaging.""" release_type = (os.getenv("RETRIEVER_RELEASE_TYPE") or os.getenv("NV_INGEST_RELEASE_TYPE") or "dev").lower() @@ -128,8 +139,6 @@ def get_build_version() -> str: build_number = _build_number() if release_type == "release": - if _has_prerelease(base_version): - return base_version return f"{base_version}.post{build_number}" if int(build_number) > 0 else base_version if release_type == "dev": return f"{base_version}.dev{build_number}" From cca500148d5d26c7ec40ae957042352998efbedd Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 19 Mar 2026 18:51:24 -0700 Subject: [PATCH 04/20] build endpoint working --- nemo_retriever/README.md | 62 +++++++++++++------ nemo_retriever/pyproject.toml | 4 -- .../nemo_retriever/examples/batch_pipeline.py | 21 ++++++- .../examples/inprocess_pipeline.py | 44 +++++++++---- 4 files changed, 93 insertions(+), 38 deletions(-) diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index 62b7af995..a2300cd41 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -61,43 +61,67 @@ UV_TORCH_BACKEND=cu130 uv pip install torch torchvision ## Image Captioning (optional) NeMo Retriever Library can caption extracted images using a local VLM -([Nemotron Nano 12B v2 VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16)). -This requires [vLLM](https://github.com/vllm-project/vllm) and -[mamba-ssm](https://github.com/state-spaces/mamba), which must be installed -separately because they contain CUDA kernels that must match your torch build. +([Nemotron Nano 12B v2 VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16)) +powered by [vLLM](https://github.com/vllm-project/vllm), or by calling a +remote VLM endpoint. -```bash -# Install vLLM (--no-deps prevents overwriting torch, transformers, etc.) -uv pip install --no-deps vllm>=0.16.0 +### Install vLLM -# Build mamba-ssm from source against your torch (takes a few minutes) -uv pip install --no-deps --no-build-isolation mamba-ssm>=2.3.1 +```bash +uv pip install vllm --extra-index-url https://pypi.ngc.nvidia.com ``` -> **Note:** `--no-deps` is required because vLLM's dependency solver would -> downgrade `transformers` and `huggingface-hub` to incompatible versions. +The NGC index provides cu130 wheels that match the torch build installed above. -After installing, add `--caption` and `--caption-device` to your pipeline command: +### Local captioning + +Add `--caption` to your pipeline command. The VLM model is downloaded from +Hugging Face on first use and loaded via vLLM for inference. ```bash python -m nemo_retriever.examples.inprocess_pipeline \ data/multimodal_test.pdf \ - --caption \ - --caption-device cuda:1 + --caption ``` -`--caption-device` places the VLM on a separate GPU so it does not compete with -the page-elements, OCR, and embedding models. If omitted, a warning is printed -and the VLM defaults to `cuda:0`. +### Remote captioning + +If you have a VLM endpoint running (e.g. via `vllm serve`), pass the URL +instead: -Supported `--caption-model-name` values: +```bash +python -m nemo_retriever.examples.inprocess_pipeline \ + data/multimodal_test.pdf \ + --caption-invoke-url http://vlm:8000/v1/chat/completions +``` -| Model | Precision | Notes | +### Supported models + +| `--caption-model-name` | Precision | Notes | |---|---|---| | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` (default) | BFloat16 | Works on SM80+ (A100, A10, RTX 3090, ...) | | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8` | FP8 | Works on SM80+ | | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD` | NVFP4 | Requires SM89+ (Ada Lovelace / Hopper) | +### Python API + +```python +from nemo_retriever import create_ingestor +from nemo_retriever.params import CaptionParams, ExtractParams + +ingestor = create_ingestor(run_mode="inprocess") +results = ( + ingestor + .files("doc.pdf") + .extract(ExtractParams(extract_images=True)) + .caption(CaptionParams()) # local vLLM + # or: .caption(CaptionParams(endpoint_url="http://vlm:8000/v1/chat/completions")) + .embed() + .vdb_upload() + .ingest() +) +``` + ## Run the pipeline The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/). diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index fea03d44f..19c522057 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -83,12 +83,8 @@ dependencies = [ svg = [ "cairosvg>=2.7.0", ] -# Install with: pip install ".[vlm-caption]" -# mamba-ssm must be built from source against the installed torch: -# uv pip install --no-deps --no-build-isolation mamba-ssm vlm-caption = [ "vllm>=0.16.0", - "mamba-ssm>=2.3.1", ] dev = [ "build>=1.2.2", diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index 784a8c8aa..bed85299a 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -512,6 +512,21 @@ def main( "(used when --table-output-format=markdown)." ), ), + extract_text: bool = typer.Option( + True, + "--extract-text/--no-extract-text", + help="Extract text from PDF pages.", + ), + extract_tables: bool = typer.Option( + True, + "--extract-tables/--no-extract-tables", + help="Extract tables from PDF pages.", + ), + extract_charts: bool = typer.Option( + True, + "--extract-charts/--no-extract-charts", + help="Extract charts from PDF pages.", + ), extract_infographics: bool = typer.Option( False, "--extract-infographics/--no-extract-infographics", @@ -729,9 +744,9 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams: return ExtractParams( method=method, dpi=int(dpi), - extract_text=True, - extract_tables=True, - extract_charts=True, + extract_text=extract_text, + extract_tables=extract_tables, + extract_charts=extract_charts, extract_infographics=extract_infographics, extract_page_as_image=extract_page_as_image, api_key=extract_remote_api_key, diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index 71802f879..629c82f7e 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -93,6 +93,26 @@ def main( "--embed-model-name", help="Embedding model name passed to .embed().", ), + extract_text: bool = typer.Option( + True, + "--extract-text/--no-extract-text", + help="Extract text from PDF pages.", + ), + extract_tables: bool = typer.Option( + True, + "--extract-tables/--no-extract-tables", + help="Extract tables from PDF pages.", + ), + extract_charts: bool = typer.Option( + True, + "--extract-charts/--no-extract-charts", + help="Extract charts from PDF pages.", + ), + extract_infographics: bool = typer.Option( + False, + "--extract-infographics/--no-extract-infographics", + help="Extract infographics from PDF pages.", + ), method: str = typer.Option( "pdfium", "--method", @@ -239,10 +259,10 @@ def main( ingestor = ingestor.files(file_patterns).extract_image_files( ExtractParams( method=method, - extract_text=True, - extract_tables=True, - extract_charts=True, - extract_infographics=False, + extract_text=extract_text, + extract_tables=extract_tables, + extract_charts=extract_charts, + extract_infographics=extract_infographics, use_graphic_elements=use_graphic_elements, graphic_elements_invoke_url=graphic_elements_invoke_url, use_table_structure=use_table_structure, @@ -256,10 +276,10 @@ def main( ingestor = ingestor.files(file_patterns).extract( ExtractParams( method=method, - extract_text=True, - extract_tables=True, - extract_charts=True, - extract_infographics=False, + extract_text=extract_text, + extract_tables=extract_tables, + extract_charts=extract_charts, + extract_infographics=extract_infographics, use_graphic_elements=use_graphic_elements, graphic_elements_invoke_url=graphic_elements_invoke_url, use_table_structure=use_table_structure, @@ -273,10 +293,10 @@ def main( ingestor = ingestor.files(file_patterns).extract( ExtractParams( method=method, - extract_text=True, - extract_tables=True, - extract_charts=True, - extract_infographics=False, + extract_text=extract_text, + extract_tables=extract_tables, + extract_charts=extract_charts, + extract_infographics=extract_infographics, use_graphic_elements=use_graphic_elements, graphic_elements_invoke_url=graphic_elements_invoke_url, use_table_structure=use_table_structure, From 1384c6f3a2c0a29f39127a93416069774b60a2dc Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 19 Mar 2026 21:36:39 -0700 Subject: [PATCH 05/20] add context window --- nemo_retriever/README.md | 70 ---------- .../src/nemo_retriever/caption/caption.py | 130 +++++++++++------- .../nemo_retriever/examples/batch_pipeline.py | 6 + .../examples/inprocess_pipeline.py | 6 + .../src/nemo_retriever/params/models.py | 1 + nemo_retriever/tests/test_caption.py | 107 ++++++++++++++ 6 files changed, 199 insertions(+), 121 deletions(-) create mode 100644 nemo_retriever/tests/test_caption.py diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index a2300cd41..6a0ac50db 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -52,76 +52,6 @@ uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu13 ``` This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime. -Alternatively, if you have uv 0.7+ you can set `UV_TORCH_BACKEND` to select the correct PyTorch CUDA index automatically: - -```bash -UV_TORCH_BACKEND=cu130 uv pip install torch torchvision -``` - -## Image Captioning (optional) - -NeMo Retriever Library can caption extracted images using a local VLM -([Nemotron Nano 12B v2 VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16)) -powered by [vLLM](https://github.com/vllm-project/vllm), or by calling a -remote VLM endpoint. - -### Install vLLM - -```bash -uv pip install vllm --extra-index-url https://pypi.ngc.nvidia.com -``` - -The NGC index provides cu130 wheels that match the torch build installed above. - -### Local captioning - -Add `--caption` to your pipeline command. The VLM model is downloaded from -Hugging Face on first use and loaded via vLLM for inference. - -```bash -python -m nemo_retriever.examples.inprocess_pipeline \ - data/multimodal_test.pdf \ - --caption -``` - -### Remote captioning - -If you have a VLM endpoint running (e.g. via `vllm serve`), pass the URL -instead: - -```bash -python -m nemo_retriever.examples.inprocess_pipeline \ - data/multimodal_test.pdf \ - --caption-invoke-url http://vlm:8000/v1/chat/completions -``` - -### Supported models - -| `--caption-model-name` | Precision | Notes | -|---|---|---| -| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` (default) | BFloat16 | Works on SM80+ (A100, A10, RTX 3090, ...) | -| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-FP8` | FP8 | Works on SM80+ | -| `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-NVFP4-QAD` | NVFP4 | Requires SM89+ (Ada Lovelace / Hopper) | - -### Python API - -```python -from nemo_retriever import create_ingestor -from nemo_retriever.params import CaptionParams, ExtractParams - -ingestor = create_ingestor(run_mode="inprocess") -results = ( - ingestor - .files("doc.pdf") - .extract(ExtractParams(extract_images=True)) - .caption(CaptionParams()) # local vLLM - # or: .caption(CaptionParams(endpoint_url="http://vlm:8000/v1/chat/completions")) - .embed() - .vdb_upload() - .ingest() -) -``` - ## Run the pipeline The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/). diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py index 738d2ee48..5cad882e8 100644 --- a/nemo_retriever/src/nemo_retriever/caption/caption.py +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -10,6 +10,8 @@ from nemo_retriever.params import CaptionParams +_MAX_CONTEXT_TEXT_CHARS = 4096 + class CaptionActor: """Ray Data actor that holds a local VLM captioner on a single GPU. @@ -39,6 +41,16 @@ def __call__(self, batch_df: Any) -> Any: return caption_images(batch_df, model=self._model, **self._kwargs) +def _build_prompt_with_context(base_prompt: str, context_text: str) -> str: + """Prepend surrounding page text to the base VLM prompt. + + If *context_text* is empty the *base_prompt* is returned unchanged. + """ + if not context_text: + return base_prompt + return f"Text near this image:\n---\n{context_text}\n---\n\n{base_prompt}" + + def _caption_batch_remote( base64_images: List[str], *, @@ -89,6 +101,32 @@ def _caption_batch_local( ) +def _caption_one( + b64: str, + *, + model: Any, + endpoint_url: str | None, + model_name: str, + api_key: str | None, + prompt: str, + system_prompt: str | None, + temperature: float, +) -> str: + """Caption a single image (used when each image gets a unique prompt).""" + if model is not None: + captions = _caption_batch_local( + [b64], model=model, prompt=prompt, + system_prompt=system_prompt, temperature=temperature, + ) + else: + captions = _caption_batch_remote( + [b64], endpoint_url=endpoint_url, # type: ignore[arg-type] + model_name=model_name, api_key=api_key, prompt=prompt, + system_prompt=system_prompt, temperature=temperature, + ) + return captions[0] if captions else "" + + def caption_images( batch_df: pd.DataFrame, *, @@ -100,6 +138,7 @@ def caption_images( system_prompt: str | None = "/no_think", temperature: float = 1.0, batch_size: int = 8, + context_text_max_chars: int = 0, **kwargs: Any, ) -> pd.DataFrame: """Caption images in the ``images`` column using a VLM. @@ -111,34 +150,14 @@ def caption_images( * **Local** (``model`` is set): runs inference through a local ``NemotronVLMCaptioner`` instance loaded from Hugging Face. + When ``context_text_max_chars`` is greater than zero, the page's ``text`` + column is prepended to the prompt for each image so the VLM can use + surrounding OCR text as context. In this mode images are captioned + one at a time (each gets its own enriched prompt). + For each row, any item in the ``images`` list whose ``text`` field is empty will be captioned. The returned caption is written back into ``images[i]["text"]``. - - Parameters - ---------- - batch_df : pd.DataFrame - DataFrame with an ``images`` column containing lists of dicts with - keys ``image_b64``, ``text``, and ``bbox_xyxy_norm``. - model : NemotronVLMCaptioner | None - Pre-loaded local VLM model. When provided, ``endpoint_url`` is - ignored and inference runs in-process. - endpoint_url : str | None - URL of a remote VLM HTTP endpoint. - model_name : str - Model identifier passed to the remote VLM endpoint (ignored for - local mode). - api_key : str | None - Bearer token for the remote VLM endpoint. - prompt : str - Text prompt sent alongside each image. - system_prompt : str | None - Optional system prompt for the VLM. - temperature : float - Sampling temperature. - batch_size : int - Number of images per remote VLM request (local mode processes - images one at a time). """ if not isinstance(batch_df, pd.DataFrame) or batch_df.empty: return batch_df @@ -157,6 +176,9 @@ def caption_images( gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9), ) + use_context = context_text_max_chars > 0 + effective_max = min(context_text_max_chars, _MAX_CONTEXT_TEXT_CHARS) if use_context else 0 + # Collect all (row_idx, item_idx, image_b64) needing captions. pending: List[Tuple[int, int, str]] = [] for row_idx, row in batch_df.iterrows(): @@ -175,33 +197,39 @@ def caption_images( if not pending: return batch_df - # Generate captions. - all_captions: List[str] = [] - for start in range(0, len(pending), batch_size): - chunk_b64 = [b64 for _, _, b64 in pending[start : start + batch_size]] - - if model is not None: - captions = _caption_batch_local( - chunk_b64, - model=model, - prompt=prompt, - system_prompt=system_prompt, + if use_context: + # Each image gets a per-page enriched prompt, so caption one at a time. + for row_idx, item_idx, b64 in pending: + page_text = batch_df.at[row_idx, "text"] if "text" in batch_df.columns else "" + context = (page_text or "")[:effective_max] + enriched_prompt = _build_prompt_with_context(prompt, context) + caption = _caption_one( + b64, model=model, endpoint_url=endpoint_url, + model_name=model_name, api_key=api_key, + prompt=enriched_prompt, system_prompt=system_prompt, temperature=temperature, ) - else: - captions = _caption_batch_remote( - chunk_b64, - endpoint_url=endpoint_url, # type: ignore[arg-type] - model_name=model_name, - api_key=api_key, - prompt=prompt, - system_prompt=system_prompt, - temperature=temperature, - ) - all_captions.extend(captions) - - # Write captions back into the DataFrame. - for (row_idx, item_idx, _), caption in zip(pending, all_captions): - batch_df.at[row_idx, "images"][item_idx]["text"] = caption + batch_df.at[row_idx, "images"][item_idx]["text"] = caption + else: + # Batch mode: all images share the same prompt. + all_captions: List[str] = [] + for start in range(0, len(pending), batch_size): + chunk_b64 = [b64 for _, _, b64 in pending[start : start + batch_size]] + + if model is not None: + captions = _caption_batch_local( + chunk_b64, model=model, prompt=prompt, + system_prompt=system_prompt, temperature=temperature, + ) + else: + captions = _caption_batch_remote( + chunk_b64, endpoint_url=endpoint_url, # type: ignore[arg-type] + model_name=model_name, api_key=api_key, prompt=prompt, + system_prompt=system_prompt, temperature=temperature, + ) + all_captions.extend(captions) + + for (row_idx, item_idx, _), caption in zip(pending, all_captions): + batch_df.at[row_idx, "images"][item_idx]["text"] = caption return batch_df diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index bed85299a..4090618fa 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -557,6 +557,11 @@ def main( "--caption-device", help="GPU device for the local VLM captioner (e.g. 'cuda:1').", ), + caption_context_text_max_chars: int = typer.Option( + 0, + "--caption-context-text-max-chars", + help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", + ), text_chunk: bool = typer.Option( False, "--text-chunk", @@ -790,6 +795,7 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams: endpoint_url=caption_invoke_url, model_name=caption_model_name, device=caption_device, + context_text_max_chars=caption_context_text_max_chars, ) ) diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index 629c82f7e..c8fda38b9 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -192,6 +192,11 @@ def main( "--caption-device", help="GPU device for the local VLM captioner (e.g. 'cuda:1'). Defaults to the first --gpu-devices entry.", ), + caption_context_text_max_chars: int = typer.Option( + 0, + "--caption-context-text-max-chars", + help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", + ), hybrid: bool = typer.Option( False, "--hybrid/--no-hybrid", @@ -323,6 +328,7 @@ def main( endpoint_url=caption_invoke_url, model_name=caption_model_name, device=caption_device, + context_text_max_chars=caption_context_text_max_chars, ) ) diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py index 6fd246966..8f48e125e 100644 --- a/nemo_retriever/src/nemo_retriever/params/models.py +++ b/nemo_retriever/src/nemo_retriever/params/models.py @@ -312,6 +312,7 @@ class CaptionParams(_ParamsModel): batch_size: int = 8 device: Optional[str] = None hf_cache_dir: Optional[str] = None + context_text_max_chars: int = 0 tensor_parallel_size: int = 1 gpu_memory_utilization: float = 0.9 diff --git a/nemo_retriever/tests/test_caption.py b/nemo_retriever/tests/test_caption.py new file mode 100644 index 000000000..3bf6844be --- /dev/null +++ b/nemo_retriever/tests/test_caption.py @@ -0,0 +1,107 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for image captioning pipeline stage.""" + +import base64 +import io +from unittest.mock import MagicMock, patch + +import pandas as pd +import pytest + +PIL = pytest.importorskip("PIL") +from PIL import Image # noqa: E402 + + +def _make_1x1_png_b64() -> str: + img = Image.new("RGB", (1, 1), color=(255, 0, 0)) + buf = io.BytesIO() + img.save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("ascii") + + +def _make_page_df(num_images=2, captioned=False): + b64 = _make_1x1_png_b64() + images = [ + {"bbox_xyxy_norm": [0.1, 0.2, 0.5, 0.8], "text": "done" if captioned else "", "image_b64": b64} + for _ in range(num_images) + ] + return pd.DataFrame([{"text": "page", "images": images, "tables": [], "charts": [], "infographics": []}]) + + +def test_caption_images_writes_back(): + from nemo_retriever.caption.caption import caption_images + + mock_model = MagicMock() + mock_model.caption_batch.return_value = ["cap1", "cap2"] + result = caption_images(_make_page_df(), model=mock_model) + assert result.iloc[0]["images"][0]["text"] == "cap1" + assert result.iloc[0]["images"][1]["text"] == "cap2" + + +def test_caption_images_skips_already_captioned(): + from nemo_retriever.caption.caption import caption_images + + mock_model = MagicMock() + result = caption_images(_make_page_df(captioned=True), model=mock_model) + mock_model.caption_batch.assert_not_called() + assert result.iloc[0]["images"][0]["text"] == "done" + + +@patch("nemo_retriever.pdf.extract.extract_image_like_objects_from_pdfium_page") +def test_pdf_extraction_populates_images(mock_extract): + _ext = pytest.importorskip("nemo_retriever.pdf.extract") + pdfium = pytest.importorskip("pypdfium2") + + mock_img = MagicMock(image=_make_1x1_png_b64(), bbox=(10, 20, 100, 200), max_width=612, max_height=792) + mock_extract.return_value = [mock_img] + + doc = pdfium.PdfDocument.new() + doc.new_page(612, 792) + buf = io.BytesIO() + doc.save(buf) + doc.close() + + result = _ext.pdf_extraction(pd.DataFrame([{"bytes": buf.getvalue(), "path": "t.pdf", "page_number": 1}]), extract_images=True) + images = result.iloc[0]["images"] + assert len(images) == 1 + assert images[0]["text"] == "" + assert abs(images[0]["bbox_xyxy_norm"][0] - 10 / 612) < 1e-6 + + +def test_explode_includes_captioned_images(): + from nemo_retriever.ingest_modes.inprocess import explode_content_to_rows + + b64 = _make_1x1_png_b64() + df = pd.DataFrame([{ + "text": "page", + "page_image": {"image_b64": b64}, + "images": [{"text": "a dog", "bbox_xyxy_norm": [0.1, 0.2, 0.5, 0.8], "image_b64": b64}], + "tables": [], "charts": [], "infographics": [], + }]) + result = explode_content_to_rows(df, content_columns=("table", "chart", "infographic", "images")) + assert len(result) == 2 # page text + image caption + + # Default columns exclude images + result2 = explode_content_to_rows(df) + assert len(result2) == 1 + + +def test_context_text_prepended_to_prompt(): + from nemo_retriever.caption.caption import caption_images + + mock_model = MagicMock() + mock_model.caption_batch.return_value = ["captioned with context"] + + df = _make_page_df(num_images=1) + df.at[0, "text"] = "The quick brown fox jumps over the lazy dog." + + result = caption_images(df, model=mock_model, context_text_max_chars=100) + + assert result.iloc[0]["images"][0]["text"] == "captioned with context" + # The prompt passed to caption_batch should contain the page text. + call_kwargs = mock_model.caption_batch.call_args[1] + assert "quick brown fox" in call_kwargs["prompt"] + assert "Text near this image:" in call_kwargs["prompt"] From 8ba2c81e9379748ba157bf1ceca3184447c69b87 Mon Sep 17 00:00:00 2001 From: edknv Date: Thu, 19 Mar 2026 21:44:32 -0700 Subject: [PATCH 06/20] update readme --- nemo_retriever/README.md | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index 6a0ac50db..b454bbcf1 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -48,10 +48,18 @@ Use the CUDA 13.0 wheels from the dedicated index by running the following comma ```bash uv pip uninstall torch torchvision -uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu130 +uv pip install torch==2.9.1 torchvision --torch-backend=cu130 ``` This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime. +3. (Optional) Install vLLM for image captioning + +If you want to generate captions for extracted images, install [vLLM](https://docs.vllm.ai/). + +```bash +uv pip install vllm --torch-backend=cu130 +``` + ## Run the pipeline The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/). @@ -242,6 +250,30 @@ ingestor = create_ingestor(run_mode="batch") ingestor = ingestor.files([str(INPUT_AUDIO)]).extract_audio() ``` +### Caption extracted images + +Use `.caption()` to generate text descriptions for extracted images using a local VLM. Requires vLLM (see step 3 above). + +```python +ingestor = ( + ingestor.files(documents) + .extract() + .caption() + .embed() + .vdb_upload() +) +``` + +By default this uses [Nemotron-Nano-12B-VL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16). You can customize the model and prompt: + +```python +.caption( + model_name="nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + prompt="Describe this image in detail:", + context_text_max_chars=1024, # include surrounding page text as context +) +``` + ### Explore Different Pipeline Options: You can use the [Nemotron RAG VL Embedder](https://huggingface.co/nvidia/llama-nemotron-embed-vl-1b-v2) From 06e5d8ea64a5050c92efbe92546bc02486008d83 Mon Sep 17 00:00:00 2001 From: edknv Date: Fri, 20 Mar 2026 13:25:23 -0700 Subject: [PATCH 07/20] install vllm wheels for cu130 support --- nemo_retriever/README.md | 4 ++-- nemo_retriever/pyproject.toml | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index b454bbcf1..566d50980 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -54,10 +54,10 @@ This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library 3. (Optional) Install vLLM for image captioning -If you want to generate captions for extracted images, install [vLLM](https://docs.vllm.ai/). +If you want to generate captions for extracted images, install the `vlm-caption` extra which includes [vLLM](https://docs.vllm.ai/) built for CUDA 13. ```bash -uv pip install vllm --torch-backend=cu130 +uv pip install "nemo-retriever[vlm-caption]" ``` ## Run the pipeline diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 3c43286f1..4daf510e4 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -106,6 +106,10 @@ nemotron-table-structure-v1 = { index = "test-pypi" } nemotron-ocr = { index = "test-pypi" } torch = { index = "torch-cuda"} torchvision = { index ="torch-cuda"} +vllm = [ + { url = "https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0+cu130-cp38-abi3-manylinux_2_35_x86_64.whl", marker = "platform_machine == 'x86_64'" }, + { url = "https://github.com/vllm-project/vllm/releases/download/v0.16.0/vllm-0.16.0+cu130-cp38-abi3-manylinux_2_35_aarch64.whl", marker = "platform_machine == 'aarch64'" }, +] [[tool.uv.index]] name = "test-pypi" From 58fe3811cc4aa5f7e1c7a4292f6d2306120294ce Mon Sep 17 00:00:00 2001 From: edknv Date: Fri, 20 Mar 2026 13:34:41 -0700 Subject: [PATCH 08/20] pin vllm to exact match --- nemo_retriever/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 4daf510e4..6c6d2d7a7 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -82,7 +82,7 @@ svg = [ "cairosvg>=2.7.0", ] vlm-caption = [ - "vllm>=0.16.0", + "vllm==0.16.0", ] dev = [ "build>=1.2.2", From f90de97d5cd17a769dcad6f80f87f791255b69e4 Mon Sep 17 00:00:00 2001 From: edknv Date: Sun, 22 Mar 2026 10:54:05 -0700 Subject: [PATCH 09/20] cache model globally --- .../src/nemo_retriever/caption/caption.py | 71 +++++++++++++------ 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py index 5cad882e8..6bc0c9b53 100644 --- a/nemo_retriever/src/nemo_retriever/caption/caption.py +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -11,6 +11,22 @@ from nemo_retriever.params import CaptionParams _MAX_CONTEXT_TEXT_CHARS = 4096 +_cached_local_model = None + + +def _get_cached_local_model(kwargs: dict) -> "Any": + global _cached_local_model + if _cached_local_model is None: + from nemo_retriever.model.local import NemotronVLMCaptioner + + _cached_local_model = NemotronVLMCaptioner( + model_path=kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"), + device=kwargs.get("device"), + hf_cache_dir=kwargs.get("hf_cache_dir"), + tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), + gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9), + ) + return _cached_local_model class CaptionActor: @@ -115,14 +131,21 @@ def _caption_one( """Caption a single image (used when each image gets a unique prompt).""" if model is not None: captions = _caption_batch_local( - [b64], model=model, prompt=prompt, - system_prompt=system_prompt, temperature=temperature, + [b64], + model=model, + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, ) else: captions = _caption_batch_remote( - [b64], endpoint_url=endpoint_url, # type: ignore[arg-type] - model_name=model_name, api_key=api_key, prompt=prompt, - system_prompt=system_prompt, temperature=temperature, + [b64], + endpoint_url=endpoint_url, # type: ignore[arg-type] + model_name=model_name, + api_key=api_key, + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, ) return captions[0] if captions else "" @@ -166,15 +189,8 @@ def caption_images( if model is None and not endpoint_url: # Lazy model creation for the sequential (no GPU pool) fallback. - from nemo_retriever.model.local import NemotronVLMCaptioner - - model = NemotronVLMCaptioner( - model_path=kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"), - device=kwargs.get("device"), - hf_cache_dir=kwargs.get("hf_cache_dir"), - tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), - gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9), - ) + # Cache the model so it is not re-created on every call. + model = _get_cached_local_model(kwargs) use_context = context_text_max_chars > 0 effective_max = min(context_text_max_chars, _MAX_CONTEXT_TEXT_CHARS) if use_context else 0 @@ -204,9 +220,13 @@ def caption_images( context = (page_text or "")[:effective_max] enriched_prompt = _build_prompt_with_context(prompt, context) caption = _caption_one( - b64, model=model, endpoint_url=endpoint_url, - model_name=model_name, api_key=api_key, - prompt=enriched_prompt, system_prompt=system_prompt, + b64, + model=model, + endpoint_url=endpoint_url, + model_name=model_name, + api_key=api_key, + prompt=enriched_prompt, + system_prompt=system_prompt, temperature=temperature, ) batch_df.at[row_idx, "images"][item_idx]["text"] = caption @@ -218,14 +238,21 @@ def caption_images( if model is not None: captions = _caption_batch_local( - chunk_b64, model=model, prompt=prompt, - system_prompt=system_prompt, temperature=temperature, + chunk_b64, + model=model, + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, ) else: captions = _caption_batch_remote( - chunk_b64, endpoint_url=endpoint_url, # type: ignore[arg-type] - model_name=model_name, api_key=api_key, prompt=prompt, - system_prompt=system_prompt, temperature=temperature, + chunk_b64, + endpoint_url=endpoint_url, # type: ignore[arg-type] + model_name=model_name, + api_key=api_key, + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, ) all_captions.extend(captions) From 2a3df58f21464669baa6c6b382ff97439daaab24 Mon Sep 17 00:00:00 2001 From: edknv Date: Sun, 22 Mar 2026 13:46:12 -0700 Subject: [PATCH 10/20] set gpu memory utilization --- .../src/nemo_retriever/examples/batch_pipeline.py | 6 ++++++ .../src/nemo_retriever/examples/inprocess_pipeline.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index 4090618fa..00c80ca74 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -562,6 +562,11 @@ def main( "--caption-context-text-max-chars", help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", ), + caption_gpu_memory_utilization: float = typer.Option( + 0.25, + "--caption-gpu-memory-utilization", + help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).", + ), text_chunk: bool = typer.Option( False, "--text-chunk", @@ -796,6 +801,7 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams: model_name=caption_model_name, device=caption_device, context_text_max_chars=caption_context_text_max_chars, + gpu_memory_utilization=caption_gpu_memory_utilization, ) ) diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index c8fda38b9..37324c554 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -197,6 +197,11 @@ def main( "--caption-context-text-max-chars", help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", ), + caption_gpu_memory_utilization: float = typer.Option( + 0.25, + "--caption-gpu-memory-utilization", + help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).", + ), hybrid: bool = typer.Option( False, "--hybrid/--no-hybrid", @@ -329,6 +334,7 @@ def main( model_name=caption_model_name, device=caption_device, context_text_max_chars=caption_context_text_max_chars, + gpu_memory_utilization=caption_gpu_memory_utilization, ) ) From 1306f2be819bf3c17880774845f71d069a7f255a Mon Sep 17 00:00:00 2001 From: edknv Date: Sun, 22 Mar 2026 14:09:09 -0700 Subject: [PATCH 11/20] set caption batch size --- .../src/nemo_retriever/examples/batch_pipeline.py | 8 +++++++- .../src/nemo_retriever/examples/inprocess_pipeline.py | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index 00c80ca74..491cbf8b6 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -562,8 +562,13 @@ def main( "--caption-context-text-max-chars", help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", ), + caption_batch_size: int = typer.Option( + 4, + "--caption-batch-size", + help="Number of images to caption per batch.", + ), caption_gpu_memory_utilization: float = typer.Option( - 0.25, + 0.5, "--caption-gpu-memory-utilization", help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).", ), @@ -801,6 +806,7 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams: model_name=caption_model_name, device=caption_device, context_text_max_chars=caption_context_text_max_chars, + batch_size=caption_batch_size, gpu_memory_utilization=caption_gpu_memory_utilization, ) ) diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index 37324c554..ebcea181e 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -197,8 +197,13 @@ def main( "--caption-context-text-max-chars", help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", ), + caption_batch_size: int = typer.Option( + 4, + "--caption-batch-size", + help="Number of images to caption per batch.", + ), caption_gpu_memory_utilization: float = typer.Option( - 0.25, + 0.5, "--caption-gpu-memory-utilization", help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).", ), @@ -334,6 +339,7 @@ def main( model_name=caption_model_name, device=caption_device, context_text_max_chars=caption_context_text_max_chars, + batch_size=caption_batch_size, gpu_memory_utilization=caption_gpu_memory_utilization, ) ) From 858f7ca4a5b6b585c03533d1fc95a413f375b804 Mon Sep 17 00:00:00 2001 From: edknv Date: Sun, 22 Mar 2026 14:21:38 -0700 Subject: [PATCH 12/20] remove batch size arg --- .../src/nemo_retriever/caption/caption.py | 33 ++++++++++--------- .../nemo_retriever/examples/batch_pipeline.py | 8 +---- .../examples/inprocess_pipeline.py | 8 +---- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py index 6bc0c9b53..92429930c 100644 --- a/nemo_retriever/src/nemo_retriever/caption/caption.py +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -232,21 +232,24 @@ def caption_images( batch_df.at[row_idx, "images"][item_idx]["text"] = caption else: # Batch mode: all images share the same prompt. - all_captions: List[str] = [] - for start in range(0, len(pending), batch_size): - chunk_b64 = [b64 for _, _, b64 in pending[start : start + batch_size]] - - if model is not None: - captions = _caption_batch_local( - chunk_b64, - model=model, - prompt=prompt, - system_prompt=system_prompt, - temperature=temperature, - ) - else: + all_b64 = [b64 for _, _, b64 in pending] + + if model is not None: + # Submit all at once — vLLM schedules internally based on + # available GPU memory. + all_captions = _caption_batch_local( + all_b64, + model=model, + prompt=prompt, + system_prompt=system_prompt, + temperature=temperature, + ) + else: + # Remote endpoints may have request-size limits; chunk. + all_captions: List[str] = [] + for start in range(0, len(all_b64), batch_size): captions = _caption_batch_remote( - chunk_b64, + all_b64[start : start + batch_size], endpoint_url=endpoint_url, # type: ignore[arg-type] model_name=model_name, api_key=api_key, @@ -254,7 +257,7 @@ def caption_images( system_prompt=system_prompt, temperature=temperature, ) - all_captions.extend(captions) + all_captions.extend(captions) for (row_idx, item_idx, _), caption in zip(pending, all_captions): batch_df.at[row_idx, "images"][item_idx]["text"] = caption diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index 491cbf8b6..b0113efe4 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -562,13 +562,8 @@ def main( "--caption-context-text-max-chars", help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", ), - caption_batch_size: int = typer.Option( - 4, - "--caption-batch-size", - help="Number of images to caption per batch.", - ), caption_gpu_memory_utilization: float = typer.Option( - 0.5, + 0.4, "--caption-gpu-memory-utilization", help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).", ), @@ -806,7 +801,6 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams: model_name=caption_model_name, device=caption_device, context_text_max_chars=caption_context_text_max_chars, - batch_size=caption_batch_size, gpu_memory_utilization=caption_gpu_memory_utilization, ) ) diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index ebcea181e..5a89fc433 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -197,13 +197,8 @@ def main( "--caption-context-text-max-chars", help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", ), - caption_batch_size: int = typer.Option( - 4, - "--caption-batch-size", - help="Number of images to caption per batch.", - ), caption_gpu_memory_utilization: float = typer.Option( - 0.5, + 0.4, "--caption-gpu-memory-utilization", help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).", ), @@ -339,7 +334,6 @@ def main( model_name=caption_model_name, device=caption_device, context_text_max_chars=caption_context_text_max_chars, - batch_size=caption_batch_size, gpu_memory_utilization=caption_gpu_memory_utilization, ) ) From 5a2e0fde8f048f84efd34fd45c72689f2e5ce8d1 Mon Sep 17 00:00:00 2001 From: edknv Date: Sun, 22 Mar 2026 16:12:55 -0700 Subject: [PATCH 13/20] skip loading ocr --- .../src/nemo_retriever/examples/batch_pipeline.py | 2 +- .../src/nemo_retriever/examples/inprocess_pipeline.py | 2 +- nemo_retriever/src/nemo_retriever/ingest_modes/batch.py | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index b0113efe4..c53ff0db5 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -563,7 +563,7 @@ def main( help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", ), caption_gpu_memory_utilization: float = typer.Option( - 0.4, + 0.5, "--caption-gpu-memory-utilization", help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).", ), diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index 5a89fc433..d8c77ff1f 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -198,7 +198,7 @@ def main( help="Max characters of surrounding page text to include in the VLM prompt. 0 disables context.", ), caption_gpu_memory_utilization: float = typer.Option( - 0.4, + 0.5, "--caption-gpu-memory-utilization", help="Fraction of GPU memory vLLM may use for the caption model (0.0–1.0).", ), diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py index 0d1ac3488..e93ef7ed6 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py @@ -644,7 +644,12 @@ def _append_detection_stages(self, kwargs: dict[str, Any]) -> None: ocr_flags["inference_batch_size"] = self._requested_plan.get_ocr_batch_size() - if ocr_flags: + # Only append OCR stage if at least one content type needs it. + needs_ocr = any( + ocr_flags.get(k) + for k in ("extract_text", "extract_tables", "extract_charts", "extract_infographics") + ) + if needs_ocr: self._rd_dataset = self._rd_dataset.map_batches( OCRActor, batch_size=self._requested_plan.get_ocr_batch_size(), From 83092073fb23fc7bf916dad1938db09a6b31bdfc Mon Sep 17 00:00:00 2001 From: edknv Date: Sun, 22 Mar 2026 16:58:46 -0700 Subject: [PATCH 14/20] use fractional gpu --- nemo_retriever/src/nemo_retriever/ingest_modes/batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py index e93ef7ed6..506d8a321 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py @@ -941,7 +941,7 @@ def caption(self, params: CaptionParams | None = None, **kwargs: Any) -> "BatchI from nemo_retriever.caption.caption import CaptionActor - caption_num_gpus = 0.0 if resolved.endpoint_url else 1.0 + caption_num_gpus = 0.0 if resolved.endpoint_url else resolved.gpu_memory_utilization self._rd_dataset = self._rd_dataset.map_batches( CaptionActor, From 564d72cb81eac7ef87a7f0efbcf07a6f8e8144a1 Mon Sep 17 00:00:00 2001 From: edknv Date: Sun, 22 Mar 2026 18:14:47 -0700 Subject: [PATCH 15/20] filter out small images --- .../src/nemo_retriever/caption/caption.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py index 92429930c..64d1a1e40 100644 --- a/nemo_retriever/src/nemo_retriever/caption/caption.py +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -11,6 +11,21 @@ from nemo_retriever.params import CaptionParams _MAX_CONTEXT_TEXT_CHARS = 4096 +_MIN_IMAGE_DIMENSION = 32 + + +def _image_meets_min_size(b64: str) -> bool: + """Return True if the base64 image is at least _MIN_IMAGE_DIMENSION on both sides.""" + import base64 + from io import BytesIO + from PIL import Image + + try: + img = Image.open(BytesIO(base64.b64decode(b64))) + w, h = img.size + return w >= _MIN_IMAGE_DIMENSION and h >= _MIN_IMAGE_DIMENSION + except Exception: + return False _cached_local_model = None @@ -207,7 +222,7 @@ def caption_images( if item.get("text"): continue # already captioned b64 = item.get("image_b64") - if b64: + if b64 and _image_meets_min_size(b64): pending.append((row_idx, item_idx, b64)) if not pending: From a921382b573b8ff9b98c67f95eecac638cff945c Mon Sep 17 00:00:00 2001 From: edknv Date: Mon, 23 Mar 2026 11:05:53 -0700 Subject: [PATCH 16/20] updates --- nemo_retriever/README.md | 18 ++++----- nemo_retriever/pyproject.toml | 4 +- .../src/nemo_retriever/caption/caption.py | 16 ++++---- .../model/local/nemotron_vlm_captioner.py | 2 +- .../src/nemo_retriever/params/models.py | 2 +- nemo_retriever/src/nemo_retriever/version.py | 39 +++++++------------ 6 files changed, 34 insertions(+), 47 deletions(-) diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index 566d50980..479721819 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -48,18 +48,10 @@ Use the CUDA 13.0 wheels from the dedicated index by running the following comma ```bash uv pip uninstall torch torchvision -uv pip install torch==2.9.1 torchvision --torch-backend=cu130 +uv pip install torch==2.9.1 torchvision -i https://download.pytorch.org/whl/cu130 ``` This ensures the OCR and GPU‑accelerated components in NeMo Retriever Library run against the right CUDA runtime. -3. (Optional) Install vLLM for image captioning - -If you want to generate captions for extracted images, install the `vlm-caption` extra which includes [vLLM](https://docs.vllm.ai/) built for CUDA 13. - -```bash -uv pip install "nemo-retriever[vlm-caption]" -``` - ## Run the pipeline The [test PDF](../data/multimodal_test.pdf) contains text, tables, charts, and images. Additional test data resides [here](../data/). @@ -257,7 +249,13 @@ Use `.caption()` to generate text descriptions for extracted images using a loca ```python ingestor = ( ingestor.files(documents) - .extract() + .extract( + extract_text=True, + extract_tables=False, + extract_charts=False, + extract_infographics=False, + extract_images=True, + ) .caption() .embed() .vdb_upload() diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 6c6d2d7a7..7de00f251 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -75,15 +75,13 @@ dependencies = [ "soundfile>=0.12.0", "scipy>=1.11.0", "nvidia-ml-py", + "vllm==0.16.0", ] [project.optional-dependencies] svg = [ "cairosvg>=2.7.0", ] -vlm-caption = [ - "vllm==0.16.0", -] dev = [ "build>=1.2.2", "pytest>=8.0.2", diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py index 64d1a1e40..2b73eddc5 100644 --- a/nemo_retriever/src/nemo_retriever/caption/caption.py +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -4,29 +4,29 @@ from __future__ import annotations +import base64 +from io import BytesIO from typing import Any, Dict, List, Tuple import pandas as pd +from PIL import Image from nemo_retriever.params import CaptionParams +_DEFAULT_MODEL_NAME = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" _MAX_CONTEXT_TEXT_CHARS = 4096 _MIN_IMAGE_DIMENSION = 32 +_cached_local_model = None def _image_meets_min_size(b64: str) -> bool: """Return True if the base64 image is at least _MIN_IMAGE_DIMENSION on both sides.""" - import base64 - from io import BytesIO - from PIL import Image - try: img = Image.open(BytesIO(base64.b64decode(b64))) w, h = img.size return w >= _MIN_IMAGE_DIMENSION and h >= _MIN_IMAGE_DIMENSION except Exception: return False -_cached_local_model = None def _get_cached_local_model(kwargs: dict) -> "Any": @@ -35,7 +35,7 @@ def _get_cached_local_model(kwargs: dict) -> "Any": from nemo_retriever.model.local import NemotronVLMCaptioner _cached_local_model = NemotronVLMCaptioner( - model_path=kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"), + model_path=kwargs.get("model_name", _DEFAULT_MODEL_NAME), device=kwargs.get("device"), hf_cache_dir=kwargs.get("hf_cache_dir"), tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), @@ -61,7 +61,7 @@ def __init__(self, params: CaptionParams) -> None: from nemo_retriever.model.local import NemotronVLMCaptioner self._model = NemotronVLMCaptioner( - model_path=self._kwargs.get("model_name", "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16"), + model_path=self._kwargs.get("model_name", _DEFAULT_MODEL_NAME), device=self._kwargs.get("device"), hf_cache_dir=self._kwargs.get("hf_cache_dir"), tensor_parallel_size=self._kwargs.get("tensor_parallel_size", 1), @@ -170,7 +170,7 @@ def caption_images( *, model: Any = None, endpoint_url: str | None = None, - model_name: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16", + model_name: str = _DEFAULT_MODEL_NAME, api_key: str | None = None, prompt: str = "Caption the content of this image:", system_prompt: str | None = "/no_think", diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py index 8264a82f5..9279ccd6e 100644 --- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py +++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py @@ -75,7 +75,7 @@ def __init__( hf_cache_dir: Optional[str] = None, max_new_tokens: int = 1024, tensor_parallel_size: int = 1, - gpu_memory_utilization: float = 0.9, + gpu_memory_utilization: float = 0.8, ) -> None: super().__init__() diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py index 8f48e125e..8c16b388e 100644 --- a/nemo_retriever/src/nemo_retriever/params/models.py +++ b/nemo_retriever/src/nemo_retriever/params/models.py @@ -314,7 +314,7 @@ class CaptionParams(_ParamsModel): hf_cache_dir: Optional[str] = None context_text_max_chars: int = 0 tensor_parallel_size: int = 1 - gpu_memory_utilization: float = 0.9 + gpu_memory_utilization: float = 0.8 class InfographicParams(_ParamsModel): diff --git a/nemo_retriever/src/nemo_retriever/version.py b/nemo_retriever/src/nemo_retriever/version.py index 13ec2bd3b..9999c919c 100644 --- a/nemo_retriever/src/nemo_retriever/version.py +++ b/nemo_retriever/src/nemo_retriever/version.py @@ -12,7 +12,6 @@ from pathlib import Path import os import subprocess -import tempfile try: from ._build_info import BUILD_DATE as _PACKAGE_BUILD_DATE @@ -24,7 +23,6 @@ _PKG_NAME = "nemo-retriever" _UNKNOWN = "unknown" -_BUILD_STAMP = Path(tempfile.gettempdir()) / ".nemo_retriever_build_stamp" def _utc_now() -> datetime: @@ -59,28 +57,7 @@ def _build_datetime() -> datetime: except ValueError: pass - # Stamp file in the system temp dir makes the timestamp deterministic - # across the two separate subprocesses pip spawns during a PEP 517 build - # (metadata + wheel). We use tempdir rather than the source tree because - # pip may copy the source to different locations for each step. - if _BUILD_STAMP.exists(): - try: - cached = _BUILD_STAMP.read_text().strip() - if cached: - ts = float(cached) - # Only reuse if less than 60 s old to avoid stale stamps. - if abs(_utc_now().timestamp() - ts) < 60: - return datetime.fromtimestamp(ts, tz=timezone.utc) - _BUILD_STAMP.unlink(missing_ok=True) - except (OSError, ValueError): - pass - - now = _utc_now() - try: - _BUILD_STAMP.write_text(str(now.timestamp())) - except OSError: - pass - return now + return _utc_now() @lru_cache(maxsize=1) @@ -131,6 +108,18 @@ def _base_version() -> str: return os.getenv("RETRIEVER_VERSION") or os.getenv("NV_INGEST_VERSION") or _build_datetime().strftime("%Y.%m.%d") +def _has_prerelease(version_str: str) -> bool: + """Return True if *version_str* already contains a PEP 440 pre-release segment.""" + from packaging.version import Version + + try: + return Version(version_str).pre is not None + except Exception: + import re + + return bool(re.search(r"(a|alpha|b|beta|rc|c|dev|pre)[-_.]?\d*", version_str, re.I)) + + def get_build_version() -> str: """Return a PEP 440 compliant version string for packaging.""" release_type = (os.getenv("RETRIEVER_RELEASE_TYPE") or os.getenv("NV_INGEST_RELEASE_TYPE") or "dev").lower() @@ -139,6 +128,8 @@ def get_build_version() -> str: build_number = _build_number() if release_type == "release": + if _has_prerelease(base_version): + return base_version return f"{base_version}.post{build_number}" if int(build_number) > 0 else base_version if release_type == "dev": return f"{base_version}.dev{build_number}" From b0b447548d0b0c7b3d0f5518f89e828df5bd3f4b Mon Sep 17 00:00:00 2001 From: edknv Date: Mon, 23 Mar 2026 12:04:49 -0700 Subject: [PATCH 17/20] updates --- .../nemo_retriever/ingest_modes/gpu_pool.py | 37 +------------------ .../nemo_retriever/ingest_modes/inprocess.py | 5 ++- 2 files changed, 4 insertions(+), 38 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py b/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py index 11f3c36d6..cb1aa019a 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/gpu_pool.py @@ -66,28 +66,6 @@ def create(self) -> Any: return NemotronParseV12(task_prompt=self.task_prompt) -@dataclass -class CaptionModelConfig: - """Config to recreate a NemotronVLMCaptioner model.""" - - model_path: str = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" - device: Optional[str] = None - hf_cache_dir: Optional[str] = None - tensor_parallel_size: int = 1 - gpu_memory_utilization: float = 0.9 - - def create(self) -> Any: - from nemo_retriever.model.local import NemotronVLMCaptioner - - return NemotronVLMCaptioner( - model_path=self.model_path, - device=self.device, - hf_cache_dir=self.hf_cache_dir, - tensor_parallel_size=self.tensor_parallel_size, - gpu_memory_utilization=self.gpu_memory_utilization, - ) - - @dataclass class EmbeddingModelConfig: """Config to recreate an embedding model (VL or non-VL).""" @@ -189,19 +167,6 @@ def _extract_model_config(func: Callable, kwargs: dict[str, Any]) -> Any: if func is collapse_content_to_page_rows: return None # CPU-only, no model - from nemo_retriever.caption.caption import caption_images - - if func is caption_images: - if kwargs.get("endpoint_url"): - return None # Remote endpoint, no local model - return CaptionModelConfig( - model_path=kwargs.get("model_name", CaptionModelConfig.model_path), - device=kwargs.get("device"), - hf_cache_dir=kwargs.get("hf_cache_dir"), - tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), - gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9), - ) - return None @@ -327,7 +292,7 @@ def start(self) -> None: p = self._ctx.Process( target=_gpu_worker_entry, args=(idx, device_id, self._task_descriptors, iq, self._output_queue, evt), - daemon=False, + daemon=True, ) p.start() self._workers.append(p) diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py index 144d2b599..d47af16ea 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py @@ -1364,8 +1364,9 @@ def caption(self, params: "CaptionParams | None" = None, **kwargs: Any) -> "InPr warnings.warn( "No caption device specified. The VLM will load on cuda:0, which " - "may conflict with other models. Use --caption-device (e.g. " - "'cuda:1') to place the captioner on a separate GPU.", + "may conflict with other models. Use device='cuda:1' (or " + "--caption-device from the CLI) to place the captioner on a " + "separate GPU.", stacklevel=2, ) caption_kwargs["model"] = None From e6cb852c9839dd7115495f7e7eaf9e9d0255a9fc Mon Sep 17 00:00:00 2001 From: edknv Date: Mon, 23 Mar 2026 12:11:29 -0700 Subject: [PATCH 18/20] fix tests --- nemo_retriever/tests/test_caption.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/nemo_retriever/tests/test_caption.py b/nemo_retriever/tests/test_caption.py index 3bf6844be..f216ac831 100644 --- a/nemo_retriever/tests/test_caption.py +++ b/nemo_retriever/tests/test_caption.py @@ -15,15 +15,15 @@ from PIL import Image # noqa: E402 -def _make_1x1_png_b64() -> str: - img = Image.new("RGB", (1, 1), color=(255, 0, 0)) +def _make_test_png_b64(size: tuple[int, int] = (64, 64)) -> str: + img = Image.new("RGB", size, color=(255, 0, 0)) buf = io.BytesIO() img.save(buf, format="PNG") return base64.b64encode(buf.getvalue()).decode("ascii") def _make_page_df(num_images=2, captioned=False): - b64 = _make_1x1_png_b64() + b64 = _make_test_png_b64() images = [ {"bbox_xyxy_norm": [0.1, 0.2, 0.5, 0.8], "text": "done" if captioned else "", "image_b64": b64} for _ in range(num_images) @@ -55,7 +55,7 @@ def test_pdf_extraction_populates_images(mock_extract): _ext = pytest.importorskip("nemo_retriever.pdf.extract") pdfium = pytest.importorskip("pypdfium2") - mock_img = MagicMock(image=_make_1x1_png_b64(), bbox=(10, 20, 100, 200), max_width=612, max_height=792) + mock_img = MagicMock(image=_make_test_png_b64(), bbox=(10, 20, 100, 200), max_width=612, max_height=792) mock_extract.return_value = [mock_img] doc = pdfium.PdfDocument.new() @@ -74,7 +74,7 @@ def test_pdf_extraction_populates_images(mock_extract): def test_explode_includes_captioned_images(): from nemo_retriever.ingest_modes.inprocess import explode_content_to_rows - b64 = _make_1x1_png_b64() + b64 = _make_test_png_b64() df = pd.DataFrame([{ "text": "page", "page_image": {"image_b64": b64}, @@ -105,3 +105,16 @@ def test_context_text_prepended_to_prompt(): call_kwargs = mock_model.caption_batch.call_args[1] assert "quick brown fox" in call_kwargs["prompt"] assert "Text near this image:" in call_kwargs["prompt"] + + +def test_caption_images_skips_small_images(): + from nemo_retriever.caption.caption import caption_images + + tiny_b64 = _make_test_png_b64(size=(1, 1)) + images = [{"bbox_xyxy_norm": [0.1, 0.2, 0.5, 0.8], "text": "", "image_b64": tiny_b64}] + df = pd.DataFrame([{"text": "page", "images": images, "tables": [], "charts": [], "infographics": []}]) + + mock_model = MagicMock() + result = caption_images(df, model=mock_model) + mock_model.caption_batch.assert_not_called() + assert result.iloc[0]["images"][0]["text"] == "" From ae086798db1523654d8145d8cb699ed813bd0335 Mon Sep 17 00:00:00 2001 From: edknv Date: Mon, 23 Mar 2026 19:15:08 -0700 Subject: [PATCH 19/20] simplify --- .../src/nemo_retriever/caption/caption.py | 78 ++++++++----------- .../model/local/nemotron_vlm_captioner.py | 11 +-- 2 files changed, 38 insertions(+), 51 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py index 2b73eddc5..ee0c41efb 100644 --- a/nemo_retriever/src/nemo_retriever/caption/caption.py +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -29,18 +29,22 @@ def _image_meets_min_size(b64: str) -> bool: return False +def _create_local_model(kwargs: dict) -> "Any": + from nemo_retriever.model.local import NemotronVLMCaptioner + + return NemotronVLMCaptioner( + model_path=kwargs.get("model_name", _DEFAULT_MODEL_NAME), + device=kwargs.get("device"), + hf_cache_dir=kwargs.get("hf_cache_dir"), + tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), + gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.8), + ) + + def _get_cached_local_model(kwargs: dict) -> "Any": global _cached_local_model if _cached_local_model is None: - from nemo_retriever.model.local import NemotronVLMCaptioner - - _cached_local_model = NemotronVLMCaptioner( - model_path=kwargs.get("model_name", _DEFAULT_MODEL_NAME), - device=kwargs.get("device"), - hf_cache_dir=kwargs.get("hf_cache_dir"), - tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), - gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.9), - ) + _cached_local_model = _create_local_model(kwargs) return _cached_local_model @@ -58,15 +62,7 @@ def __init__(self, params: CaptionParams) -> None: if endpoint: self._model = None else: - from nemo_retriever.model.local import NemotronVLMCaptioner - - self._model = NemotronVLMCaptioner( - model_path=self._kwargs.get("model_name", _DEFAULT_MODEL_NAME), - device=self._kwargs.get("device"), - hf_cache_dir=self._kwargs.get("hf_cache_dir"), - tensor_parallel_size=self._kwargs.get("tensor_parallel_size", 1), - gpu_memory_utilization=self._kwargs.get("gpu_memory_utilization", 0.9), - ) + self._model = _create_local_model(self._kwargs) def __call__(self, batch_df: Any) -> Any: return caption_images(batch_df, model=self._model, **self._kwargs) @@ -82,19 +78,29 @@ def _build_prompt_with_context(base_prompt: str, context_text: str) -> str: return f"Text near this image:\n---\n{context_text}\n---\n\n{base_prompt}" +def _create_remote_client(endpoint_url: str, api_key: str | None) -> Any: + """Create a reusable NIM inference client for a remote VLM endpoint.""" + from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface + from nv_ingest_api.util.nim import create_inference_client + + return create_inference_client( + model_interface=VLMModelInterface(), + endpoints=(None, endpoint_url), + auth_token=api_key, + infer_protocol="http", + ) + + def _caption_batch_remote( base64_images: List[str], *, - endpoint_url: str, + nim_client: Any, model_name: str, - api_key: str | None, prompt: str, system_prompt: str | None, temperature: float, ) -> List[str]: """Send a batch of images to a remote VLM endpoint and return captions.""" - from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface - from nv_ingest_api.util.nim import create_inference_client from nv_ingest_api.util.image_processing.transforms import scale_image_to_encoding_size scaled = [scale_image_to_encoding_size(b64)[0] for b64 in base64_images] @@ -106,12 +112,6 @@ def _caption_batch_remote( if system_prompt: data["system_prompt"] = system_prompt - nim_client = create_inference_client( - model_interface=VLMModelInterface(), - endpoints=(None, endpoint_url), - auth_token=api_key, - infer_protocol="http", - ) return nim_client.infer(data, model_name=model_name, temperature=temperature) @@ -136,9 +136,8 @@ def _caption_one( b64: str, *, model: Any, - endpoint_url: str | None, + nim_client: Any | None, model_name: str, - api_key: str | None, prompt: str, system_prompt: str | None, temperature: float, @@ -155,9 +154,8 @@ def _caption_one( else: captions = _caption_batch_remote( [b64], - endpoint_url=endpoint_url, # type: ignore[arg-type] + nim_client=nim_client, model_name=model_name, - api_key=api_key, prompt=prompt, system_prompt=system_prompt, temperature=temperature, @@ -203,14 +201,13 @@ def caption_images( return batch_df if model is None and not endpoint_url: - # Lazy model creation for the sequential (no GPU pool) fallback. - # Cache the model so it is not re-created on every call. model = _get_cached_local_model(kwargs) + nim_client = _create_remote_client(endpoint_url, api_key) if endpoint_url and model is None else None + use_context = context_text_max_chars > 0 effective_max = min(context_text_max_chars, _MAX_CONTEXT_TEXT_CHARS) if use_context else 0 - # Collect all (row_idx, item_idx, image_b64) needing captions. pending: List[Tuple[int, int, str]] = [] for row_idx, row in batch_df.iterrows(): images = row.get("images") @@ -229,7 +226,6 @@ def caption_images( return batch_df if use_context: - # Each image gets a per-page enriched prompt, so caption one at a time. for row_idx, item_idx, b64 in pending: page_text = batch_df.at[row_idx, "text"] if "text" in batch_df.columns else "" context = (page_text or "")[:effective_max] @@ -237,21 +233,17 @@ def caption_images( caption = _caption_one( b64, model=model, - endpoint_url=endpoint_url, + nim_client=nim_client, model_name=model_name, - api_key=api_key, prompt=enriched_prompt, system_prompt=system_prompt, temperature=temperature, ) batch_df.at[row_idx, "images"][item_idx]["text"] = caption else: - # Batch mode: all images share the same prompt. all_b64 = [b64 for _, _, b64 in pending] if model is not None: - # Submit all at once — vLLM schedules internally based on - # available GPU memory. all_captions = _caption_batch_local( all_b64, model=model, @@ -260,14 +252,12 @@ def caption_images( temperature=temperature, ) else: - # Remote endpoints may have request-size limits; chunk. all_captions: List[str] = [] for start in range(0, len(all_b64), batch_size): captions = _caption_batch_remote( all_b64[start : start + batch_size], - endpoint_url=endpoint_url, # type: ignore[arg-type] + nim_client=nim_client, model_name=model_name, - api_key=api_key, prompt=prompt, system_prompt=system_prompt, temperature=temperature, diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py index 9279ccd6e..7d329054e 100644 --- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py +++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py @@ -156,12 +156,9 @@ def caption( temperature: float = 1.0, ) -> str: """Generate a caption for a single base64-encoded image.""" - from vllm import SamplingParams - - messages = self._build_messages(base64_image, prompt=prompt, system_prompt=system_prompt) - sampling_params = SamplingParams(temperature=temperature, max_tokens=self._max_new_tokens) - outputs = self._llm.chat([messages], sampling_params=sampling_params) - return outputs[0].outputs[0].text.strip() + return self.caption_batch([base64_image], prompt=prompt, system_prompt=system_prompt, temperature=temperature)[ + 0 + ] def caption_batch( self, @@ -186,7 +183,7 @@ def caption_batch( @property def model_name(self) -> str: - return "NVIDIA-Nemotron-Nano-12B-v2-VL" + return self._model_path @property def model_type(self) -> str: From 92779f8b00a9e5c6fc0d9780572a694ba199c140 Mon Sep 17 00:00:00 2001 From: edknv Date: Mon, 23 Mar 2026 21:55:01 -0700 Subject: [PATCH 20/20] consistent default gpu mem util --- nemo_retriever/src/nemo_retriever/caption/caption.py | 2 +- .../src/nemo_retriever/model/local/nemotron_vlm_captioner.py | 2 +- nemo_retriever/src/nemo_retriever/params/models.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/caption/caption.py b/nemo_retriever/src/nemo_retriever/caption/caption.py index ee0c41efb..b55dea563 100644 --- a/nemo_retriever/src/nemo_retriever/caption/caption.py +++ b/nemo_retriever/src/nemo_retriever/caption/caption.py @@ -37,7 +37,7 @@ def _create_local_model(kwargs: dict) -> "Any": device=kwargs.get("device"), hf_cache_dir=kwargs.get("hf_cache_dir"), tensor_parallel_size=kwargs.get("tensor_parallel_size", 1), - gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.8), + gpu_memory_utilization=kwargs.get("gpu_memory_utilization", 0.5), ) diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py index 7d329054e..14b814381 100644 --- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py +++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_vlm_captioner.py @@ -75,7 +75,7 @@ def __init__( hf_cache_dir: Optional[str] = None, max_new_tokens: int = 1024, tensor_parallel_size: int = 1, - gpu_memory_utilization: float = 0.8, + gpu_memory_utilization: float = 0.5, ) -> None: super().__init__() diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py index 8c16b388e..8b92975db 100644 --- a/nemo_retriever/src/nemo_retriever/params/models.py +++ b/nemo_retriever/src/nemo_retriever/params/models.py @@ -314,7 +314,7 @@ class CaptionParams(_ParamsModel): hf_cache_dir: Optional[str] = None context_text_max_chars: int = 0 tensor_parallel_size: int = 1 - gpu_memory_utilization: float = 0.8 + gpu_memory_utilization: float = 0.5 class InfographicParams(_ParamsModel):