NVIDIA · splint-disk-8i · Mar 15, 2026 · Mar 15, 2026
@@ -245,7 +245,27 @@ ensures that no further processing is attempted on a failed message, maintaining
 
 ### Adding a New Stage or Module
 
-#### TODO(Devin): Add details about adding a new stage or module once we have router node functionality in place.
+New stages and modules follow the project's pipeline architecture. At a high level, each stage is a self-contained
+processing unit that receives a `ControlMessage`, performs its work, and returns the modified message. When adding
+a new stage:
+
+1. **Create the module** under `src/nv_ingest/stages/` following the existing directory structure. Your module should
+   mirror the pattern used by existing stages (e.g., `pdf_extractor`, `chart_extractor`).
+
+2. **Use the standard decorators** to integrate with the pipeline:
+   - `@filter_by_task` to ensure the stage only processes messages containing the relevant task type
+   - `@nv_ingest_node_failure_context_manager` to handle errors consistently
+   - `@traceable` for tracing and timing instrumentation
+
+3. **Register the stage** in the pipeline configuration so it can be discovered and wired into the processing graph.
+
+4. **Add unit tests** under `tests/` in a path that mirrors the module location (see
+   [Common Practices for Writing Unit Tests](#common-practices-for-writing-unit-tests) below).
+
+5. **Update documentation** to describe the new stage's purpose, inputs, outputs, and any configuration parameters.
+
+> **Note:** The router node functionality is still being finalized. This section will be expanded with more details
+> on dynamic routing and stage registration once that work is complete. See [#16](https://github.com/NVIDIA/nv-ingest/issues/16) for tracking.
 
 ### Common Practices for Writing Unit Tests
 

@@ -41,7 +41,7 @@ The following diagram shows the NeMo Retriever Library pipeline.
 
 ## What is NeMo Retriever Library?
 
-The NeMo Retriever Library is a library and microservice framework designed to perform the following functions::
+The NeMo Retriever Library is a library and microservice framework designed to perform the following functions:
 
 - Accept a job specification that contains a document payload and a set of ingestion tasks to perform on that payload.
 - Store the result of each job to retrieve later. The result is a dictionary that contains a list of metadata that describes the objects extracted from the base document, and processing annotations and timing/trace data.

@@ -1,81 +1,85 @@
 #!/usr/bin/env bash
 # Copyright (c) 2024, NVIDIA CORPORATION.
 # Reports relevant environment information useful for diagnosing and
-# debugging NVIDIA Ingest issues.
+# debugging NeMo Retriever issues.
 # Usage:
 # "./print_env.sh" - prints to stdout
 # "./print_env.sh > env.txt" - prints to file "env.txt"
 
 print_env() {
-echo "**git***"
-if [ "$(git rev-parse --is-inside-work-tree 2>/dev/null)" == "true" ]; then
-git log --decorate -n 1
-echo "**git submodules***"
-git submodule status --recursive
-else
-echo "Not inside a git repository"
-fi
-echo
+    echo "***git***"
+    if [ "$(git rev-parse --is-inside-work-tree 2>/dev/null)" == "true" ]; then
+        git log --decorate -n 1
+        echo "***git submodules***"
+        git submodule status --recursive
+    else
+        echo "Not inside a git repository"
+    fi
+    echo
 
-echo "***OS Information***"
-cat /etc/*-release
-uname -a
-echo
+    echo "***OS Information***"
+    cat /etc/*-release
+    uname -a
+    echo
 
-echo "***GPU Information***"
-nvidia-smi
-echo
+    echo "***GPU Information***"
+    nvidia-smi
+    echo
 
-echo "***CPU***"
-lscpu
-echo
+    echo "***CPU***"
+    lscpu
+    echo
 
-echo "***CMake***"
-which cmake && cmake --version
-echo
+    echo "***Docker***"
+    which docker && docker --version
+    echo
 
-echo "***g++***"
-which g++ && g++ --version
-echo
+    echo "***CMake***"
+    which cmake && cmake --version
+    echo
 
-echo "***nvcc***"
-which nvcc && nvcc --version
-echo
+    echo "***g++***"
+    which g++ && g++ --version
+    echo
 
-echo "***Python***"
-which python && python -c "import sys; print('Python {0}.{1}.{2}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))"
-echo
+    echo "***nvcc***"
+    which nvcc && nvcc --version
+    echo
 
-echo "***Environment Variables***"
+    echo "***Python***"
+    which python && python -c "import sys; print('Python {0}.{1}.{2}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))"
+    echo
 
-printf '%-32s: %s\n' PATH $PATH
+    echo "***Environment Variables***"
 
-printf '%-32s: %s\n' LD_LIBRARY_PATH $LD_LIBRARY_PATH
+    printf '%-32s: %s\n' PATH "$PATH"
 
-printf '%-32s: %s\n' NUMBAPRO_NVVM $NUMBAPRO_NVVM
+    printf '%-32s: %s\n' LD_LIBRARY_PATH "$LD_LIBRARY_PATH"
 
-printf '%-32s: %s\n' NUMBAPRO_LIBDEVICE $NUMBAPRO_LIBDEVICE
+    printf '%-32s: %s\n' NUMBAPRO_NVVM "$NUMBAPRO_NVVM"
 
-printf '%-32s: %s\n' VIRTUAL_ENV $VIRTUAL_ENV
+    printf '%-32s: %s\n' NUMBAPRO_LIBDEVICE "$NUMBAPRO_LIBDEVICE"
 
-printf '%-32s: %s\n' PYTHON_PATH $PYTHON_PATH
+    printf '%-32s: %s\n' VIRTUAL_ENV "$VIRTUAL_ENV"
 
-echo
+    printf '%-32s: %s\n' PYTHON_PATH "$PYTHON_PATH"
 
+    echo
 
-# Print pip packages if pip exists
-if type "pip" &> /dev/null; then
-echo "***pip packages***"
-which pip && pip list
-echo
-else
-echo "pip not found"
-fi
+
+    # Print pip packages if pip exists
+    if type "pip" &> /dev/null; then
+        echo "***pip packages***"
+        which pip && pip list
+        echo
+    else
+        echo "pip not found"
+    fi
 }
 
 echo "<details><summary>Click here to see environment details</summary><pre>"
 echo "     "
 print_env | while read -r line; do
     echo "     $line"
 done
-echo "</pre></details>"
+echo "</pre></details>"