From b31eb3539f0d60a8fda5a9c1887f7149ad8d621e Mon Sep 17 00:00:00 2001 From: splint-disk-8i <259054981+splint-disk-8i@users.noreply.github.com> Date: Sun, 15 Mar 2026 11:38:15 +0200 Subject: [PATCH 1/2] docs: fix double colon typo in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f769225f8..e2837fcd1 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ The following diagram shows the NeMo Retriever Library pipeline. ## What is NeMo Retriever Library? -The NeMo Retriever Library is a library and microservice framework designed to perform the following functions:: +The NeMo Retriever Library is a library and microservice framework designed to perform the following functions: - Accept a job specification that contains a document payload and a set of ingestion tasks to perform on that payload. - Store the result of each job to retrieve later. The result is a dictionary that contains a list of metadata that describes the objects extracted from the base document, and processing annotations and timing/trace data. From 9ac01b751253e85babc56ccee0c20b2428ca44a4 Mon Sep 17 00:00:00 2001 From: splint-disk-8i <259054981+splint-disk-8i@users.noreply.github.com> Date: Sun, 15 Mar 2026 11:59:23 +0200 Subject: [PATCH 2/2] docs: fill in CONTRIBUTING.md TODO, clean up print_env.sh - Replace TODO(Devin) placeholder with guide on adding new stages (refs #16) - Fix inconsistent headers and indentation in print_env.sh - Quote variables in printf to handle paths with spaces - Add Docker version to environment report --- CONTRIBUTING.md | 22 +++++++- scripts/support/print_env.sh | 102 ++++++++++++++++++----------------- 2 files changed, 74 insertions(+), 50 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1f549469e..7709c93b9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -245,7 +245,27 @@ ensures that no further processing is attempted on a failed message, maintaining ### Adding a New Stage or Module -#### TODO(Devin): Add details about adding a new stage or module once we have router node functionality in place. +New stages and modules follow the project's pipeline architecture. At a high level, each stage is a self-contained +processing unit that receives a `ControlMessage`, performs its work, and returns the modified message. When adding +a new stage: + +1. **Create the module** under `src/nv_ingest/stages/` following the existing directory structure. Your module should + mirror the pattern used by existing stages (e.g., `pdf_extractor`, `chart_extractor`). + +2. **Use the standard decorators** to integrate with the pipeline: + - `@filter_by_task` to ensure the stage only processes messages containing the relevant task type + - `@nv_ingest_node_failure_context_manager` to handle errors consistently + - `@traceable` for tracing and timing instrumentation + +3. **Register the stage** in the pipeline configuration so it can be discovered and wired into the processing graph. + +4. **Add unit tests** under `tests/` in a path that mirrors the module location (see + [Common Practices for Writing Unit Tests](#common-practices-for-writing-unit-tests) below). + +5. **Update documentation** to describe the new stage's purpose, inputs, outputs, and any configuration parameters. + +> **Note:** The router node functionality is still being finalized. This section will be expanded with more details +> on dynamic routing and stage registration once that work is complete. See [#16](https://github.com/NVIDIA/nv-ingest/issues/16) for tracking. ### Common Practices for Writing Unit Tests diff --git a/scripts/support/print_env.sh b/scripts/support/print_env.sh index 9e669cefe..59109ef09 100644 --- a/scripts/support/print_env.sh +++ b/scripts/support/print_env.sh @@ -1,76 +1,80 @@ #!/usr/bin/env bash # Copyright (c) 2024, NVIDIA CORPORATION. # Reports relevant environment information useful for diagnosing and -# debugging NVIDIA Ingest issues. +# debugging NeMo Retriever issues. # Usage: # "./print_env.sh" - prints to stdout # "./print_env.sh > env.txt" - prints to file "env.txt" print_env() { -echo "**git***" -if [ "$(git rev-parse --is-inside-work-tree 2>/dev/null)" == "true" ]; then -git log --decorate -n 1 -echo "**git submodules***" -git submodule status --recursive -else -echo "Not inside a git repository" -fi -echo + echo "***git***" + if [ "$(git rev-parse --is-inside-work-tree 2>/dev/null)" == "true" ]; then + git log --decorate -n 1 + echo "***git submodules***" + git submodule status --recursive + else + echo "Not inside a git repository" + fi + echo -echo "***OS Information***" -cat /etc/*-release -uname -a -echo + echo "***OS Information***" + cat /etc/*-release + uname -a + echo -echo "***GPU Information***" -nvidia-smi -echo + echo "***GPU Information***" + nvidia-smi + echo -echo "***CPU***" -lscpu -echo + echo "***CPU***" + lscpu + echo -echo "***CMake***" -which cmake && cmake --version -echo + echo "***Docker***" + which docker && docker --version + echo -echo "***g++***" -which g++ && g++ --version -echo + echo "***CMake***" + which cmake && cmake --version + echo -echo "***nvcc***" -which nvcc && nvcc --version -echo + echo "***g++***" + which g++ && g++ --version + echo -echo "***Python***" -which python && python -c "import sys; print('Python {0}.{1}.{2}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))" -echo + echo "***nvcc***" + which nvcc && nvcc --version + echo -echo "***Environment Variables***" + echo "***Python***" + which python && python -c "import sys; print('Python {0}.{1}.{2}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))" + echo -printf '%-32s: %s\n' PATH $PATH + echo "***Environment Variables***" -printf '%-32s: %s\n' LD_LIBRARY_PATH $LD_LIBRARY_PATH + printf '%-32s: %s\n' PATH "$PATH" -printf '%-32s: %s\n' NUMBAPRO_NVVM $NUMBAPRO_NVVM + printf '%-32s: %s\n' LD_LIBRARY_PATH "$LD_LIBRARY_PATH" -printf '%-32s: %s\n' NUMBAPRO_LIBDEVICE $NUMBAPRO_LIBDEVICE + printf '%-32s: %s\n' NUMBAPRO_NVVM "$NUMBAPRO_NVVM" -printf '%-32s: %s\n' VIRTUAL_ENV $VIRTUAL_ENV + printf '%-32s: %s\n' NUMBAPRO_LIBDEVICE "$NUMBAPRO_LIBDEVICE" -printf '%-32s: %s\n' PYTHON_PATH $PYTHON_PATH + printf '%-32s: %s\n' VIRTUAL_ENV "$VIRTUAL_ENV" -echo + printf '%-32s: %s\n' PYTHON_PATH "$PYTHON_PATH" + echo -# Print pip packages if pip exists -if type "pip" &> /dev/null; then -echo "***pip packages***" -which pip && pip list -echo -else -echo "pip not found" -fi + + # Print pip packages if pip exists + if type "pip" &> /dev/null; then + echo "***pip packages***" + which pip && pip list + echo + else + echo "pip not found" + fi } echo "
Click here to see environment details
"
@@ -78,4 +82,4 @@ echo "     "
 print_env | while read -r line; do
     echo "     $line"
 done
-echo "
" +echo "" \ No newline at end of file