NVIDIA · cliffburdick · Jun 2, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -18,6 +18,7 @@ CMake options (full table in `docs/getting-started.md`):
 - `DAQIRI_MGR` — space-separated backend list. Valid values: `dpdk`, `socket`, `rdma`. Default in `src/CMakeLists.txt:137` is `"dpdk socket"` (which, due to the rule below, effectively builds all three).
 - `DAQIRI_BUILD_PYTHON` — builds `pybind11` bindings from `python/`.
 - `DAQIRI_BUILD_EXAMPLES` — builds the benchmark executables (default `ON`).
+- `DAQIRI_ENABLE_OTEL_METRICS` — enables OpenTelemetry metrics instrumentation (default `OFF`).
 - `DAQIRI_REORDER_GPU_PROFILE` — enable CUDA event timing in the DPDK reorder kernels (off by default).
 
 CUDA architectures are hardcoded to `80;90;121` (A100, H100, GB10) in `src/CMakeLists.txt:25`. Change this when targeting other GPUs.

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -21,6 +21,7 @@ find_package(CUDAToolkit REQUIRED)
 
 option(DAQIRI_BUILD_PYTHON "Build Python bindings" OFF)
 option(DAQIRI_BUILD_EXAMPLES "Build standalone examples" ON)
+option(DAQIRI_ENABLE_OTEL_METRICS "Enable OpenTelemetry metrics instrumentation" OFF)
 set(DAQIRI_MGR "dpdk socket" CACHE STRING "Manager backend list")
 
 add_subdirectory(src)
@@ -83,6 +84,9 @@ endif()
 set(DAQIRI_PC_REQUIRES "")
 set(DAQIRI_PC_REQUIRES_PRIVATE "libdpdk")
 set(DAQIRI_PC_LIBS "-lyaml-cpp")
+if(DAQIRI_ENABLE_OTEL_METRICS)
+  string(APPEND DAQIRI_PC_LIBS " -lopentelemetry_api")
+endif()
 separate_arguments(DAQIRI_PC_MGR_LIST UNIX_COMMAND "${DAQIRI_MGR}")
 list(FIND DAQIRI_PC_MGR_LIST "socket" DAQIRI_PC_HAS_SOCKET_IDX)
 list(FIND DAQIRI_PC_MGR_LIST "rdma" DAQIRI_PC_HAS_RDMA_IDX)

diff --git a/Dockerfile b/Dockerfile
@@ -19,6 +19,7 @@ ARG DAQIRI_BASE_TARGET=dpdk
 ARG DAQIRI_MGR="dpdk socket"
 ARG DAQIRI_BUILD_PYTHON=OFF
 ARG BUILD_SHARED_LIBS=ON
+ARG DAQIRI_ENABLE_OTEL_METRICS=OFF
 ARG DAQIRI_OS_BASE_IMAGE=nvcr.io/nvidia/cuda:13.1.0-devel-ubuntu24.04
 
 # ============================================================
@@ -172,36 +173,32 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     mft \
     && rm -rf /var/lib/apt/lists/*
 
+ARG DAQIRI_ENABLE_OTEL_METRICS
+ARG OPENTELEMETRY_CPP_VERSION=v1.27.0
+RUN if [ "${DAQIRI_ENABLE_OTEL_METRICS}" = "ON" ]; then \
+      git clone --depth 1 --branch "${OPENTELEMETRY_CPP_VERSION}" \
+        https://github.com/open-telemetry/opentelemetry-cpp.git /tmp/opentelemetry-cpp \
+      && cmake -S /tmp/opentelemetry-cpp -B /tmp/opentelemetry-cpp-build \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_INSTALL_PREFIX=/usr/local \
+        -DBUILD_TESTING=OFF \
+        -DWITH_EXAMPLES=OFF \
+        -DWITH_OTLP=OFF \
+        -DWITH_PROMETHEUS=ON \
+        -DWITH_ZIPKIN=OFF \
+        -DWITH_ABSEIL=OFF \
+        -DWITH_STL=CXX17 \
+      && cmake --build /tmp/opentelemetry-cpp-build --target install -j "$(nproc)" \
+      && ldconfig \
+      && rm -rf /tmp/opentelemetry-cpp /tmp/opentelemetry-cpp-build; \
+    fi
+
 # ==============================================================
 # rdma: Named target for consistent per-manager container builds.
 # Identical to dpdk (which already includes RDMA/ibverbs deps).
 # ==============================================================
 FROM dpdk AS rdma
 
-# ==============================================================
-# gpunetio: Add DOCA SDK packages for GPUNetIO support
-# ==============================================================
-FROM dpdk AS gpunetio
-
-# Install DOCA SDK packages required for GPUNetIO
-# (DOCA repo is already configured in dpdk stage)
-# - libdoca-sdk-gpunetio-dev: for gpunetio backend (doca-gpunetio module)
-# - libdoca-sdk-eth-dev: for gpunetio backend (doca-eth module)
-# - libdoca-sdk-flow-dev: for gpunetio backend (doca-flow module)
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        mlnx-dpdk-dev \
-        libdoca-sdk-gpunetio-dev \
-        libdoca-sdk-eth-dev \
-        libdoca-sdk-flow-dev \
-        mlnx-ofed-kernel-utils \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/mellanox/gdrcopy \
-    && cd /opt/mellanox/gdrcopy \
-    && make lib
-
-ENV GDRCOPY_PATH_L=/opt/mellanox/gdrcopy/src
-
 # ==============================
 # Rivermax Target
 # This stage is only built when --target rivermax is specified. It installs and configures Rivermax SDK.
@@ -281,6 +278,7 @@ FROM ${DAQIRI_BASE_TARGET} AS daqiri-build
 ARG DAQIRI_MGR
 ARG DAQIRI_BUILD_PYTHON
 ARG BUILD_SHARED_LIBS
+ARG DAQIRI_ENABLE_OTEL_METRICS
 
 WORKDIR /workspace/daqiri
 COPY . .
@@ -292,6 +290,7 @@ RUN cmake -S . -B build \
       -DCMAKE_CUDA_ARCHITECTURES=all-major \
       -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} \
       -DDAQIRI_BUILD_PYTHON=${DAQIRI_BUILD_PYTHON} \
+      -DDAQIRI_ENABLE_OTEL_METRICS=${DAQIRI_ENABLE_OTEL_METRICS} \
       -DDAQIRI_MGR="${DAQIRI_MGR}" \
     && cmake --build build -j "$(nproc)" \
     && cmake --install build
@@ -304,4 +303,5 @@ FROM ${DAQIRI_BASE_TARGET} AS runtime
 COPY --from=daqiri-build /opt/daqiri /opt/daqiri
 ENV CMAKE_PREFIX_PATH=/opt/daqiri
 ENV LD_LIBRARY_PATH=/opt/daqiri/lib
+EXPOSE 9464
 WORKDIR /opt/daqiri
diff --git a/README.md b/README.md
@@ -26,6 +26,8 @@ DAQIRI provides direct NIC hardware access in userspace, bypassing the Linux ker
 - **Flow Steering** — Configure the NIC's hardware flow engine to route packets by UDP
   source/destination port.
 - **RDMA** — RDMA verbs (READ, WRITE, SEND) over RoCE on Ethernet NICs or InfiniBand.
+- **Optional OpenTelemetry metrics** — Expose per-interface or per-queue packet,
+  byte, and drop counters when built with `DAQIRI_ENABLE_OTEL_METRICS=ON`.
 
 ### Backends
 
@@ -71,6 +73,11 @@ Container build:
 BASE_TARGET=dpdk DAQIRI_MGR="dpdk rdma" scripts/build-container.sh
 ```
 
+OpenTelemetry metrics are opt-in. Build with `-DDAQIRI_ENABLE_OTEL_METRICS=ON`
+for CMake builds or `DAQIRI_ENABLE_OTEL_METRICS=ON` for container builds. DAQIRI
+registers the instruments, while applications configure the OpenTelemetry SDK and
+exporters.
+
 See [Getting Started](https://nvidia.github.io/daqiri/getting-started/) for requirements, CMake options, and
 running the benchmarks.
 

diff --git a/cmake/daqiriConfig.cmake.in b/cmake/daqiriConfig.cmake.in
@@ -2,5 +2,9 @@
 
 include(CMakeFindDependencyMacro)
 find_dependency(CUDAToolkit)
+set(DAQIRI_ENABLE_OTEL_METRICS @DAQIRI_ENABLE_OTEL_METRICS@)
+if(DAQIRI_ENABLE_OTEL_METRICS)
+  find_dependency(opentelemetry-cpp CONFIG COMPONENTS api)
+endif()
 
 include("${CMAKE_CURRENT_LIST_DIR}/daqiriTargets.cmake")
diff --git a/docs/api-reference/configuration.md b/docs/api-reference/configuration.md
@@ -7,6 +7,11 @@ want to interoperate with existing configuration code.
 
 See `examples/daqiri_bench_*.yaml` for complete working examples.
 
+OpenTelemetry metrics do not add YAML fields. Metrics-enabled builds use the
+same interface, queue, and flow names from the active configuration as metric
+labels, and applications are still responsible for configuring the OpenTelemetry
+SDK/exporter before running DAQIRI.
+
 ## Common Configuration
 
 These settings apply globally to both TX and RX:

diff --git a/docs/api-reference/cpp.md b/docs/api-reference/cpp.md
@@ -348,6 +348,26 @@ daqiri::print_stats();
 daqiri::shutdown();
 ```
 
+## OpenTelemetry Metrics
+
+OpenTelemetry metrics are disabled by default and add no runtime instrumentation when
+DAQIRI is built without `DAQIRI_ENABLE_OTEL_METRICS=ON`. Metrics-enabled builds register
+observable counters through the OpenTelemetry C++ API:
+
+| Metric | Unit |
+| --- | --- |
+| `daqiri.rx.packets` | `{packet}` |
+| `daqiri.tx.packets` | `{packet}` |
+| `daqiri.rx.bytes` | `By` |
+| `daqiri.tx.bytes` | `By` |
+| `daqiri.dropped.packets` | `{packet}` |
+
+All metrics include `daqiri.backend`, `daqiri.interface.name`, `daqiri.port.id`, and
+`daqiri.queue.id`. Drop metrics also include `daqiri.drop.reason`.
+
+DAQIRI only owns library instrumentation. Applications remain responsible for
+configuring the OpenTelemetry C++ SDK, metric readers, and exporters.
+
 ## Function Reference
 
 This section summarizes the C++ functions available through `daqiri/daqiri.h`. The

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -87,6 +87,12 @@ Then build the DAQIRI library:
     BASE_IMAGE=torch BASE_TARGET=dpdk DAQIRI_MGR="dpdk socket rdma" scripts/build-container.sh
     ```
 
+    OpenTelemetry metrics are optional. Enable them with:
+
+    ```bash
+    DAQIRI_ENABLE_OTEL_METRICS=ON BASE_TARGET=dpdk DAQIRI_MGR="dpdk socket rdma" scripts/build-container.sh
+    ```
+
 === "CMake build (bare-metal)"
 
     ```bash
@@ -128,6 +134,7 @@ Both methods use the same public C++ include:
 | `DAQIRI_BUILD_PYTHON` | `OFF` | Build pybind11 Python bindings. |
 | `DAQIRI_BUILD_EXAMPLES` | `ON` | Build benchmark executables. |
 | `DAQIRI_ENABLE_GDS` | `OFF` | Enable cuFile-backed burst file writes from CUDA device memory. Host-memory writes use POSIX APIs without GDS. |
+| `DAQIRI_ENABLE_OTEL_METRICS` | `OFF` | Enable OpenTelemetry C++ metrics instrumentation. When enabled, OpenTelemetry C++ API package metadata must be available to CMake. |
 | `BUILD_SHARED_LIBS` | — | Build as shared library. |
 
 CUDA architectures are hardcoded to `80;90;121` (A100, H100, GB10) in `src/CMakeLists.txt`.
@@ -146,6 +153,11 @@ GDS-supported filesystem such as XFS. If `nvidia-fs` is not loaded, or the desti
 storage is not supported, DAQIRI returns `NOT_SUPPORTED` for CUDA device-backed burst
 writes. Host-backed burst writes continue to use POSIX APIs and do not require GDS.
 
+OpenTelemetry metrics builds register observable counters for received packets,
+transmitted packets, received bytes, transmitted bytes, and dropped packets. DAQIRI
+does not configure an SDK reader or exporter; applications that want exported data
+must configure the OpenTelemetry C++ SDK before or during DAQIRI initialization.
+
 ## Next Steps
 
 Once DAQIRI is built, follow the tutorials to configure your system and run your first benchmark:

diff --git a/docs/tutorials/benchmarking_examples.md b/docs/tutorials/benchmarking_examples.md
@@ -169,6 +169,35 @@ After having modified the configuration file, ensure you have connected an SFP c
 
 By default the application runs for 10 seconds and then exits. You can change the duration by passing `--seconds <N>` after the YAML path, or stop it gracefully at any time with `Ctrl-C`.
 
+## Watch live OpenTelemetry metrics in Grafana
+
+DAQIRI can expose the raw benchmark counters through OpenTelemetry when metrics
+support is enabled at build time. The Grafana example uses the same benchmark
+binary and YAML files as the loopback test above, then starts Prometheus and
+Grafana beside the benchmark process.
+
+Build the container with metrics enabled:
+
+```bash
+DAQIRI_ENABLE_OTEL_METRICS=ON DAQIRI_MGR="dpdk socket rdma" scripts/build-container.sh
+```
+
+Before starting the stack, fill in the required `<placeholders>` in the benchmark
+YAML you plan to run. You can also pass a machine-local copy through
+`DAQIRI_CONFIG` so the tracked example YAML keeps its placeholder syntax.
+
+```bash
+cd examples/grafana
+DAQIRI_CONFIG=/workspace/daqiri/examples/daqiri_bench_raw_tx_rx.yaml \
+DAQIRI_SECONDS=60 \
+docker compose up
+```
+
+Prometheus scrapes `http://localhost:9464/metrics`, and Grafana serves the
+`DAQIRI OpenTelemetry Metrics` dashboard at `http://localhost:3000`. The
+throughput panel reports payload counter rates in `Gb/s` for each active
+interface and queue.
+
 ??? abstract "See an example output"
 
     ```log

diff --git a/docs/tutorials/configuration-walkthrough.md b/docs/tutorials/configuration-walkthrough.md
@@ -24,6 +24,10 @@ With a backend in mind, read down the questions below and stop at the first one
     - **DGX Spark / GB10** (prefilled) — [`daqiri_bench_raw_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_spark.yaml). `kind: host_pinned` for the integrated GPU; cores, PCIe addresses, and IPs are prefilled. See the [Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) for run details.
     - **No physical NIC available** — [`daqiri_bench_raw_sw_loopback.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_sw_loopback.yaml). `loopback: "sw"`, no NIC required. Useful for first-time build verification, not representative of production performance.
 
+    To watch the same raw loopback benchmark with live Prometheus and Grafana
+    counters, use the Grafana compose stack described in
+    [Watch live OpenTelemetry metrics in Grafana](benchmarking_examples.md#watch-live-opentelemetry-metrics-in-grafana).
+
     **RDMA / RoCE** — runs on `daqiri_bench_rdma` (use `--mode {tx,rx,both}`). Configs use `kind: host_pinned` regardless of platform.
 
     - **Generic** (template — replace IPs) — [`daqiri_bench_rdma_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx.yaml).

diff --git a/docs/tutorials/system_configuration.md b/docs/tutorials/system_configuration.md
@@ -812,10 +812,6 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking
 
     ### Step 5: Isolate CPU cores
 
-    !!! note
-
-        This optimization is less impactful when using the `gpunetio` backend since the GPU polls the NIC.
-
     The CPU interacting with the NIC to route packets is sensitive to perturbations, especially with smaller packet/batch sizes requiring more frequent work. Isolating a CPU in Linux prevents unwanted user or kernel threads from running on it, reducing context switching and latency spikes from noisy neighbors.
 
     We recommend isolating the CPU cores you will select to interact with the NIC (defined in the `daqiri` configuration [described in the configuration reference](configuration-walkthrough.md) in this tutorial). This is done by setting additional flags on the kernel bootline.

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -67,6 +67,19 @@ endfunction()
 
 add_daqiri_raw_bench(daqiri_bench_raw_hds raw_hds_bench.cpp)
 add_daqiri_raw_bench(daqiri_bench_raw_gpudirect raw_gpudirect_bench.cpp)
+if(DAQIRI_ENABLE_OTEL_METRICS)
+  find_package(opentelemetry-cpp CONFIG QUIET COMPONENTS sdk exporters_prometheus)
+  if(TARGET opentelemetry-cpp::prometheus_exporter)
+    target_sources(daqiri_bench_raw_gpudirect PRIVATE grafana/otel_prometheus.cpp)
+    target_compile_definitions(daqiri_bench_raw_gpudirect PRIVATE DAQIRI_GRAFANA_PROMETHEUS=1)
+    target_link_libraries(daqiri_bench_raw_gpudirect PRIVATE
+      opentelemetry-cpp::sdk
+      opentelemetry-cpp::prometheus_exporter
+    )
+  else()
+    message(STATUS "OpenTelemetry Prometheus exporter not found; Grafana live metrics example disabled")
+  endif()
+endif()
 add_daqiri_raw_bench(daqiri_bench_raw_reorder_seq raw_reorder_seq_bench.cpp)
 add_daqiri_raw_bench(daqiri_bench_raw_reorder_quantize raw_reorder_quantize_bench.cpp)
 add_daqiri_raw_bench(daqiri_example_gds_write gds_write_example.cpp)

diff --git a/examples/grafana/README.md b/examples/grafana/README.md
@@ -0,0 +1,62 @@
+# DAQIRI Grafana Metrics Example
+
+This example runs `daqiri_bench_raw_gpudirect` for 60 seconds with DAQIRI's
+OpenTelemetry metrics exposed through a Prometheus pull endpoint, scraped by
+Prometheus, and visualized in a local Grafana dashboard.
+
+## Ports
+
+| Service | URL |
+| --- | --- |
+| Grafana | <http://localhost:3000> |
+| Prometheus | <http://localhost:9090> |
+| DAQIRI metrics | <http://localhost:9464/metrics> |
+
+Grafana is provisioned with the `DAQIRI OpenTelemetry Metrics` dashboard and a
+Prometheus datasource. The default Grafana login is `admin` / `daqiri`; anonymous
+viewer access is also enabled for the local example.
+
+## Build
+
+Build `daqiri:local` with DAQIRI metrics and the OpenTelemetry Prometheus exporter:
+
+```bash
+DAQIRI_ENABLE_OTEL_METRICS=ON DAQIRI_MGR="dpdk socket rdma" scripts/build-container.sh
+```
+
+## Run
+
+Update `examples/daqiri_bench_raw_tx_rx.yaml` for your NIC, GPU, MAC, IP, and CPU
+core values. Then start the stack:
+
+```bash
+cd examples/grafana
+docker compose up
+```
+
+The DAQIRI benchmark container follows the repository run requirements from
+`AGENTS.md`: it runs as root with `privileged: true`, host networking, all NVIDIA
+GPUs exposed through the NVIDIA runtime, and `/dev/hugepages` mounted from the
+host.
+
+The DAQIRI service runs:
+
+```bash
+/opt/daqiri/bin/daqiri_bench_raw_gpudirect /workspace/daqiri/examples/daqiri_bench_raw_tx_rx.yaml --seconds 60
+```
+
+To use a different config or binary, override the environment variables:
+
+```bash
+DAQIRI_CONFIG=/workspace/daqiri/examples/daqiri_bench_raw_tx_rx.yaml docker compose up
+```
+
+Stop the stack with:
+
+```bash
+docker compose down
+```
+
+The DAQIRI and Prometheus services use host networking so DPDK and the Prometheus
+scrape path can use the same host-visible network namespace. Grafana exposes port
+`3000` through Docker port mapping.