diff --git a/.gitignore b/.gitignore
index e221700..472185d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,9 @@
 build*/
 site/
+bench-results/
+
+# tune_system.py default output
+pcie_schematic.png
 
 # macOS
 .DS_Store
diff --git a/examples/rdma_bench.cpp b/examples/rdma_bench.cpp
index 0a1c58f..7ff2446 100644
--- a/examples/rdma_bench.cpp
+++ b/examples/rdma_bench.cpp
@@ -67,7 +67,12 @@ RdmaBenchConfig parse_rdma_cfg(const YAML::Node& node) {
 
 void rdma_worker(const RdmaBenchConfig& cfg, daqiri::bench::TokenBucketPacer& pacer,
                  std::atomic<bool>& stop, RdmaWorkerStats& stats) {
-  static constexpr int kMaxOutstanding = 5;
+  // Matches the per-MR num_bufs in the YAML configs. Higher values deadlock
+  // the bench: post_req blocks in get_tx_packet_burst when the pool is empty,
+  // but free_tx_burst (which refills it) only runs later in the same loop
+  // iteration via get_rx_burst. Until the loop is refactored to interleave
+  // drain with post, this constant must stay <= num_bufs.
+  static constexpr int kMaxOutstanding = 20;
   int outstanding_send = 0;
   int outstanding_recv = 0;
   uint64_t send_wr_id = 0x1234;
diff --git a/examples/run_spark_bench.sh b/examples/run_spark_bench.sh
new file mode 100755
index 0000000..5b6478d
--- /dev/null
+++ b/examples/run_spark_bench.sh
@@ -0,0 +1,373 @@
+#!/usr/bin/env bash
+#
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Sweep wrapper for DAQIRI benchmarks on DGX Spark. Runs the bench across a
+# matrix of (payload/message size, batch size, target-gbps), captures per-run
+# CPU/GPU/NIC counters, and emits one CSV row per cell into bench-results/.
+#
+# Drop sources per backend (per the report methodology):
+#   DPDK    : grep imissed/ierrors/rx_nombuf from bench log (DAQIRI_LOG_INFO).
+#   RDMA    : grep "CQ error" lines from bench log (DAQIRI_LOG_ERROR).
+#   socket  : diff /proc/net/udp drops column (UDP); nstat -a (TCP retransmits).
+#
+# Usage:
+#   ./run_spark_bench.sh <backend> [mode]
+#     backend ∈ {dpdk, rdma, socket-udp, socket-tcp}
+#     mode    ∈ {smoke, sweep, drop-curve, drop-curve-matrix}  (default: smoke)
+#
+# Required environment in current shell:
+#   DAQIRI_BUILD_DIR — path to the cmake build dir (defaults to ../build).
+#   ETH_DST_ADDR     — required for dpdk backend (the RX iface MAC).
+#   RX_IFACE         — kernel name of the RX interface for /proc/net/udp diff
+#                       (e.g. enP2p1s0f1np1); required for socket-udp.
+#
+# Run inside the project container as root (per AGENTS.md).
+
+set -u
+set -o pipefail
+
+# --------------------------------------------------------------------------
+# Configuration
+# --------------------------------------------------------------------------
+
+BACKEND="${1:-}"
+MODE="${2:-smoke}"
+if [[ -z "$BACKEND" ]]; then
+  echo "Usage: $0 <dpdk|rdma|socket-udp|socket-tcp> [smoke|sweep|drop-curve|drop-curve-matrix]" >&2
+  exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+BUILD_DIR="${DAQIRI_BUILD_DIR:-$SCRIPT_DIR/../build}"
+TS="$(date -u +%Y%m%dT%H%M%SZ)"
+OUT_DIR="$SCRIPT_DIR/../bench-results/$TS-$BACKEND-$MODE"
+mkdir -p "$OUT_DIR"
+
+CSV="$OUT_DIR/runs.csv"
+echo "lang,backend,post_process,payload,batch,target_gbps,seconds,packets,bytes,pps,gbps,drops,drops_kind,cpu_master_pct,cpu_tx_pct,cpu_rx_pct,gpu_sm_pct,gpu_mem_pct" > "$CSV"
+
+# Capture slow-moving environment state once per result set.
+"$SCRIPT_DIR/bench_capture_environment.sh" "$OUT_DIR"
+
+RUN_SECONDS=30
+DRIVER_LOG="$OUT_DIR/last_run.stderr"
+FAILURES=0
+
+# Per-backend sweep matrices (see docs/performance-dgx-spark.md methodology).
+# Native-shape sizes are the leftmost entry; "matched 8K" cell is also included.
+case "$BACKEND" in
+  dpdk)
+    PAYLOADS_SWEEP=(8000 4096 1024 256 64)
+    BATCHES_SWEEP=(10240 4096 1024 256)
+    PAYLOADS_HEADLINE=(8000)
+    BATCHES_HEADLINE=(10240)
+    BASE_YAML="$SCRIPT_DIR/daqiri_bench_raw_tx_rx_spark.yaml"
+    BENCH_BIN="$BUILD_DIR/examples/daqiri_bench_raw_gpudirect"
+    CPU_MASTER=8; CPU_TX=17; CPU_RX=18
+    : "${ETH_DST_ADDR:?ETH_DST_ADDR must be set for dpdk backend (cat /sys/class/net/<rx-iface>/address)}"
+    ;;
+  rdma)
+    PAYLOADS_SWEEP=(8000000 1048576 65536 8192 4096)
+    BATCHES_SWEEP=(1)
+    PAYLOADS_HEADLINE=(8000000)
+    BATCHES_HEADLINE=(1)
+    BASE_YAML="$SCRIPT_DIR/daqiri_bench_rdma_tx_rx_spark.yaml"
+    BENCH_BIN="$BUILD_DIR/examples/daqiri_bench_rdma"
+    CPU_MASTER=8; CPU_TX=17; CPU_RX=18
+    ;;
+  socket-udp)
+    PAYLOADS_SWEEP=(1472 1024 256 64)
+    BATCHES_SWEEP=(1)
+    PAYLOADS_HEADLINE=(1472)
+    BATCHES_HEADLINE=(1)
+    BASE_YAML="$SCRIPT_DIR/daqiri_bench_socket_udp_tx_rx.yaml"
+    BENCH_BIN="$BUILD_DIR/examples/daqiri_bench_socket"
+    CPU_MASTER=8; CPU_TX=17; CPU_RX=18
+    ;;
+  socket-tcp)
+    PAYLOADS_SWEEP=(1048576 65536 1024)
+    BATCHES_SWEEP=(1)
+    PAYLOADS_HEADLINE=(65536)
+    BATCHES_HEADLINE=(1)
+    BASE_YAML="$SCRIPT_DIR/daqiri_bench_socket_tcp_tx_rx.yaml"
+    BENCH_BIN="$BUILD_DIR/examples/daqiri_bench_socket"
+    CPU_MASTER=8; CPU_TX=17; CPU_RX=18
+    ;;
+  *) echo "Unknown backend: $BACKEND" >&2; exit 1 ;;
+esac
+
+DROP_CURVE_TARGETS=(1 5 10 25 50 75 100 0)  # 0 means unpaced (line rate)
+
+# --------------------------------------------------------------------------
+# Helpers
+# --------------------------------------------------------------------------
+
+# Read a scalar field from a `key=value` style stdout line.
+# usage: extract_field <pattern-prefix> <field-name> <file>
+extract_field() {
+  local prefix="$1" field="$2" file="$3"
+  grep -E "^$prefix" "$file" | tail -n1 | grep -oE " $field=[^ ]+" | head -n1 | sed -E "s/.*$field=//"
+}
+
+# Sum DPDK drop counters from the manager log emitted via DAQIRI_LOG_INFO.
+parse_dpdk_drops() {
+  local log="$1"
+  local sum=0 v
+  for key in imissed ierrors rx_nombuf; do
+    v="$(grep -oE "$key=[0-9]+" "$log" 2>/dev/null | tail -n1 | sed -E "s/.*=//" || true)"
+    [[ -n "${v:-}" ]] && sum=$((sum + v))
+  done
+  echo "$sum"
+}
+
+# Count RDMA CQ errors in the manager log.
+parse_rdma_drops() {
+  local log="$1"
+  grep -c 'CQ error' "$log" 2>/dev/null || echo 0
+}
+
+# Snapshot socket drops on the kernel side.
+# /proc/net/udp column 13 ("drops") is printed in decimal (%lu in
+# net/ipv4/udp.c). The local_address / rem_address columns are hex.
+snapshot_proc_net_udp() {
+  awk 'NR>1 { sum += $13 } END { print sum+0 }' /proc/net/udp 2>/dev/null || echo 0
+}
+snapshot_nstat() {
+  nstat -a 2>/dev/null | awk '/TcpExtTCPLostRetransmit|TcpRetransSegs|TcpInErrs/ { s += $2 } END { print s+0 }' || echo 0
+}
+
+# Snapshot /proc/stat per-cpu counters to a file. Mpstat is often not installed
+# in the bench container; /proc/stat is always available.
+snapshot_cpu_stat() {
+  awk '/^cpu[0-9]+/ {
+    total = $2+$3+$4+$5+$6+$7+$8
+    busy  = total - $5 - $6
+    print $1, total, busy
+  }' /proc/stat > "$1"
+}
+
+# Compute busy% for a single cpu index between two /proc/stat snapshots.
+cpu_busy_pct() {
+  local before="$1" after="$2" cpu_idx="$3"
+  awk -v cpu="cpu$cpu_idx" '
+    NR == FNR { b_total[$1] = $2; b_busy[$1] = $3; next }
+              { a_total[$1] = $2; a_busy[$1] = $3 }
+    END {
+      dt = a_total[cpu] - b_total[cpu]
+      db = a_busy[cpu]  - b_busy[cpu]
+      if (dt > 0) printf "%.1f", (db * 100.0) / dt
+      else        printf "0.0"
+    }
+  ' "$before" "$after"
+}
+
+# Substitute payload / batch into the base YAML and write a temp config.
+generate_yaml() {
+  local out="$1" payload="$2" batch="$3"
+  case "$BACKEND" in
+    dpdk)
+      sed -E \
+        -e "s|^( *payload_size: ).*|\1$payload|" \
+        -e "s|^( *batch_size: ).*|\1$batch|" \
+        -e "s|<00:00:00:00:00:00>|$ETH_DST_ADDR|g" \
+        "$BASE_YAML" > "$out"
+      ;;
+    rdma)
+      sed -E "s|^( *message_size: ).*|\1$payload|g" "$BASE_YAML" > "$out"
+      ;;
+    socket-udp|socket-tcp)
+      sed -E "s|^( *message_size: ).*|\1$payload|g" "$BASE_YAML" > "$out"
+      ;;
+  esac
+}
+
+# Run one cell. Echoes the CSV row to stdout.
+run_cell() {
+  local lang="$1" payload="$2" batch="$3" target_gbps="$4"
+  local cell="$lang-$BACKEND-p$payload-b$batch-g$target_gbps"
+  local cell_dir="$OUT_DIR/$cell"
+  mkdir -p "$cell_dir"
+
+  local yaml="$cell_dir/config.yaml"
+  generate_yaml "$yaml" "$payload" "$batch"
+
+  # Snapshot kernel-side drop counters.
+  local udp_before tcp_before
+  udp_before="$(snapshot_proc_net_udp)"
+  tcp_before="$(snapshot_nstat)"
+
+  # Snapshot per-cpu stats just before the bench starts.
+  snapshot_cpu_stat "$cell_dir/cpu_stat.before"
+
+  # Background GPU dmon (1-sec sample, RUN_SECONDS samples).
+  ( nvidia-smi dmon -s pucvmet -c "$RUN_SECONDS" > "$cell_dir/nvidia_smi_dmon.txt" 2>&1 ) &
+  local dmon_pid=$!
+
+  # Run the bench. Stderr captures DAQIRI_LOG_* output (DPDK/RDMA drop sources).
+  local stdout="$cell_dir/stdout.txt"
+  local stderr="$cell_dir/stderr.txt"
+  local args=("$yaml" --seconds "$RUN_SECONDS")
+  [[ "$target_gbps" != "0" ]] && args+=(--target-gbps "$target_gbps")
+  [[ "$BACKEND" == "rdma" || "$BACKEND" =~ ^socket- ]] && args+=(--mode both)
+
+  local bench_rc=0
+  "$BENCH_BIN" "${args[@]}" > "$stdout" 2> "$stderr" || bench_rc=$?
+  cp "$stderr" "$DRIVER_LOG"
+
+  # Snapshot per-cpu stats right after the bench exits (before background
+  # captures finish reaping, to bound the window).
+  snapshot_cpu_stat "$cell_dir/cpu_stat.after"
+
+  # Stop background captures (they self-terminate at -c <N>, but reap if needed).
+  wait "$dmon_pid"  2>/dev/null || true
+
+  # Parse bench stdout. For RX-bearing benches "RX complete" is authoritative;
+  # for TX-only configs fall back to "TX complete".
+  local pkts bytes secs
+  pkts="$(extract_field 'RX complete' packets "$stdout")"
+  bytes="$(extract_field 'RX complete' bytes   "$stdout")"
+  secs="$(extract_field 'RX complete' seconds  "$stdout")"
+  if [[ -z "$pkts" ]]; then
+    pkts="$(extract_field 'TX complete' packets "$stdout")"
+    bytes="$(extract_field 'TX complete' bytes   "$stdout")"
+    secs="$(extract_field 'TX complete' seconds  "$stdout")"
+  fi
+  if [[ -z "$pkts" ]]; then
+    case "$BACKEND" in
+      rdma)
+        # RDMA prints "Client/Server complete: ... send_completions=N send_bytes=N seconds=S"
+        pkts="$(extract_field 'Client complete' send_completions "$stdout")"
+        bytes="$(extract_field 'Client complete' send_bytes "$stdout")"
+        secs="$(extract_field 'Client complete' seconds "$stdout")"
+        ;;
+      socket-udp|socket-tcp)
+        # socket_bench prints "Client/Server complete: ... sent_packets=N sent_bytes=N
+        # recv_packets=N recv_bytes=N seconds=S". Use the TX-side counters for parity
+        # with how the RDMA fallback reports throughput.
+        pkts="$(extract_field 'Client complete' sent_packets "$stdout")"
+        bytes="$(extract_field 'Client complete' sent_bytes "$stdout")"
+        secs="$(extract_field 'Client complete' seconds "$stdout")"
+        ;;
+    esac
+  fi
+  local stats_missing=0
+  if [[ -z "${pkts:-}" || -z "${bytes:-}" || -z "${secs:-}" ]]; then
+    stats_missing=1
+  fi
+  if [[ "$bench_rc" -ne 0 || "$stats_missing" -ne 0 ]]; then
+    if [[ "$bench_rc" -ne 0 ]]; then
+      echo "ERROR: $cell bench exited with status $bench_rc" >&2
+    fi
+    if [[ "$stats_missing" -ne 0 ]]; then
+      echo "ERROR: $cell produced no parseable completion stats" >&2
+    fi
+    echo "       stdout: $stdout" >&2
+    echo "       stderr: $stderr" >&2
+    return 1
+  fi
+
+  local pps gbps
+  pps="$(awk -v p="$pkts" -v s="$secs" 'BEGIN { if (s+0>0) printf "%.0f", p/s; else print 0 }')"
+  gbps="$(awk -v b="$bytes" -v s="$secs" 'BEGIN { if (s+0>0) printf "%.3f", (b*8.0)/s/1e9; else print 0 }')"
+
+  # Drops per backend.
+  local drops drops_kind
+  case "$BACKEND" in
+    dpdk)
+      drops="$(parse_dpdk_drops "$stderr")"
+      drops_kind="dpdk-imissed+ierrors+nombuf"
+      ;;
+    rdma)
+      drops="$(parse_rdma_drops "$stderr")"
+      drops_kind="rdma-cqe-error"
+      ;;
+    socket-udp)
+      local udp_after; udp_after="$(snapshot_proc_net_udp)"
+      drops="$((udp_after - udp_before))"
+      drops_kind="udp-proc-net-udp-drops"
+      ;;
+    socket-tcp)
+      local tcp_after; tcp_after="$(snapshot_nstat)"
+      drops="$((tcp_after - tcp_before))"
+      drops_kind="tcp-nstat-retrans+inerrs"
+      ;;
+  esac
+
+  # Per-core CPU busy% over the bench window. Cores defined per-backend
+  # (master/TX/RX) match the YAML so we measure the threads we actually pin.
+  local cpu_master_pct cpu_tx_pct cpu_rx_pct
+  cpu_master_pct="$(cpu_busy_pct "$cell_dir/cpu_stat.before" "$cell_dir/cpu_stat.after" "$CPU_MASTER")"
+  cpu_tx_pct="$(cpu_busy_pct     "$cell_dir/cpu_stat.before" "$cell_dir/cpu_stat.after" "$CPU_TX")"
+  cpu_rx_pct="$(cpu_busy_pct     "$cell_dir/cpu_stat.before" "$cell_dir/cpu_stat.after" "$CPU_RX")"
+
+  # GPU SM% (column 5) and memory-controller % (column 6) from nvidia-smi
+  # dmon -s pucvmet. These are near zero for GPUDirect workloads (GPU is a
+  # DMA target, not a compute engine).
+  local gpu_sm gpu_mem
+  gpu_sm="$(awk '/^ *[0-9]/ { count++; sum += $5 } END { if (count) printf "%.1f", sum/count; else print 0 }' \
+               "$cell_dir/nvidia_smi_dmon.txt" 2>/dev/null || echo 0)"
+  gpu_mem="$(awk '/^ *[0-9]/ { count++; sum += $6 } END { if (count) printf "%.1f", sum/count; else print 0 }' \
+                "$cell_dir/nvidia_smi_dmon.txt" 2>/dev/null || echo 0)"
+
+  echo "$lang,$BACKEND,none,$payload,$batch,$target_gbps,$secs,$pkts,$bytes,$pps,$gbps,$drops,$drops_kind,$cpu_master_pct,$cpu_tx_pct,$cpu_rx_pct,$gpu_sm,$gpu_mem" \
+    | tee -a "$CSV"
+}
+
+run_cell_or_record_failure() {
+  run_cell "$@" || FAILURES=$((FAILURES + 1))
+}
+
+# --------------------------------------------------------------------------
+# Driver
+# --------------------------------------------------------------------------
+
+case "$MODE" in
+  smoke)
+    # One cell, native-shape, unpaced.
+    for p in "${PAYLOADS_HEADLINE[@]}"; do
+      for b in "${BATCHES_HEADLINE[@]}"; do
+        run_cell_or_record_failure cpp "$p" "$b" 0
+      done
+    done
+    ;;
+  sweep)
+    # Full payload × batch matrix at line rate.
+    for p in "${PAYLOADS_SWEEP[@]}"; do
+      for b in "${BATCHES_SWEEP[@]}"; do
+        run_cell_or_record_failure cpp "$p" "$b" 0
+      done
+    done
+    ;;
+  drop-curve)
+    # Hold native-shape constant, sweep target_gbps.
+    for p in "${PAYLOADS_HEADLINE[@]}"; do
+      for b in "${BATCHES_HEADLINE[@]}"; do
+        for g in "${DROP_CURVE_TARGETS[@]}"; do
+          run_cell_or_record_failure cpp "$p" "$b" "$g"
+        done
+      done
+    done
+    ;;
+  drop-curve-matrix)
+    # 2D drop curve: sweep payload × target_gbps at the headline batch.
+    for p in "${PAYLOADS_SWEEP[@]}"; do
+      for b in "${BATCHES_HEADLINE[@]}"; do
+        for g in "${DROP_CURVE_TARGETS[@]}"; do
+          run_cell_or_record_failure cpp "$p" "$b" "$g"
+        done
+      done
+    done
+    ;;
+  *) echo "Unknown mode: $MODE" >&2; exit 1 ;;
+esac
+
+echo
+echo "Results in: $OUT_DIR"
+echo "CSV:        $CSV"
+
+if [[ "$FAILURES" -ne 0 ]]; then
+  echo "Failed cells: $FAILURES" >&2
+  exit 1
+fi
diff --git a/scripts/setup_spark_rdma_loopback.sh b/scripts/setup_spark_rdma_loopback.sh
new file mode 100755
index 0000000..4c9061a
--- /dev/null
+++ b/scripts/setup_spark_rdma_loopback.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Host network config for the DGX Spark RDMA loopback bench.
+# Adapted from a colleague's single-adapter script for this host's
+# inter-port loopback: Adapter1:port0 (1.1.1.1) <-> Adapter2:port1 (2.2.2.2).
+#
+# Matches examples/daqiri_bench_rdma_tx_rx_spark.yaml (1.1.1.1 / 2.2.2.2).
+# Re-running is safe: replaces addresses, flushes per-port tables, and
+# deletes any matching rules before re-adding.
+#
+# Env overrides: SPARK_TX_IFACE, SPARK_RX_IFACE, SPARK_TX_MAC, SPARK_RX_MAC.
+
+set -euo pipefail
+
+p0="${SPARK_TX_IFACE:-enp1s0f0np0}"     # Adapter1 port 0, PCI 0000:01:00.0
+p1="${SPARK_RX_IFACE:-enP2p1s0f1np1}"   # Adapter2 port 1, PCI 0002:01:00.1
+
+read_mac() {
+  local iface="$1"
+  local path="/sys/class/net/${iface}/address"
+  if [[ ! -r "$path" ]]; then
+    echo "cannot read $path; set SPARK_TX_MAC/SPARK_RX_MAC explicitly" >&2
+    exit 1
+  fi
+  cat "$path"
+}
+
+p0_mac="${SPARK_TX_MAC:-$(read_mac "$p0")}"
+p1_mac="${SPARK_RX_MAC:-$(read_mac "$p1")}"
+
+p0_ip="1.1.1.1"
+p1_ip="2.2.2.2"
+
+# Numeric table IDs avoid having to edit /etc/iproute2/rt_tables.
+table_p0=100
+table_p1=101
+
+ip link set "${p0}" up
+ip link set "${p1}" up
+
+ip addr replace "${p0_ip}/24" dev "${p0}"
+ip addr replace "${p1_ip}/24" dev "${p1}"
+
+ip route flush table "${table_p0}" 2>/dev/null || true
+ip route flush table "${table_p1}" 2>/dev/null || true
+ip route add table "${table_p0}" default dev "${p0}"
+ip route add table "${table_p1}" default dev "${p1}"
+
+ip rule del from "${p0_ip}/32" table "${table_p0}" 2>/dev/null || true
+ip rule del to   "${p0_ip}/32" table "${table_p0}" 2>/dev/null || true
+ip rule del from "${p1_ip}/32" table "${table_p1}" 2>/dev/null || true
+ip rule del to   "${p1_ip}/32" table "${table_p1}" 2>/dev/null || true
+
+ip rule add from "${p0_ip}/32" table "${table_p0}"
+ip rule add to   "${p0_ip}/32" table "${table_p0}"
+ip rule add from "${p1_ip}/32" table "${table_p1}"
+ip rule add to   "${p1_ip}/32" table "${table_p1}"
+
+arp -i "${p0}" -s "${p0_ip}" "${p0_mac}"
+arp -i "${p0}" -s "${p1_ip}" "${p1_mac}"
+arp -i "${p1}" -s "${p0_ip}" "${p0_mac}"
+arp -i "${p1}" -s "${p1_ip}" "${p1_mac}"
+
+echo "RDMA loopback config applied."
+echo "  ${p0} (${p0_mac}) -> ${p0_ip}/24, table ${table_p0}"
+echo "  ${p1} (${p1_mac}) -> ${p1_ip}/24, table ${table_p1}"
diff --git a/scripts/spark_data_fill.sh b/scripts/spark_data_fill.sh
new file mode 100755
index 0000000..22349ed
--- /dev/null
+++ b/scripts/spark_data_fill.sh
@@ -0,0 +1,153 @@
+#!/usr/bin/env bash
+# Drives the PR 1 data-fill bench runs for the DGX Spark performance report.
+#
+# Runs DPDK GPUDirect, socket-UDP, and socket-TCP through their sweep and
+# drop-curve modes via examples/run_spark_bench.sh, with pre-flight checks
+# and orphan-hugepage cleanup. RDMA is deferred from PR 1 (single-host
+# loopback over the cable needs a netns+two-process refactor; tracked
+# separately).
+#
+# Run inside the project container (privileged, --gpus all, /dev/hugepages
+# and /mnt/huge mounted, repo at /workspace).
+#
+# Usage:
+#   ./scripts/spark_data_fill.sh                 # all three backends
+#   ./scripts/spark_data_fill.sh dpdk            # just DPDK
+#   ./scripts/spark_data_fill.sh socket-udp socket-tcp
+#
+# Env overrides:
+#   ETH_DST_ADDR  — RX-side MAC. Auto-detected from
+#                   /sys/class/net/enP2p1s0f1np1/address if unset.
+#   RX_IFACE      — RX netdev name (default enP2p1s0f1np1).
+#   DAQIRI_BUILD_DIR — defaults to ./build.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+WRAPPER="$REPO_ROOT/examples/run_spark_bench.sh"
+BUILD_DIR="${DAQIRI_BUILD_DIR:-$REPO_ROOT/build}"
+RX_IFACE="${RX_IFACE:-enP2p1s0f1np1}"
+
+BACKENDS=("$@")
+[[ ${#BACKENDS[@]} -eq 0 ]] && BACKENDS=(dpdk socket-udp socket-tcp)
+
+# --- pre-flight ------------------------------------------------------------
+
+preflight_fail() { echo "PREFLIGHT FAIL: $*" >&2; exit 1; }
+note() { echo "[$(date -u +%H:%M:%SZ)] $*"; }
+
+[[ -x "$WRAPPER" ]] || preflight_fail "wrapper missing or not executable: $WRAPPER"
+
+for be in "${BACKENDS[@]}"; do
+  case "$be" in
+    dpdk)       bin="$BUILD_DIR/examples/daqiri_bench_raw_gpudirect" ;;
+    socket-udp|socket-tcp) bin="$BUILD_DIR/examples/daqiri_bench_socket" ;;
+    rdma)       preflight_fail "RDMA is deferred from PR 1; see follow-up issue" ;;
+    *)          preflight_fail "unknown backend: $be" ;;
+  esac
+  [[ -x "$bin" ]] || preflight_fail "missing bench binary: $bin (run cmake --build first)"
+done
+
+# DPDK-only checks.
+if [[ " ${BACKENDS[*]} " == *" dpdk "* ]]; then
+  free_hp="$(awk '/^HugePages_Free:/ { print $2 }' /proc/meminfo)"
+  [[ "${free_hp:-0}" -ge 4 ]] || preflight_fail "HugePages_Free=$free_hp (need >=4); clean /mnt/huge and /dev/hugepages from prior runs"
+
+  if [[ -z "${ETH_DST_ADDR:-}" ]]; then
+    mac_path="/sys/class/net/$RX_IFACE/address"
+    [[ -r "$mac_path" ]] || preflight_fail "cannot read $mac_path; set ETH_DST_ADDR explicitly"
+    ETH_DST_ADDR="$(cat "$mac_path")"
+    export ETH_DST_ADDR
+    note "ETH_DST_ADDR auto-detected from $RX_IFACE: $ETH_DST_ADDR"
+  fi
+
+  carrier="$(cat "/sys/class/net/$RX_IFACE/carrier" 2>/dev/null || echo 0)"
+  [[ "$carrier" == "1" ]] || preflight_fail "RX iface $RX_IFACE has no carrier (cable unplugged or link down)"
+fi
+
+note "Pre-flight OK. Backends: ${BACKENDS[*]}"
+note "Build dir: $BUILD_DIR"
+note "Repo root: $REPO_ROOT"
+
+# --- hugepage cleanup helper ----------------------------------------------
+
+# DPDK leaves orphan rtemap_* files when a bench aborts. Clean between runs so
+# we don't run out of hugepages mid-sweep.
+clean_orphan_hugepages() {
+  local pre post freed
+  pre="$(awk '/^HugePages_Free:/ { print $2 }' /proc/meminfo)"
+  : "${pre:=0}"
+  shopt -s nullglob
+  # DPDK uses a random per-process file prefix (override with --file-prefix);
+  # match anything ending in `map_<digit>` to catch the common shape without
+  # nuking unrelated files. Skip any that are still held by a live process.
+  for f in /dev/hugepages/*map_[0-9]* /mnt/huge/*map_[0-9]*; do
+    if ! fuser -- "$f" >/dev/null 2>&1; then
+      rm -f -- "$f" 2>/dev/null || true
+    fi
+  done
+  shopt -u nullglob
+  post="$(awk '/^HugePages_Free:/ { print $2 }' /proc/meminfo)"
+  : "${post:=0}"
+  freed=$((post - pre))
+  if [[ "$freed" -gt 0 ]]; then
+    note "Freed $freed orphan hugepages (now ${post} free)"
+  fi
+  return 0
+}
+
+# --- driver loop -----------------------------------------------------------
+
+declare -a RESULT_DIRS
+FAILURES=0
+
+run_backend_mode() {
+  local backend="$1" mode="$2"
+  note "=== Running: $backend $mode ==="
+  clean_orphan_hugepages
+
+  # Stream wrapper output live (per-cell CSV rows appear as they finish) while
+  # also keeping a log for post-run parsing of the "Results in:" line.
+  local log="/tmp/spark_data_fill.$backend.$mode.log"
+  local rc=0
+  set +e
+  "$WRAPPER" "$backend" "$mode" 2>&1 | tee "$log"
+  local -a pipe_status=("${PIPESTATUS[@]}")
+  set -e
+  rc="${pipe_status[0]}"
+
+  if [[ "$rc" -eq 0 ]]; then
+    local result_dir
+    result_dir="$(awk '/^Results in:/ { print $3 }' "$log" | tail -n1)"
+    [[ -n "$result_dir" ]] && RESULT_DIRS+=("$backend/$mode -> $result_dir")
+    note "$backend $mode complete"
+  else
+    FAILURES=$((FAILURES + 1))
+    note "$backend $mode FAILED (exit $rc); continuing"
+    tail -n 40 "$log" >&2
+  fi
+  clean_orphan_hugepages
+}
+
+for be in "${BACKENDS[@]}"; do
+  run_backend_mode "$be" sweep
+  run_backend_mode "$be" drop-curve
+done
+
+# --- summary ---------------------------------------------------------------
+
+echo
+echo "=========================================="
+echo "Data-fill complete. Result directories:"
+echo "=========================================="
+for r in "${RESULT_DIRS[@]}"; do
+  echo "  $r"
+done
+echo
+echo "Next: aggregate CSVs and fill docs/performance-dgx-spark.md."
+
+if [[ "$FAILURES" -ne 0 ]]; then
+  echo "Failed backend/mode runs: $FAILURES" >&2
+  exit 1
+fi