diff --git a/.gitignore b/.gitignore index e221700..472185d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ build*/ site/ +bench-results/ + +# tune_system.py default output +pcie_schematic.png # macOS .DS_Store diff --git a/examples/rdma_bench.cpp b/examples/rdma_bench.cpp index 0a1c58f..7ff2446 100644 --- a/examples/rdma_bench.cpp +++ b/examples/rdma_bench.cpp @@ -67,7 +67,12 @@ RdmaBenchConfig parse_rdma_cfg(const YAML::Node& node) { void rdma_worker(const RdmaBenchConfig& cfg, daqiri::bench::TokenBucketPacer& pacer, std::atomic& stop, RdmaWorkerStats& stats) { - static constexpr int kMaxOutstanding = 5; + // Matches the per-MR num_bufs in the YAML configs. Higher values deadlock + // the bench: post_req blocks in get_tx_packet_burst when the pool is empty, + // but free_tx_burst (which refills it) only runs later in the same loop + // iteration via get_rx_burst. Until the loop is refactored to interleave + // drain with post, this constant must stay <= num_bufs. + static constexpr int kMaxOutstanding = 20; int outstanding_send = 0; int outstanding_recv = 0; uint64_t send_wr_id = 0x1234; diff --git a/examples/run_spark_bench.sh b/examples/run_spark_bench.sh new file mode 100755 index 0000000..5b6478d --- /dev/null +++ b/examples/run_spark_bench.sh @@ -0,0 +1,373 @@ +#!/usr/bin/env bash +# +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Sweep wrapper for DAQIRI benchmarks on DGX Spark. Runs the bench across a +# matrix of (payload/message size, batch size, target-gbps), captures per-run +# CPU/GPU/NIC counters, and emits one CSV row per cell into bench-results/. +# +# Drop sources per backend (per the report methodology): +# DPDK : grep imissed/ierrors/rx_nombuf from bench log (DAQIRI_LOG_INFO). +# RDMA : grep "CQ error" lines from bench log (DAQIRI_LOG_ERROR). +# socket : diff /proc/net/udp drops column (UDP); nstat -a (TCP retransmits). +# +# Usage: +# ./run_spark_bench.sh [mode] +# backend ∈ {dpdk, rdma, socket-udp, socket-tcp} +# mode ∈ {smoke, sweep, drop-curve, drop-curve-matrix} (default: smoke) +# +# Required environment in current shell: +# DAQIRI_BUILD_DIR — path to the cmake build dir (defaults to ../build). +# ETH_DST_ADDR — required for dpdk backend (the RX iface MAC). +# RX_IFACE — kernel name of the RX interface for /proc/net/udp diff +# (e.g. enP2p1s0f1np1); required for socket-udp. +# +# Run inside the project container as root (per AGENTS.md). + +set -u +set -o pipefail + +# -------------------------------------------------------------------------- +# Configuration +# -------------------------------------------------------------------------- + +BACKEND="${1:-}" +MODE="${2:-smoke}" +if [[ -z "$BACKEND" ]]; then + echo "Usage: $0 [smoke|sweep|drop-curve|drop-curve-matrix]" >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +BUILD_DIR="${DAQIRI_BUILD_DIR:-$SCRIPT_DIR/../build}" +TS="$(date -u +%Y%m%dT%H%M%SZ)" +OUT_DIR="$SCRIPT_DIR/../bench-results/$TS-$BACKEND-$MODE" +mkdir -p "$OUT_DIR" + +CSV="$OUT_DIR/runs.csv" +echo "lang,backend,post_process,payload,batch,target_gbps,seconds,packets,bytes,pps,gbps,drops,drops_kind,cpu_master_pct,cpu_tx_pct,cpu_rx_pct,gpu_sm_pct,gpu_mem_pct" > "$CSV" + +# Capture slow-moving environment state once per result set. +"$SCRIPT_DIR/bench_capture_environment.sh" "$OUT_DIR" + +RUN_SECONDS=30 +DRIVER_LOG="$OUT_DIR/last_run.stderr" +FAILURES=0 + +# Per-backend sweep matrices (see docs/performance-dgx-spark.md methodology). +# Native-shape sizes are the leftmost entry; "matched 8K" cell is also included. +case "$BACKEND" in + dpdk) + PAYLOADS_SWEEP=(8000 4096 1024 256 64) + BATCHES_SWEEP=(10240 4096 1024 256) + PAYLOADS_HEADLINE=(8000) + BATCHES_HEADLINE=(10240) + BASE_YAML="$SCRIPT_DIR/daqiri_bench_raw_tx_rx_spark.yaml" + BENCH_BIN="$BUILD_DIR/examples/daqiri_bench_raw_gpudirect" + CPU_MASTER=8; CPU_TX=17; CPU_RX=18 + : "${ETH_DST_ADDR:?ETH_DST_ADDR must be set for dpdk backend (cat /sys/class/net//address)}" + ;; + rdma) + PAYLOADS_SWEEP=(8000000 1048576 65536 8192 4096) + BATCHES_SWEEP=(1) + PAYLOADS_HEADLINE=(8000000) + BATCHES_HEADLINE=(1) + BASE_YAML="$SCRIPT_DIR/daqiri_bench_rdma_tx_rx_spark.yaml" + BENCH_BIN="$BUILD_DIR/examples/daqiri_bench_rdma" + CPU_MASTER=8; CPU_TX=17; CPU_RX=18 + ;; + socket-udp) + PAYLOADS_SWEEP=(1472 1024 256 64) + BATCHES_SWEEP=(1) + PAYLOADS_HEADLINE=(1472) + BATCHES_HEADLINE=(1) + BASE_YAML="$SCRIPT_DIR/daqiri_bench_socket_udp_tx_rx.yaml" + BENCH_BIN="$BUILD_DIR/examples/daqiri_bench_socket" + CPU_MASTER=8; CPU_TX=17; CPU_RX=18 + ;; + socket-tcp) + PAYLOADS_SWEEP=(1048576 65536 1024) + BATCHES_SWEEP=(1) + PAYLOADS_HEADLINE=(65536) + BATCHES_HEADLINE=(1) + BASE_YAML="$SCRIPT_DIR/daqiri_bench_socket_tcp_tx_rx.yaml" + BENCH_BIN="$BUILD_DIR/examples/daqiri_bench_socket" + CPU_MASTER=8; CPU_TX=17; CPU_RX=18 + ;; + *) echo "Unknown backend: $BACKEND" >&2; exit 1 ;; +esac + +DROP_CURVE_TARGETS=(1 5 10 25 50 75 100 0) # 0 means unpaced (line rate) + +# -------------------------------------------------------------------------- +# Helpers +# -------------------------------------------------------------------------- + +# Read a scalar field from a `key=value` style stdout line. +# usage: extract_field +extract_field() { + local prefix="$1" field="$2" file="$3" + grep -E "^$prefix" "$file" | tail -n1 | grep -oE " $field=[^ ]+" | head -n1 | sed -E "s/.*$field=//" +} + +# Sum DPDK drop counters from the manager log emitted via DAQIRI_LOG_INFO. +parse_dpdk_drops() { + local log="$1" + local sum=0 v + for key in imissed ierrors rx_nombuf; do + v="$(grep -oE "$key=[0-9]+" "$log" 2>/dev/null | tail -n1 | sed -E "s/.*=//" || true)" + [[ -n "${v:-}" ]] && sum=$((sum + v)) + done + echo "$sum" +} + +# Count RDMA CQ errors in the manager log. +parse_rdma_drops() { + local log="$1" + grep -c 'CQ error' "$log" 2>/dev/null || echo 0 +} + +# Snapshot socket drops on the kernel side. +# /proc/net/udp column 13 ("drops") is printed in decimal (%lu in +# net/ipv4/udp.c). The local_address / rem_address columns are hex. +snapshot_proc_net_udp() { + awk 'NR>1 { sum += $13 } END { print sum+0 }' /proc/net/udp 2>/dev/null || echo 0 +} +snapshot_nstat() { + nstat -a 2>/dev/null | awk '/TcpExtTCPLostRetransmit|TcpRetransSegs|TcpInErrs/ { s += $2 } END { print s+0 }' || echo 0 +} + +# Snapshot /proc/stat per-cpu counters to a file. Mpstat is often not installed +# in the bench container; /proc/stat is always available. +snapshot_cpu_stat() { + awk '/^cpu[0-9]+/ { + total = $2+$3+$4+$5+$6+$7+$8 + busy = total - $5 - $6 + print $1, total, busy + }' /proc/stat > "$1" +} + +# Compute busy% for a single cpu index between two /proc/stat snapshots. +cpu_busy_pct() { + local before="$1" after="$2" cpu_idx="$3" + awk -v cpu="cpu$cpu_idx" ' + NR == FNR { b_total[$1] = $2; b_busy[$1] = $3; next } + { a_total[$1] = $2; a_busy[$1] = $3 } + END { + dt = a_total[cpu] - b_total[cpu] + db = a_busy[cpu] - b_busy[cpu] + if (dt > 0) printf "%.1f", (db * 100.0) / dt + else printf "0.0" + } + ' "$before" "$after" +} + +# Substitute payload / batch into the base YAML and write a temp config. +generate_yaml() { + local out="$1" payload="$2" batch="$3" + case "$BACKEND" in + dpdk) + sed -E \ + -e "s|^( *payload_size: ).*|\1$payload|" \ + -e "s|^( *batch_size: ).*|\1$batch|" \ + -e "s|<00:00:00:00:00:00>|$ETH_DST_ADDR|g" \ + "$BASE_YAML" > "$out" + ;; + rdma) + sed -E "s|^( *message_size: ).*|\1$payload|g" "$BASE_YAML" > "$out" + ;; + socket-udp|socket-tcp) + sed -E "s|^( *message_size: ).*|\1$payload|g" "$BASE_YAML" > "$out" + ;; + esac +} + +# Run one cell. Echoes the CSV row to stdout. +run_cell() { + local lang="$1" payload="$2" batch="$3" target_gbps="$4" + local cell="$lang-$BACKEND-p$payload-b$batch-g$target_gbps" + local cell_dir="$OUT_DIR/$cell" + mkdir -p "$cell_dir" + + local yaml="$cell_dir/config.yaml" + generate_yaml "$yaml" "$payload" "$batch" + + # Snapshot kernel-side drop counters. + local udp_before tcp_before + udp_before="$(snapshot_proc_net_udp)" + tcp_before="$(snapshot_nstat)" + + # Snapshot per-cpu stats just before the bench starts. + snapshot_cpu_stat "$cell_dir/cpu_stat.before" + + # Background GPU dmon (1-sec sample, RUN_SECONDS samples). + ( nvidia-smi dmon -s pucvmet -c "$RUN_SECONDS" > "$cell_dir/nvidia_smi_dmon.txt" 2>&1 ) & + local dmon_pid=$! + + # Run the bench. Stderr captures DAQIRI_LOG_* output (DPDK/RDMA drop sources). + local stdout="$cell_dir/stdout.txt" + local stderr="$cell_dir/stderr.txt" + local args=("$yaml" --seconds "$RUN_SECONDS") + [[ "$target_gbps" != "0" ]] && args+=(--target-gbps "$target_gbps") + [[ "$BACKEND" == "rdma" || "$BACKEND" =~ ^socket- ]] && args+=(--mode both) + + local bench_rc=0 + "$BENCH_BIN" "${args[@]}" > "$stdout" 2> "$stderr" || bench_rc=$? + cp "$stderr" "$DRIVER_LOG" + + # Snapshot per-cpu stats right after the bench exits (before background + # captures finish reaping, to bound the window). + snapshot_cpu_stat "$cell_dir/cpu_stat.after" + + # Stop background captures (they self-terminate at -c , but reap if needed). + wait "$dmon_pid" 2>/dev/null || true + + # Parse bench stdout. For RX-bearing benches "RX complete" is authoritative; + # for TX-only configs fall back to "TX complete". + local pkts bytes secs + pkts="$(extract_field 'RX complete' packets "$stdout")" + bytes="$(extract_field 'RX complete' bytes "$stdout")" + secs="$(extract_field 'RX complete' seconds "$stdout")" + if [[ -z "$pkts" ]]; then + pkts="$(extract_field 'TX complete' packets "$stdout")" + bytes="$(extract_field 'TX complete' bytes "$stdout")" + secs="$(extract_field 'TX complete' seconds "$stdout")" + fi + if [[ -z "$pkts" ]]; then + case "$BACKEND" in + rdma) + # RDMA prints "Client/Server complete: ... send_completions=N send_bytes=N seconds=S" + pkts="$(extract_field 'Client complete' send_completions "$stdout")" + bytes="$(extract_field 'Client complete' send_bytes "$stdout")" + secs="$(extract_field 'Client complete' seconds "$stdout")" + ;; + socket-udp|socket-tcp) + # socket_bench prints "Client/Server complete: ... sent_packets=N sent_bytes=N + # recv_packets=N recv_bytes=N seconds=S". Use the TX-side counters for parity + # with how the RDMA fallback reports throughput. + pkts="$(extract_field 'Client complete' sent_packets "$stdout")" + bytes="$(extract_field 'Client complete' sent_bytes "$stdout")" + secs="$(extract_field 'Client complete' seconds "$stdout")" + ;; + esac + fi + local stats_missing=0 + if [[ -z "${pkts:-}" || -z "${bytes:-}" || -z "${secs:-}" ]]; then + stats_missing=1 + fi + if [[ "$bench_rc" -ne 0 || "$stats_missing" -ne 0 ]]; then + if [[ "$bench_rc" -ne 0 ]]; then + echo "ERROR: $cell bench exited with status $bench_rc" >&2 + fi + if [[ "$stats_missing" -ne 0 ]]; then + echo "ERROR: $cell produced no parseable completion stats" >&2 + fi + echo " stdout: $stdout" >&2 + echo " stderr: $stderr" >&2 + return 1 + fi + + local pps gbps + pps="$(awk -v p="$pkts" -v s="$secs" 'BEGIN { if (s+0>0) printf "%.0f", p/s; else print 0 }')" + gbps="$(awk -v b="$bytes" -v s="$secs" 'BEGIN { if (s+0>0) printf "%.3f", (b*8.0)/s/1e9; else print 0 }')" + + # Drops per backend. + local drops drops_kind + case "$BACKEND" in + dpdk) + drops="$(parse_dpdk_drops "$stderr")" + drops_kind="dpdk-imissed+ierrors+nombuf" + ;; + rdma) + drops="$(parse_rdma_drops "$stderr")" + drops_kind="rdma-cqe-error" + ;; + socket-udp) + local udp_after; udp_after="$(snapshot_proc_net_udp)" + drops="$((udp_after - udp_before))" + drops_kind="udp-proc-net-udp-drops" + ;; + socket-tcp) + local tcp_after; tcp_after="$(snapshot_nstat)" + drops="$((tcp_after - tcp_before))" + drops_kind="tcp-nstat-retrans+inerrs" + ;; + esac + + # Per-core CPU busy% over the bench window. Cores defined per-backend + # (master/TX/RX) match the YAML so we measure the threads we actually pin. + local cpu_master_pct cpu_tx_pct cpu_rx_pct + cpu_master_pct="$(cpu_busy_pct "$cell_dir/cpu_stat.before" "$cell_dir/cpu_stat.after" "$CPU_MASTER")" + cpu_tx_pct="$(cpu_busy_pct "$cell_dir/cpu_stat.before" "$cell_dir/cpu_stat.after" "$CPU_TX")" + cpu_rx_pct="$(cpu_busy_pct "$cell_dir/cpu_stat.before" "$cell_dir/cpu_stat.after" "$CPU_RX")" + + # GPU SM% (column 5) and memory-controller % (column 6) from nvidia-smi + # dmon -s pucvmet. These are near zero for GPUDirect workloads (GPU is a + # DMA target, not a compute engine). + local gpu_sm gpu_mem + gpu_sm="$(awk '/^ *[0-9]/ { count++; sum += $5 } END { if (count) printf "%.1f", sum/count; else print 0 }' \ + "$cell_dir/nvidia_smi_dmon.txt" 2>/dev/null || echo 0)" + gpu_mem="$(awk '/^ *[0-9]/ { count++; sum += $6 } END { if (count) printf "%.1f", sum/count; else print 0 }' \ + "$cell_dir/nvidia_smi_dmon.txt" 2>/dev/null || echo 0)" + + echo "$lang,$BACKEND,none,$payload,$batch,$target_gbps,$secs,$pkts,$bytes,$pps,$gbps,$drops,$drops_kind,$cpu_master_pct,$cpu_tx_pct,$cpu_rx_pct,$gpu_sm,$gpu_mem" \ + | tee -a "$CSV" +} + +run_cell_or_record_failure() { + run_cell "$@" || FAILURES=$((FAILURES + 1)) +} + +# -------------------------------------------------------------------------- +# Driver +# -------------------------------------------------------------------------- + +case "$MODE" in + smoke) + # One cell, native-shape, unpaced. + for p in "${PAYLOADS_HEADLINE[@]}"; do + for b in "${BATCHES_HEADLINE[@]}"; do + run_cell_or_record_failure cpp "$p" "$b" 0 + done + done + ;; + sweep) + # Full payload × batch matrix at line rate. + for p in "${PAYLOADS_SWEEP[@]}"; do + for b in "${BATCHES_SWEEP[@]}"; do + run_cell_or_record_failure cpp "$p" "$b" 0 + done + done + ;; + drop-curve) + # Hold native-shape constant, sweep target_gbps. + for p in "${PAYLOADS_HEADLINE[@]}"; do + for b in "${BATCHES_HEADLINE[@]}"; do + for g in "${DROP_CURVE_TARGETS[@]}"; do + run_cell_or_record_failure cpp "$p" "$b" "$g" + done + done + done + ;; + drop-curve-matrix) + # 2D drop curve: sweep payload × target_gbps at the headline batch. + for p in "${PAYLOADS_SWEEP[@]}"; do + for b in "${BATCHES_HEADLINE[@]}"; do + for g in "${DROP_CURVE_TARGETS[@]}"; do + run_cell_or_record_failure cpp "$p" "$b" "$g" + done + done + done + ;; + *) echo "Unknown mode: $MODE" >&2; exit 1 ;; +esac + +echo +echo "Results in: $OUT_DIR" +echo "CSV: $CSV" + +if [[ "$FAILURES" -ne 0 ]]; then + echo "Failed cells: $FAILURES" >&2 + exit 1 +fi diff --git a/scripts/setup_spark_rdma_loopback.sh b/scripts/setup_spark_rdma_loopback.sh new file mode 100755 index 0000000..4c9061a --- /dev/null +++ b/scripts/setup_spark_rdma_loopback.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Host network config for the DGX Spark RDMA loopback bench. +# Adapted from a colleague's single-adapter script for this host's +# inter-port loopback: Adapter1:port0 (1.1.1.1) <-> Adapter2:port1 (2.2.2.2). +# +# Matches examples/daqiri_bench_rdma_tx_rx_spark.yaml (1.1.1.1 / 2.2.2.2). +# Re-running is safe: replaces addresses, flushes per-port tables, and +# deletes any matching rules before re-adding. +# +# Env overrides: SPARK_TX_IFACE, SPARK_RX_IFACE, SPARK_TX_MAC, SPARK_RX_MAC. + +set -euo pipefail + +p0="${SPARK_TX_IFACE:-enp1s0f0np0}" # Adapter1 port 0, PCI 0000:01:00.0 +p1="${SPARK_RX_IFACE:-enP2p1s0f1np1}" # Adapter2 port 1, PCI 0002:01:00.1 + +read_mac() { + local iface="$1" + local path="/sys/class/net/${iface}/address" + if [[ ! -r "$path" ]]; then + echo "cannot read $path; set SPARK_TX_MAC/SPARK_RX_MAC explicitly" >&2 + exit 1 + fi + cat "$path" +} + +p0_mac="${SPARK_TX_MAC:-$(read_mac "$p0")}" +p1_mac="${SPARK_RX_MAC:-$(read_mac "$p1")}" + +p0_ip="1.1.1.1" +p1_ip="2.2.2.2" + +# Numeric table IDs avoid having to edit /etc/iproute2/rt_tables. +table_p0=100 +table_p1=101 + +ip link set "${p0}" up +ip link set "${p1}" up + +ip addr replace "${p0_ip}/24" dev "${p0}" +ip addr replace "${p1_ip}/24" dev "${p1}" + +ip route flush table "${table_p0}" 2>/dev/null || true +ip route flush table "${table_p1}" 2>/dev/null || true +ip route add table "${table_p0}" default dev "${p0}" +ip route add table "${table_p1}" default dev "${p1}" + +ip rule del from "${p0_ip}/32" table "${table_p0}" 2>/dev/null || true +ip rule del to "${p0_ip}/32" table "${table_p0}" 2>/dev/null || true +ip rule del from "${p1_ip}/32" table "${table_p1}" 2>/dev/null || true +ip rule del to "${p1_ip}/32" table "${table_p1}" 2>/dev/null || true + +ip rule add from "${p0_ip}/32" table "${table_p0}" +ip rule add to "${p0_ip}/32" table "${table_p0}" +ip rule add from "${p1_ip}/32" table "${table_p1}" +ip rule add to "${p1_ip}/32" table "${table_p1}" + +arp -i "${p0}" -s "${p0_ip}" "${p0_mac}" +arp -i "${p0}" -s "${p1_ip}" "${p1_mac}" +arp -i "${p1}" -s "${p0_ip}" "${p0_mac}" +arp -i "${p1}" -s "${p1_ip}" "${p1_mac}" + +echo "RDMA loopback config applied." +echo " ${p0} (${p0_mac}) -> ${p0_ip}/24, table ${table_p0}" +echo " ${p1} (${p1_mac}) -> ${p1_ip}/24, table ${table_p1}" diff --git a/scripts/spark_data_fill.sh b/scripts/spark_data_fill.sh new file mode 100755 index 0000000..22349ed --- /dev/null +++ b/scripts/spark_data_fill.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# Drives the PR 1 data-fill bench runs for the DGX Spark performance report. +# +# Runs DPDK GPUDirect, socket-UDP, and socket-TCP through their sweep and +# drop-curve modes via examples/run_spark_bench.sh, with pre-flight checks +# and orphan-hugepage cleanup. RDMA is deferred from PR 1 (single-host +# loopback over the cable needs a netns+two-process refactor; tracked +# separately). +# +# Run inside the project container (privileged, --gpus all, /dev/hugepages +# and /mnt/huge mounted, repo at /workspace). +# +# Usage: +# ./scripts/spark_data_fill.sh # all three backends +# ./scripts/spark_data_fill.sh dpdk # just DPDK +# ./scripts/spark_data_fill.sh socket-udp socket-tcp +# +# Env overrides: +# ETH_DST_ADDR — RX-side MAC. Auto-detected from +# /sys/class/net/enP2p1s0f1np1/address if unset. +# RX_IFACE — RX netdev name (default enP2p1s0f1np1). +# DAQIRI_BUILD_DIR — defaults to ./build. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +WRAPPER="$REPO_ROOT/examples/run_spark_bench.sh" +BUILD_DIR="${DAQIRI_BUILD_DIR:-$REPO_ROOT/build}" +RX_IFACE="${RX_IFACE:-enP2p1s0f1np1}" + +BACKENDS=("$@") +[[ ${#BACKENDS[@]} -eq 0 ]] && BACKENDS=(dpdk socket-udp socket-tcp) + +# --- pre-flight ------------------------------------------------------------ + +preflight_fail() { echo "PREFLIGHT FAIL: $*" >&2; exit 1; } +note() { echo "[$(date -u +%H:%M:%SZ)] $*"; } + +[[ -x "$WRAPPER" ]] || preflight_fail "wrapper missing or not executable: $WRAPPER" + +for be in "${BACKENDS[@]}"; do + case "$be" in + dpdk) bin="$BUILD_DIR/examples/daqiri_bench_raw_gpudirect" ;; + socket-udp|socket-tcp) bin="$BUILD_DIR/examples/daqiri_bench_socket" ;; + rdma) preflight_fail "RDMA is deferred from PR 1; see follow-up issue" ;; + *) preflight_fail "unknown backend: $be" ;; + esac + [[ -x "$bin" ]] || preflight_fail "missing bench binary: $bin (run cmake --build first)" +done + +# DPDK-only checks. +if [[ " ${BACKENDS[*]} " == *" dpdk "* ]]; then + free_hp="$(awk '/^HugePages_Free:/ { print $2 }' /proc/meminfo)" + [[ "${free_hp:-0}" -ge 4 ]] || preflight_fail "HugePages_Free=$free_hp (need >=4); clean /mnt/huge and /dev/hugepages from prior runs" + + if [[ -z "${ETH_DST_ADDR:-}" ]]; then + mac_path="/sys/class/net/$RX_IFACE/address" + [[ -r "$mac_path" ]] || preflight_fail "cannot read $mac_path; set ETH_DST_ADDR explicitly" + ETH_DST_ADDR="$(cat "$mac_path")" + export ETH_DST_ADDR + note "ETH_DST_ADDR auto-detected from $RX_IFACE: $ETH_DST_ADDR" + fi + + carrier="$(cat "/sys/class/net/$RX_IFACE/carrier" 2>/dev/null || echo 0)" + [[ "$carrier" == "1" ]] || preflight_fail "RX iface $RX_IFACE has no carrier (cable unplugged or link down)" +fi + +note "Pre-flight OK. Backends: ${BACKENDS[*]}" +note "Build dir: $BUILD_DIR" +note "Repo root: $REPO_ROOT" + +# --- hugepage cleanup helper ---------------------------------------------- + +# DPDK leaves orphan rtemap_* files when a bench aborts. Clean between runs so +# we don't run out of hugepages mid-sweep. +clean_orphan_hugepages() { + local pre post freed + pre="$(awk '/^HugePages_Free:/ { print $2 }' /proc/meminfo)" + : "${pre:=0}" + shopt -s nullglob + # DPDK uses a random per-process file prefix (override with --file-prefix); + # match anything ending in `map_` to catch the common shape without + # nuking unrelated files. Skip any that are still held by a live process. + for f in /dev/hugepages/*map_[0-9]* /mnt/huge/*map_[0-9]*; do + if ! fuser -- "$f" >/dev/null 2>&1; then + rm -f -- "$f" 2>/dev/null || true + fi + done + shopt -u nullglob + post="$(awk '/^HugePages_Free:/ { print $2 }' /proc/meminfo)" + : "${post:=0}" + freed=$((post - pre)) + if [[ "$freed" -gt 0 ]]; then + note "Freed $freed orphan hugepages (now ${post} free)" + fi + return 0 +} + +# --- driver loop ----------------------------------------------------------- + +declare -a RESULT_DIRS +FAILURES=0 + +run_backend_mode() { + local backend="$1" mode="$2" + note "=== Running: $backend $mode ===" + clean_orphan_hugepages + + # Stream wrapper output live (per-cell CSV rows appear as they finish) while + # also keeping a log for post-run parsing of the "Results in:" line. + local log="/tmp/spark_data_fill.$backend.$mode.log" + local rc=0 + set +e + "$WRAPPER" "$backend" "$mode" 2>&1 | tee "$log" + local -a pipe_status=("${PIPESTATUS[@]}") + set -e + rc="${pipe_status[0]}" + + if [[ "$rc" -eq 0 ]]; then + local result_dir + result_dir="$(awk '/^Results in:/ { print $3 }' "$log" | tail -n1)" + [[ -n "$result_dir" ]] && RESULT_DIRS+=("$backend/$mode -> $result_dir") + note "$backend $mode complete" + else + FAILURES=$((FAILURES + 1)) + note "$backend $mode FAILED (exit $rc); continuing" + tail -n 40 "$log" >&2 + fi + clean_orphan_hugepages +} + +for be in "${BACKENDS[@]}"; do + run_backend_mode "$be" sweep + run_backend_mode "$be" drop-curve +done + +# --- summary --------------------------------------------------------------- + +echo +echo "==========================================" +echo "Data-fill complete. Result directories:" +echo "==========================================" +for r in "${RESULT_DIRS[@]}"; do + echo " $r" +done +echo +echo "Next: aggregate CSVs and fill docs/performance-dgx-spark.md." + +if [[ "$FAILURES" -ne 0 ]]; then + echo "Failed backend/mode runs: $FAILURES" >&2 + exit 1 +fi