From 633dc6b99c70d6fb239c887a91fa27797afbb9c1 Mon Sep 17 00:00:00 2001 From: Tommy Reilly Date: Fri, 26 Jun 2026 09:15:52 -0400 Subject: [PATCH 1/2] Add callbacks for driver cuGraphLaunch parcagpu subscribed CUPTI callbacks for eager launches (cuLaunchKernel) and runtime graph launches (cudaGraphLaunch), but not the driver-API cuGraphLaunch / cuGraphLaunch_ptsz. C++ runtimes like TensorRT-LLM replay CUDA graphs through the driver API, so the cuda_correlation USDT never fired in graph mode and no GPU samples were produced, while eager launches kept working (confirmed on a 4xB200 TRT-LLM node). Fix: enable the two driver graph-launch cbids in CuptiProfiler::initialize (with symmetric teardown). Not setGraphCallbacks(), which also subscribes capture cbids that would emit correlation events for non-executing capture calls. Tests: the mock harness dispatched callbacks unconditionally, hiding the missing subscription. mock_cupti now records the subscribed (domain,cbid) set and the harness only dispatches subscribed callbacks. test-pc-mock-graph (new make target) asserts driver cuGraphLaunch correlation events fire; graph_repro.cu / graph-repro-real.sh add a real-GPU reproducer. Verified the guard goes red without the fix and green with it. --- Makefile | 6 +- src/cupti.cpp | 13 ++++ test/graph-repro-real.sh | 55 +++++++++++++++++ test/graph_repro.cu | 94 +++++++++++++++++++++++++++++ test/mock_cupti.c | 34 +++++++++++ test/test-pc-mock-graph.sh | 119 +++++++++++++++++++++++++++++++++++++ test/test_cupti_prof.c | 34 +++++++---- 7 files changed, 344 insertions(+), 11 deletions(-) create mode 100755 test/graph-repro-real.sh create mode 100644 test/graph_repro.cu create mode 100755 test/test-pc-mock-graph.sh diff --git a/Makefile b/Makefile index 3317417..ffe56ab 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug generate bpf-test microbenchmarks test-multi test-pc-real test-pc-mock +.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug generate bpf-test microbenchmarks test-multi test-pc-real test-pc-mock test-pc-mock-graph LIB_NAME = libparcagpucupti.so @@ -163,6 +163,10 @@ test-pc-real: local bpf-test microbenchmarks test-pc-mock: local bpf-test sudo -E test/test-pc-mock.sh +# Mock graph-launch test — guards driver cuGraphLaunch CBID subscription. No GPU. +test-pc-mock-graph: local bpf-test + sudo -E test/test-pc-mock-graph.sh + format: @echo "=== Formatting source files ===" @clang-format -i -style=file src/*.cpp src/*.h test/*.c diff --git a/src/cupti.cpp b/src/cupti.cpp index 1e1dd8d..234158e 100644 --- a/src/cupti.cpp +++ b/src/cupti.cpp @@ -260,6 +260,14 @@ class CuptiProfiler : public proton::Singleton { proton::setRuntimeCallbacks(subscriber, /*enable=*/true); proton::setLaunchCallbacks(subscriber, /*enable=*/true); + // Driver-API graph launches (cuGraphLaunch/_ptsz); not covered by setLaunch/ + // setRuntimeCallbacks. Only these two cbids -- not setGraphCallbacks(), which + // also subscribes capture cbids that emit uncorrelated events. + cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch); + cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz); + // Enable resource callbacks only if PC sampling is enabled if (pcSamplingEnabled) { proton::setResourceCallbacks(subscriber, /*enable=*/true); @@ -305,6 +313,11 @@ class CuptiProfiler : public proton::Singleton { if (subscriber) { proton::setRuntimeCallbacks(subscriber, /*enable=*/false); proton::setLaunchCallbacks(subscriber, /*enable=*/false); + // Mirror the graph-launch callbacks enabled in init(). + cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch); + cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz); if (pcSamplingEnabled) { proton::setResourceCallbacks(subscriber, /*enable=*/false); } diff --git a/test/graph-repro-real.sh b/test/graph-repro-real.sh new file mode 100755 index 0000000..84c8ae3 --- /dev/null +++ b/test/graph-repro-real.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# graph-repro-real.sh — real-GPU guard: runs graph_repro under the shim and +# checks the cuda_correlation USDT fires for driver cuGraphLaunch (cbid -514). +# Zero => FAIL (cbid not subscribed). Needs an NVIDIA GPU, nvcc, bpftrace, root. +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +LIB="$ROOT/build-local/lib/libparcagpucupti.so" +SECS="${1:-15}" +BIN="$(mktemp -d)/graph_repro" +COUNT_LOG="$(mktemp)" + +[ -f "$LIB" ] || { echo "error: $LIB not found — run 'make local' first" >&2; exit 1; } +command -v nvcc >/dev/null || { echo "error: nvcc not found" >&2; exit 1; } +command -v bpftrace >/dev/null || { echo "error: bpftrace not found" >&2; exit 1; } + +echo "=== Building reproducer ===" +nvcc -O2 -o "$BIN" "$ROOT/test/graph_repro.cu" -lcuda + +echo "=== Launching reproducer under parcagpu shim ===" +LD_PRELOAD="$LIB" PARCAGPU_DEBUG=1 "$BIN" "$SECS" 2>"$(mktemp)" & +APP_PID=$! + +# Wait for the shim to be mapped. +for _ in $(seq 1 50); do + grep -q libparcagpucupti "/proc/$APP_PID/maps" 2>/dev/null && break + sleep 0.1 +done + +echo "=== Attaching bpftrace to cuda_correlation USDT (driver cbid 514 => -514) ===" +# @driver: driver cuGraphLaunch (path under test); @runtime: positive cbid +bpftrace -p "$APP_PID" \ + -e "usdt:$LIB:parcagpu:cuda_correlation /arg1 == -514 || arg1 == -515/ { @driver = count(); } + usdt:$LIB:parcagpu:cuda_correlation /arg1 >= 0/ { @runtime = count(); }" \ + >"$COUNT_LOG" 2>/dev/null & +BT_PID=$! + +wait "$APP_PID" || true +sleep 1 +kill "$BT_PID" 2>/dev/null || true +wait "$BT_PID" 2>/dev/null || true + +echo "=== bpftrace results ===" +cat "$COUNT_LOG" + +DRIVER=$(grep -oE "@driver: [0-9]+" "$COUNT_LOG" | grep -oE "[0-9]+" || echo 0) +echo +if [ "${DRIVER:-0}" -gt 0 ]; then + echo "PASS: driver cuGraphLaunch produced $DRIVER correlation events" + exit 0 +else + echo "FAIL: zero driver cuGraphLaunch correlation events — driver graph-launch" >&2 + echo " cbid is not subscribed (CuptiProfiler::initialize)." >&2 + exit 1 +fi diff --git a/test/graph_repro.cu b/test/graph_repro.cu new file mode 100644 index 0000000..99cf2ac --- /dev/null +++ b/test/graph_repro.cu @@ -0,0 +1,94 @@ +// graph_repro.cu — replays a captured graph via the DRIVER cuGraphLaunch (the +// path parcagpu missed). Build: nvcc -o graph_repro test/graph_repro.cu -lcuda +// Run: ./graph_repro [seconds]. See test/graph-repro-real.sh for the guard. + +#include +#include +#include +#include +#include + +#define CUDA_CHECK(x) \ + do { \ + cudaError_t e = (x); \ + if (e != cudaSuccess) { \ + fprintf(stderr, "cuda error %s at %s:%d\n", cudaGetErrorString(e), \ + __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +#define CU_CHECK(x) \ + do { \ + CUresult e = (x); \ + if (e != CUDA_SUCCESS) { \ + const char *s = nullptr; \ + cuGetErrorString(e, &s); \ + fprintf(stderr, "driver error %s at %s:%d\n", s ? s : "?", __FILE__, \ + __LINE__); \ + exit(1); \ + } \ + } while (0) + +__global__ void add_one(float *x, int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + x[i] += 1.0f; +} + +int main(int argc, char **argv) { + int seconds = (argc > 1) ? atoi(argv[1]) : 20; + + // Force driver + runtime init. + CUDA_CHECK(cudaFree(0)); + CU_CHECK(cuInit(0)); + + const int n = 1 << 16; + float *d = nullptr; + CUDA_CHECK(cudaMalloc(&d, n * sizeof(float))); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + // Build a graph by capturing a few kernel launches into the stream. + CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal)); + for (int k = 0; k < 8; k++) { + add_one<<<(n + 255) / 256, 256, 0, stream>>>(d, n); + } + cudaGraph_t graph; + CUDA_CHECK(cudaStreamEndCapture(stream, &graph)); + + cudaGraphExec_t exec; + CUDA_CHECK(cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0)); + + // cudaGraphExec_t is the same handle as CUgraphExec; replay via driver API. + CUgraphExec drvExec = reinterpret_cast(exec); + CUstream drvStream = reinterpret_cast(stream); + + fprintf(stderr, + "graph_repro: replaying graph via driver cuGraphLaunch for %ds\n", + seconds); + + struct timespec start; + clock_gettime(CLOCK_MONOTONIC, &start); + unsigned long long launches = 0; + for (;;) { + CU_CHECK(cuGraphLaunch(drvExec, drvStream)); + launches++; + if ((launches & 0x3ff) == 0) { + CUDA_CHECK(cudaStreamSynchronize(stream)); + struct timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + if (now.tv_sec - start.tv_sec >= seconds) + break; + } + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + fprintf(stderr, "graph_repro: done, %llu driver graph launches\n", launches); + + cudaGraphExecDestroy(exec); + cudaGraphDestroy(graph); + cudaStreamDestroy(stream); + cudaFree(d); + return 0; +} diff --git a/test/mock_cupti.c b/test/mock_cupti.c index 6862ba1..ab3b3f4 100644 --- a/test/mock_cupti.c +++ b/test/mock_cupti.c @@ -45,6 +45,24 @@ CUptiResult cuptiSubscribe(CUpti_SubscriberHandle *subscriber, return CUPTI_SUCCESS; } +// Subscribed (domain, cbid) pairs, so the harness can mirror real CUPTI dispatch. +#define MOCK_MAX_ENABLED 512 +static struct { + CUpti_CallbackDomain domain; + CUpti_CallbackId cbid; +} __enabled_cbs[MOCK_MAX_ENABLED]; +static size_t __enabled_cb_count = 0; + +int __mock_cupti_is_callback_enabled(CUpti_CallbackDomain domain, + CUpti_CallbackId cbid) { + for (size_t i = 0; i < __enabled_cb_count; i++) { + if (__enabled_cbs[i].domain == domain && __enabled_cbs[i].cbid == cbid) { + return 1; + } + } + return 0; +} + CUptiResult cuptiEnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber, CUpti_CallbackDomain domain, @@ -52,6 +70,22 @@ CUptiResult cuptiEnableCallback(uint32_t enable, (void)subscriber; // Mark as intentionally unused fprintf(stderr, "[MOCK_CUPTI] cuptiEnableCallback(enable=%u, domain=%u, cbid=%u)\n", enable, domain, cbid); + if (enable) { + if (!__mock_cupti_is_callback_enabled(domain, cbid) && + __enabled_cb_count < MOCK_MAX_ENABLED) { + __enabled_cbs[__enabled_cb_count].domain = domain; + __enabled_cbs[__enabled_cb_count].cbid = cbid; + __enabled_cb_count++; + } + } else { + for (size_t i = 0; i < __enabled_cb_count; i++) { + if (__enabled_cbs[i].domain == domain && + __enabled_cbs[i].cbid == cbid) { + __enabled_cbs[i] = __enabled_cbs[--__enabled_cb_count]; + break; + } + } + } return CUPTI_SUCCESS; } diff --git a/test/test-pc-mock-graph.sh b/test/test-pc-mock-graph.sh new file mode 100755 index 0000000..5c9820e --- /dev/null +++ b/test/test-pc-mock-graph.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Graph-launch variant of test-pc-mock.sh: drives test_cupti_prof with +# --graph-rate so half the launches are CUDA graph launches (one correlation +# ID fanning out into many kernel activity records with a nonzero graphId). +# Verifies graph kernels are profiled (emitted) rather than filtered, i.e. the +# PC-sampling rewrite did not break graph-launch profiling. +# +# Prereqs: make local bpf-test ; run with: sudo -E test/test-pc-mock-graph.sh +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +LIB="$ROOT/build-local/lib/libparcagpucupti.so" +TEST_BIN="$ROOT/build-local/bin/test_cupti_prof" +BPF="$ROOT/test/bpf/activity_parser" +CUBIN="$ROOT/test/pc_sample_toy.cubin" +BPF_LOG="/tmp/parcagpu-graph-bpf.log" +TEST_LOG="/tmp/parcagpu-graph-test.log" + +for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do + [ -e "$f" ] || { echo "error: $f not found" >&2; exit 1; } +done + +cleanup() { + [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true + [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true + [ -n "${SEM_PID:-}" ] && kill "$SEM_PID" 2>/dev/null || true + wait 2>/dev/null || true +} +trap cleanup EXIT + +echo "=== Starting test_cupti_prof (mock, graph-rate=25 of launch-rate=50) ===" +LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \ + PARCAGPU_DEBUG=1 \ + PARCAGPU_PC_SAMPLING_RATE=10000 \ + MOCK_CUBIN_PATH="$CUBIN" \ + "$TEST_BIN" "$LIB" --launch-rate=50 --graph-rate=25 --duration=12 > "$TEST_LOG" 2>&1 & +TEST_PID=$! +echo "test_cupti_prof PID: $TEST_PID" + +while kill -0 "$TEST_PID" 2>/dev/null && + ! grep -q libparcagpucupti "/proc/$TEST_PID/maps" 2>/dev/null; do + sleep 0.1 +done +if ! kill -0 "$TEST_PID" 2>/dev/null; then + echo "error: test_cupti_prof exited before library loaded" >&2 + cat "$TEST_LOG" >&2; exit 1 +fi + +# The activity_parser does NOT attach the cuda_correlation probe, but +# parcagpu's allocBuffer + callback path are gated on its USDT semaphore +# (PARCAGPU_CUDA_CORRELATION_ENABLED). In production parca-agent attaches it; +# here we bump that semaphore with bpftrace so the activity path is live. +echo "=== Bumping cuda_correlation semaphore via bpftrace ===" +bpftrace -p "$TEST_PID" -e "usdt:$LIB:parcagpu:cuda_correlation { @n = count(); } + usdt:$LIB:parcagpu:cuda_correlation /(int32)arg1 == -514 || (int32)arg1 == -515/ { @driver_graph = count(); }" \ + > /tmp/parcagpu-graph-sem.log 2>&1 & +SEM_PID=$! +sleep 2 # let bpftrace attach + set the semaphore before launches ramp + +echo "=== Starting BPF activity parser ===" +"$BPF" -pid "$TEST_PID" -lib "$LIB" > "$BPF_LOG" 2>&1 & +BPF_PID=$! +echo "activity_parser PID: $BPF_PID" + +wait "$TEST_PID" 2>/dev/null || true +TEST_PID="" +sleep 2 +kill "$BPF_PID" 2>/dev/null || true +wait "$BPF_PID" 2>/dev/null || true +BPF_PID="" +kill "$SEM_PID" 2>/dev/null || true +wait "$SEM_PID" 2>/dev/null || true +SEM_PID="" +echo "=== bpftrace cuda_correlation hit count ===" +grep -E "@n:" /tmp/parcagpu-graph-sem.log || cat /tmp/parcagpu-graph-sem.log | tail -3 || true + +echo +echo "=== graph map inserts (parcagpu debug, first 5) ===" +grep "into graph map" "$TEST_LOG" | head -5 || true +echo "=== graph map insert count ===" +grep -c "into graph map" "$TEST_LOG" || true +echo "=== filtered graph activities (graphId nonzero) count ===" +grep "Filtered kernel activity" "$TEST_LOG" | grep -vE "graphId=0\b" | grep -cE "graphId=[1-9]" || true +echo +echo "=== BPF: sample graph kernel events (graph= nonzero) ===" +grep -E "^kernel: " "$BPF_LOG" | grep -vE "graph=0\b" | grep -E "graph=[1-9]" | head -8 || true +echo "=== BPF: graph kernel-event count (graph= nonzero) ===" +grep -E "^kernel: " "$BPF_LOG" | grep -vE "graph=0\b" | grep -cE "graph=[1-9]" || true +echo "=== BPF: regular kernel-event count (graph=0) ===" +grep -cE "^kernel: .*graph=0\b" "$BPF_LOG" || true +echo "=== BPF summary line ===" +grep -E "kernels_found|events_received" "$BPF_LOG" | tail -2 || true +echo + +# --- Checks --- +PASS=true +check() { + if eval "$2"; then echo "PASS: $1"; else echo "FAIL: $1" >&2; PASS=false; fi +} +GRAPH_INSERTS=$(grep -c "into graph map" "$TEST_LOG" || true) +# Authoritative signal: graph kernel activities that PASSED the correlation +# filter and were emitted (PARCAGPU_KERNEL_EXECUTED + activity_batch). The BPF +# per-kernel "kernel:" line is verbosity-gated (-v), so we count emits on the +# parcagpu side instead, and confirm the consumer received events. +GRAPH_EMITTED=$(grep "Kernel activity:" "$TEST_LOG" | grep -cE "graphId=[1-9]" || true) +BPF_RECEIVED=$(grep -oE "events_received=[0-9]+" "$BPF_LOG" | grep -oE "[0-9]+" | sort -n | tail -1 || echo 0) +# Driver cuGraphLaunch correlation events (signed cbid -514/-515). Zero means the +# driver graph-launch cbid wasn't subscribed; the runtime half emits regardless, +# so GRAPH_EMITTED>0 alone wouldn't catch it. +DRIVER_GRAPH=$(grep -oE "@driver_graph: [0-9]+" /tmp/parcagpu-graph-sem.log | grep -oE "[0-9]+" || echo 0) + +check "graph launches inserted into graph map" "[ \"${GRAPH_INSERTS:-0}\" -gt 0 ]" +check "graph kernel activities emitted (not filtered)" "[ \"${GRAPH_EMITTED:-0}\" -gt 0 ]" +check "BPF consumer received kernel events" "[ \"${BPF_RECEIVED:-0}\" -gt 0 ]" +check "driver cuGraphLaunch correlated (cbid subscribed)" "[ \"${DRIVER_GRAPH:-0}\" -gt 0 ]" + +if $PASS; then echo; echo "=== GRAPH PROFILING VERIFIED ==="; else echo; echo "=== GRAPH PROFILING FAILED ===" >&2; exit 1; fi diff --git a/test/test_cupti_prof.c b/test/test_cupti_prof.c index 9d08a3b..c1a6589 100644 --- a/test/test_cupti_prof.c +++ b/test/test_cupti_prof.c @@ -294,6 +294,20 @@ static void (*parcagpuCuptiCallback)(void *userdata, CUpti_CallbackDomain domain // Defined in mock_cupti.c — feeds correlation IDs to cuptiPCSamplingGetData. extern void __mock_pc_enqueue_correlation(uint32_t correlation_id); +// Defined in mock_cupti.c — whether a (domain, cbid) was subscribed. +extern int __mock_cupti_is_callback_enabled(CUpti_CallbackDomain domain, + CUpti_CallbackId cbid); + +// Dispatch to the shim only if subscribed, mirroring real CUPTI gating. +static void fire_cupti_callback(CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const CUpti_CallbackData *cbdata) { + if (!__mock_cupti_is_callback_enabled(domain, cbid)) { + return; + } + parcagpuCuptiCallback(NULL, domain, cbid, cbdata); +} + //============================================================================= // Launched Kernels Queue // Track which correlation IDs have had their callbacks executed @@ -385,12 +399,12 @@ NOINLINE CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned i // DRIVER ENTER callback tls_cbdata.callbackSite = CUPTI_API_ENTER; tls_cbdata.correlationId = tls_correlation_id; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &tls_cbdata); // DRIVER EXIT callback tls_cbdata.callbackSite = CUPTI_API_EXIT; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &tls_cbdata); return CUDA_SUCCESS; @@ -403,12 +417,12 @@ NOINLINE CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) { // DRIVER ENTER callback tls_cbdata.callbackSite = CUPTI_API_ENTER; tls_cbdata.correlationId = tls_correlation_id; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata); // DRIVER EXIT callback tls_cbdata.callbackSite = CUPTI_API_EXIT; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata); // Enqueue for activity generation @@ -427,7 +441,7 @@ NOINLINE cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 block // RUNTIME ENTER callback tls_cbdata.callbackSite = CUPTI_API_ENTER; tls_cbdata.correlationId = tls_correlation_id; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, &tls_cbdata); // Runtime internally calls driver - call through cuLaunchKernel so it appears in stack @@ -435,7 +449,7 @@ NOINLINE cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 block // RUNTIME EXIT callback tls_cbdata.callbackSite = CUPTI_API_EXIT; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, &tls_cbdata); return cudaSuccess; @@ -448,22 +462,22 @@ NOINLINE cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t str // RUNTIME ENTER callback tls_cbdata.callbackSite = CUPTI_API_ENTER; tls_cbdata.correlationId = tls_correlation_id; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, &tls_cbdata); // Runtime internally calls driver - use inline driver callback with cuGraphLaunch cbid // (We don't call cuGraphLaunch here to avoid double-queueing) tls_cbdata.callbackSite = CUPTI_API_ENTER; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata); tls_cbdata.callbackSite = CUPTI_API_EXIT; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata); // RUNTIME EXIT callback tls_cbdata.callbackSite = CUPTI_API_EXIT; - parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API, + fire_cupti_callback(CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, &tls_cbdata); // Enqueue for activity generation From f0fbd49245b7c68c6efe413124e8deafc4a4d471 Mon Sep 17 00:00:00 2001 From: Tommy Reilly Date: Fri, 26 Jun 2026 09:45:32 -0500 Subject: [PATCH 2/2] More fixes/tests from spark testing --- Makefile | 9 +-- src/cupti.cpp | 18 +++--- src/env_config.cpp | 11 ++++ src/pc_sampling.cpp | 39 ++++++++++-- src/pc_sampling.h | 1 + test/test-pc-mock-graph.sh | 119 ------------------------------------- test/test-pc-mock.sh | 62 ++++++++++++++++++- 7 files changed, 118 insertions(+), 141 deletions(-) delete mode 100755 test/test-pc-mock-graph.sh diff --git a/Makefile b/Makefile index ffe56ab..22a2db3 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug generate bpf-test microbenchmarks test-multi test-pc-real test-pc-mock test-pc-mock-graph +.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug generate bpf-test microbenchmarks test-multi test-pc-real test-pc-mock LIB_NAME = libparcagpucupti.so @@ -159,14 +159,11 @@ test-multi: local bpf-test test-pc-real: local bpf-test microbenchmarks sudo -E test/test-pc-real.sh -# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA. +# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA. Also covers +# graph launches (guards driver cuGraphLaunch CBID subscription). test-pc-mock: local bpf-test sudo -E test/test-pc-mock.sh -# Mock graph-launch test — guards driver cuGraphLaunch CBID subscription. No GPU. -test-pc-mock-graph: local bpf-test - sudo -E test/test-pc-mock-graph.sh - format: @echo "=== Formatting source files ===" @clang-format -i -style=file src/*.cpp src/*.h test/*.c diff --git a/src/cupti.cpp b/src/cupti.cpp index 234158e..2e4121f 100644 --- a/src/cupti.cpp +++ b/src/cupti.cpp @@ -263,10 +263,10 @@ class CuptiProfiler : public proton::Singleton { // Driver-API graph launches (cuGraphLaunch/_ptsz); not covered by setLaunch/ // setRuntimeCallbacks. Only these two cbids -- not setGraphCallbacks(), which // also subscribes capture cbids that emit uncorrelated events. - cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch); - cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz); + proton::cupti::enableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch); + proton::cupti::enableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz); // Enable resource callbacks only if PC sampling is enabled if (pcSamplingEnabled) { @@ -314,10 +314,12 @@ class CuptiProfiler : public proton::Singleton { proton::setRuntimeCallbacks(subscriber, /*enable=*/false); proton::setLaunchCallbacks(subscriber, /*enable=*/false); // Mirror the graph-launch callbacks enabled in init(). - cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch); - cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, - CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz); + proton::cupti::enableCallback(0, subscriber, + CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch); + proton::cupti::enableCallback( + 0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz); if (pcSamplingEnabled) { proton::setResourceCallbacks(subscriber, /*enable=*/false); } diff --git a/src/env_config.cpp b/src/env_config.cpp index 93ca87b..acc9af7 100644 --- a/src/env_config.cpp +++ b/src/env_config.cpp @@ -18,6 +18,7 @@ static const char *knownVars[] = { "PARCAGPU_RATE_LIMIT", "PARCAGPU_SAMPLING_FACTOR", "PARCAGPU_PC_SAMPLING_RATE", + "PARCAGPU_PC_HW_BUFFER_MB", }; static constexpr size_t numKnownVars = sizeof(knownVars) / sizeof(knownVars[0]); @@ -88,6 +89,16 @@ void validateEnvVars() { "env_config: PARCAGPU_PC_SAMPLING_RATE invalid"); } } + + val = std::getenv("PARCAGPU_PC_HW_BUFFER_MB"); + if (val) { + int mb = std::atoi(val); + if (mb < 0) { + DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_HW_BUFFER_MB=%s invalid " + "(must be >= 0), ignoring\n", val); + fireError(0, val, "env_config: PARCAGPU_PC_HW_BUFFER_MB invalid"); + } + } } } // namespace parcagpu diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp index cf37b18..85485a6 100644 --- a/src/pc_sampling.cpp +++ b/src/pc_sampling.cpp @@ -347,12 +347,23 @@ CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() { return scratchBufferInfo; } +// MB to force for the PC-sampling hardware buffer, from PARCAGPU_PC_HW_BUFFER_MB. +// 0 (unset/invalid/negative) means "leave at CUPTI default" — see the note in +// ConfigureData::initialize() for why we don't force it by default. +int pcHardwareBufferMB() { + const char *env = getenv("PARCAGPU_PC_HW_BUFFER_MB"); + if (!env) + return 0; + int mb = atoi(env); + return mb > 0 ? mb : 0; +} + CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() { CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{}; hardwareBufferInfo.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE; hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize = - HardwareBufferSize; + (size_t)hardwareBufferMB * 1024 * 1024; return hardwareBufferInfo; } @@ -381,15 +392,33 @@ void ConfigureData::initialize(CUcontext context) { DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n", context, contextId); + hardwareBufferMB = pcHardwareBufferMB(); + configurationInfos.emplace_back(configureStallReasons()); configurationInfos.emplace_back(configureCollectionMode()); configurationInfos.emplace_back(configureStartStopControl()); configurationInfos.emplace_back(configureSamplingBuffer()); - // Bigger scratch + hardware buffers so a busy workload (PyTorch-class) - // doesn't overflow CUPTI's defaults within minutes and start returning - // CUPTI_ERROR_OUT_OF_MEMORY from cuptiPCSamplingGetData. + // Bigger scratch buffer so a busy workload (PyTorch-class) doesn't overflow + // CUPTI's default within minutes and start returning CUPTI_ERROR_OUT_OF_MEMORY + // from cuptiPCSamplingGetData. configurationInfos.emplace_back(configureScratchBuffer()); - configurationInfos.emplace_back(configureHardwareBufferSize()); + // By default do NOT force the hardware buffer size. On Blackwell (sm_120/ + // sm_121, e.g. GB10) under live injection + activity recording, setting + // CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE makes every + // cuptiPCSamplingGetData return CUPTI_ERROR_OUT_OF_MEMORY (0 PCs drained) — + // confirmed independent of the value (even 16 MB fails) and not reproducible + // in a standalone CUPTI sample with the identical attribute set. CUPTI's + // default hardware buffer works; if it overflows it reports droppedSamples + // rather than wedging GetData. + // + // Escape hatch: set PARCAGPU_PC_HW_BUFFER_MB= to force an n-MB hardware + // buffer (0 = leave at CUPTI default, same as unset). Lets ops re-enable / + // tune it on GPUs where the larger buffer helps busy workloads. + if (hardwareBufferMB > 0) { + DEBUG_PRINTF("[PARCAGPU] Forcing PC hardware buffer = %d MB " + "(PARCAGPU_PC_HW_BUFFER_MB)\n", hardwareBufferMB); + configurationInfos.emplace_back(configureHardwareBufferSize()); + } // Don't set sampling period — let CUPTI use its default. // Explicit period values silently break sampling on some GPUs (e.g. // Blackwell). diff --git a/src/pc_sampling.h b/src/pc_sampling.h index dbb98c3..9dba0e7 100644 --- a/src/pc_sampling.h +++ b/src/pc_sampling.h @@ -87,6 +87,7 @@ struct ConfigureData { uint32_t contextId; uint32_t numStallReasons{}; uint32_t numValidStallReasons{}; + int hardwareBufferMB{}; // PARCAGPU_PC_HW_BUFFER_MB; 0 = CUPTI default char **stallReasonNames{}; uint32_t *stallReasonIndices{}; std::map stallReasonIndexToMetricIndex{}; diff --git a/test/test-pc-mock-graph.sh b/test/test-pc-mock-graph.sh deleted file mode 100755 index 5c9820e..0000000 --- a/test/test-pc-mock-graph.sh +++ /dev/null @@ -1,119 +0,0 @@ -#!/bin/bash -# Graph-launch variant of test-pc-mock.sh: drives test_cupti_prof with -# --graph-rate so half the launches are CUDA graph launches (one correlation -# ID fanning out into many kernel activity records with a nonzero graphId). -# Verifies graph kernels are profiled (emitted) rather than filtered, i.e. the -# PC-sampling rewrite did not break graph-launch profiling. -# -# Prereqs: make local bpf-test ; run with: sudo -E test/test-pc-mock-graph.sh -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" - -LIB="$ROOT/build-local/lib/libparcagpucupti.so" -TEST_BIN="$ROOT/build-local/bin/test_cupti_prof" -BPF="$ROOT/test/bpf/activity_parser" -CUBIN="$ROOT/test/pc_sample_toy.cubin" -BPF_LOG="/tmp/parcagpu-graph-bpf.log" -TEST_LOG="/tmp/parcagpu-graph-test.log" - -for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do - [ -e "$f" ] || { echo "error: $f not found" >&2; exit 1; } -done - -cleanup() { - [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true - [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true - [ -n "${SEM_PID:-}" ] && kill "$SEM_PID" 2>/dev/null || true - wait 2>/dev/null || true -} -trap cleanup EXIT - -echo "=== Starting test_cupti_prof (mock, graph-rate=25 of launch-rate=50) ===" -LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \ - PARCAGPU_DEBUG=1 \ - PARCAGPU_PC_SAMPLING_RATE=10000 \ - MOCK_CUBIN_PATH="$CUBIN" \ - "$TEST_BIN" "$LIB" --launch-rate=50 --graph-rate=25 --duration=12 > "$TEST_LOG" 2>&1 & -TEST_PID=$! -echo "test_cupti_prof PID: $TEST_PID" - -while kill -0 "$TEST_PID" 2>/dev/null && - ! grep -q libparcagpucupti "/proc/$TEST_PID/maps" 2>/dev/null; do - sleep 0.1 -done -if ! kill -0 "$TEST_PID" 2>/dev/null; then - echo "error: test_cupti_prof exited before library loaded" >&2 - cat "$TEST_LOG" >&2; exit 1 -fi - -# The activity_parser does NOT attach the cuda_correlation probe, but -# parcagpu's allocBuffer + callback path are gated on its USDT semaphore -# (PARCAGPU_CUDA_CORRELATION_ENABLED). In production parca-agent attaches it; -# here we bump that semaphore with bpftrace so the activity path is live. -echo "=== Bumping cuda_correlation semaphore via bpftrace ===" -bpftrace -p "$TEST_PID" -e "usdt:$LIB:parcagpu:cuda_correlation { @n = count(); } - usdt:$LIB:parcagpu:cuda_correlation /(int32)arg1 == -514 || (int32)arg1 == -515/ { @driver_graph = count(); }" \ - > /tmp/parcagpu-graph-sem.log 2>&1 & -SEM_PID=$! -sleep 2 # let bpftrace attach + set the semaphore before launches ramp - -echo "=== Starting BPF activity parser ===" -"$BPF" -pid "$TEST_PID" -lib "$LIB" > "$BPF_LOG" 2>&1 & -BPF_PID=$! -echo "activity_parser PID: $BPF_PID" - -wait "$TEST_PID" 2>/dev/null || true -TEST_PID="" -sleep 2 -kill "$BPF_PID" 2>/dev/null || true -wait "$BPF_PID" 2>/dev/null || true -BPF_PID="" -kill "$SEM_PID" 2>/dev/null || true -wait "$SEM_PID" 2>/dev/null || true -SEM_PID="" -echo "=== bpftrace cuda_correlation hit count ===" -grep -E "@n:" /tmp/parcagpu-graph-sem.log || cat /tmp/parcagpu-graph-sem.log | tail -3 || true - -echo -echo "=== graph map inserts (parcagpu debug, first 5) ===" -grep "into graph map" "$TEST_LOG" | head -5 || true -echo "=== graph map insert count ===" -grep -c "into graph map" "$TEST_LOG" || true -echo "=== filtered graph activities (graphId nonzero) count ===" -grep "Filtered kernel activity" "$TEST_LOG" | grep -vE "graphId=0\b" | grep -cE "graphId=[1-9]" || true -echo -echo "=== BPF: sample graph kernel events (graph= nonzero) ===" -grep -E "^kernel: " "$BPF_LOG" | grep -vE "graph=0\b" | grep -E "graph=[1-9]" | head -8 || true -echo "=== BPF: graph kernel-event count (graph= nonzero) ===" -grep -E "^kernel: " "$BPF_LOG" | grep -vE "graph=0\b" | grep -cE "graph=[1-9]" || true -echo "=== BPF: regular kernel-event count (graph=0) ===" -grep -cE "^kernel: .*graph=0\b" "$BPF_LOG" || true -echo "=== BPF summary line ===" -grep -E "kernels_found|events_received" "$BPF_LOG" | tail -2 || true -echo - -# --- Checks --- -PASS=true -check() { - if eval "$2"; then echo "PASS: $1"; else echo "FAIL: $1" >&2; PASS=false; fi -} -GRAPH_INSERTS=$(grep -c "into graph map" "$TEST_LOG" || true) -# Authoritative signal: graph kernel activities that PASSED the correlation -# filter and were emitted (PARCAGPU_KERNEL_EXECUTED + activity_batch). The BPF -# per-kernel "kernel:" line is verbosity-gated (-v), so we count emits on the -# parcagpu side instead, and confirm the consumer received events. -GRAPH_EMITTED=$(grep "Kernel activity:" "$TEST_LOG" | grep -cE "graphId=[1-9]" || true) -BPF_RECEIVED=$(grep -oE "events_received=[0-9]+" "$BPF_LOG" | grep -oE "[0-9]+" | sort -n | tail -1 || echo 0) -# Driver cuGraphLaunch correlation events (signed cbid -514/-515). Zero means the -# driver graph-launch cbid wasn't subscribed; the runtime half emits regardless, -# so GRAPH_EMITTED>0 alone wouldn't catch it. -DRIVER_GRAPH=$(grep -oE "@driver_graph: [0-9]+" /tmp/parcagpu-graph-sem.log | grep -oE "[0-9]+" || echo 0) - -check "graph launches inserted into graph map" "[ \"${GRAPH_INSERTS:-0}\" -gt 0 ]" -check "graph kernel activities emitted (not filtered)" "[ \"${GRAPH_EMITTED:-0}\" -gt 0 ]" -check "BPF consumer received kernel events" "[ \"${BPF_RECEIVED:-0}\" -gt 0 ]" -check "driver cuGraphLaunch correlated (cbid subscribed)" "[ \"${DRIVER_GRAPH:-0}\" -gt 0 ]" - -if $PASS; then echo; echo "=== GRAPH PROFILING VERIFIED ==="; else echo; echo "=== GRAPH PROFILING FAILED ===" >&2; exit 1; fi diff --git a/test/test-pc-mock.sh b/test/test-pc-mock.sh index 8d4b888..1e14bff 100755 --- a/test/test-pc-mock.sh +++ b/test/test-pc-mock.sh @@ -3,8 +3,15 @@ # with BPF activity parser. Verifies stall reason map, PC samples, and # cubin loading WITHOUT requiring a real GPU. # +# Also drives a mix of CUDA graph launches (--graph-rate): half the launches +# fan one correlation ID out into many kernel activity records with a nonzero +# graphId. We verify those graph kernels are profiled (emitted) rather than +# filtered, and that the driver cuGraphLaunch cbid is subscribed — i.e. the +# PC-sampling rewrite did not break graph-launch profiling. +# # Prerequisites: # make local bpf-test +# bpftrace (used to bump the cuda_correlation USDT semaphore; see below) # # Usage: # sudo -E test/test-pc-mock.sh # default @@ -21,6 +28,7 @@ BPF="$ROOT/test/bpf/activity_parser" CUBIN="$ROOT/test/pc_sample_toy.cubin" BPF_LOG="/tmp/parcagpu-pc-mock-bpf.log" TEST_LOG="/tmp/parcagpu-pc-mock-test.log" +SEM_LOG="/tmp/parcagpu-pc-mock-sem.log" VERBOSE="" for arg in "$@"; do @@ -36,10 +44,12 @@ for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do exit 1 fi done +command -v bpftrace >/dev/null || { echo "error: bpftrace not found" >&2; exit 1; } cleanup() { [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true + [ -n "${SEM_PID:-}" ] && kill "$SEM_PID" 2>/dev/null || true wait 2>/dev/null || true } trap cleanup EXIT @@ -49,12 +59,14 @@ trap cleanup EXIT # instead of real libcupti.so / libcuda.so. # High target rate keeps the controller's probability near 1 throughout the # short mock run (otherwise it would converge below 1 once samples flow). -echo "=== Starting test_cupti_prof (mock) ===" +# --graph-rate=25 of --launch-rate=50: half the launches are CUDA graph +# launches (one correlation ID fanning out into many kernel records). +echo "=== Starting test_cupti_prof (mock, graph-rate=25 of launch-rate=50) ===" LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \ PARCAGPU_DEBUG=1 \ PARCAGPU_PC_SAMPLING_RATE=10000 \ MOCK_CUBIN_PATH="$CUBIN" \ - "$TEST_BIN" "$LIB" --launch-rate=50 --duration=15 > "$TEST_LOG" 2>&1 & + "$TEST_BIN" "$LIB" --launch-rate=50 --graph-rate=25 --duration=15 > "$TEST_LOG" 2>&1 & TEST_PID=$! echo "test_cupti_prof PID: $TEST_PID" @@ -70,6 +82,17 @@ if ! kill -0 "$TEST_PID" 2>/dev/null; then exit 1 fi +# The activity_parser does NOT attach the cuda_correlation probe, but +# parcagpu's allocBuffer + callback path are gated on its USDT semaphore +# (PARCAGPU_CUDA_CORRELATION_ENABLED). In production parca-agent attaches it; +# here we bump that semaphore with bpftrace so the activity path is live. +echo "=== Bumping cuda_correlation semaphore via bpftrace ===" +bpftrace -p "$TEST_PID" -e "usdt:$LIB:parcagpu:cuda_correlation { @n = count(); } + usdt:$LIB:parcagpu:cuda_correlation /(int32)arg1 == -514 || (int32)arg1 == -515/ { @driver_graph = count(); }" \ + > "$SEM_LOG" 2>&1 & +SEM_PID=$! +sleep 2 # let bpftrace attach + set the semaphore before launches ramp + # --- Attach BPF parser --- echo "=== Starting BPF activity parser ===" "$BPF" -pid "$TEST_PID" -lib "$LIB" $VERBOSE > "$BPF_LOG" 2>&1 & @@ -81,10 +104,13 @@ wait "$TEST_PID" 2>/dev/null || true TEST_PID="" sleep 2 -# --- Stop BPF parser --- +# --- Stop BPF parser + semaphore bumper --- kill "$BPF_PID" 2>/dev/null || true wait "$BPF_PID" 2>/dev/null || true BPF_PID="" +kill "$SEM_PID" 2>/dev/null || true +wait "$SEM_PID" 2>/dev/null || true +SEM_PID="" # --- Results --- echo @@ -94,10 +120,16 @@ echo echo "=== BPF parser output ===" cat "$BPF_LOG" echo +echo "=== bpftrace cuda_correlation hit count ===" +grep -E "@n:" "$SEM_LOG" || tail -3 "$SEM_LOG" || true +echo "=== graph map insert count (parcagpu debug) ===" +grep -c "into graph map" "$TEST_LOG" || true +echo # --- Checks --- PASS=true +# grep-based check: PASS if PATTERN is present in FILE. check() { local label="$1" pattern="$2" file="$3" if grep -q "$pattern" "$file"; then @@ -108,6 +140,12 @@ check() { fi } +# expression-based check: PASS if the shell test EXPR succeeds. +check_expr() { + if eval "$2"; then echo "PASS: $1"; else echo "FAIL: $1" >&2; PASS=false; fi +} + +# PC sampling + cubin + source correlation (no GPU). check "PC sampling initialized" "PC sampling initialized" "$TEST_LOG" check "real cubin loaded (mock)" "Loaded cubin.*pc_sample_toy" "$TEST_LOG" check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TEST_LOG" @@ -119,6 +157,24 @@ check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG" check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG" check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG" +# Graph-launch profiling. +GRAPH_INSERTS=$(grep -c "into graph map" "$TEST_LOG" || true) +# Authoritative signal: graph kernel activities that PASSED the correlation +# filter and were emitted (PARCAGPU_KERNEL_EXECUTED + activity_batch). The BPF +# per-kernel "kernel:" line is verbosity-gated (-v), so we count emits on the +# parcagpu side instead, and confirm the consumer received events. +GRAPH_EMITTED=$(grep "Kernel activity:" "$TEST_LOG" | grep -cE "graphId=[1-9]" || true) +BPF_RECEIVED=$(grep -oE "events_received=[0-9]+" "$BPF_LOG" | grep -oE "[0-9]+" | sort -n | tail -1 || echo 0) +# Driver cuGraphLaunch correlation events (signed cbid -514/-515). Zero means the +# driver graph-launch cbid wasn't subscribed; the runtime half emits regardless, +# so GRAPH_EMITTED>0 alone wouldn't catch it. +DRIVER_GRAPH=$(grep -oE "@driver_graph: [0-9]+" "$SEM_LOG" | grep -oE "[0-9]+" || echo 0) + +check_expr "graph launches inserted into graph map" "[ \"${GRAPH_INSERTS:-0}\" -gt 0 ]" +check_expr "graph kernel activities emitted (not filtered)" "[ \"${GRAPH_EMITTED:-0}\" -gt 0 ]" +check_expr "BPF consumer received kernel events" "[ \"${BPF_RECEIVED:-0}\" -gt 0 ]" +check_expr "driver cuGraphLaunch correlated (cbid subscribed)" "[ \"${DRIVER_GRAPH:-0}\" -gt 0 ]" + if $PASS; then echo echo "=== ALL CHECKS PASSED ==="