From 633dc6b99c70d6fb239c887a91fa27797afbb9c1 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Fri, 26 Jun 2026 09:15:52 -0400
Subject: [PATCH 1/2] Add callbacks for driver cuGraphLaunch

parcagpu subscribed CUPTI callbacks for eager launches (cuLaunchKernel) and
runtime graph launches (cudaGraphLaunch), but not the driver-API cuGraphLaunch /
cuGraphLaunch_ptsz. C++ runtimes like TensorRT-LLM replay CUDA graphs through the
driver API, so the cuda_correlation USDT never fired in graph mode and no GPU
samples were produced, while eager launches kept working (confirmed on a 4xB200
TRT-LLM node).

Fix: enable the two driver graph-launch cbids in CuptiProfiler::initialize (with
symmetric teardown). Not setGraphCallbacks(), which also subscribes capture cbids
that would emit correlation events for non-executing capture calls.

Tests: the mock harness dispatched callbacks unconditionally, hiding the missing
subscription. mock_cupti now records the subscribed (domain,cbid) set and the
harness only dispatches subscribed callbacks. test-pc-mock-graph (new make
target) asserts driver cuGraphLaunch correlation events fire; graph_repro.cu /
graph-repro-real.sh add a real-GPU reproducer. Verified the guard goes red
without the fix and green with it.
---
 Makefile                   |   6 +-
 src/cupti.cpp              |  13 ++++
 test/graph-repro-real.sh   |  55 +++++++++++++++++
 test/graph_repro.cu        |  94 +++++++++++++++++++++++++++++
 test/mock_cupti.c          |  34 +++++++++++
 test/test-pc-mock-graph.sh | 119 +++++++++++++++++++++++++++++++++++++
 test/test_cupti_prof.c     |  34 +++++++----
 7 files changed, 344 insertions(+), 11 deletions(-)
 create mode 100755 test/graph-repro-real.sh
 create mode 100644 test/graph_repro.cu
 create mode 100755 test/test-pc-mock-graph.sh
diff --git a/Makefile b/Makefile
index 3317417..ffe56ab 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug generate bpf-test microbenchmarks test-multi test-pc-real test-pc-mock
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug generate bpf-test microbenchmarks test-multi test-pc-real test-pc-mock test-pc-mock-graph
 
 LIB_NAME = libparcagpucupti.so
 
@@ -163,6 +163,10 @@ test-pc-real: local bpf-test microbenchmarks
 test-pc-mock: local bpf-test
 	sudo -E test/test-pc-mock.sh
 
+# Mock graph-launch test — guards driver cuGraphLaunch CBID subscription. No GPU.
+test-pc-mock-graph: local bpf-test
+	sudo -E test/test-pc-mock-graph.sh
+
 format:
 	@echo "=== Formatting source files ==="
 	@clang-format -i -style=file src/*.cpp src/*.h test/*.c
diff --git a/src/cupti.cpp b/src/cupti.cpp
index 1e1dd8d..234158e 100644
--- a/src/cupti.cpp
+++ b/src/cupti.cpp
@@ -260,6 +260,14 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     proton::setRuntimeCallbacks(subscriber, /*enable=*/true);
     proton::setLaunchCallbacks(subscriber, /*enable=*/true);
 
+    // Driver-API graph launches (cuGraphLaunch/_ptsz); not covered by setLaunch/
+    // setRuntimeCallbacks. Only these two cbids -- not setGraphCallbacks(), which
+    // also subscribes capture cbids that emit uncorrelated events.
+    cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
+    cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
+
     // Enable resource callbacks only if PC sampling is enabled
     if (pcSamplingEnabled) {
       proton::setResourceCallbacks(subscriber, /*enable=*/true);
@@ -305,6 +313,11 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     if (subscriber) {
       proton::setRuntimeCallbacks(subscriber, /*enable=*/false);
       proton::setLaunchCallbacks(subscriber, /*enable=*/false);
+      // Mirror the graph-launch callbacks enabled in init().
+      cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
+      cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
       if (pcSamplingEnabled) {
         proton::setResourceCallbacks(subscriber, /*enable=*/false);
       }
diff --git a/test/graph-repro-real.sh b/test/graph-repro-real.sh
new file mode 100755
index 0000000..84c8ae3
--- /dev/null
+++ b/test/graph-repro-real.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# graph-repro-real.sh — real-GPU guard: runs graph_repro under the shim and
+# checks the cuda_correlation USDT fires for driver cuGraphLaunch (cbid -514).
+# Zero => FAIL (cbid not subscribed). Needs an NVIDIA GPU, nvcc, bpftrace, root.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+LIB="$ROOT/build-local/lib/libparcagpucupti.so"
+SECS="${1:-15}"
+BIN="$(mktemp -d)/graph_repro"
+COUNT_LOG="$(mktemp)"
+
+[ -f "$LIB" ] || { echo "error: $LIB not found — run 'make local' first" >&2; exit 1; }
+command -v nvcc >/dev/null || { echo "error: nvcc not found" >&2; exit 1; }
+command -v bpftrace >/dev/null || { echo "error: bpftrace not found" >&2; exit 1; }
+
+echo "=== Building reproducer ==="
+nvcc -O2 -o "$BIN" "$ROOT/test/graph_repro.cu" -lcuda
+
+echo "=== Launching reproducer under parcagpu shim ==="
+LD_PRELOAD="$LIB" PARCAGPU_DEBUG=1 "$BIN" "$SECS" 2>"$(mktemp)" &
+APP_PID=$!
+
+# Wait for the shim to be mapped.
+for _ in $(seq 1 50); do
+  grep -q libparcagpucupti "/proc/$APP_PID/maps" 2>/dev/null && break
+  sleep 0.1
+done
+
+echo "=== Attaching bpftrace to cuda_correlation USDT (driver cbid 514 => -514) ==="
+# @driver: driver cuGraphLaunch (path under test); @runtime: positive cbid
+bpftrace -p "$APP_PID" \
+  -e "usdt:$LIB:parcagpu:cuda_correlation /arg1 == -514 || arg1 == -515/ { @driver = count(); }
+      usdt:$LIB:parcagpu:cuda_correlation /arg1 >= 0/ { @runtime = count(); }" \
+  >"$COUNT_LOG" 2>/dev/null &
+BT_PID=$!
+
+wait "$APP_PID" || true
+sleep 1
+kill "$BT_PID" 2>/dev/null || true
+wait "$BT_PID" 2>/dev/null || true
+
+echo "=== bpftrace results ==="
+cat "$COUNT_LOG"
+
+DRIVER=$(grep -oE "@driver: [0-9]+" "$COUNT_LOG" | grep -oE "[0-9]+" || echo 0)
+echo
+if [ "${DRIVER:-0}" -gt 0 ]; then
+  echo "PASS: driver cuGraphLaunch produced $DRIVER correlation events"
+  exit 0
+else
+  echo "FAIL: zero driver cuGraphLaunch correlation events — driver graph-launch" >&2
+  echo "      cbid is not subscribed (CuptiProfiler::initialize)." >&2
+  exit 1
+fi
diff --git a/test/graph_repro.cu b/test/graph_repro.cu
new file mode 100644
index 0000000..99cf2ac
--- /dev/null
+++ b/test/graph_repro.cu
@@ -0,0 +1,94 @@
+// graph_repro.cu — replays a captured graph via the DRIVER cuGraphLaunch (the
+// path parcagpu missed). Build: nvcc -o graph_repro test/graph_repro.cu -lcuda
+// Run: ./graph_repro [seconds]. See test/graph-repro-real.sh for the guard.
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+
+#define CUDA_CHECK(x)                                                          \
+  do {                                                                         \
+    cudaError_t e = (x);                                                       \
+    if (e != cudaSuccess) {                                                    \
+      fprintf(stderr, "cuda error %s at %s:%d\n", cudaGetErrorString(e),       \
+              __FILE__, __LINE__);                                             \
+      exit(1);                                                                 \
+    }                                                                          \
+  } while (0)
+
+#define CU_CHECK(x)                                                            \
+  do {                                                                         \
+    CUresult e = (x);                                                          \
+    if (e != CUDA_SUCCESS) {                                                   \
+      const char *s = nullptr;                                                 \
+      cuGetErrorString(e, &s);                                                 \
+      fprintf(stderr, "driver error %s at %s:%d\n", s ? s : "?", __FILE__,     \
+              __LINE__);                                                       \
+      exit(1);                                                                 \
+    }                                                                          \
+  } while (0)
+
+__global__ void add_one(float *x, int n) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    x[i] += 1.0f;
+}
+
+int main(int argc, char **argv) {
+  int seconds = (argc > 1) ? atoi(argv[1]) : 20;
+
+  // Force driver + runtime init.
+  CUDA_CHECK(cudaFree(0));
+  CU_CHECK(cuInit(0));
+
+  const int n = 1 << 16;
+  float *d = nullptr;
+  CUDA_CHECK(cudaMalloc(&d, n * sizeof(float)));
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  // Build a graph by capturing a few kernel launches into the stream.
+  CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+  for (int k = 0; k < 8; k++) {
+    add_one<<<(n + 255) / 256, 256, 0, stream>>>(d, n);
+  }
+  cudaGraph_t graph;
+  CUDA_CHECK(cudaStreamEndCapture(stream, &graph));
+
+  cudaGraphExec_t exec;
+  CUDA_CHECK(cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0));
+
+  // cudaGraphExec_t is the same handle as CUgraphExec; replay via driver API.
+  CUgraphExec drvExec = reinterpret_cast<CUgraphExec>(exec);
+  CUstream drvStream = reinterpret_cast<CUstream>(stream);
+
+  fprintf(stderr,
+          "graph_repro: replaying graph via driver cuGraphLaunch for %ds\n",
+          seconds);
+
+  struct timespec start;
+  clock_gettime(CLOCK_MONOTONIC, &start);
+  unsigned long long launches = 0;
+  for (;;) {
+    CU_CHECK(cuGraphLaunch(drvExec, drvStream));
+    launches++;
+    if ((launches & 0x3ff) == 0) {
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      struct timespec now;
+      clock_gettime(CLOCK_MONOTONIC, &now);
+      if (now.tv_sec - start.tv_sec >= seconds)
+        break;
+    }
+  }
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  fprintf(stderr, "graph_repro: done, %llu driver graph launches\n", launches);
+
+  cudaGraphExecDestroy(exec);
+  cudaGraphDestroy(graph);
+  cudaStreamDestroy(stream);
+  cudaFree(d);
+  return 0;
+}
diff --git a/test/mock_cupti.c b/test/mock_cupti.c
index 6862ba1..ab3b3f4 100644
--- a/test/mock_cupti.c
+++ b/test/mock_cupti.c
@@ -45,6 +45,24 @@ CUptiResult cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
     return CUPTI_SUCCESS;
 }
 
+// Subscribed (domain, cbid) pairs, so the harness can mirror real CUPTI dispatch.
+#define MOCK_MAX_ENABLED 512
+static struct {
+    CUpti_CallbackDomain domain;
+    CUpti_CallbackId cbid;
+} __enabled_cbs[MOCK_MAX_ENABLED];
+static size_t __enabled_cb_count = 0;
+
+int __mock_cupti_is_callback_enabled(CUpti_CallbackDomain domain,
+                                     CUpti_CallbackId cbid) {
+    for (size_t i = 0; i < __enabled_cb_count; i++) {
+        if (__enabled_cbs[i].domain == domain && __enabled_cbs[i].cbid == cbid) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
 CUptiResult cuptiEnableCallback(uint32_t enable,
                                 CUpti_SubscriberHandle subscriber,
                                 CUpti_CallbackDomain domain,
@@ -52,6 +70,22 @@ CUptiResult cuptiEnableCallback(uint32_t enable,
     (void)subscriber;  // Mark as intentionally unused
     fprintf(stderr, "[MOCK_CUPTI] cuptiEnableCallback(enable=%u, domain=%u, cbid=%u)\n",
             enable, domain, cbid);
+    if (enable) {
+        if (!__mock_cupti_is_callback_enabled(domain, cbid) &&
+            __enabled_cb_count < MOCK_MAX_ENABLED) {
+            __enabled_cbs[__enabled_cb_count].domain = domain;
+            __enabled_cbs[__enabled_cb_count].cbid = cbid;
+            __enabled_cb_count++;
+        }
+    } else {
+        for (size_t i = 0; i < __enabled_cb_count; i++) {
+            if (__enabled_cbs[i].domain == domain &&
+                __enabled_cbs[i].cbid == cbid) {
+                __enabled_cbs[i] = __enabled_cbs[--__enabled_cb_count];
+                break;
+            }
+        }
+    }
     return CUPTI_SUCCESS;
 }
 
diff --git a/test/test-pc-mock-graph.sh b/test/test-pc-mock-graph.sh
new file mode 100755
index 0000000..5c9820e
--- /dev/null
+++ b/test/test-pc-mock-graph.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Graph-launch variant of test-pc-mock.sh: drives test_cupti_prof with
+# --graph-rate so half the launches are CUDA graph launches (one correlation
+# ID fanning out into many kernel activity records with a nonzero graphId).
+# Verifies graph kernels are profiled (emitted) rather than filtered, i.e. the
+# PC-sampling rewrite did not break graph-launch profiling.
+#
+# Prereqs: make local bpf-test ; run with: sudo -E test/test-pc-mock-graph.sh
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+LIB="$ROOT/build-local/lib/libparcagpucupti.so"
+TEST_BIN="$ROOT/build-local/bin/test_cupti_prof"
+BPF="$ROOT/test/bpf/activity_parser"
+CUBIN="$ROOT/test/pc_sample_toy.cubin"
+BPF_LOG="/tmp/parcagpu-graph-bpf.log"
+TEST_LOG="/tmp/parcagpu-graph-test.log"
+
+for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do
+  [ -e "$f" ] || { echo "error: $f not found" >&2; exit 1; }
+done
+
+cleanup() {
+  [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true
+  [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true
+  [ -n "${SEM_PID:-}" ] && kill "$SEM_PID" 2>/dev/null || true
+  wait 2>/dev/null || true
+}
+trap cleanup EXIT
+
+echo "=== Starting test_cupti_prof (mock, graph-rate=25 of launch-rate=50) ==="
+LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \
+  PARCAGPU_DEBUG=1 \
+  PARCAGPU_PC_SAMPLING_RATE=10000 \
+  MOCK_CUBIN_PATH="$CUBIN" \
+  "$TEST_BIN" "$LIB" --launch-rate=50 --graph-rate=25 --duration=12 > "$TEST_LOG" 2>&1 &
+TEST_PID=$!
+echo "test_cupti_prof PID: $TEST_PID"
+
+while kill -0 "$TEST_PID" 2>/dev/null &&
+      ! grep -q libparcagpucupti "/proc/$TEST_PID/maps" 2>/dev/null; do
+  sleep 0.1
+done
+if ! kill -0 "$TEST_PID" 2>/dev/null; then
+  echo "error: test_cupti_prof exited before library loaded" >&2
+  cat "$TEST_LOG" >&2; exit 1
+fi
+
+# The activity_parser does NOT attach the cuda_correlation probe, but
+# parcagpu's allocBuffer + callback path are gated on its USDT semaphore
+# (PARCAGPU_CUDA_CORRELATION_ENABLED). In production parca-agent attaches it;
+# here we bump that semaphore with bpftrace so the activity path is live.
+echo "=== Bumping cuda_correlation semaphore via bpftrace ==="
+bpftrace -p "$TEST_PID" -e "usdt:$LIB:parcagpu:cuda_correlation { @n = count(); }
+  usdt:$LIB:parcagpu:cuda_correlation /(int32)arg1 == -514 || (int32)arg1 == -515/ { @driver_graph = count(); }" \
+  > /tmp/parcagpu-graph-sem.log 2>&1 &
+SEM_PID=$!
+sleep 2  # let bpftrace attach + set the semaphore before launches ramp
+
+echo "=== Starting BPF activity parser ==="
+"$BPF" -pid "$TEST_PID" -lib "$LIB" > "$BPF_LOG" 2>&1 &
+BPF_PID=$!
+echo "activity_parser PID: $BPF_PID"
+
+wait "$TEST_PID" 2>/dev/null || true
+TEST_PID=""
+sleep 2
+kill "$BPF_PID" 2>/dev/null || true
+wait "$BPF_PID" 2>/dev/null || true
+BPF_PID=""
+kill "$SEM_PID" 2>/dev/null || true
+wait "$SEM_PID" 2>/dev/null || true
+SEM_PID=""
+echo "=== bpftrace cuda_correlation hit count ==="
+grep -E "@n:" /tmp/parcagpu-graph-sem.log || cat /tmp/parcagpu-graph-sem.log | tail -3 || true
+
+echo
+echo "=== graph map inserts (parcagpu debug, first 5) ==="
+grep "into graph map" "$TEST_LOG" | head -5 || true
+echo "=== graph map insert count ==="
+grep -c "into graph map" "$TEST_LOG" || true
+echo "=== filtered graph activities (graphId nonzero) count ==="
+grep "Filtered kernel activity" "$TEST_LOG" | grep -vE "graphId=0\b" | grep -cE "graphId=[1-9]" || true
+echo
+echo "=== BPF: sample graph kernel events (graph= nonzero) ==="
+grep -E "^kernel: " "$BPF_LOG" | grep -vE "graph=0\b" | grep -E "graph=[1-9]" | head -8 || true
+echo "=== BPF: graph kernel-event count (graph= nonzero) ==="
+grep -E "^kernel: " "$BPF_LOG" | grep -vE "graph=0\b" | grep -cE "graph=[1-9]" || true
+echo "=== BPF: regular kernel-event count (graph=0) ==="
+grep -cE "^kernel: .*graph=0\b" "$BPF_LOG" || true
+echo "=== BPF summary line ==="
+grep -E "kernels_found|events_received" "$BPF_LOG" | tail -2 || true
+echo
+
+# --- Checks ---
+PASS=true
+check() {
+  if eval "$2"; then echo "PASS: $1"; else echo "FAIL: $1" >&2; PASS=false; fi
+}
+GRAPH_INSERTS=$(grep -c "into graph map" "$TEST_LOG" || true)
+# Authoritative signal: graph kernel activities that PASSED the correlation
+# filter and were emitted (PARCAGPU_KERNEL_EXECUTED + activity_batch). The BPF
+# per-kernel "kernel:" line is verbosity-gated (-v), so we count emits on the
+# parcagpu side instead, and confirm the consumer received events.
+GRAPH_EMITTED=$(grep "Kernel activity:" "$TEST_LOG" | grep -cE "graphId=[1-9]" || true)
+BPF_RECEIVED=$(grep -oE "events_received=[0-9]+" "$BPF_LOG" | grep -oE "[0-9]+" | sort -n | tail -1 || echo 0)
+# Driver cuGraphLaunch correlation events (signed cbid -514/-515). Zero means the
+# driver graph-launch cbid wasn't subscribed; the runtime half emits regardless,
+# so GRAPH_EMITTED>0 alone wouldn't catch it.
+DRIVER_GRAPH=$(grep -oE "@driver_graph: [0-9]+" /tmp/parcagpu-graph-sem.log | grep -oE "[0-9]+" || echo 0)
+
+check "graph launches inserted into graph map"         "[ \"${GRAPH_INSERTS:-0}\" -gt 0 ]"
+check "graph kernel activities emitted (not filtered)"  "[ \"${GRAPH_EMITTED:-0}\" -gt 0 ]"
+check "BPF consumer received kernel events"             "[ \"${BPF_RECEIVED:-0}\" -gt 0 ]"
+check "driver cuGraphLaunch correlated (cbid subscribed)" "[ \"${DRIVER_GRAPH:-0}\" -gt 0 ]"
+
+if $PASS; then echo; echo "=== GRAPH PROFILING VERIFIED ==="; else echo; echo "=== GRAPH PROFILING FAILED ===" >&2; exit 1; fi
diff --git a/test/test_cupti_prof.c b/test/test_cupti_prof.c
index 9d08a3b..c1a6589 100644
--- a/test/test_cupti_prof.c
+++ b/test/test_cupti_prof.c
@@ -294,6 +294,20 @@ static void (*parcagpuCuptiCallback)(void *userdata, CUpti_CallbackDomain domain
 // Defined in mock_cupti.c — feeds correlation IDs to cuptiPCSamplingGetData.
 extern void __mock_pc_enqueue_correlation(uint32_t correlation_id);
 
+// Defined in mock_cupti.c — whether a (domain, cbid) was subscribed.
+extern int __mock_cupti_is_callback_enabled(CUpti_CallbackDomain domain,
+                                            CUpti_CallbackId cbid);
+
+// Dispatch to the shim only if subscribed, mirroring real CUPTI gating.
+static void fire_cupti_callback(CUpti_CallbackDomain domain,
+                                CUpti_CallbackId cbid,
+                                const CUpti_CallbackData *cbdata) {
+    if (!__mock_cupti_is_callback_enabled(domain, cbid)) {
+        return;
+    }
+    parcagpuCuptiCallback(NULL, domain, cbid, cbdata);
+}
+
 //=============================================================================
 // Launched Kernels Queue
 // Track which correlation IDs have had their callbacks executed
@@ -385,12 +399,12 @@ NOINLINE CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned i
     // DRIVER ENTER callback
     tls_cbdata.callbackSite = CUPTI_API_ENTER;
     tls_cbdata.correlationId = tls_correlation_id;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API,
                          CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &tls_cbdata);
 
     // DRIVER EXIT callback
     tls_cbdata.callbackSite = CUPTI_API_EXIT;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API,
                          CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, &tls_cbdata);
 
     return CUDA_SUCCESS;
@@ -403,12 +417,12 @@ NOINLINE CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
     // DRIVER ENTER callback
     tls_cbdata.callbackSite = CUPTI_API_ENTER;
     tls_cbdata.correlationId = tls_correlation_id;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API,
                          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata);
 
     // DRIVER EXIT callback
     tls_cbdata.callbackSite = CUPTI_API_EXIT;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API,
                          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata);
 
     // Enqueue for activity generation
@@ -427,7 +441,7 @@ NOINLINE cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 block
     // RUNTIME ENTER callback
     tls_cbdata.callbackSite = CUPTI_API_ENTER;
     tls_cbdata.correlationId = tls_correlation_id;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_RUNTIME_API,
                          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, &tls_cbdata);
 
     // Runtime internally calls driver - call through cuLaunchKernel so it appears in stack
@@ -435,7 +449,7 @@ NOINLINE cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 block
 
     // RUNTIME EXIT callback
     tls_cbdata.callbackSite = CUPTI_API_EXIT;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_RUNTIME_API,
                          CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, &tls_cbdata);
 
     return cudaSuccess;
@@ -448,22 +462,22 @@ NOINLINE cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t str
     // RUNTIME ENTER callback
     tls_cbdata.callbackSite = CUPTI_API_ENTER;
     tls_cbdata.correlationId = tls_correlation_id;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_RUNTIME_API,
                          CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, &tls_cbdata);
 
     // Runtime internally calls driver - use inline driver callback with cuGraphLaunch cbid
     // (We don't call cuGraphLaunch here to avoid double-queueing)
     tls_cbdata.callbackSite = CUPTI_API_ENTER;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API,
                          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata);
 
     tls_cbdata.callbackSite = CUPTI_API_EXIT;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_DRIVER_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_DRIVER_API,
                          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch, &tls_cbdata);
 
     // RUNTIME EXIT callback
     tls_cbdata.callbackSite = CUPTI_API_EXIT;
-    parcagpuCuptiCallback(NULL, CUPTI_CB_DOMAIN_RUNTIME_API,
+    fire_cupti_callback(CUPTI_CB_DOMAIN_RUNTIME_API,
                          CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, &tls_cbdata);
 
     // Enqueue for activity generation

From f0fbd49245b7c68c6efe413124e8deafc4a4d471 Mon Sep 17 00:00:00 2001
From: Tommy Reilly <gnurizen@gmail.com>
Date: Fri, 26 Jun 2026 09:45:32 -0500
Subject: [PATCH 2/2] More fixes/tests from spark testing

---
 Makefile                   |   9 +--
 src/cupti.cpp              |  18 +++---
 src/env_config.cpp         |  11 ++++
 src/pc_sampling.cpp        |  39 ++++++++++--
 src/pc_sampling.h          |   1 +
 test/test-pc-mock-graph.sh | 119 -------------------------------------
 test/test-pc-mock.sh       |  62 ++++++++++++++++++-
 7 files changed, 118 insertions(+), 141 deletions(-)
 delete mode 100755 test/test-pc-mock-graph.sh

diff --git a/Makefile b/Makefile
index ffe56ab..22a2db3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug generate bpf-test microbenchmarks test-multi test-pc-real test-pc-mock test-pc-mock-graph
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug generate bpf-test microbenchmarks test-multi test-pc-real test-pc-mock
 
 LIB_NAME = libparcagpucupti.so
 
@@ -159,14 +159,11 @@ test-multi: local bpf-test
 test-pc-real: local bpf-test microbenchmarks
 	sudo -E test/test-pc-real.sh
 
-# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA.
+# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA. Also covers
+# graph launches (guards driver cuGraphLaunch CBID subscription).
 test-pc-mock: local bpf-test
 	sudo -E test/test-pc-mock.sh
 
-# Mock graph-launch test — guards driver cuGraphLaunch CBID subscription. No GPU.
-test-pc-mock-graph: local bpf-test
-	sudo -E test/test-pc-mock-graph.sh
-
 format:
 	@echo "=== Formatting source files ==="
 	@clang-format -i -style=file src/*.cpp src/*.h test/*.c
diff --git a/src/cupti.cpp b/src/cupti.cpp
index 234158e..2e4121f 100644
--- a/src/cupti.cpp
+++ b/src/cupti.cpp
@@ -263,10 +263,10 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     // Driver-API graph launches (cuGraphLaunch/_ptsz); not covered by setLaunch/
     // setRuntimeCallbacks. Only these two cbids -- not setGraphCallbacks(), which
     // also subscribes capture cbids that emit uncorrelated events.
-    cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
-                        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
-    cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
-                        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
+    proton::cupti::enableCallback<true>(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                                        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
+    proton::cupti::enableCallback<true>(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                                        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
 
     // Enable resource callbacks only if PC sampling is enabled
     if (pcSamplingEnabled) {
@@ -314,10 +314,12 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
       proton::setRuntimeCallbacks(subscriber, /*enable=*/false);
       proton::setLaunchCallbacks(subscriber, /*enable=*/false);
       // Mirror the graph-launch callbacks enabled in init().
-      cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
-                          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
-      cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
-                          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
+      proton::cupti::enableCallback<false>(0, subscriber,
+                                           CUPTI_CB_DOMAIN_DRIVER_API,
+                                           CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
+      proton::cupti::enableCallback<false>(
+          0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
       if (pcSamplingEnabled) {
         proton::setResourceCallbacks(subscriber, /*enable=*/false);
       }
diff --git a/src/env_config.cpp b/src/env_config.cpp
index 93ca87b..acc9af7 100644
--- a/src/env_config.cpp
+++ b/src/env_config.cpp
@@ -18,6 +18,7 @@ static const char *knownVars[] = {
     "PARCAGPU_RATE_LIMIT",
     "PARCAGPU_SAMPLING_FACTOR",
     "PARCAGPU_PC_SAMPLING_RATE",
+    "PARCAGPU_PC_HW_BUFFER_MB",
 };
 static constexpr size_t numKnownVars =
     sizeof(knownVars) / sizeof(knownVars[0]);
@@ -88,6 +89,16 @@ void validateEnvVars() {
                 "env_config: PARCAGPU_PC_SAMPLING_RATE invalid");
     }
   }
+
+  val = std::getenv("PARCAGPU_PC_HW_BUFFER_MB");
+  if (val) {
+    int mb = std::atoi(val);
+    if (mb < 0) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_HW_BUFFER_MB=%s invalid "
+                   "(must be >= 0), ignoring\n", val);
+      fireError(0, val, "env_config: PARCAGPU_PC_HW_BUFFER_MB invalid");
+    }
+  }
 }
 
 } // namespace parcagpu
diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp
index cf37b18..85485a6 100644
--- a/src/pc_sampling.cpp
+++ b/src/pc_sampling.cpp
@@ -347,12 +347,23 @@ CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() {
   return scratchBufferInfo;
 }
 
+// MB to force for the PC-sampling hardware buffer, from PARCAGPU_PC_HW_BUFFER_MB.
+// 0 (unset/invalid/negative) means "leave at CUPTI default" — see the note in
+// ConfigureData::initialize() for why we don't force it by default.
+int pcHardwareBufferMB() {
+  const char *env = getenv("PARCAGPU_PC_HW_BUFFER_MB");
+  if (!env)
+    return 0;
+  int mb = atoi(env);
+  return mb > 0 ? mb : 0;
+}
+
 CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() {
   CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{};
   hardwareBufferInfo.attributeType =
       CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE;
   hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize =
-      HardwareBufferSize;
+      (size_t)hardwareBufferMB * 1024 * 1024;
   return hardwareBufferInfo;
 }
 
@@ -381,15 +392,33 @@ void ConfigureData::initialize(CUcontext context) {
   DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n", context,
                contextId);
 
+  hardwareBufferMB = pcHardwareBufferMB();
+
   configurationInfos.emplace_back(configureStallReasons());
   configurationInfos.emplace_back(configureCollectionMode());
   configurationInfos.emplace_back(configureStartStopControl());
   configurationInfos.emplace_back(configureSamplingBuffer());
-  // Bigger scratch + hardware buffers so a busy workload (PyTorch-class)
-  // doesn't overflow CUPTI's defaults within minutes and start returning
-  // CUPTI_ERROR_OUT_OF_MEMORY from cuptiPCSamplingGetData.
+  // Bigger scratch buffer so a busy workload (PyTorch-class) doesn't overflow
+  // CUPTI's default within minutes and start returning CUPTI_ERROR_OUT_OF_MEMORY
+  // from cuptiPCSamplingGetData.
   configurationInfos.emplace_back(configureScratchBuffer());
-  configurationInfos.emplace_back(configureHardwareBufferSize());
+  // By default do NOT force the hardware buffer size. On Blackwell (sm_120/
+  // sm_121, e.g. GB10) under live injection + activity recording, setting
+  // CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE makes every
+  // cuptiPCSamplingGetData return CUPTI_ERROR_OUT_OF_MEMORY (0 PCs drained) —
+  // confirmed independent of the value (even 16 MB fails) and not reproducible
+  // in a standalone CUPTI sample with the identical attribute set. CUPTI's
+  // default hardware buffer works; if it overflows it reports droppedSamples
+  // rather than wedging GetData.
+  //
+  // Escape hatch: set PARCAGPU_PC_HW_BUFFER_MB=<n> to force an n-MB hardware
+  // buffer (0 = leave at CUPTI default, same as unset). Lets ops re-enable /
+  // tune it on GPUs where the larger buffer helps busy workloads.
+  if (hardwareBufferMB > 0) {
+    DEBUG_PRINTF("[PARCAGPU] Forcing PC hardware buffer = %d MB "
+                 "(PARCAGPU_PC_HW_BUFFER_MB)\n", hardwareBufferMB);
+    configurationInfos.emplace_back(configureHardwareBufferSize());
+  }
   // Don't set sampling period — let CUPTI use its default.
   // Explicit period values silently break sampling on some GPUs (e.g.
   // Blackwell).
diff --git a/src/pc_sampling.h b/src/pc_sampling.h
index dbb98c3..9dba0e7 100644
--- a/src/pc_sampling.h
+++ b/src/pc_sampling.h
@@ -87,6 +87,7 @@ struct ConfigureData {
   uint32_t contextId;
   uint32_t numStallReasons{};
   uint32_t numValidStallReasons{};
+  int hardwareBufferMB{}; // PARCAGPU_PC_HW_BUFFER_MB; 0 = CUPTI default
   char **stallReasonNames{};
   uint32_t *stallReasonIndices{};
   std::map<size_t, size_t> stallReasonIndexToMetricIndex{};
diff --git a/test/test-pc-mock-graph.sh b/test/test-pc-mock-graph.sh
deleted file mode 100755
index 5c9820e..0000000
--- a/test/test-pc-mock-graph.sh
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/bin/bash
-# Graph-launch variant of test-pc-mock.sh: drives test_cupti_prof with
-# --graph-rate so half the launches are CUDA graph launches (one correlation
-# ID fanning out into many kernel activity records with a nonzero graphId).
-# Verifies graph kernels are profiled (emitted) rather than filtered, i.e. the
-# PC-sampling rewrite did not break graph-launch profiling.
-#
-# Prereqs: make local bpf-test ; run with: sudo -E test/test-pc-mock-graph.sh
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-
-LIB="$ROOT/build-local/lib/libparcagpucupti.so"
-TEST_BIN="$ROOT/build-local/bin/test_cupti_prof"
-BPF="$ROOT/test/bpf/activity_parser"
-CUBIN="$ROOT/test/pc_sample_toy.cubin"
-BPF_LOG="/tmp/parcagpu-graph-bpf.log"
-TEST_LOG="/tmp/parcagpu-graph-test.log"
-
-for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do
-  [ -e "$f" ] || { echo "error: $f not found" >&2; exit 1; }
-done
-
-cleanup() {
-  [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true
-  [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true
-  [ -n "${SEM_PID:-}" ] && kill "$SEM_PID" 2>/dev/null || true
-  wait 2>/dev/null || true
-}
-trap cleanup EXIT
-
-echo "=== Starting test_cupti_prof (mock, graph-rate=25 of launch-rate=50) ==="
-LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \
-  PARCAGPU_DEBUG=1 \
-  PARCAGPU_PC_SAMPLING_RATE=10000 \
-  MOCK_CUBIN_PATH="$CUBIN" \
-  "$TEST_BIN" "$LIB" --launch-rate=50 --graph-rate=25 --duration=12 > "$TEST_LOG" 2>&1 &
-TEST_PID=$!
-echo "test_cupti_prof PID: $TEST_PID"
-
-while kill -0 "$TEST_PID" 2>/dev/null &&
-      ! grep -q libparcagpucupti "/proc/$TEST_PID/maps" 2>/dev/null; do
-  sleep 0.1
-done
-if ! kill -0 "$TEST_PID" 2>/dev/null; then
-  echo "error: test_cupti_prof exited before library loaded" >&2
-  cat "$TEST_LOG" >&2; exit 1
-fi
-
-# The activity_parser does NOT attach the cuda_correlation probe, but
-# parcagpu's allocBuffer + callback path are gated on its USDT semaphore
-# (PARCAGPU_CUDA_CORRELATION_ENABLED). In production parca-agent attaches it;
-# here we bump that semaphore with bpftrace so the activity path is live.
-echo "=== Bumping cuda_correlation semaphore via bpftrace ==="
-bpftrace -p "$TEST_PID" -e "usdt:$LIB:parcagpu:cuda_correlation { @n = count(); }
-  usdt:$LIB:parcagpu:cuda_correlation /(int32)arg1 == -514 || (int32)arg1 == -515/ { @driver_graph = count(); }" \
-  > /tmp/parcagpu-graph-sem.log 2>&1 &
-SEM_PID=$!
-sleep 2  # let bpftrace attach + set the semaphore before launches ramp
-
-echo "=== Starting BPF activity parser ==="
-"$BPF" -pid "$TEST_PID" -lib "$LIB" > "$BPF_LOG" 2>&1 &
-BPF_PID=$!
-echo "activity_parser PID: $BPF_PID"
-
-wait "$TEST_PID" 2>/dev/null || true
-TEST_PID=""
-sleep 2
-kill "$BPF_PID" 2>/dev/null || true
-wait "$BPF_PID" 2>/dev/null || true
-BPF_PID=""
-kill "$SEM_PID" 2>/dev/null || true
-wait "$SEM_PID" 2>/dev/null || true
-SEM_PID=""
-echo "=== bpftrace cuda_correlation hit count ==="
-grep -E "@n:" /tmp/parcagpu-graph-sem.log || cat /tmp/parcagpu-graph-sem.log | tail -3 || true
-
-echo
-echo "=== graph map inserts (parcagpu debug, first 5) ==="
-grep "into graph map" "$TEST_LOG" | head -5 || true
-echo "=== graph map insert count ==="
-grep -c "into graph map" "$TEST_LOG" || true
-echo "=== filtered graph activities (graphId nonzero) count ==="
-grep "Filtered kernel activity" "$TEST_LOG" | grep -vE "graphId=0\b" | grep -cE "graphId=[1-9]" || true
-echo
-echo "=== BPF: sample graph kernel events (graph= nonzero) ==="
-grep -E "^kernel: " "$BPF_LOG" | grep -vE "graph=0\b" | grep -E "graph=[1-9]" | head -8 || true
-echo "=== BPF: graph kernel-event count (graph= nonzero) ==="
-grep -E "^kernel: " "$BPF_LOG" | grep -vE "graph=0\b" | grep -cE "graph=[1-9]" || true
-echo "=== BPF: regular kernel-event count (graph=0) ==="
-grep -cE "^kernel: .*graph=0\b" "$BPF_LOG" || true
-echo "=== BPF summary line ==="
-grep -E "kernels_found|events_received" "$BPF_LOG" | tail -2 || true
-echo
-
-# --- Checks ---
-PASS=true
-check() {
-  if eval "$2"; then echo "PASS: $1"; else echo "FAIL: $1" >&2; PASS=false; fi
-}
-GRAPH_INSERTS=$(grep -c "into graph map" "$TEST_LOG" || true)
-# Authoritative signal: graph kernel activities that PASSED the correlation
-# filter and were emitted (PARCAGPU_KERNEL_EXECUTED + activity_batch). The BPF
-# per-kernel "kernel:" line is verbosity-gated (-v), so we count emits on the
-# parcagpu side instead, and confirm the consumer received events.
-GRAPH_EMITTED=$(grep "Kernel activity:" "$TEST_LOG" | grep -cE "graphId=[1-9]" || true)
-BPF_RECEIVED=$(grep -oE "events_received=[0-9]+" "$BPF_LOG" | grep -oE "[0-9]+" | sort -n | tail -1 || echo 0)
-# Driver cuGraphLaunch correlation events (signed cbid -514/-515). Zero means the
-# driver graph-launch cbid wasn't subscribed; the runtime half emits regardless,
-# so GRAPH_EMITTED>0 alone wouldn't catch it.
-DRIVER_GRAPH=$(grep -oE "@driver_graph: [0-9]+" /tmp/parcagpu-graph-sem.log | grep -oE "[0-9]+" || echo 0)
-
-check "graph launches inserted into graph map"         "[ \"${GRAPH_INSERTS:-0}\" -gt 0 ]"
-check "graph kernel activities emitted (not filtered)"  "[ \"${GRAPH_EMITTED:-0}\" -gt 0 ]"
-check "BPF consumer received kernel events"             "[ \"${BPF_RECEIVED:-0}\" -gt 0 ]"
-check "driver cuGraphLaunch correlated (cbid subscribed)" "[ \"${DRIVER_GRAPH:-0}\" -gt 0 ]"
-
-if $PASS; then echo; echo "=== GRAPH PROFILING VERIFIED ==="; else echo; echo "=== GRAPH PROFILING FAILED ===" >&2; exit 1; fi
diff --git a/test/test-pc-mock.sh b/test/test-pc-mock.sh
index 8d4b888..1e14bff 100755
--- a/test/test-pc-mock.sh
+++ b/test/test-pc-mock.sh
@@ -3,8 +3,15 @@
 # with BPF activity parser. Verifies stall reason map, PC samples, and
 # cubin loading WITHOUT requiring a real GPU.
 #
+# Also drives a mix of CUDA graph launches (--graph-rate): half the launches
+# fan one correlation ID out into many kernel activity records with a nonzero
+# graphId. We verify those graph kernels are profiled (emitted) rather than
+# filtered, and that the driver cuGraphLaunch cbid is subscribed — i.e. the
+# PC-sampling rewrite did not break graph-launch profiling.
+#
 # Prerequisites:
 #   make local bpf-test
+#   bpftrace (used to bump the cuda_correlation USDT semaphore; see below)
 #
 # Usage:
 #   sudo -E test/test-pc-mock.sh          # default
@@ -21,6 +28,7 @@ BPF="$ROOT/test/bpf/activity_parser"
 CUBIN="$ROOT/test/pc_sample_toy.cubin"
 BPF_LOG="/tmp/parcagpu-pc-mock-bpf.log"
 TEST_LOG="/tmp/parcagpu-pc-mock-test.log"
+SEM_LOG="/tmp/parcagpu-pc-mock-sem.log"
 VERBOSE=""
 
 for arg in "$@"; do
@@ -36,10 +44,12 @@ for f in "$LIB" "$TEST_BIN" "$BPF" "$CUBIN"; do
     exit 1
   fi
 done
+command -v bpftrace >/dev/null || { echo "error: bpftrace not found" >&2; exit 1; }
 
 cleanup() {
   [ -n "${TEST_PID:-}" ] && kill "$TEST_PID" 2>/dev/null || true
   [ -n "${BPF_PID:-}" ] && kill "$BPF_PID" 2>/dev/null || true
+  [ -n "${SEM_PID:-}" ] && kill "$SEM_PID" 2>/dev/null || true
   wait 2>/dev/null || true
 }
 trap cleanup EXIT
@@ -49,12 +59,14 @@ trap cleanup EXIT
 # instead of real libcupti.so / libcuda.so.
 # High target rate keeps the controller's probability near 1 throughout the
 # short mock run (otherwise it would converge below 1 once samples flow).
-echo "=== Starting test_cupti_prof (mock) ==="
+# --graph-rate=25 of --launch-rate=50: half the launches are CUDA graph
+# launches (one correlation ID fanning out into many kernel records).
+echo "=== Starting test_cupti_prof (mock, graph-rate=25 of launch-rate=50) ==="
 LD_LIBRARY_PATH="$ROOT/build-local/lib:${LD_LIBRARY_PATH:-}" \
   PARCAGPU_DEBUG=1 \
   PARCAGPU_PC_SAMPLING_RATE=10000 \
   MOCK_CUBIN_PATH="$CUBIN" \
-  "$TEST_BIN" "$LIB" --launch-rate=50 --duration=15 > "$TEST_LOG" 2>&1 &
+  "$TEST_BIN" "$LIB" --launch-rate=50 --graph-rate=25 --duration=15 > "$TEST_LOG" 2>&1 &
 TEST_PID=$!
 echo "test_cupti_prof PID: $TEST_PID"
 
@@ -70,6 +82,17 @@ if ! kill -0 "$TEST_PID" 2>/dev/null; then
   exit 1
 fi
 
+# The activity_parser does NOT attach the cuda_correlation probe, but
+# parcagpu's allocBuffer + callback path are gated on its USDT semaphore
+# (PARCAGPU_CUDA_CORRELATION_ENABLED). In production parca-agent attaches it;
+# here we bump that semaphore with bpftrace so the activity path is live.
+echo "=== Bumping cuda_correlation semaphore via bpftrace ==="
+bpftrace -p "$TEST_PID" -e "usdt:$LIB:parcagpu:cuda_correlation { @n = count(); }
+  usdt:$LIB:parcagpu:cuda_correlation /(int32)arg1 == -514 || (int32)arg1 == -515/ { @driver_graph = count(); }" \
+  > "$SEM_LOG" 2>&1 &
+SEM_PID=$!
+sleep 2  # let bpftrace attach + set the semaphore before launches ramp
+
 # --- Attach BPF parser ---
 echo "=== Starting BPF activity parser ==="
 "$BPF" -pid "$TEST_PID" -lib "$LIB" $VERBOSE > "$BPF_LOG" 2>&1 &
@@ -81,10 +104,13 @@ wait "$TEST_PID" 2>/dev/null || true
 TEST_PID=""
 sleep 2
 
-# --- Stop BPF parser ---
+# --- Stop BPF parser + semaphore bumper ---
 kill "$BPF_PID" 2>/dev/null || true
 wait "$BPF_PID" 2>/dev/null || true
 BPF_PID=""
+kill "$SEM_PID" 2>/dev/null || true
+wait "$SEM_PID" 2>/dev/null || true
+SEM_PID=""
 
 # --- Results ---
 echo
@@ -94,10 +120,16 @@ echo
 echo "=== BPF parser output ==="
 cat "$BPF_LOG"
 echo
+echo "=== bpftrace cuda_correlation hit count ==="
+grep -E "@n:" "$SEM_LOG" || tail -3 "$SEM_LOG" || true
+echo "=== graph map insert count (parcagpu debug) ==="
+grep -c "into graph map" "$TEST_LOG" || true
+echo
 
 # --- Checks ---
 PASS=true
 
+# grep-based check: PASS if PATTERN is present in FILE.
 check() {
   local label="$1" pattern="$2" file="$3"
   if grep -q "$pattern" "$file"; then
@@ -108,6 +140,12 @@ check() {
   fi
 }
 
+# expression-based check: PASS if the shell test EXPR succeeds.
+check_expr() {
+  if eval "$2"; then echo "PASS: $1"; else echo "FAIL: $1" >&2; PASS=false; fi
+}
+
+# PC sampling + cubin + source correlation (no GPU).
 check "PC sampling initialized" "PC sampling initialized" "$TEST_LOG"
 check "real cubin loaded (mock)" "Loaded cubin.*pc_sample_toy" "$TEST_LOG"
 check "modules loaded (parcagpu)" "Module 0x.*loaded" "$TEST_LOG"
@@ -119,6 +157,24 @@ check "PC samples contain stall reasons" "smsp__pcsamp" "$BPF_LOG"
 check "cubins loaded (bpf)" "\[CUBIN\].*loaded" "$BPF_LOG"
 check "PC sample events received" "pc_samples=[1-9]" "$BPF_LOG"
 
+# Graph-launch profiling.
+GRAPH_INSERTS=$(grep -c "into graph map" "$TEST_LOG" || true)
+# Authoritative signal: graph kernel activities that PASSED the correlation
+# filter and were emitted (PARCAGPU_KERNEL_EXECUTED + activity_batch). The BPF
+# per-kernel "kernel:" line is verbosity-gated (-v), so we count emits on the
+# parcagpu side instead, and confirm the consumer received events.
+GRAPH_EMITTED=$(grep "Kernel activity:" "$TEST_LOG" | grep -cE "graphId=[1-9]" || true)
+BPF_RECEIVED=$(grep -oE "events_received=[0-9]+" "$BPF_LOG" | grep -oE "[0-9]+" | sort -n | tail -1 || echo 0)
+# Driver cuGraphLaunch correlation events (signed cbid -514/-515). Zero means the
+# driver graph-launch cbid wasn't subscribed; the runtime half emits regardless,
+# so GRAPH_EMITTED>0 alone wouldn't catch it.
+DRIVER_GRAPH=$(grep -oE "@driver_graph: [0-9]+" "$SEM_LOG" | grep -oE "[0-9]+" || echo 0)
+
+check_expr "graph launches inserted into graph map"          "[ \"${GRAPH_INSERTS:-0}\" -gt 0 ]"
+check_expr "graph kernel activities emitted (not filtered)"  "[ \"${GRAPH_EMITTED:-0}\" -gt 0 ]"
+check_expr "BPF consumer received kernel events"             "[ \"${BPF_RECEIVED:-0}\" -gt 0 ]"
+check_expr "driver cuGraphLaunch correlated (cbid subscribed)" "[ \"${DRIVER_GRAPH:-0}\" -gt 0 ]"
+
 if $PASS; then
   echo
   echo "=== ALL CHECKS PASSED ==="