parca-dev · gnurizen · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/Makefile b/Makefile
@@ -159,7 +159,8 @@ test-multi: local bpf-test
 test-pc-real: local bpf-test microbenchmarks
 	sudo -E test/test-pc-real.sh
 
-# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA.
+# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA. Also covers
+# graph launches (guards driver cuGraphLaunch CBID subscription).
 test-pc-mock: local bpf-test
 	sudo -E test/test-pc-mock.sh
 

diff --git a/src/cupti.cpp b/src/cupti.cpp
@@ -260,6 +260,14 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     proton::setRuntimeCallbacks(subscriber, /*enable=*/true);
     proton::setLaunchCallbacks(subscriber, /*enable=*/true);
 
+    // Driver-API graph launches (cuGraphLaunch/_ptsz); not covered by setLaunch/
+    // setRuntimeCallbacks. Only these two cbids -- not setGraphCallbacks(), which
+    // also subscribes capture cbids that emit uncorrelated events.
+    proton::cupti::enableCallback<true>(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                                        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
+    proton::cupti::enableCallback<true>(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                                        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
+
     // Enable resource callbacks only if PC sampling is enabled
     if (pcSamplingEnabled) {
       proton::setResourceCallbacks(subscriber, /*enable=*/true);
@@ -305,6 +313,13 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
     if (subscriber) {
       proton::setRuntimeCallbacks(subscriber, /*enable=*/false);
       proton::setLaunchCallbacks(subscriber, /*enable=*/false);
+      // Mirror the graph-launch callbacks enabled in init().
+      proton::cupti::enableCallback<false>(0, subscriber,
+                                           CUPTI_CB_DOMAIN_DRIVER_API,
+                                           CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
+      proton::cupti::enableCallback<false>(
+          0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+          CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
       if (pcSamplingEnabled) {
         proton::setResourceCallbacks(subscriber, /*enable=*/false);
       }

diff --git a/src/env_config.cpp b/src/env_config.cpp
@@ -18,6 +18,7 @@ static const char *knownVars[] = {
     "PARCAGPU_RATE_LIMIT",
     "PARCAGPU_SAMPLING_FACTOR",
     "PARCAGPU_PC_SAMPLING_RATE",
+    "PARCAGPU_PC_HW_BUFFER_MB",
 };
 static constexpr size_t numKnownVars =
     sizeof(knownVars) / sizeof(knownVars[0]);
@@ -88,6 +89,16 @@ void validateEnvVars() {
                 "env_config: PARCAGPU_PC_SAMPLING_RATE invalid");
     }
   }
+
+  val = std::getenv("PARCAGPU_PC_HW_BUFFER_MB");
+  if (val) {
+    int mb = std::atoi(val);
+    if (mb < 0) {
+      DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_HW_BUFFER_MB=%s invalid "
+                   "(must be >= 0), ignoring\n", val);
+      fireError(0, val, "env_config: PARCAGPU_PC_HW_BUFFER_MB invalid");
+    }
+  }
 }
 
 } // namespace parcagpu
diff --git a/src/pc_sampling.cpp b/src/pc_sampling.cpp
@@ -347,12 +347,23 @@ CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() {
   return scratchBufferInfo;
 }
 
+// MB to force for the PC-sampling hardware buffer, from PARCAGPU_PC_HW_BUFFER_MB.
+// 0 (unset/invalid/negative) means "leave at CUPTI default" — see the note in
+// ConfigureData::initialize() for why we don't force it by default.
+int pcHardwareBufferMB() {
+  const char *env = getenv("PARCAGPU_PC_HW_BUFFER_MB");
+  if (!env)
+    return 0;
+  int mb = atoi(env);
+  return mb > 0 ? mb : 0;
+}
+
 CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() {
   CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{};
   hardwareBufferInfo.attributeType =
       CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE;
   hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize =
-      HardwareBufferSize;
+      (size_t)hardwareBufferMB * 1024 * 1024;
   return hardwareBufferInfo;
 }
 
@@ -381,15 +392,33 @@ void ConfigureData::initialize(CUcontext context) {
   DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n", context,
                contextId);
 
+  hardwareBufferMB = pcHardwareBufferMB();
+
   configurationInfos.emplace_back(configureStallReasons());
   configurationInfos.emplace_back(configureCollectionMode());
   configurationInfos.emplace_back(configureStartStopControl());
   configurationInfos.emplace_back(configureSamplingBuffer());
-  // Bigger scratch + hardware buffers so a busy workload (PyTorch-class)
-  // doesn't overflow CUPTI's defaults within minutes and start returning
-  // CUPTI_ERROR_OUT_OF_MEMORY from cuptiPCSamplingGetData.
+  // Bigger scratch buffer so a busy workload (PyTorch-class) doesn't overflow
+  // CUPTI's default within minutes and start returning CUPTI_ERROR_OUT_OF_MEMORY
+  // from cuptiPCSamplingGetData.
   configurationInfos.emplace_back(configureScratchBuffer());
-  configurationInfos.emplace_back(configureHardwareBufferSize());
+  // By default do NOT force the hardware buffer size. On Blackwell (sm_120/
+  // sm_121, e.g. GB10) under live injection + activity recording, setting
+  // CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE makes every
+  // cuptiPCSamplingGetData return CUPTI_ERROR_OUT_OF_MEMORY (0 PCs drained) —
+  // confirmed independent of the value (even 16 MB fails) and not reproducible
+  // in a standalone CUPTI sample with the identical attribute set. CUPTI's
+  // default hardware buffer works; if it overflows it reports droppedSamples
+  // rather than wedging GetData.
+  //
+  // Escape hatch: set PARCAGPU_PC_HW_BUFFER_MB=<n> to force an n-MB hardware
+  // buffer (0 = leave at CUPTI default, same as unset). Lets ops re-enable /
+  // tune it on GPUs where the larger buffer helps busy workloads.
+  if (hardwareBufferMB > 0) {
+    DEBUG_PRINTF("[PARCAGPU] Forcing PC hardware buffer = %d MB "
+                 "(PARCAGPU_PC_HW_BUFFER_MB)\n", hardwareBufferMB);
+    configurationInfos.emplace_back(configureHardwareBufferSize());
+  }
   // Don't set sampling period — let CUPTI use its default.
   // Explicit period values silently break sampling on some GPUs (e.g.
   // Blackwell).

diff --git a/src/pc_sampling.h b/src/pc_sampling.h
@@ -87,6 +87,7 @@ struct ConfigureData {
   uint32_t contextId;
   uint32_t numStallReasons{};
   uint32_t numValidStallReasons{};
+  int hardwareBufferMB{}; // PARCAGPU_PC_HW_BUFFER_MB; 0 = CUPTI default
   char **stallReasonNames{};
   uint32_t *stallReasonIndices{};
   std::map<size_t, size_t> stallReasonIndexToMetricIndex{};

diff --git a/test/graph-repro-real.sh b/test/graph-repro-real.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# graph-repro-real.sh — real-GPU guard: runs graph_repro under the shim and
+# checks the cuda_correlation USDT fires for driver cuGraphLaunch (cbid -514).
+# Zero => FAIL (cbid not subscribed). Needs an NVIDIA GPU, nvcc, bpftrace, root.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+LIB="$ROOT/build-local/lib/libparcagpucupti.so"
+SECS="${1:-15}"
+BIN="$(mktemp -d)/graph_repro"
+COUNT_LOG="$(mktemp)"
+
+[ -f "$LIB" ] || { echo "error: $LIB not found — run 'make local' first" >&2; exit 1; }
+command -v nvcc >/dev/null || { echo "error: nvcc not found" >&2; exit 1; }
+command -v bpftrace >/dev/null || { echo "error: bpftrace not found" >&2; exit 1; }
+
+echo "=== Building reproducer ==="
+nvcc -O2 -o "$BIN" "$ROOT/test/graph_repro.cu" -lcuda
+
+echo "=== Launching reproducer under parcagpu shim ==="
+LD_PRELOAD="$LIB" PARCAGPU_DEBUG=1 "$BIN" "$SECS" 2>"$(mktemp)" &
+APP_PID=$!
+
+# Wait for the shim to be mapped.
+for _ in $(seq 1 50); do
+  grep -q libparcagpucupti "/proc/$APP_PID/maps" 2>/dev/null && break
+  sleep 0.1
+done
+
+echo "=== Attaching bpftrace to cuda_correlation USDT (driver cbid 514 => -514) ==="
+# @driver: driver cuGraphLaunch (path under test); @runtime: positive cbid
+bpftrace -p "$APP_PID" \
+  -e "usdt:$LIB:parcagpu:cuda_correlation /arg1 == -514 || arg1 == -515/ { @driver = count(); }
+      usdt:$LIB:parcagpu:cuda_correlation /arg1 >= 0/ { @runtime = count(); }" \
+  >"$COUNT_LOG" 2>/dev/null &
+BT_PID=$!
+
+wait "$APP_PID" || true
+sleep 1
+kill "$BT_PID" 2>/dev/null || true
+wait "$BT_PID" 2>/dev/null || true
+
+echo "=== bpftrace results ==="
+cat "$COUNT_LOG"
+
+DRIVER=$(grep -oE "@driver: [0-9]+" "$COUNT_LOG" | grep -oE "[0-9]+" || echo 0)
+echo
+if [ "${DRIVER:-0}" -gt 0 ]; then
+  echo "PASS: driver cuGraphLaunch produced $DRIVER correlation events"
+  exit 0
+else
+  echo "FAIL: zero driver cuGraphLaunch correlation events — driver graph-launch" >&2
+  echo "      cbid is not subscribed (CuptiProfiler::initialize)." >&2
+  exit 1
+fi
diff --git a/test/graph_repro.cu b/test/graph_repro.cu
@@ -0,0 +1,94 @@
+// graph_repro.cu — replays a captured graph via the DRIVER cuGraphLaunch (the
+// path parcagpu missed). Build: nvcc -o graph_repro test/graph_repro.cu -lcuda
+// Run: ./graph_repro [seconds]. See test/graph-repro-real.sh for the guard.
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+
+#define CUDA_CHECK(x)                                                          \
+  do {                                                                         \
+    cudaError_t e = (x);                                                       \
+    if (e != cudaSuccess) {                                                    \
+      fprintf(stderr, "cuda error %s at %s:%d\n", cudaGetErrorString(e),       \
+              __FILE__, __LINE__);                                             \
+      exit(1);                                                                 \
+    }                                                                          \
+  } while (0)
+
+#define CU_CHECK(x)                                                            \
+  do {                                                                         \
+    CUresult e = (x);                                                          \
+    if (e != CUDA_SUCCESS) {                                                   \
+      const char *s = nullptr;                                                 \
+      cuGetErrorString(e, &s);                                                 \
+      fprintf(stderr, "driver error %s at %s:%d\n", s ? s : "?", __FILE__,     \
+              __LINE__);                                                       \
+      exit(1);                                                                 \
+    }                                                                          \
+  } while (0)
+
+__global__ void add_one(float *x, int n) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    x[i] += 1.0f;
+}
+
+int main(int argc, char **argv) {
+  int seconds = (argc > 1) ? atoi(argv[1]) : 20;
+
+  // Force driver + runtime init.
+  CUDA_CHECK(cudaFree(0));
+  CU_CHECK(cuInit(0));
+
+  const int n = 1 << 16;
+  float *d = nullptr;
+  CUDA_CHECK(cudaMalloc(&d, n * sizeof(float)));
+
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+
+  // Build a graph by capturing a few kernel launches into the stream.
+  CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
+  for (int k = 0; k < 8; k++) {
+    add_one<<<(n + 255) / 256, 256, 0, stream>>>(d, n);
+  }
+  cudaGraph_t graph;
+  CUDA_CHECK(cudaStreamEndCapture(stream, &graph));
+
+  cudaGraphExec_t exec;
+  CUDA_CHECK(cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0));
+
+  // cudaGraphExec_t is the same handle as CUgraphExec; replay via driver API.
+  CUgraphExec drvExec = reinterpret_cast<CUgraphExec>(exec);
+  CUstream drvStream = reinterpret_cast<CUstream>(stream);
+
+  fprintf(stderr,
+          "graph_repro: replaying graph via driver cuGraphLaunch for %ds\n",
+          seconds);
+
+  struct timespec start;
+  clock_gettime(CLOCK_MONOTONIC, &start);
+  unsigned long long launches = 0;
+  for (;;) {
+    CU_CHECK(cuGraphLaunch(drvExec, drvStream));
+    launches++;
+    if ((launches & 0x3ff) == 0) {
+      CUDA_CHECK(cudaStreamSynchronize(stream));
+      struct timespec now;
+      clock_gettime(CLOCK_MONOTONIC, &now);
+      if (now.tv_sec - start.tv_sec >= seconds)
+        break;
+    }
+  }
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  fprintf(stderr, "graph_repro: done, %llu driver graph launches\n", launches);
+
+  cudaGraphExecDestroy(exec);
+  cudaGraphDestroy(graph);
+  cudaStreamDestroy(stream);
+  cudaFree(d);
+  return 0;
+}
diff --git a/test/mock_cupti.c b/test/mock_cupti.c
@@ -45,13 +45,47 @@ CUptiResult cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
     return CUPTI_SUCCESS;
 }
 
+// Subscribed (domain, cbid) pairs, so the harness can mirror real CUPTI dispatch.
+#define MOCK_MAX_ENABLED 512
+static struct {
+    CUpti_CallbackDomain domain;
+    CUpti_CallbackId cbid;
+} __enabled_cbs[MOCK_MAX_ENABLED];
+static size_t __enabled_cb_count = 0;
+
+int __mock_cupti_is_callback_enabled(CUpti_CallbackDomain domain,
+                                     CUpti_CallbackId cbid) {
+    for (size_t i = 0; i < __enabled_cb_count; i++) {
+        if (__enabled_cbs[i].domain == domain && __enabled_cbs[i].cbid == cbid) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
 CUptiResult cuptiEnableCallback(uint32_t enable,
                                 CUpti_SubscriberHandle subscriber,
                                 CUpti_CallbackDomain domain,
                                 CUpti_CallbackId cbid) {
     (void)subscriber;  // Mark as intentionally unused
     fprintf(stderr, "[MOCK_CUPTI] cuptiEnableCallback(enable=%u, domain=%u, cbid=%u)\n",
             enable, domain, cbid);
+    if (enable) {
+        if (!__mock_cupti_is_callback_enabled(domain, cbid) &&
+            __enabled_cb_count < MOCK_MAX_ENABLED) {
+            __enabled_cbs[__enabled_cb_count].domain = domain;
+            __enabled_cbs[__enabled_cb_count].cbid = cbid;
+            __enabled_cb_count++;
+        }
+    } else {
+        for (size_t i = 0; i < __enabled_cb_count; i++) {
+            if (__enabled_cbs[i].domain == domain &&
+                __enabled_cbs[i].cbid == cbid) {
+                __enabled_cbs[i] = __enabled_cbs[--__enabled_cb_count];
+                break;
+            }
+        }
+    }
     return CUPTI_SUCCESS;
 }