Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ test-multi: local bpf-test
test-pc-real: local bpf-test microbenchmarks
sudo -E test/test-pc-real.sh

# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA.
# Mock PC sampling test — no GPU required, uses mock CUPTI/CUDA. Also covers
# graph launches (guards driver cuGraphLaunch CBID subscription).
test-pc-mock: local bpf-test
sudo -E test/test-pc-mock.sh

Expand Down
15 changes: 15 additions & 0 deletions src/cupti.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,14 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
proton::setRuntimeCallbacks(subscriber, /*enable=*/true);
proton::setLaunchCallbacks(subscriber, /*enable=*/true);

// Driver-API graph launches (cuGraphLaunch/_ptsz); not covered by setLaunch/
// setRuntimeCallbacks. Only these two cbids -- not setGraphCallbacks(), which
// also subscribes capture cbids that emit uncorrelated events.
proton::cupti::enableCallback<true>(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
proton::cupti::enableCallback<true>(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);

// Enable resource callbacks only if PC sampling is enabled
if (pcSamplingEnabled) {
proton::setResourceCallbacks(subscriber, /*enable=*/true);
Expand Down Expand Up @@ -305,6 +313,13 @@ class CuptiProfiler : public proton::Singleton<CuptiProfiler> {
if (subscriber) {
proton::setRuntimeCallbacks(subscriber, /*enable=*/false);
proton::setLaunchCallbacks(subscriber, /*enable=*/false);
// Mirror the graph-launch callbacks enabled in init().
proton::cupti::enableCallback<false>(0, subscriber,
CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch);
proton::cupti::enableCallback<false>(
0, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz);
if (pcSamplingEnabled) {
proton::setResourceCallbacks(subscriber, /*enable=*/false);
}
Expand Down
11 changes: 11 additions & 0 deletions src/env_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ static const char *knownVars[] = {
"PARCAGPU_RATE_LIMIT",
"PARCAGPU_SAMPLING_FACTOR",
"PARCAGPU_PC_SAMPLING_RATE",
"PARCAGPU_PC_HW_BUFFER_MB",
};
static constexpr size_t numKnownVars =
sizeof(knownVars) / sizeof(knownVars[0]);
Expand Down Expand Up @@ -88,6 +89,16 @@ void validateEnvVars() {
"env_config: PARCAGPU_PC_SAMPLING_RATE invalid");
}
}

val = std::getenv("PARCAGPU_PC_HW_BUFFER_MB");
if (val) {
int mb = std::atoi(val);
if (mb < 0) {
DEBUG_PRINTF("[PARCAGPU] Warning: PARCAGPU_PC_HW_BUFFER_MB=%s invalid "
"(must be >= 0), ignoring\n", val);
fireError(0, val, "env_config: PARCAGPU_PC_HW_BUFFER_MB invalid");
}
}
}

} // namespace parcagpu
39 changes: 34 additions & 5 deletions src/pc_sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,12 +347,23 @@ CUpti_PCSamplingConfigurationInfo ConfigureData::configureScratchBuffer() {
return scratchBufferInfo;
}

// MB to force for the PC-sampling hardware buffer, from PARCAGPU_PC_HW_BUFFER_MB.
// 0 (unset/invalid/negative) means "leave at CUPTI default" — see the note in
// ConfigureData::initialize() for why we don't force it by default.
int pcHardwareBufferMB() {
const char *env = getenv("PARCAGPU_PC_HW_BUFFER_MB");
if (!env)
return 0;
int mb = atoi(env);
return mb > 0 ? mb : 0;
}

CUpti_PCSamplingConfigurationInfo ConfigureData::configureHardwareBufferSize() {
CUpti_PCSamplingConfigurationInfo hardwareBufferInfo{};
hardwareBufferInfo.attributeType =
CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE;
hardwareBufferInfo.attributeData.hardwareBufferSizeData.hardwareBufferSize =
HardwareBufferSize;
(size_t)hardwareBufferMB * 1024 * 1024;
return hardwareBufferInfo;
}

Expand Down Expand Up @@ -381,15 +392,33 @@ void ConfigureData::initialize(CUcontext context) {
DEBUG_PRINTF("Initializing PC sampling for context %p (id %u)\n", context,
contextId);

hardwareBufferMB = pcHardwareBufferMB();

configurationInfos.emplace_back(configureStallReasons());
configurationInfos.emplace_back(configureCollectionMode());
configurationInfos.emplace_back(configureStartStopControl());
configurationInfos.emplace_back(configureSamplingBuffer());
// Bigger scratch + hardware buffers so a busy workload (PyTorch-class)
// doesn't overflow CUPTI's defaults within minutes and start returning
// CUPTI_ERROR_OUT_OF_MEMORY from cuptiPCSamplingGetData.
// Bigger scratch buffer so a busy workload (PyTorch-class) doesn't overflow
// CUPTI's default within minutes and start returning CUPTI_ERROR_OUT_OF_MEMORY
// from cuptiPCSamplingGetData.
configurationInfos.emplace_back(configureScratchBuffer());
configurationInfos.emplace_back(configureHardwareBufferSize());
// By default do NOT force the hardware buffer size. On Blackwell (sm_120/
// sm_121, e.g. GB10) under live injection + activity recording, setting
// CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE makes every
// cuptiPCSamplingGetData return CUPTI_ERROR_OUT_OF_MEMORY (0 PCs drained) —
// confirmed independent of the value (even 16 MB fails) and not reproducible
// in a standalone CUPTI sample with the identical attribute set. CUPTI's
// default hardware buffer works; if it overflows it reports droppedSamples
// rather than wedging GetData.
//
// Escape hatch: set PARCAGPU_PC_HW_BUFFER_MB=<n> to force an n-MB hardware
// buffer (0 = leave at CUPTI default, same as unset). Lets ops re-enable /
// tune it on GPUs where the larger buffer helps busy workloads.
if (hardwareBufferMB > 0) {
DEBUG_PRINTF("[PARCAGPU] Forcing PC hardware buffer = %d MB "
"(PARCAGPU_PC_HW_BUFFER_MB)\n", hardwareBufferMB);
configurationInfos.emplace_back(configureHardwareBufferSize());
}
// Don't set sampling period — let CUPTI use its default.
// Explicit period values silently break sampling on some GPUs (e.g.
// Blackwell).
Expand Down
1 change: 1 addition & 0 deletions src/pc_sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ struct ConfigureData {
uint32_t contextId;
uint32_t numStallReasons{};
uint32_t numValidStallReasons{};
int hardwareBufferMB{}; // PARCAGPU_PC_HW_BUFFER_MB; 0 = CUPTI default
char **stallReasonNames{};
uint32_t *stallReasonIndices{};
std::map<size_t, size_t> stallReasonIndexToMetricIndex{};
Expand Down
55 changes: 55 additions & 0 deletions test/graph-repro-real.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env bash
# graph-repro-real.sh — real-GPU guard: runs graph_repro under the shim and
# checks the cuda_correlation USDT fires for driver cuGraphLaunch (cbid -514).
# Zero => FAIL (cbid not subscribed). Needs an NVIDIA GPU, nvcc, bpftrace, root.
set -euo pipefail

ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
LIB="$ROOT/build-local/lib/libparcagpucupti.so"
SECS="${1:-15}"
BIN="$(mktemp -d)/graph_repro"
COUNT_LOG="$(mktemp)"

[ -f "$LIB" ] || { echo "error: $LIB not found — run 'make local' first" >&2; exit 1; }
command -v nvcc >/dev/null || { echo "error: nvcc not found" >&2; exit 1; }
command -v bpftrace >/dev/null || { echo "error: bpftrace not found" >&2; exit 1; }

echo "=== Building reproducer ==="
nvcc -O2 -o "$BIN" "$ROOT/test/graph_repro.cu" -lcuda

echo "=== Launching reproducer under parcagpu shim ==="
LD_PRELOAD="$LIB" PARCAGPU_DEBUG=1 "$BIN" "$SECS" 2>"$(mktemp)" &
APP_PID=$!

# Wait for the shim to be mapped.
for _ in $(seq 1 50); do
grep -q libparcagpucupti "/proc/$APP_PID/maps" 2>/dev/null && break
sleep 0.1
done

echo "=== Attaching bpftrace to cuda_correlation USDT (driver cbid 514 => -514) ==="
# @driver: driver cuGraphLaunch (path under test); @runtime: positive cbid
bpftrace -p "$APP_PID" \
-e "usdt:$LIB:parcagpu:cuda_correlation /arg1 == -514 || arg1 == -515/ { @driver = count(); }
usdt:$LIB:parcagpu:cuda_correlation /arg1 >= 0/ { @runtime = count(); }" \
>"$COUNT_LOG" 2>/dev/null &
BT_PID=$!

wait "$APP_PID" || true
sleep 1
kill "$BT_PID" 2>/dev/null || true
wait "$BT_PID" 2>/dev/null || true

echo "=== bpftrace results ==="
cat "$COUNT_LOG"

DRIVER=$(grep -oE "@driver: [0-9]+" "$COUNT_LOG" | grep -oE "[0-9]+" || echo 0)
echo
if [ "${DRIVER:-0}" -gt 0 ]; then
echo "PASS: driver cuGraphLaunch produced $DRIVER correlation events"
exit 0
else
echo "FAIL: zero driver cuGraphLaunch correlation events — driver graph-launch" >&2
echo " cbid is not subscribed (CuptiProfiler::initialize)." >&2
exit 1
fi
94 changes: 94 additions & 0 deletions test/graph_repro.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// graph_repro.cu — replays a captured graph via the DRIVER cuGraphLaunch (the
// path parcagpu missed). Build: nvcc -o graph_repro test/graph_repro.cu -lcuda
// Run: ./graph_repro [seconds]. See test/graph-repro-real.sh for the guard.

#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdlib>
#include <ctime>

#define CUDA_CHECK(x) \
do { \
cudaError_t e = (x); \
if (e != cudaSuccess) { \
fprintf(stderr, "cuda error %s at %s:%d\n", cudaGetErrorString(e), \
__FILE__, __LINE__); \
exit(1); \
} \
} while (0)

#define CU_CHECK(x) \
do { \
CUresult e = (x); \
if (e != CUDA_SUCCESS) { \
const char *s = nullptr; \
cuGetErrorString(e, &s); \
fprintf(stderr, "driver error %s at %s:%d\n", s ? s : "?", __FILE__, \
__LINE__); \
exit(1); \
} \
} while (0)

__global__ void add_one(float *x, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
x[i] += 1.0f;
}

int main(int argc, char **argv) {
int seconds = (argc > 1) ? atoi(argv[1]) : 20;

// Force driver + runtime init.
CUDA_CHECK(cudaFree(0));
CU_CHECK(cuInit(0));

const int n = 1 << 16;
float *d = nullptr;
CUDA_CHECK(cudaMalloc(&d, n * sizeof(float)));

cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));

// Build a graph by capturing a few kernel launches into the stream.
CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
for (int k = 0; k < 8; k++) {
add_one<<<(n + 255) / 256, 256, 0, stream>>>(d, n);
}
cudaGraph_t graph;
CUDA_CHECK(cudaStreamEndCapture(stream, &graph));

cudaGraphExec_t exec;
CUDA_CHECK(cudaGraphInstantiate(&exec, graph, nullptr, nullptr, 0));

// cudaGraphExec_t is the same handle as CUgraphExec; replay via driver API.
CUgraphExec drvExec = reinterpret_cast<CUgraphExec>(exec);
CUstream drvStream = reinterpret_cast<CUstream>(stream);

fprintf(stderr,
"graph_repro: replaying graph via driver cuGraphLaunch for %ds\n",
seconds);

struct timespec start;
clock_gettime(CLOCK_MONOTONIC, &start);
unsigned long long launches = 0;
for (;;) {
CU_CHECK(cuGraphLaunch(drvExec, drvStream));
launches++;
if ((launches & 0x3ff) == 0) {
CUDA_CHECK(cudaStreamSynchronize(stream));
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
if (now.tv_sec - start.tv_sec >= seconds)
break;
}
}
CUDA_CHECK(cudaStreamSynchronize(stream));
fprintf(stderr, "graph_repro: done, %llu driver graph launches\n", launches);

cudaGraphExecDestroy(exec);
cudaGraphDestroy(graph);
cudaStreamDestroy(stream);
cudaFree(d);
return 0;
}
34 changes: 34 additions & 0 deletions test/mock_cupti.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,47 @@ CUptiResult cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
return CUPTI_SUCCESS;
}

// Subscribed (domain, cbid) pairs, so the harness can mirror real CUPTI dispatch.
#define MOCK_MAX_ENABLED 512
static struct {
CUpti_CallbackDomain domain;
CUpti_CallbackId cbid;
} __enabled_cbs[MOCK_MAX_ENABLED];
static size_t __enabled_cb_count = 0;

int __mock_cupti_is_callback_enabled(CUpti_CallbackDomain domain,
CUpti_CallbackId cbid) {
for (size_t i = 0; i < __enabled_cb_count; i++) {
if (__enabled_cbs[i].domain == domain && __enabled_cbs[i].cbid == cbid) {
return 1;
}
}
return 0;
}

CUptiResult cuptiEnableCallback(uint32_t enable,
CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain,
CUpti_CallbackId cbid) {
(void)subscriber; // Mark as intentionally unused
fprintf(stderr, "[MOCK_CUPTI] cuptiEnableCallback(enable=%u, domain=%u, cbid=%u)\n",
enable, domain, cbid);
if (enable) {
if (!__mock_cupti_is_callback_enabled(domain, cbid) &&
__enabled_cb_count < MOCK_MAX_ENABLED) {
__enabled_cbs[__enabled_cb_count].domain = domain;
__enabled_cbs[__enabled_cb_count].cbid = cbid;
__enabled_cb_count++;
}
} else {
for (size_t i = 0; i < __enabled_cb_count; i++) {
if (__enabled_cbs[i].domain == domain &&
__enabled_cbs[i].cbid == cbid) {
__enabled_cbs[i] = __enabled_cbs[--__enabled_cb_count];
break;
}
}
}
return CUPTI_SUCCESS;
}

Expand Down
Loading
Loading