From f6321ad11ea10ef53f4204d7868db72adec4b0e4 Mon Sep 17 00:00:00 2001 From: Divyansh Khanna Date: Mon, 26 Jan 2026 17:05:02 -0800 Subject: [PATCH] callback registry .h/.cpp --- libkineto/libkineto_defs.bzl | 1 + libkineto/src/CuptiCallbackRegistry.cpp | 217 ++++++++++++++++++++++++ libkineto/src/CuptiCallbackRegistry.h | 99 +++++++++++ 3 files changed, 317 insertions(+) create mode 100644 libkineto/src/CuptiCallbackRegistry.cpp create mode 100644 libkineto/src/CuptiCallbackRegistry.h diff --git a/libkineto/libkineto_defs.bzl b/libkineto/libkineto_defs.bzl index a9924b727..ebb417516 100644 --- a/libkineto/libkineto_defs.bzl +++ b/libkineto/libkineto_defs.bzl @@ -13,6 +13,7 @@ def get_libkineto_cupti_srcs(with_api = True): return [ "src/CuptiActivityApi.cpp", "src/CuptiCallbackApi.cpp", + "src/CuptiCallbackRegistry.cpp", "src/CuptiEventApi.cpp", "src/CuptiMetricApi.cpp", "src/CuptiRangeProfiler.cpp", diff --git a/libkineto/src/CuptiCallbackRegistry.cpp b/libkineto/src/CuptiCallbackRegistry.cpp new file mode 100644 index 000000000..c99116e76 --- /dev/null +++ b/libkineto/src/CuptiCallbackRegistry.cpp @@ -0,0 +1,217 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "CuptiCallbackRegistry.h" + +namespace KINETO_NAMESPACE { + +CuptiCallbackRegistry& CuptiCallbackRegistry::instance() { + static CuptiCallbackRegistry instance; + return instance; +} + +const std::unordered_map* +CuptiCallbackRegistry::getMapForDomain(CallbackDomain domain) const { + switch (domain) { + case CallbackDomain::RUNTIME: + return &runtimeCallbacks_; + case CallbackDomain::DRIVER: + return &driverCallbacks_; + default: + return nullptr; + } +} + +void CuptiCallbackRegistry::registerCallback( + CallbackDomain domain, + uint32_t cbid, + bool requiresFlowCorrelation, + bool isBlocklisted) { + CallbackProps props{requiresFlowCorrelation, isBlocklisted}; + switch (domain) { + case CallbackDomain::RUNTIME: + runtimeCallbacks_[cbid] = props; + break; + case CallbackDomain::DRIVER: + driverCallbacks_[cbid] = props; + break; + default: + break; + } +} + +void CuptiCallbackRegistry::registerCallbackRange( + CallbackDomain domain, + uint32_t startCbid, + uint32_t endCbid, + bool requiresFlowCorrelation) { + callbackRanges_.push_back( + {domain, CallbackRange{startCbid, endCbid, requiresFlowCorrelation}}); +} + +CuptiCallbackRegistry::CuptiCallbackRegistry() { + // ========================================================================= + // RUNTIME API - Kernel Launches + // ========================================================================= + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/ + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); + +#if defined(CUPTI_API_VERSION) && CUPTI_API_VERSION >= 18 + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); +#endif + + // ========================================================================= + // RUNTIME API - CUDA Graph Operations + // ========================================================================= + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); + + // ========================================================================= + // RUNTIME API - Synchronization Operations + // ========================================================================= + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); + + // ========================================================================= + // RUNTIME API - Memory Operations (range-based) + // ========================================================================= + registerCallbackRange( + /*domain=*/CallbackDomain::RUNTIME, + /*startCbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, + /*endCbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020, + /*requiresFlowCorrelation=*/true); + + // ========================================================================= + // RUNTIME API - Blocklisted (noisy) Callbacks + // ========================================================================= + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020, + /*requiresFlowCorrelation=*/false, + /*isBlocklisted=*/true); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020, + /*requiresFlowCorrelation=*/false, + /*isBlocklisted=*/true); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020, + /*requiresFlowCorrelation=*/false, + /*isBlocklisted=*/true); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020, + /*requiresFlowCorrelation=*/false, + /*isBlocklisted=*/true); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020, + /*requiresFlowCorrelation=*/false, + /*isBlocklisted=*/true); + + registerCallback( + /*domain=*/CallbackDomain::RUNTIME, + /*cbid=*/CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020, + /*requiresFlowCorrelation=*/false, + /*isBlocklisted=*/true); + + // ========================================================================= + // DRIVER API - Kernel Launches + // ========================================================================= + registerCallback( + /*domain=*/CallbackDomain::DRIVER, + /*cbid=*/CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); + +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11060 + registerCallback( + /*domain=*/CallbackDomain::DRIVER, + /*cbid=*/CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx, + /*requiresFlowCorrelation=*/true, + /*isBlocklisted=*/false); +#endif +} + +bool CuptiCallbackRegistry::requiresFlowCorrelation( + CallbackDomain domain, + uint32_t cbid) const { + // Check explicit callbacks first + const auto* map = getMapForDomain(domain); + if (map != nullptr) { + auto it = map->find(cbid); + if (it != map->end()) { + return it->second.requiresFlowCorrelation; + } + } + // Check ranges (for memory operations) + for (const auto& [rangeDomain, range] : callbackRanges_) { + if (rangeDomain == domain && cbid >= range.startCbid && + cbid <= range.endCbid) { + return range.requiresFlowCorrelation; + } + } + return false; +} + +bool CuptiCallbackRegistry::isBlocklisted(CallbackDomain domain, uint32_t cbid) + const { + const auto* map = getMapForDomain(domain); + if (map != nullptr) { + auto it = map->find(cbid); + if (it != map->end()) { + return it->second.isBlocklisted; + } + } + return false; +} + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/CuptiCallbackRegistry.h b/libkineto/src/CuptiCallbackRegistry.h new file mode 100644 index 000000000..cf2debfde --- /dev/null +++ b/libkineto/src/CuptiCallbackRegistry.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace KINETO_NAMESPACE { + +// Domain of the CUPTI callback +enum class CallbackDomain : uint8_t { + RUNTIME, // CUDA Runtime API (cudaXxx functions) + DRIVER, // CUDA Driver API (cuXxx functions) +}; + +// CuptiCallbackRegistry: Central registry for CUPTI callback metadata +// +// This class provides a single source of truth for all CUPTI callback +// properties, replacing scattered hardcoded checks throughout the codebase. +// +// The registry is initialized lazily on first access via instance(). +// All callbacks are registered in the constructor with their properties. +// +// Usage: +// auto& registry = CuptiCallbackRegistry::instance(); +// if (registry.requiresFlowCorrelation(CallbackDomain::RUNTIME, cbid)) { +// // Create flow correlation +// } +// +class CuptiCallbackRegistry { + public: + // Get the singleton instance (lazy initialization on first call) + static CuptiCallbackRegistry& instance(); + + // Disable copy/move + CuptiCallbackRegistry(const CuptiCallbackRegistry&) = delete; + CuptiCallbackRegistry& operator=(const CuptiCallbackRegistry&) = delete; + CuptiCallbackRegistry(CuptiCallbackRegistry&&) = delete; + CuptiCallbackRegistry& operator=(CuptiCallbackRegistry&&) = delete; + + // Check if a callback requires flow correlation (CPU->GPU arrows in trace) + bool requiresFlowCorrelation(CallbackDomain domain, uint32_t cbid) const; + + // Check if a callback is blocklisted (should be filtered from traces) + bool isBlocklisted(CallbackDomain domain, uint32_t cbid) const; + + private: + CuptiCallbackRegistry(); + ~CuptiCallbackRegistry() = default; + + // Properties stored per callback + struct CallbackProps { + bool requiresFlowCorrelation; + bool isBlocklisted; + }; + + // Range of callbacks (for memory operations) + struct CallbackRange { + uint32_t startCbid; + uint32_t endCbid; // inclusive + bool requiresFlowCorrelation; + }; + + // Register a callback + void registerCallback( + CallbackDomain domain, + uint32_t cbid, + bool requiresFlowCorrelation, + bool isBlocklisted); + + // Register a range of callbacks + void registerCallbackRange( + CallbackDomain domain, + uint32_t startCbid, + uint32_t endCbid, + bool requiresFlowCorrelation); + + // Storage per domain + std::unordered_map runtimeCallbacks_; + std::unordered_map driverCallbacks_; + + // Ranges for callbacks that use range-based matching + std::vector> callbackRanges_; + + // Helper to get the appropriate map + const std::unordered_map* getMapForDomain( + CallbackDomain domain) const; +}; + +} // namespace KINETO_NAMESPACE