Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# ignore common items
.idea
.vscode
build/
build*
35 changes: 32 additions & 3 deletions libkineto/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,31 @@ endfunction()

project(kineto VERSION 0.1 LANGUAGES CXX C)

add_compile_options(-fPIC)

if(NOT DEFINED KINETO_COMPILED_WITH_CXX11_ABI)
execute_process(
COMMAND
sh -x -c
"python -c 'import torch;print(1 if torch.compiled_with_cxx11_abi() else 0)'"
OUTPUT_VARIABLE KINETO_COMPILED_WITH_CXX11_ABI)
if(KINETO_COMPILED_WITH_CXX11_ABI GREATER 0)
set(KINETO_COMPILED_WITH_CXX11_ABI 1)
else()
set(KINETO_COMPILED_WITH_CXX11_ABI 0)
endif()
endif()
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=${KINETO_COMPILED_WITH_CXX11_ABI}"
)
add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=${KINETO_COMPILED_WITH_CXX11_ABI})
message(STATUS "KINETO_COMPILED_WITH_CXX11_ABI:" ${KINETO_COMPILED_WITH_CXX11_ABI})

set(KINETO_LIBRARY_TYPE "default" CACHE STRING
"Type of library (default, static or shared) to build")
set_property(CACHE KINETO_LIBRARY_TYPE PROPERTY STRINGS default shared)
option(KINETO_BUILD_TESTS "Build kineto unit tests" ON)
option(KINETO_USE_DEVICE_ACTIVITY "using DeviceActivityInterface to collect device activity" OFF)

set(LIBKINETO_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
set(LIBKINETO_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include")
Expand All @@ -53,21 +74,24 @@ endif()

# Set LIBKINETO_NOCUPTI to explicitly disable CUPTI
# Otherwise, CUPTI is disabled if not found
IF (NOT CUDA_SOURCE_DIR OR NOT CUPTI_INCLUDE_DIR OR NOT CUDA_cupti_LIBRARY)
IF (NOT CUDA_SOURCE_DIR OR NOT CUPTI_INCLUDE_DIR OR NOT CUDA_cupti_LIBRARY OR KINETO_USE_DEVICE_ACTIVITY)
set(LIBKINETO_NOCUPTI ON CACHE BOOL "" FORCE)
endif()

IF (NOT ROCM_SOURCE_DIR AND NOT ROCTRACER_INCLUDE_DIR)
IF (NOT ROCM_SOURCE_DIR AND NOT ROCTRACER_INCLUDE_DIR OR KINETO_USE_DEVICE_ACTIVITY)
set(LIBKINETO_NOROCTRACER ON CACHE BOOL "" FORCE)
endif()

# Define file lists
if (LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER)
if (LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER AND NOT KINETO_USE_DEVICE_ACTIVITY)
get_filelist("get_libkineto_cpu_only_srcs(with_api=False)" LIBKINETO_SRCS)
message(INFO " CUPTI unavailable or disabled - not building GPU profilers")
elseif(NOT LIBKINETO_NOROCTRACER)
get_filelist("get_libkineto_roctracer_srcs(with_api=False)" LIBKINETO_SRCS)
message(INFO " Building with roctracer")
elseif(KINETO_USE_DEVICE_ACTIVITY)
get_filelist("get_libkineto_cpu_only_srcs(with_api=False)" LIBKINETO_SRCS)
message(INFO " Building with device activity")
else()
get_filelist("get_libkineto_cupti_srcs(with_api=False)" LIBKINETO_SRCS)
endif()
Expand Down Expand Up @@ -105,6 +129,11 @@ if (NOT LIBKINETO_NOROCTRACER)
target_compile_options(kineto_base PRIVATE "-D__HIP_PLATFORM_AMD__")
endif()

if (KINETO_USE_DEVICE_ACTIVITY)
list(APPEND KINETO_COMPILE_OPTIONS "-DHAS_DEVICE_ACTIVITY")
endif()
message(STATUS "Kineto: KINETO_COMPILE_OPTIONS = ${KINETO_COMPILE_OPTIONS}")

target_compile_options(kineto_base PRIVATE "${KINETO_COMPILE_OPTIONS}")
target_compile_options(kineto_api PRIVATE "${KINETO_COMPILE_OPTIONS}")

Expand Down
44 changes: 44 additions & 0 deletions libkineto/include/DeviceActivityInterface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#pragma once

#include <atomic>
#include <functional>
#include <set>
#include <stdint.h>

#include "ActivityType.h"
#include "ITraceActivity.h"

namespace libkineto {

class ActivityLogger;

class DeviceActivityInterface {
public:
enum CorrelationFlowType {
Default,
User,
End,
};

virtual ~DeviceActivityInterface() {}

virtual void pushCorrelationID(uint64_t id, CorrelationFlowType type) = 0;
virtual void popCorrelationID(CorrelationFlowType type) = 0;

virtual void enableActivities(const std::set<ActivityType>& selectedActivities) = 0;
virtual void disableActivities(const std::set<ActivityType>& selectedActivities) = 0;
virtual void clearActivities() = 0;
virtual void teardownContext() = 0;
virtual void setMaxBufferSize(int32_t size) = 0;

virtual int32_t processActivities(ActivityLogger& logger,
std::function<const ITraceActivity*(int32_t)> linkedActivity,
int64_t startTime, int64_t endTime) = 0;

public:
std::atomic_bool stopCollection{false};
};

extern DeviceActivityInterface* device_activity_singleton;

} // namespace libkineto
6 changes: 6 additions & 0 deletions libkineto/src/ActivityProfilerController.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
#ifdef HAS_ROCTRACER
#include "RoctracerActivityApi.h"
#endif
#ifdef HAS_DEVICE_ACTIVITY
#include "DeviceActivityInterface.h"
#endif
#include "ThreadUtil.h"
#include "output_json.h"
#include "output_membuf.h"
Expand Down Expand Up @@ -48,6 +51,9 @@ ActivityProfilerController::ActivityProfilerController(
#ifdef HAS_ROCTRACER
profiler_ = std::make_unique<CuptiActivityProfiler>(
RoctracerActivityApi::singleton(), cpuOnly);
#elif HAS_DEVICE_ACTIVITY
profiler_ = std::make_unique<CuptiActivityProfiler>(
*device_activity_singleton, cpuOnly);
#else
profiler_ = std::make_unique<CuptiActivityProfiler>(
CuptiActivityApi::singleton(), cpuOnly);
Expand Down
19 changes: 19 additions & 0 deletions libkineto/src/ActivityProfilerProxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#ifdef HAS_ROCTRACER
#include "RoctracerActivityApi.h"
#endif
#ifdef HAS_DEVICE_ACTIVITY
#include "DeviceActivityInterface.h"
#endif

namespace KINETO_NAMESPACE {

Expand Down Expand Up @@ -99,6 +102,10 @@ void ActivityProfilerProxy::pushCorrelationId(uint64_t id) {
RoctracerActivityApi::pushCorrelationID(id,
RoctracerActivityApi::CorrelationFlowType::Default);
#endif
#ifdef HAS_DEVICE_ACTIVITY
device_activity_singleton->pushCorrelationID(id,
DeviceActivityInterface::CorrelationFlowType::Default);
#endif
}

void ActivityProfilerProxy::popCorrelationId() {
Expand All @@ -108,16 +115,28 @@ void ActivityProfilerProxy::popCorrelationId() {
RoctracerActivityApi::popCorrelationID(
RoctracerActivityApi::CorrelationFlowType::Default);
#endif
#ifdef HAS_DEVICE_ACTIVITY
device_activity_singleton->popCorrelationID(
DeviceActivityInterface::CorrelationFlowType::Default);
#endif
}

void ActivityProfilerProxy::pushUserCorrelationId(uint64_t id) {
CuptiActivityApi::pushCorrelationID(id,
CuptiActivityApi::CorrelationFlowType::User);
#ifdef HAS_DEVICE_ACTIVITY
device_activity_singleton->pushCorrelationID(id,
DeviceActivityInterface::CorrelationFlowType::User);
#endif
}

void ActivityProfilerProxy::popUserCorrelationId() {
CuptiActivityApi::popCorrelationID(
CuptiActivityApi::CorrelationFlowType::User);
#ifdef HAS_DEVICE_ACTIVITY
device_activity_singleton->popCorrelationID(
DeviceActivityInterface::CorrelationFlowType::User);
#endif
}

void ActivityProfilerProxy::transferCpuTrace(
Expand Down
31 changes: 19 additions & 12 deletions libkineto/src/CuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@
#ifdef HAS_ROCTRACER
#include "RoctracerActivityApi.h"
#endif
#ifdef HAS_DEVICE_ACTIVITY
#include "DeviceActivityInterface.h"
#endif
#include "output_base.h"

#include "Logger.h"
Expand Down Expand Up @@ -137,6 +140,10 @@ void CuptiActivityProfiler::transferCpuTrace(
CuptiActivityProfiler::CuptiActivityProfiler(
RoctracerActivityApi& cupti,
bool cpuOnly)
#elif HAS_DEVICE_ACTIVITY
CuptiActivityProfiler::CuptiActivityProfiler(
DeviceActivityInterface& cupti,
bool cpuOnly)
#else
CuptiActivityProfiler::CuptiActivityProfiler(
CuptiActivityApi& cupti,
Expand Down Expand Up @@ -215,7 +222,7 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
}
}
#endif // HAS_CUPTI
#ifdef HAS_ROCTRACER
#if defined(HAS_ROCTRACER) || defined(HAS_DEVICE_ACTIVITY)
if (!cpuOnly_) {
VLOG(0) << "Retrieving GPU activity buffers";
const int count = cupti_.processActivities(
Expand All @@ -226,7 +233,7 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
LOG(INFO) << "Processed " << count << " GPU records";
LOGGER_OBSERVER_ADD_EVENT_COUNT(count);
}
#endif // HAS_ROCTRACER
#endif // HAS_ROCTRACER || HAS_DEVICE_ACTIVITY

for (const auto& session : sessions_) {
LOG(INFO) << "Processing child profiler trace";
Expand Down Expand Up @@ -629,7 +636,7 @@ void CuptiActivityProfiler::configure(
LOGGER_OBSERVER_SET_GROUP_TRACE_ID(config_->requestGroupTraceID());
}

#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) || defined(HAS_DEVICE_ACTIVITY)
if (!cpuOnly_) {
// Enabling CUPTI activity tracing incurs a larger perf hit at first,
// presumably because structures are allocated and initialized, callbacks
Expand All @@ -653,7 +660,7 @@ void CuptiActivityProfiler::configure(
setupOverhead_, duration_cast<microseconds>(t2 - timestamp).count());
}
}
#endif // HAS_CUPTI || HAS_ROCTRACER
#endif // HAS_CUPTI || HAS_ROCTRACER || HAS_DEVICE_ACTIVITY

if (profilers_.size() > 0) {
configureChildProfilers();
Expand Down Expand Up @@ -699,7 +706,7 @@ void CuptiActivityProfiler::startTraceInternal(
void CuptiActivityProfiler::stopTraceInternal(
const time_point<system_clock>& now) {
captureWindowEndTime_ = libkineto::timeSinceEpoch(now);
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) || defined(HAS_DEVICE_ACTIVITY)
if (!cpuOnly_) {
time_point<system_clock> timestamp;
if (VLOG_IS_ON(1)) {
Expand All @@ -716,7 +723,7 @@ void CuptiActivityProfiler::stopTraceInternal(
setupOverhead_, duration_cast<microseconds>(t2 - timestamp).count());
}
}
#endif // HAS_CUPTI || HAS_ROCTRACER
#endif // HAS_CUPTI || HAS_ROCTRACER || HAS_DEVICE_ACTIVITY

if (currentRunloopState_ == RunloopState::CollectTrace) {
VLOG(0) << "CollectTrace -> ProcessTrace";
Expand Down Expand Up @@ -756,7 +763,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
case RunloopState::Warmup:
VLOG(1) << "State: Warmup";
warmup_done = derivedConfig_->isWarmupDone(now, currentIter);
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) || defined(HAS_DEVICE_ACTIVITY)
// Flushing can take a while so avoid doing it close to the start time
if (!cpuOnly_ && currentIter < 0 &&
(derivedConfig_->isProfilingByIteration() ||
Expand All @@ -772,7 +779,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
VLOG(0) << "Warmup -> WaitForRequest";
break;
}
#endif // HAS_CUPTI || HAS_ROCTRACER
#endif // HAS_CUPTI || HAS_ROCTRACER || HAS_DEVICE_ACTIVITY

if (warmup_done) {
UST_LOGGER_MARK_COMPLETED(kWarmUpStage);
Expand Down Expand Up @@ -805,9 +812,9 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
collection_done = derivedConfig_->isCollectionDone(now, currentIter);

if (collection_done
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) || defined(HAS_DEVICE_ACTIVITY)
|| cupti_.stopCollection
#endif // HAS_CUPTI || HAS_ROCTRACER
#endif // HAS_CUPTI || HAS_ROCTRACER || HAS_DEVICE_ACTIVITY
){
// Update runloop state first to prevent further updates to shared state
LOG(INFO) << "Tracing complete.";
Expand Down Expand Up @@ -931,12 +938,12 @@ void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger&
}

void CuptiActivityProfiler::resetTraceData() {
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER) || defined(HAS_DEVICE_ACTIVITY)
if (!cpuOnly_) {
cupti_.clearActivities();
cupti_.teardownContext();
}
#endif // HAS_CUPTI || HAS_ROCTRACER
#endif // HAS_CUPTI || HAS_ROCTRACER || HAS_DEVICE_ACTIVITY
activityMap_.clear();
cpuCorrelationMap_.clear();
correlatedCudaActivities_.clear();
Expand Down
4 changes: 4 additions & 0 deletions libkineto/src/CuptiActivityProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ namespace KINETO_NAMESPACE {
class Config;
class CuptiActivityApi;
class RoctracerActivityApi;
class DeviceActivityInterface;

// This struct is a derived snapshot of the Config. And should not
// be mutable after construction.
Expand Down Expand Up @@ -107,6 +108,7 @@ class CuptiActivityProfiler {
public:
CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly);
CuptiActivityProfiler(RoctracerActivityApi& rai, bool cpuOnly);
CuptiActivityProfiler(DeviceActivityInterface& deviceActivityApi, bool cpuOnly);
CuptiActivityProfiler(const CuptiActivityProfiler&) = delete;
CuptiActivityProfiler& operator=(const CuptiActivityProfiler&) = delete;

Expand Down Expand Up @@ -345,6 +347,8 @@ class CuptiActivityProfiler {
// Calls to CUPTI is encapsulated behind this interface
#ifdef HAS_ROCTRACER
RoctracerActivityApi& cupti_; // Design failure here
#elif HAS_DEVICE_ACTIVITY
DeviceActivityInterface& cupti_;
#else
CuptiActivityApi& cupti_;
#endif
Expand Down