Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ static inline void perf_aicore_record_task(
record->start_time = start_time;
record->end_time = end_time;
record->kernel_ready_time = kernel_ready_time;
record->task_id = task_id;
record->mixed_task_id = task_id;
record->fanout_count = 0;
record->fanout_filled = 0;

perf_buf->count = idx + 1;

Expand Down
8 changes: 4 additions & 4 deletions src/a2a3/platform/include/aicpu/performance_collector_aicpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,12 @@ void perf_aicpu_init_phase_profiling(Runtime* runtime, int num_sched_threads, in
* @param start_time Phase start timestamp
* @param end_time Phase end timestamp
* @param loop_iter Current loop iteration number
* @param tasks_processed Number of tasks processed in this phase
* @param tasks_processed Number of tasks processed (scheduler) or mixed_task_id raw (orchestrator)
*/
void perf_aicpu_record_phase(int thread_idx,
AicpuPhaseId phase_id,
uint64_t start_time, uint64_t end_time,
uint32_t loop_iter, uint32_t tasks_processed);
uint32_t loop_iter, uint64_t tasks_processed);

/**
* Write orchestrator cumulative summary
Expand Down Expand Up @@ -138,11 +138,11 @@ void perf_aicpu_set_orch_thread_idx(int thread_idx);
* @param start_time Phase start timestamp
* @param end_time Phase end timestamp
* @param submit_idx Task submission index (acts as loop_iter)
* @param task_id Task ID (stored in tasks_processed field for task tracking)
* @param mixed_task_id Mixed task id raw value (pto2_task_id_raw) for cross-view correlation
*/
void perf_aicpu_record_orch_phase(AicpuPhaseId phase_id,
uint64_t start_time, uint64_t end_time,
uint32_t submit_idx, uint32_t task_id);
uint32_t submit_idx, uint64_t mixed_task_id);

/**
* Write core-to-thread assignment mapping to shared memory
Expand Down
15 changes: 9 additions & 6 deletions src/a2a3/platform/include/common/perf_profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,16 @@ struct PerfRecord {
uint64_t finish_time; // AICPU timestamp: when AICPU observed task completion (task_status back to 0)

// Task identification
uint32_t task_id; // Register dispatch id (per-core monotonic counter, NOT mixed_task_id).
// May collide across cores; use (ring_id, task_id, core_id) as unique key.
uint64_t mixed_task_id; // pto2_task_id_raw (ring_id<<32 | local_id) for cross-view correlation.
// Written by AICore as dispatch counter; overwritten by AICPU executor.
uint32_t func_id; // Kernel function identifier
CoreType core_type; // Core type (AIC/AIV)
uint8_t ring_id; // Ring layer (0 for single-ring / legacy)

// Dependency relationship (fanout only)
int32_t fanout[RUNTIME_MAX_FANOUT]; // Successor task ID array
int32_t fanout_count; // Number of successor tasks
uint64_t fanout[RUNTIME_MAX_FANOUT]; // Successor task mixed_task_id array
int32_t fanout_count; // Number of successor tasks
uint8_t fanout_filled; // 1: fanout has been populated by AICPU or fallback
} __attribute__((aligned(64)));

static_assert(sizeof(PerfRecord) % 64 == 0,
Expand Down Expand Up @@ -262,8 +263,10 @@ struct AicpuPhaseRecord {
uint64_t end_time; // Phase end timestamp
uint32_t loop_iter; // Loop iteration number
AicpuPhaseId phase_id; // Phase type
uint32_t tasks_processed; // Tasks processed in this phase
uint32_t padding; // Alignment padding
union {
uint64_t mixed_task_id; // Orchestrator phases: pto2_task_id_raw for cross-view correlation
uint64_t tasks_processed; // Scheduler phases: number of tasks processed in this batch
};
};

/**
Expand Down
9 changes: 4 additions & 5 deletions src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ static void switch_phase_buffer(int thread_idx) {
void perf_aicpu_record_phase(int thread_idx,
AicpuPhaseId phase_id,
uint64_t start_time, uint64_t end_time,
uint32_t loop_iter, uint32_t tasks_processed) {
uint32_t loop_iter, uint64_t tasks_processed) {
if (s_phase_header == nullptr) {
return;
}
Expand Down Expand Up @@ -440,8 +440,7 @@ void perf_aicpu_record_phase(int thread_idx,
record->end_time = end_time;
record->loop_iter = loop_iter;
record->phase_id = phase_id;
record->tasks_processed = tasks_processed;
record->padding = 0;
record->mixed_task_id = tasks_processed;

buf->count = idx + 1;
}
Expand Down Expand Up @@ -470,9 +469,9 @@ void perf_aicpu_set_orch_thread_idx(int thread_idx) {

void perf_aicpu_record_orch_phase(AicpuPhaseId phase_id,
uint64_t start_time, uint64_t end_time,
uint32_t submit_idx, uint32_t task_id) {
uint32_t submit_idx, uint64_t mixed_task_id) {
if (s_orch_thread_idx < 0 || s_phase_header == nullptr) return;
perf_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id);
perf_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, mixed_task_id);
}

void perf_aicpu_flush_phase_buffers(int thread_idx) {
Expand Down
10 changes: 5 additions & 5 deletions src/a2a3/platform/src/host/performance_collector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
// Sort by task_id
std::sort(tagged_records.begin(), tagged_records.end(),
[](const TaggedRecord& a, const TaggedRecord& b) {
return a.record->task_id < b.record->task_id;
return a.record->mixed_task_id < b.record->mixed_task_id;
});

// Step 4: Calculate base time (minimum kernel_ready_time, including phase timestamps)
Expand All @@ -930,8 +930,8 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
}
if (tagged.record->dispatch_time < base_time_cycles && tagged.record->dispatch_time > 0) {
base_time_cycles = tagged.record->dispatch_time;
LOG_WARN("Timestamp violation: dispatch_time (%lu) < base_time (%lu) for task %u, using dispatch_time as new base_time",
tagged.record->dispatch_time, base_time_cycles, tagged.record->task_id);
LOG_WARN("Timestamp violation: dispatch_time (%lu) < base_time (%lu) for task %llu, using dispatch_time as new base_time",
tagged.record->dispatch_time, base_time_cycles, (unsigned long long)tagged.record->mixed_task_id);
}
}

Expand Down Expand Up @@ -987,7 +987,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
const char* core_type_str = (record.core_type == CoreType::AIC) ? "aic" : "aiv";

outfile << " {\n";
outfile << " \"task_id\": " << record.task_id << ",\n";
outfile << " \"task_id\": " << record.mixed_task_id << ",\n";
outfile << " \"func_id\": " << record.func_id << ",\n";
outfile << " \"core_id\": " << tagged.core_id << ",\n";
outfile << " \"core_type\": \"" << core_type_str << "\",\n";
Expand Down Expand Up @@ -1113,7 +1113,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
<< ", \"start_time_us\": " << std::fixed << std::setprecision(3) << start_us
<< ", \"end_time_us\": " << std::fixed << std::setprecision(3) << end_us
<< ", \"submit_idx\": " << pr.loop_iter
<< ", \"task_id\": " << static_cast<int32_t>(pr.tasks_processed)
<< ", \"task_id\": " << static_cast<int64_t>(pr.mixed_task_id)
<< "}";
first = false;
}
Expand Down
4 changes: 2 additions & 2 deletions src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
uint32_t count = perf_buf->count;
if (count > 0) {
PerfRecord* record = &perf_buf->records[count - 1];
if (record->task_id == static_cast<uint32_t>(completed_task_id)) {
if (record->mixed_task_id == static_cast<uint64_t>(completed_task_id)) {
record->func_id = runtime.tasks[completed_task_id].func_id;
record->core_type = h->core_type;
perf_aicpu_record_dispatch_and_finish_time(
Expand Down Expand Up @@ -769,7 +769,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
uint32_t count = perf_buf->count;
if (count > 0) {
PerfRecord* record = &perf_buf->records[count - 1];
if (record->task_id == static_cast<uint32_t>(completed_task_id)) {
if (record->mixed_task_id == static_cast<uint64_t>(completed_task_id)) {
record->func_id = runtime.tasks[completed_task_id].func_id;
record->core_type = h->core_type;
perf_aicpu_record_dispatch_and_finish_time(
Expand Down
10 changes: 6 additions & 4 deletions src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,18 +228,20 @@ void Runtime::complete_perf_records(PerfBuffer* perf_buf) {

for (uint32_t i = 0; i < count; i++) {
PerfRecord* record = &perf_buf->records[i];
uint32_t task_id = record->task_id;
// In host_build_graph, AICore writes a plain uint32_t dispatch counter into
// mixed_task_id (upper 32 bits are always 0), so truncating to uint32_t is safe.
uint32_t task_id = static_cast<uint32_t>(record->mixed_task_id);

// Query Task by task_id (O(1) array indexing)
Task* task = get_task(task_id);
record->fanout_count = 0;
if (task != nullptr) {
record->fanout_count = task->fanout_count;

for (int32_t j = 0; j < task->fanout_count; j++) {
record->fanout[j] = task->fanout[j];
record->fanout[j] = static_cast<uint64_t>(task->fanout[j]);
}
} else {
record->fanout_count = 0;
}
record->fanout_filled = 1;
}
}
16 changes: 10 additions & 6 deletions src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,9 +367,11 @@ struct AicpuExecutor {
uint32_t count = perf_buf->count;
if (count > 0) {
PerfRecord* record = &perf_buf->records[count - 1];
if (record->task_id == static_cast<uint32_t>(expected_reg_task_id)) {
if (record->mixed_task_id == static_cast<uint64_t>(expected_reg_task_id)) {
// Fill metadata that AICore doesn't know
int32_t perf_slot_idx = static_cast<int32_t>(executing_subslot_by_core_[core_id]);
// Overwrite with full mixed_task_id for cross-view correlation.
record->mixed_task_id = pto2_task_id_raw(slot_state.task->mixed_task_id);
record->func_id = slot_state.task->kernel_id[perf_slot_idx];
record->core_type = CT;
perf_aicpu_record_dispatch_and_finish_time(
Expand All @@ -384,10 +386,11 @@ struct AicpuExecutor {
record->fanout_count = 0;
PTO2DepListEntry* cur = slot_state.fanout_head;
while (cur != nullptr && record->fanout_count < RUNTIME_MAX_FANOUT) {
record->fanout[record->fanout_count++] = static_cast<int32_t>(
pto2_task_id_local(cur->slot_state->task->mixed_task_id));
record->fanout[record->fanout_count++] =
pto2_task_id_raw(cur->slot_state->task->mixed_task_id);
cur = cur->next;
}
record->fanout_filled = 1;
}
}
#if PTO2_SCHED_PROFILING
Expand Down Expand Up @@ -1716,9 +1719,10 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
}
#endif

// With multi-ring, slot_states are per-ring inside the scheduler.
// Fanout fill-in in complete_perf_records is disabled (slot_states_ptr = nullptr).
runtime->set_pto2_slot_states_ptr(nullptr);
// Register per-ring slot states for complete_perf_records fallback.
for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
runtime->set_pto2_ring_slot_states_ptr(r, rt->scheduler.ring_sched_states[r].slot_states);
}

// Store shared state for other orchestrator threads
orch_func_ = orch_func;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { retur
// The strong symbol from the AICPU build wins when profiling is available.
// Also hidden to prevent HOST .so from polluting the global symbol table.
__attribute__((weak, visibility("hidden"))) void perf_aicpu_record_orch_phase(
AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint32_t) {}
AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
static uint64_t g_orch_sync_cycle = 0; // tensormap sync
static uint64_t g_orch_alloc_cycle = 0; // task ring alloc
Expand Down Expand Up @@ -78,7 +78,7 @@ uint64_t g_orch_scope_end_atomic_count = 0;
#include "aicpu/performance_collector_aicpu.h"
__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
__attribute__((weak, visibility("hidden"))) void perf_aicpu_record_orch_phase(
AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint32_t) {}
AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
static uint32_t g_orch_submit_idx = 0;
#define CYCLE_COUNT_START() \
Expand Down Expand Up @@ -387,7 +387,7 @@ void pto2_submit_mixed_task(
PTO2TaskSlotState* fanin_states[PTO2_MAX_INPUTS];
int32_t fanin_count = 0;

CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, local_id);
CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, static_cast<uint64_t>(mixed_task_id));

// === STEP 2: Calculate output size + heap alloc (read from params only, no GM access) ===
int32_t total_output_size = 0;
Expand All @@ -405,7 +405,7 @@ void pto2_submit_mixed_task(
if (!local_packed_base) { orch->fatal = true; return; }
local_packed_end = (char*)local_packed_base + total_output_size;
}
CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, local_id);
CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, static_cast<uint64_t>(mixed_task_id));
#if PTO2_ORCH_PROFILING
if (total_output_size > 0) {
g_orch_heap_atomic_count += 1; // heap_top.store in pto2_alloc_packed_buffer
Expand Down Expand Up @@ -484,7 +484,7 @@ void pto2_submit_mixed_task(
}
}

CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, local_id);
CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, static_cast<uint64_t>(mixed_task_id));

// === STEP 5: Register outputs/inouts in TensorMap (must be separate from lookup) ===
for (int i = 0; i < params.tensor_count; i++) {
Expand All @@ -496,7 +496,7 @@ void pto2_submit_mixed_task(
}
}

CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, local_id);
CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, static_cast<uint64_t>(mixed_task_id));

// === STEP 6: Batch-write to GM (single cache line burst) ===
// Deferred from allocation phase to avoid scattered GM writes that get
Expand All @@ -521,7 +521,7 @@ void pto2_submit_mixed_task(

payload->init(params);

CYCLE_COUNT_LAP_RECORD(g_orch_params_cycle, AicpuPhaseId::ORCH_PARAMS, local_id);
CYCLE_COUNT_LAP_RECORD(g_orch_params_cycle, AicpuPhaseId::ORCH_PARAMS, static_cast<uint64_t>(mixed_task_id));
#if PTO2_ORCH_PROFILING
g_orch_params_atomic_count += 2; // fanout_lock.store + fanout_count.store
#endif
Expand Down Expand Up @@ -586,7 +586,7 @@ void pto2_submit_mixed_task(
#endif
}

CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, local_id);
CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, static_cast<uint64_t>(mixed_task_id));

#if PTO2_PROFILING
orch->tasks_submitted++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ struct PTO2TaskId {

constexpr uint8_t ring() const { return static_cast<uint8_t>(raw >> 32); }
constexpr uint32_t local() const { return static_cast<uint32_t>(raw & 0xFFFFFFFFu); }
constexpr explicit operator uint64_t() const { return raw; }

constexpr bool operator==(const PTO2TaskId& other) const { return raw == other.raw; }
constexpr bool operator!=(const PTO2TaskId& other) const { return raw != other.raw; }
Expand Down
Loading
Loading