Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/a2a3/platform/onboard/host/host_regs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ static void get_aicore_regs(std::vector<int64_t>& regs, uint64_t device_id) {
if (rt != 0) {
LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses");
// Fallback: generate placeholder addresses
for (int i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000)); // 8M stride
aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
Expand Down
67 changes: 43 additions & 24 deletions src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1777,34 +1777,53 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
// Print orchestrator profiling data
#if PTO2_ORCH_PROFILING
PTO2OrchProfilingData p = pto2_orchestrator_get_profiling();
uint64_t total = p.sync_cycle + p.alloc_cycle + p.params_cycle +
p.lookup_cycle + p.heap_cycle + p.insert_cycle +
p.fanin_cycle;
uint64_t total = p.sync_cycle + p.alloc_cycle + p.params_cycle + p.lookup_cycle + p.heap_cycle +
p.insert_cycle + p.fanin_cycle;
if (total == 0) total = 1; // avoid div-by-zero
DEV_ALWAYS("Thread %d: === Orchestrator Profiling: %lld tasks, total=%.3fus ===", thread_idx,
(long long)p.submit_count, cycles_to_us(total));
DEV_ALWAYS("Thread %d: sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle), p.sync_cycle * 100.0 / total);
DEV_ALWAYS("Thread %d: task_ring_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%llu", thread_idx,
cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
DEV_ALWAYS("Thread %d: === Orchestrator Profiling: %lld tasks, total=%.3fus ===",
thread_idx,
(long long)p.submit_count,
cycles_to_us(total));
DEV_ALWAYS("Thread %d: task_ring_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%llu",
thread_idx,
cycles_to_us(p.alloc_cycle),
p.alloc_cycle * 100.0 / total,
cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle),
cycles_to_us(p.alloc_wait_cycle),
(unsigned long long)p.alloc_atomic_count);
DEV_ALWAYS("Thread %d: param_copy : %.3fus (%.1f%%) atomics=%llu", thread_idx,
cycles_to_us(p.params_cycle), p.params_cycle * 100.0 / total,
(unsigned long long)p.params_atomic_count);
DEV_ALWAYS("Thread %d: lookup+dep : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle), p.lookup_cycle * 100.0 / total);
DEV_ALWAYS("Thread %d: heap_alloc : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%llu", thread_idx,
cycles_to_us(p.heap_cycle), p.heap_cycle * 100.0 / total,
cycles_to_us(p.heap_cycle - p.heap_wait_cycle), cycles_to_us(p.heap_wait_cycle),
DEV_ALWAYS("Thread %d: heap_alloc : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%llu",
thread_idx,
cycles_to_us(p.heap_cycle),
p.heap_cycle * 100.0 / total,
cycles_to_us(p.heap_cycle - p.heap_wait_cycle),
cycles_to_us(p.heap_wait_cycle),
(unsigned long long)p.heap_atomic_count);
DEV_ALWAYS("Thread %d: tensormap_ins : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle), p.insert_cycle * 100.0 / total);
DEV_ALWAYS("Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%llu", thread_idx,
cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle),
DEV_ALWAYS("Thread %d: sync_tensormap : %.3fus (%.1f%%)",
thread_idx,
cycles_to_us(p.sync_cycle),
p.sync_cycle * 100.0 / total);
DEV_ALWAYS("Thread %d: lookup+dep : %.3fus (%.1f%%)",
thread_idx,
cycles_to_us(p.lookup_cycle),
p.lookup_cycle * 100.0 / total);
DEV_ALWAYS("Thread %d: tensormap_ins : %.3fus (%.1f%%)",
thread_idx,
cycles_to_us(p.insert_cycle),
p.insert_cycle * 100.0 / total);
DEV_ALWAYS("Thread %d: param_copy : %.3fus (%.1f%%) atomics=%llu",
thread_idx,
cycles_to_us(p.params_cycle),
p.params_cycle * 100.0 / total,
(unsigned long long)p.params_atomic_count);
DEV_ALWAYS("Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%llu",
thread_idx,
cycles_to_us(p.fanin_cycle),
p.fanin_cycle * 100.0 / total,
cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle),
cycles_to_us(p.fanin_wait_cycle),
(unsigned long long)p.fanin_atomic_count);
DEV_ALWAYS("Thread %d: scope_end : %.3fus atomics=%llu", thread_idx,
cycles_to_us(p.scope_end_cycle),
(unsigned long long)p.scope_end_atomic_count);
DEV_ALWAYS("Thread %d: avg/task : %.3fus", thread_idx,
DEV_ALWAYS("Thread %d: avg/task : %.3fus",
thread_idx,
p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0);

#if PTO2_TENSORMAP_PROFILING
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ PTO2DepListPool dep_pool;

// After: per-ring array
PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
PTO2DepListEntry* dep_pool_cur_entries[PTO2_MAX_RING_DEPTH];
int32_t dep_pool_last_reclaimed[PTO2_MAX_RING_DEPTH];
```

Expand Down
Loading
Loading