Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,9 @@ struct AicpuExecutor {
uint64_t& sched_complete_perf_cycle
#endif
) {
#if !PTO2_PROFILING
(void)hank;
#endif
for (int32_t i = ct.running_count - 1; i >= 0; i--) {
int32_t core_id = ct.running[i];
uint64_t reg_addr = core_id_to_reg_addr_[core_id];
Expand Down Expand Up @@ -504,6 +507,9 @@ struct AicpuExecutor {
int32_t thread_idx
#endif
) {
#if !PTO2_PROFILING
(void)runtime;
#endif
PTO2DispatchPayload& payload = s_pto2_payload_per_core[core_id];
PTO2TaskDescriptor& task = *slot_state.task;
int32_t slot_idx = static_cast<int32_t>(subslot);
Expand Down Expand Up @@ -986,6 +992,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa

bool cores_released = false;

#if PTO2_PROFILING
// Benchmark: scheduler lifetime start timestamp (independent of enable_profiling)
DEV_ALWAYS("Thread=%d sched_start=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
#endif

while (true) {
bool made_progress = false;
#if PTO2_PROFILING
Expand Down Expand Up @@ -1270,8 +1281,13 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
_t0_phase = _t1;
phase_dispatch_count = 0;
}
#endif
}
#endif

#if !PTO2_PROFILING
(void)try_completed;
(void)try_pushed;
#endif

if (made_progress) {
idle_iterations = 0;
Expand Down Expand Up @@ -1392,6 +1408,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
}
if (idle_iterations > MAX_IDLE_ITERATIONS) {
DEV_ERROR("Thread %d: PTO2 timeout after %d idle iterations", thread_idx, idle_iterations);
#if PTO2_PROFILING
// Benchmark: scheduler lifetime end timestamp on timeout path
DEV_ALWAYS("Thread=%d sched_end(timeout)=%llu",
thread_idx, (unsigned long long)get_sys_cnt_aicpu());
#endif
return -1;
} else {
SPIN_WAIT_HINT();
Expand Down Expand Up @@ -1754,13 +1775,16 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
DEV_ALWAYS("Thread %d: Calling aicpu_orchestration_entry from SO (orch_idx=%d/(0~%d))",
thread_idx, orch_idx, orch_thread_num_ - 1);
#if PTO2_PROFILING
DEV_ALWAYS("Thread=%d orch_start=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
uint64_t orch_cycle_start = get_sys_cnt_aicpu();
DEV_ALWAYS("Thread=%d orch_start=%llu", thread_idx, (unsigned long long)orch_cycle_start);
#endif
PTO2_SCOPE(rt) { orch_func_(rt, orch_args_cached_, orch_arg_count_cached_, orch_thread_num_, orch_idx); }
#if PTO2_PROFILING
uint64_t orch_cycle_end = get_sys_cnt_aicpu();
DEV_ALWAYS("Thread %d: aicpu_orchestration_entry returned, cost %.3fus (orch_idx=%d)",
// Function-level timing: measures only orch_func_ execution time.
// This differs from `orch_end`, which marks stage-level end right before
// requesting core transition (includes additional post-orchestration work).
DEV_ALWAYS("Thread %d: aicpu_orchestration_entry returned, orch_func_cost=%.3fus (orch_idx=%d)",
thread_idx, cycles_to_us(orch_cycle_end - orch_cycle_start), orch_idx);
#endif

Expand Down Expand Up @@ -1911,8 +1935,9 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
// Compute new core assignments for all threads and initialize donated slots
DEV_INFO("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
#if PTO2_PROFILING
// Benchmark: record orchestrator end timestamp before waiting for schedulers
DEV_ALWAYS("BENCHMARK: thread=%d end=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
// Stage-level end: orchestrator phase finishes before requesting transition.
DEV_ALWAYS("Thread=%d orch_stage_end=%llu", thread_idx,
(unsigned long long)get_sys_cnt_aicpu());
#endif
transition_requested_.store(true, std::memory_order_release);

Expand Down Expand Up @@ -1978,14 +2003,11 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
{
const int32_t* shutdown_cores = core_assignments_[thread_idx];
int32_t shutdown_count = core_count_per_thread_[thread_idx];
#if PTO2_PROFILING
// Benchmark: record scheduler end timestamp before shutdown cleanup
if (shutdown_count > 0) {
DEV_ALWAYS("Thread=%d end=%llu",
#if PTO2_PROFILING
DEV_ALWAYS("Thread=%d sched_end=%llu",
thread_idx, (unsigned long long)get_sys_cnt_aicpu());
}
#endif
if (shutdown_count > 0) {
auto rc = shutdown_aicore(runtime, thread_idx, shutdown_cores, shutdown_count);
if (rc != 0) {
return rc;
Expand Down
70 changes: 52 additions & 18 deletions src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,38 +46,70 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
- Deadlock/livelock detection

**What's NOT compiled:**
- All profiling counters
- All profiling logs
- Performance data collection

**Log output:** 11 DEV_ALWAYS logs (debug/diagnostic only)
- All `CYCLE_COUNT_*` timing counters (`sched_*_cycle`, orchestrator cost counters)
- Scheduler/Orchestrator profiling summary logs guarded by `#if PTO2_PROFILING`
- Performance data collection paths (`enable_profiling` runtime flag becomes ineffective because profiling code is not compiled)

**Log output (timing-related):**
- No `sched_start/sched_end` timestamps
- No `orch_start/orch_stage_end` timestamps
- No `Scheduler summary: total_time=...`
- No orchestration function cost log (`aicpu_orchestration_entry returned, ...us`)

**Observed output pattern (from `device-2007835_20260319163914744.log`):**
- Keeps lifecycle/debug diagnostics such as:
- `Thread X: Start`
- `Thread X: Starting PTO2 dispatch`
- `Thread X: Calling aicpu_orchestration_entry from SO (...)`
- `Thread X: Starting PTO2 dispatch` (orch threads re-enter as sched after orchestration)
- `Thread X: Last thread, marking executor finished`
- Does not print Level-1 timing lines (`sched_start`, `orch_start`, `orch_func_cost`, `orch_stage_end`, `Scheduler summary`, `sched_end`)

---

### Level 1: Basic Profiling (PTO2_PROFILING=1)

**What's compiled:**
- All profiling counters (cycles, task counts, loop counts)
- Basic profiling summaries
- Scheduler summary output
- Orchestration completion time
- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
- Per-thread orchestration timing (`orch_start`, `orch_func_cost`)
- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested)
- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)

**What's NOT compiled:**
- Detailed phase breakdowns
- TensorMap statistics

**Log output:** 13 DEV_ALWAYS logs
- 11 debug/diagnostic logs (always present)
- 2 basic profiling summaries:
- Orchestration completion time
- Total submitted tasks
**Log output:** adds timing/profiling logs on top of Level 0.

**Scheduler output:**
**Log output (additional lines vs Level 0, per round):**
- `Thread=%d sched_start=%llu` — each sched thread, at scheduler loop start
- `Thread=%d orch_start=%llu` — each orch thread, before `orch_func_` call
- `Thread %d: aicpu_orchestration_entry returned, orch_func_cost=%.3fus (orch_idx=%d)` — each orch thread, after `orch_func_` returns
- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only
- `Thread=%d orch_stage_end=%llu` — last orch thread only, before core transition request
- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
- `Thread=%d sched_end=%llu` — each sched thread, before `shutdown_aicore` (normal path)
- `Thread=%d sched_end(timeout)=%llu` — timeout path only

**Example log output:**
```
Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
Thread=1 sched_start=46614688541727
Thread=0 sched_start=46614688541742
Thread=3 orch_start=46614688544502
Thread=2 orch_start=46614688546216
Thread 2: aicpu_orchestration_entry returned, orch_func_cost=59.600us (orch_idx=0)
Thread 3: aicpu_orchestration_entry returned, orch_func_cost=116.020us (orch_idx=1)
PTO2 total submitted tasks = 13, already executed 13 tasks
Thread=3 orch_stage_end=46614688551970
Thread 0: Scheduler summary: total_time=96.040us, loops=2239, tasks_scheduled=4
Thread=0 sched_end=46614688552812
Thread 1: Scheduler summary: total_time=157.840us, loops=3724, tasks_scheduled=9
Thread=1 sched_end=46614688553944
```

**Note:** Scheduler summary always prints when `PTO2_PROFILING=1`, regardless of `enable_profiling` flag.
**Note:**
- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_profiling`.
- `enable_profiling` only controls shared-memory data collection / swimlane export.

---

Expand Down Expand Up @@ -257,10 +289,12 @@ add_definitions(-DPTO2_ORCH_PROFILING=1)

## Log Output Summary

> Counts based on `examples/a2a3/tensormap_and_ringbuffer/paged_attention` (2 sched + 2 orch threads).

| Level | Macro Settings | DEV_ALWAYS Count | Description |
|-------|---------------|------------------|-------------|
| 0 | `PTO2_PROFILING=0` | 11 | Debug/diagnostic only |
| 1 | `PTO2_PROFILING=1` | 13 | Basic summaries |
| 1 | `PTO2_PROFILING=1` | 21 | Basic summaries |
| 2 | `+PTO2_SCHED_PROFILING=1` | 18 | Scheduler detailed |
| 3 | `+PTO2_ORCH_PROFILING=1` | 30 | Orchestrator detailed |
| 4 | `+PTO2_TENSORMAP_PROFILING=1` | 34 | TensorMap stats |
Expand Down
1 change: 1 addition & 0 deletions tools/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ flowchart TD
当前预配置的 examples:
- `alternating_matmul_add`
- `benchmark_bgemm`
- `paged_attention_unroll`
- `batch_paged_attention`
- `paged_attention`

Expand Down
2 changes: 1 addition & 1 deletion tools/perf_to_mermaid.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def main():
parser.add_argument('--style', choices=['detailed', 'compact'], default='detailed',
help='节点信息密度:detailed(详细,包含核心和时间)或 compact(紧凑,仅函数名)')
parser.add_argument('--direction', choices=['TD', 'LR'], default='TD',
help='流程图方向:TD(从上到下)或 LR(从左到右,默认)')
help='流程图方向:TD(从上到下,默认)或 LR(从左到右)')
parser.add_argument('-v', '--verbose', action='store_true', help='详细输出')

args = parser.parse_args()
Expand Down
Loading