From d3d3384d2abcb01abc6a705c8f7049f67915e6a7 Mon Sep 17 00:00:00 2001 From: wangzihao122 Date: Thu, 19 Mar 2026 16:59:54 +0800 Subject: [PATCH] Fix profiling timestamps, suppress unused-var warnings, and update docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move sched_start inside resolve_and_dispatch_pto2 after pto2_init_complete_ sync, so it accurately marks the start of the scheduler loop - Add sched_end(timeout) on the timeout exit path - Rename orch_end to orch_stage_end for clearer stage-boundary semantics; rename orch_func_cost log field to match - Reuse orch_cycle_start for orch_start timestamp to avoid a redundant get_sys_cnt_aicpu() call - Add (void) casts for hank/runtime/try_completed/try_pushed when PTO2_PROFILING=0 to suppress unused-variable warnings - Update profiling_levels.md: rewrite Level 0/1 sections with observed log patterns, real example output, and corrected DEV_ALWAYS counts (Level 1: 13 → 21, based on paged_attention 2-sched+2-orch run) - Fix perf_to_mermaid.py --direction help text (TD is default, not LR) - Add paged_attention_unroll to tools/README.md example list --- .../aicpu/aicpu_executor.cpp | 42 ++++++++--- .../docs/profiling_levels.md | 70 ++++++++++++++----- tools/README.md | 1 + tools/perf_to_mermaid.py | 2 +- 4 files changed, 86 insertions(+), 29 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index dfb5f960..3a93ab85 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -294,6 +294,9 @@ struct AicpuExecutor { uint64_t& sched_complete_perf_cycle #endif ) { +#if !PTO2_PROFILING + (void)hank; +#endif for (int32_t i = ct.running_count - 1; i >= 0; i--) { int32_t core_id = ct.running[i]; uint64_t reg_addr = core_id_to_reg_addr_[core_id]; @@ -504,6 +507,9 @@ struct AicpuExecutor { int32_t thread_idx #endif ) { +#if !PTO2_PROFILING + (void)runtime; +#endif PTO2DispatchPayload& payload = s_pto2_payload_per_core[core_id]; PTO2TaskDescriptor& task = *slot_state.task; int32_t slot_idx = static_cast(subslot); @@ -986,6 +992,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa bool cores_released = false; +#if PTO2_PROFILING + // Benchmark: scheduler lifetime start timestamp (independent of enable_profiling) + DEV_ALWAYS("Thread=%d sched_start=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu()); +#endif + while (true) { bool made_progress = false; #if PTO2_PROFILING @@ -1270,8 +1281,13 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa _t0_phase = _t1; phase_dispatch_count = 0; } -#endif } +#endif + +#if !PTO2_PROFILING + (void)try_completed; + (void)try_pushed; +#endif if (made_progress) { idle_iterations = 0; @@ -1392,6 +1408,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa } if (idle_iterations > MAX_IDLE_ITERATIONS) { DEV_ERROR("Thread %d: PTO2 timeout after %d idle iterations", thread_idx, idle_iterations); +#if PTO2_PROFILING + // Benchmark: scheduler lifetime end timestamp on timeout path + DEV_ALWAYS("Thread=%d sched_end(timeout)=%llu", + thread_idx, (unsigned long long)get_sys_cnt_aicpu()); +#endif return -1; } else { SPIN_WAIT_HINT(); @@ -1754,13 +1775,16 @@ int32_t AicpuExecutor::run(Runtime* runtime) { DEV_ALWAYS("Thread %d: Calling aicpu_orchestration_entry from SO (orch_idx=%d/(0~%d))", thread_idx, orch_idx, orch_thread_num_ - 1); #if PTO2_PROFILING - DEV_ALWAYS("Thread=%d orch_start=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu()); uint64_t orch_cycle_start = get_sys_cnt_aicpu(); + DEV_ALWAYS("Thread=%d orch_start=%llu", thread_idx, (unsigned long long)orch_cycle_start); #endif PTO2_SCOPE(rt) { orch_func_(rt, orch_args_cached_, orch_arg_count_cached_, orch_thread_num_, orch_idx); } #if PTO2_PROFILING uint64_t orch_cycle_end = get_sys_cnt_aicpu(); - DEV_ALWAYS("Thread %d: aicpu_orchestration_entry returned, cost %.3fus (orch_idx=%d)", + // Function-level timing: measures only orch_func_ execution time. + // This differs from `orch_end`, which marks stage-level end right before + // requesting core transition (includes additional post-orchestration work). + DEV_ALWAYS("Thread %d: aicpu_orchestration_entry returned, orch_func_cost=%.3fus (orch_idx=%d)", thread_idx, cycles_to_us(orch_cycle_end - orch_cycle_start), orch_idx); #endif @@ -1911,8 +1935,9 @@ int32_t AicpuExecutor::run(Runtime* runtime) { // Compute new core assignments for all threads and initialize donated slots DEV_INFO("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx); #if PTO2_PROFILING - // Benchmark: record orchestrator end timestamp before waiting for schedulers - DEV_ALWAYS("BENCHMARK: thread=%d end=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu()); + // Stage-level end: orchestrator phase finishes before requesting transition. + DEV_ALWAYS("Thread=%d orch_stage_end=%llu", thread_idx, + (unsigned long long)get_sys_cnt_aicpu()); #endif transition_requested_.store(true, std::memory_order_release); @@ -1978,14 +2003,11 @@ int32_t AicpuExecutor::run(Runtime* runtime) { { const int32_t* shutdown_cores = core_assignments_[thread_idx]; int32_t shutdown_count = core_count_per_thread_[thread_idx]; -#if PTO2_PROFILING - // Benchmark: record scheduler end timestamp before shutdown cleanup if (shutdown_count > 0) { - DEV_ALWAYS("Thread=%d end=%llu", +#if PTO2_PROFILING + DEV_ALWAYS("Thread=%d sched_end=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu()); - } #endif - if (shutdown_count > 0) { auto rc = shutdown_aicore(runtime, thread_idx, shutdown_cores, shutdown_count); if (rc != 0) { return rc; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index 47a65ac7..26734b0e 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -46,38 +46,70 @@ Each sub-level macro requires `PTO2_PROFILING=1`: - Deadlock/livelock detection **What's NOT compiled:** -- All profiling counters -- All profiling logs -- Performance data collection - -**Log output:** 11 DEV_ALWAYS logs (debug/diagnostic only) +- All `CYCLE_COUNT_*` timing counters (`sched_*_cycle`, orchestrator cost counters) +- Scheduler/Orchestrator profiling summary logs guarded by `#if PTO2_PROFILING` +- Performance data collection paths (`enable_profiling` runtime flag becomes ineffective because profiling code is not compiled) + +**Log output (timing-related):** +- No `sched_start/sched_end` timestamps +- No `orch_start/orch_stage_end` timestamps +- No `Scheduler summary: total_time=...` +- No orchestration function cost log (`aicpu_orchestration_entry returned, ...us`) + +**Observed output pattern (from `device-2007835_20260319163914744.log`):** +- Keeps lifecycle/debug diagnostics such as: + - `Thread X: Start` + - `Thread X: Starting PTO2 dispatch` + - `Thread X: Calling aicpu_orchestration_entry from SO (...)` + - `Thread X: Starting PTO2 dispatch` (orch threads re-enter as sched after orchestration) + - `Thread X: Last thread, marking executor finished` +- Does not print Level-1 timing lines (`sched_start`, `orch_start`, `orch_func_cost`, `orch_stage_end`, `Scheduler summary`, `sched_end`) --- ### Level 1: Basic Profiling (PTO2_PROFILING=1) **What's compiled:** -- All profiling counters (cycles, task counts, loop counts) -- Basic profiling summaries -- Scheduler summary output -- Orchestration completion time +- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) +- Per-thread orchestration timing (`orch_start`, `orch_func_cost`) +- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested) +- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) **What's NOT compiled:** - Detailed phase breakdowns - TensorMap statistics -**Log output:** 13 DEV_ALWAYS logs -- 11 debug/diagnostic logs (always present) -- 2 basic profiling summaries: - - Orchestration completion time - - Total submitted tasks +**Log output:** adds timing/profiling logs on top of Level 0. -**Scheduler output:** +**Log output (additional lines vs Level 0, per round):** +- `Thread=%d sched_start=%llu` — each sched thread, at scheduler loop start +- `Thread=%d orch_start=%llu` — each orch thread, before `orch_func_` call +- `Thread %d: aicpu_orchestration_entry returned, orch_func_cost=%.3fus (orch_idx=%d)` — each orch thread, after `orch_func_` returns +- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only +- `Thread=%d orch_stage_end=%llu` — last orch thread only, before core transition request +- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread +- `Thread=%d sched_end=%llu` — each sched thread, before `shutdown_aicore` (normal path) +- `Thread=%d sched_end(timeout)=%llu` — timeout path only + +**Example log output:** ``` -Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX +Thread=1 sched_start=46614688541727 +Thread=0 sched_start=46614688541742 +Thread=3 orch_start=46614688544502 +Thread=2 orch_start=46614688546216 +Thread 2: aicpu_orchestration_entry returned, orch_func_cost=59.600us (orch_idx=0) +Thread 3: aicpu_orchestration_entry returned, orch_func_cost=116.020us (orch_idx=1) +PTO2 total submitted tasks = 13, already executed 13 tasks +Thread=3 orch_stage_end=46614688551970 +Thread 0: Scheduler summary: total_time=96.040us, loops=2239, tasks_scheduled=4 +Thread=0 sched_end=46614688552812 +Thread 1: Scheduler summary: total_time=157.840us, loops=3724, tasks_scheduled=9 +Thread=1 sched_end=46614688553944 ``` -**Note:** Scheduler summary always prints when `PTO2_PROFILING=1`, regardless of `enable_profiling` flag. +**Note:** +- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_profiling`. +- `enable_profiling` only controls shared-memory data collection / swimlane export. --- @@ -257,10 +289,12 @@ add_definitions(-DPTO2_ORCH_PROFILING=1) ## Log Output Summary +> Counts based on `examples/a2a3/tensormap_and_ringbuffer/paged_attention` (2 sched + 2 orch threads). + | Level | Macro Settings | DEV_ALWAYS Count | Description | |-------|---------------|------------------|-------------| | 0 | `PTO2_PROFILING=0` | 11 | Debug/diagnostic only | -| 1 | `PTO2_PROFILING=1` | 13 | Basic summaries | +| 1 | `PTO2_PROFILING=1` | 21 | Basic summaries | | 2 | `+PTO2_SCHED_PROFILING=1` | 18 | Scheduler detailed | | 3 | `+PTO2_ORCH_PROFILING=1` | 30 | Orchestrator detailed | | 4 | `+PTO2_TENSORMAP_PROFILING=1` | 34 | TensorMap stats | diff --git a/tools/README.md b/tools/README.md index bdede4f5..349b72a8 100644 --- a/tools/README.md +++ b/tools/README.md @@ -275,6 +275,7 @@ flowchart TD 当前预配置的 examples: - `alternating_matmul_add` - `benchmark_bgemm` +- `paged_attention_unroll` - `batch_paged_attention` - `paged_attention` diff --git a/tools/perf_to_mermaid.py b/tools/perf_to_mermaid.py index 5f2d920b..9d7134e9 100755 --- a/tools/perf_to_mermaid.py +++ b/tools/perf_to_mermaid.py @@ -208,7 +208,7 @@ def main(): parser.add_argument('--style', choices=['detailed', 'compact'], default='detailed', help='节点信息密度:detailed(详细,包含核心和时间)或 compact(紧凑,仅函数名)') parser.add_argument('--direction', choices=['TD', 'LR'], default='TD', - help='流程图方向:TD(从上到下)或 LR(从左到右,默认)') + help='流程图方向:TD(从上到下,默认)或 LR(从左到右)') parser.add_argument('-v', '--verbose', action='store_true', help='详细输出') args = parser.parse_args()