From d3d3384d2abcb01abc6a705c8f7049f67915e6a7 Mon Sep 17 00:00:00 2001
From: wangzihao122 <watson_silhouette@126.com>
Date: Thu, 19 Mar 2026 16:59:54 +0800
Subject: [PATCH] Fix profiling timestamps, suppress unused-var warnings, and
 update docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Move sched_start inside resolve_and_dispatch_pto2 after pto2_init_complete_
  sync, so it accurately marks the start of the scheduler loop
- Add sched_end(timeout) on the timeout exit path
- Rename orch_end to orch_stage_end for clearer stage-boundary semantics;
  rename orch_func_cost log field to match
- Reuse orch_cycle_start for orch_start timestamp to avoid a redundant
  get_sys_cnt_aicpu() call
- Add (void) casts for hank/runtime/try_completed/try_pushed when
  PTO2_PROFILING=0 to suppress unused-variable warnings
- Update profiling_levels.md: rewrite Level 0/1 sections with observed
  log patterns, real example output, and corrected DEV_ALWAYS counts
  (Level 1: 13 → 21, based on paged_attention 2-sched+2-orch run)
- Fix perf_to_mermaid.py --direction help text (TD is default, not LR)
- Add paged_attention_unroll to tools/README.md example list
---
 .../aicpu/aicpu_executor.cpp                  | 42 ++++++++---
 .../docs/profiling_levels.md                  | 70 ++++++++++++++-----
 tools/README.md                               |  1 +
 tools/perf_to_mermaid.py                      |  2 +-
 4 files changed, 86 insertions(+), 29 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index dfb5f960..3a93ab85 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -294,6 +294,9 @@ struct AicpuExecutor {
         uint64_t& sched_complete_perf_cycle
 #endif
     ) {
+#if !PTO2_PROFILING
+        (void)hank;
+#endif
         for (int32_t i = ct.running_count - 1; i >= 0; i--) {
             int32_t core_id = ct.running[i];
             uint64_t reg_addr = core_id_to_reg_addr_[core_id];
@@ -504,6 +507,9 @@ struct AicpuExecutor {
         int32_t thread_idx
 #endif
     ) {
+#if !PTO2_PROFILING
+        (void)runtime;
+#endif
         PTO2DispatchPayload& payload = s_pto2_payload_per_core[core_id];
         PTO2TaskDescriptor& task = *slot_state.task;
         int32_t slot_idx = static_cast<int32_t>(subslot);
@@ -986,6 +992,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
 
     bool cores_released = false;
 
+#if PTO2_PROFILING
+    // Benchmark: scheduler lifetime start timestamp (independent of enable_profiling)
+    DEV_ALWAYS("Thread=%d sched_start=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
+#endif
+
     while (true) {
         bool made_progress = false;
 #if PTO2_PROFILING
@@ -1270,8 +1281,13 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                 _t0_phase = _t1;
                 phase_dispatch_count = 0;
             }
-#endif
         }
+#endif
+
+#if !PTO2_PROFILING
+        (void)try_completed;
+        (void)try_pushed;
+#endif
 
         if (made_progress) {
             idle_iterations = 0;
@@ -1392,6 +1408,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
             }
             if (idle_iterations > MAX_IDLE_ITERATIONS) {
                 DEV_ERROR("Thread %d: PTO2 timeout after %d idle iterations", thread_idx, idle_iterations);
+#if PTO2_PROFILING
+                // Benchmark: scheduler lifetime end timestamp on timeout path
+                DEV_ALWAYS("Thread=%d sched_end(timeout)=%llu",
+                           thread_idx, (unsigned long long)get_sys_cnt_aicpu());
+#endif
                 return -1;
             } else {
                 SPIN_WAIT_HINT();
@@ -1754,13 +1775,16 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
             DEV_ALWAYS("Thread %d: Calling aicpu_orchestration_entry from SO (orch_idx=%d/(0~%d))",
                        thread_idx, orch_idx, orch_thread_num_ - 1);
 #if PTO2_PROFILING
-            DEV_ALWAYS("Thread=%d orch_start=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
             uint64_t orch_cycle_start = get_sys_cnt_aicpu();
+            DEV_ALWAYS("Thread=%d orch_start=%llu", thread_idx, (unsigned long long)orch_cycle_start);
 #endif
             PTO2_SCOPE(rt) { orch_func_(rt, orch_args_cached_, orch_arg_count_cached_, orch_thread_num_, orch_idx); }
 #if PTO2_PROFILING
             uint64_t orch_cycle_end = get_sys_cnt_aicpu();
-            DEV_ALWAYS("Thread %d: aicpu_orchestration_entry returned, cost %.3fus (orch_idx=%d)",
+            // Function-level timing: measures only orch_func_ execution time.
+            // This differs from `orch_end`, which marks stage-level end right before
+            // requesting core transition (includes additional post-orchestration work).
+            DEV_ALWAYS("Thread %d: aicpu_orchestration_entry returned, orch_func_cost=%.3fus (orch_idx=%d)",
                 thread_idx, cycles_to_us(orch_cycle_end - orch_cycle_start), orch_idx);
 #endif
 
@@ -1911,8 +1935,9 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
                     // Compute new core assignments for all threads and initialize donated slots
                     DEV_INFO("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
 #if PTO2_PROFILING
-                    // Benchmark: record orchestrator end timestamp before waiting for schedulers
-                    DEV_ALWAYS("BENCHMARK: thread=%d end=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
+                    // Stage-level end: orchestrator phase finishes before requesting transition.
+                    DEV_ALWAYS("Thread=%d orch_stage_end=%llu", thread_idx,
+                               (unsigned long long)get_sys_cnt_aicpu());
 #endif
                     transition_requested_.store(true, std::memory_order_release);
 
@@ -1978,14 +2003,11 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
     {
         const int32_t* shutdown_cores = core_assignments_[thread_idx];
         int32_t shutdown_count = core_count_per_thread_[thread_idx];
-#if PTO2_PROFILING
-        // Benchmark: record scheduler end timestamp before shutdown cleanup
         if (shutdown_count > 0) {
-            DEV_ALWAYS("Thread=%d end=%llu",
+#if PTO2_PROFILING
+            DEV_ALWAYS("Thread=%d sched_end=%llu",
                        thread_idx, (unsigned long long)get_sys_cnt_aicpu());
-        }
 #endif
-        if (shutdown_count > 0) {
             auto rc = shutdown_aicore(runtime, thread_idx, shutdown_cores, shutdown_count);
             if (rc != 0) {
                 return rc;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index 47a65ac7..26734b0e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -46,38 +46,70 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 - Deadlock/livelock detection
 
 **What's NOT compiled:**
-- All profiling counters
-- All profiling logs
-- Performance data collection
-
-**Log output:** 11 DEV_ALWAYS logs (debug/diagnostic only)
+- All `CYCLE_COUNT_*` timing counters (`sched_*_cycle`, orchestrator cost counters)
+- Scheduler/Orchestrator profiling summary logs guarded by `#if PTO2_PROFILING`
+- Performance data collection paths (`enable_profiling` runtime flag becomes ineffective because profiling code is not compiled)
+
+**Log output (timing-related):**
+- No `sched_start/sched_end` timestamps
+- No `orch_start/orch_stage_end` timestamps
+- No `Scheduler summary: total_time=...`
+- No orchestration function cost log (`aicpu_orchestration_entry returned, ...us`)
+
+**Observed output pattern (from `device-2007835_20260319163914744.log`):**
+- Keeps lifecycle/debug diagnostics such as:
+  - `Thread X: Start`
+  - `Thread X: Starting PTO2 dispatch`
+  - `Thread X: Calling aicpu_orchestration_entry from SO (...)`
+  - `Thread X: Starting PTO2 dispatch` (orch threads re-enter as sched after orchestration)
+  - `Thread X: Last thread, marking executor finished`
+- Does not print Level-1 timing lines (`sched_start`, `orch_start`, `orch_func_cost`, `orch_stage_end`, `Scheduler summary`, `sched_end`)
 
 ---
 
 ### Level 1: Basic Profiling (PTO2_PROFILING=1)
 
 **What's compiled:**
-- All profiling counters (cycles, task counts, loop counts)
-- Basic profiling summaries
-- Scheduler summary output
-- Orchestration completion time
+- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`)
+- Per-thread orchestration timing (`orch_start`, `orch_func_cost`)
+- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested)
+- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`)
 
 **What's NOT compiled:**
 - Detailed phase breakdowns
 - TensorMap statistics
 
-**Log output:** 13 DEV_ALWAYS logs
-- 11 debug/diagnostic logs (always present)
-- 2 basic profiling summaries:
-  - Orchestration completion time
-  - Total submitted tasks
+**Log output:** adds timing/profiling logs on top of Level 0.
 
-**Scheduler output:**
+**Log output (additional lines vs Level 0, per round):**
+- `Thread=%d sched_start=%llu` — each sched thread, at scheduler loop start
+- `Thread=%d orch_start=%llu` — each orch thread, before `orch_func_` call
+- `Thread %d: aicpu_orchestration_entry returned, orch_func_cost=%.3fus (orch_idx=%d)` — each orch thread, after `orch_func_` returns
+- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only
+- `Thread=%d orch_stage_end=%llu` — last orch thread only, before core transition request
+- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread
+- `Thread=%d sched_end=%llu` — each sched thread, before `shutdown_aicore` (normal path)
+- `Thread=%d sched_end(timeout)=%llu` — timeout path only
+
+**Example log output:**
 ```
-Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX
+Thread=1 sched_start=46614688541727
+Thread=0 sched_start=46614688541742
+Thread=3 orch_start=46614688544502
+Thread=2 orch_start=46614688546216
+Thread 2: aicpu_orchestration_entry returned, orch_func_cost=59.600us (orch_idx=0)
+Thread 3: aicpu_orchestration_entry returned, orch_func_cost=116.020us (orch_idx=1)
+PTO2 total submitted tasks = 13, already executed 13 tasks
+Thread=3 orch_stage_end=46614688551970
+Thread 0: Scheduler summary: total_time=96.040us, loops=2239, tasks_scheduled=4
+Thread=0 sched_end=46614688552812
+Thread 1: Scheduler summary: total_time=157.840us, loops=3724, tasks_scheduled=9
+Thread=1 sched_end=46614688553944
 ```
 
-**Note:** Scheduler summary always prints when `PTO2_PROFILING=1`, regardless of `enable_profiling` flag.
+**Note:**
+- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_profiling`.
+- `enable_profiling` only controls shared-memory data collection / swimlane export.
 
 ---
 
@@ -257,10 +289,12 @@ add_definitions(-DPTO2_ORCH_PROFILING=1)
 
 ## Log Output Summary
 
+> Counts based on `examples/a2a3/tensormap_and_ringbuffer/paged_attention` (2 sched + 2 orch threads).
+
 | Level | Macro Settings | DEV_ALWAYS Count | Description |
 |-------|---------------|------------------|-------------|
 | 0 | `PTO2_PROFILING=0` | 11 | Debug/diagnostic only |
-| 1 | `PTO2_PROFILING=1` | 13 | Basic summaries |
+| 1 | `PTO2_PROFILING=1` | 21 | Basic summaries |
 | 2 | `+PTO2_SCHED_PROFILING=1` | 18 | Scheduler detailed |
 | 3 | `+PTO2_ORCH_PROFILING=1` | 30 | Orchestrator detailed |
 | 4 | `+PTO2_TENSORMAP_PROFILING=1` | 34 | TensorMap stats |
diff --git a/tools/README.md b/tools/README.md
index bdede4f5..349b72a8 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -275,6 +275,7 @@ flowchart TD
 当前预配置的 examples：
 - `alternating_matmul_add`
 - `benchmark_bgemm`
+- `paged_attention_unroll`
 - `batch_paged_attention`
 - `paged_attention`
 
diff --git a/tools/perf_to_mermaid.py b/tools/perf_to_mermaid.py
index 5f2d920b..9d7134e9 100755
--- a/tools/perf_to_mermaid.py
+++ b/tools/perf_to_mermaid.py
@@ -208,7 +208,7 @@ def main():
     parser.add_argument('--style', choices=['detailed', 'compact'], default='detailed',
                         help='节点信息密度：detailed（详细，包含核心和时间）或 compact（紧凑，仅函数名）')
     parser.add_argument('--direction', choices=['TD', 'LR'], default='TD',
-                        help='流程图方向：TD（从上到下）或 LR（从左到右，默认）')
+                        help='流程图方向：TD（从上到下，默认）或 LR（从左到右）')
     parser.add_argument('-v', '--verbose', action='store_true', help='详细输出')
 
     args = parser.parse_args()