hw-native-sys · indigo1973 · Mar 18, 2026
diff --git a/src/a2a3/platform/include/aicore/performance_collector_aicore.h b/src/a2a3/platform/include/aicore/performance_collector_aicore.h
@@ -58,7 +58,9 @@ static inline void perf_aicore_record_task(
     record->start_time = start_time;
     record->end_time = end_time;
     record->kernel_ready_time = kernel_ready_time;
-    record->task_id = task_id;
+    record->mixed_task_id = task_id;
+    record->fanout_count = 0;
+    record->fanout_filled = 0;
 
     perf_buf->count = idx + 1;
 

diff --git a/src/a2a3/platform/include/aicpu/performance_collector_aicpu.h b/src/a2a3/platform/include/aicpu/performance_collector_aicpu.h
@@ -101,12 +101,12 @@ void perf_aicpu_init_phase_profiling(Runtime* runtime, int num_sched_threads, in
  * @param start_time Phase start timestamp
  * @param end_time Phase end timestamp
  * @param loop_iter Current loop iteration number
- * @param tasks_processed Number of tasks processed in this phase
+ * @param tasks_processed Number of tasks processed (scheduler) or mixed_task_id raw (orchestrator)
  */
 void perf_aicpu_record_phase(int thread_idx,
                               AicpuPhaseId phase_id,
                               uint64_t start_time, uint64_t end_time,
-                              uint32_t loop_iter, uint32_t tasks_processed);
+                              uint32_t loop_iter, uint64_t tasks_processed);
 
 /**
  * Write orchestrator cumulative summary
@@ -138,11 +138,11 @@ void perf_aicpu_set_orch_thread_idx(int thread_idx);
  * @param start_time Phase start timestamp
  * @param end_time Phase end timestamp
  * @param submit_idx Task submission index (acts as loop_iter)
- * @param task_id Task ID (stored in tasks_processed field for task tracking)
+ * @param mixed_task_id Mixed task id raw value (pto2_task_id_raw) for cross-view correlation
  */
 void perf_aicpu_record_orch_phase(AicpuPhaseId phase_id,
                                    uint64_t start_time, uint64_t end_time,
-                                   uint32_t submit_idx, uint32_t task_id);
+                                   uint32_t submit_idx, uint64_t mixed_task_id);
 
 /**
  * Write core-to-thread assignment mapping to shared memory

diff --git a/src/a2a3/platform/include/common/perf_profiling.h b/src/a2a3/platform/include/common/perf_profiling.h
@@ -75,15 +75,16 @@ struct PerfRecord {
     uint64_t finish_time;        // AICPU timestamp: when AICPU observed task completion (task_status back to 0)
 
     // Task identification
-    uint32_t task_id;         // Register dispatch id (per-core monotonic counter, NOT mixed_task_id).
-                              // May collide across cores; use (ring_id, task_id, core_id) as unique key.
+    uint64_t mixed_task_id;   // pto2_task_id_raw (ring_id<<32 | local_id) for cross-view correlation.
+                              // Written by AICore as dispatch counter; overwritten by AICPU executor.
     uint32_t func_id;         // Kernel function identifier
     CoreType core_type;       // Core type (AIC/AIV)
     uint8_t ring_id;          // Ring layer (0 for single-ring / legacy)
 
     // Dependency relationship (fanout only)
-    int32_t fanout[RUNTIME_MAX_FANOUT];  // Successor task ID array
-    int32_t fanout_count;                 // Number of successor tasks
+    uint64_t fanout[RUNTIME_MAX_FANOUT];  // Successor task mixed_task_id array
+    int32_t fanout_count;                  // Number of successor tasks
+    uint8_t fanout_filled;                 // 1: fanout has been populated by AICPU or fallback
 } __attribute__((aligned(64)));
 
 static_assert(sizeof(PerfRecord) % 64 == 0,
@@ -262,8 +263,10 @@ struct AicpuPhaseRecord {
     uint64_t end_time;         // Phase end timestamp
     uint32_t loop_iter;        // Loop iteration number
     AicpuPhaseId phase_id;     // Phase type
-    uint32_t tasks_processed;  // Tasks processed in this phase
-    uint32_t padding;          // Alignment padding
+    union {
+        uint64_t mixed_task_id;   // Orchestrator phases: pto2_task_id_raw for cross-view correlation
+        uint64_t tasks_processed; // Scheduler phases: number of tasks processed in this batch
+    };
 };
 
 /**

diff --git a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp
@@ -389,7 +389,7 @@ static void switch_phase_buffer(int thread_idx) {
 void perf_aicpu_record_phase(int thread_idx,
     AicpuPhaseId phase_id,
                               uint64_t start_time, uint64_t end_time,
-                              uint32_t loop_iter, uint32_t tasks_processed) {
+                              uint32_t loop_iter, uint64_t tasks_processed) {
     if (s_phase_header == nullptr) {
         return;
     }
@@ -440,8 +440,7 @@ void perf_aicpu_record_phase(int thread_idx,
     record->end_time = end_time;
     record->loop_iter = loop_iter;
     record->phase_id = phase_id;
-    record->tasks_processed = tasks_processed;
-    record->padding = 0;
+    record->mixed_task_id = tasks_processed;
 
     buf->count = idx + 1;
 }
@@ -470,9 +469,9 @@ void perf_aicpu_set_orch_thread_idx(int thread_idx) {
 
 void perf_aicpu_record_orch_phase(AicpuPhaseId phase_id,
                                    uint64_t start_time, uint64_t end_time,
-                                   uint32_t submit_idx, uint32_t task_id) {
+                                   uint32_t submit_idx, uint64_t mixed_task_id) {
     if (s_orch_thread_idx < 0 || s_phase_header == nullptr) return;
-    perf_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id);
+    perf_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, mixed_task_id);
 }
 
 void perf_aicpu_flush_phase_buffers(int thread_idx) {

diff --git a/src/a2a3/platform/src/host/performance_collector.cpp b/src/a2a3/platform/src/host/performance_collector.cpp
@@ -919,7 +919,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
     // Sort by task_id
     std::sort(tagged_records.begin(), tagged_records.end(),
               [](const TaggedRecord& a, const TaggedRecord& b) {
-                  return a.record->task_id < b.record->task_id;
+                  return a.record->mixed_task_id < b.record->mixed_task_id;
               });
 
     // Step 4: Calculate base time (minimum kernel_ready_time, including phase timestamps)
@@ -930,8 +930,8 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
         }
         if (tagged.record->dispatch_time < base_time_cycles && tagged.record->dispatch_time > 0) {
             base_time_cycles = tagged.record->dispatch_time;
-            LOG_WARN("Timestamp violation: dispatch_time (%lu) < base_time (%lu) for task %u, using dispatch_time as new base_time",
-                        tagged.record->dispatch_time, base_time_cycles, tagged.record->task_id);
+            LOG_WARN("Timestamp violation: dispatch_time (%lu) < base_time (%lu) for task %llu, using dispatch_time as new base_time",
+                        tagged.record->dispatch_time, base_time_cycles, (unsigned long long)tagged.record->mixed_task_id);
         }
     }
 
@@ -987,7 +987,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
         const char* core_type_str = (record.core_type == CoreType::AIC) ? "aic" : "aiv";
 
         outfile << "    {\n";
-        outfile << "      \"task_id\": " << record.task_id << ",\n";
+        outfile << "      \"task_id\": " << record.mixed_task_id << ",\n";
         outfile << "      \"func_id\": " << record.func_id << ",\n";
         outfile << "      \"core_id\": " << tagged.core_id << ",\n";
         outfile << "      \"core_type\": \"" << core_type_str << "\",\n";
@@ -1113,7 +1113,7 @@ int PerformanceCollector::export_swimlane_json(const std::string& output_path) {
                             << ", \"start_time_us\": " << std::fixed << std::setprecision(3) << start_us
                             << ", \"end_time_us\": " << std::fixed << std::setprecision(3) << end_us
                             << ", \"submit_idx\": " << pr.loop_iter
-                            << ", \"task_id\": " << static_cast<int32_t>(pr.tasks_processed)
+                            << ", \"task_id\": " << static_cast<int64_t>(pr.mixed_task_id)
                             << "}";
                     first = false;
                 }

diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp
@@ -632,7 +632,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                     uint32_t count = perf_buf->count;
                     if (count > 0) {
                         PerfRecord* record = &perf_buf->records[count - 1];
-                        if (record->task_id == static_cast<uint32_t>(completed_task_id)) {
+                        if (record->mixed_task_id == static_cast<uint64_t>(completed_task_id)) {
                             record->func_id = runtime.tasks[completed_task_id].func_id;
                             record->core_type = h->core_type;
                             perf_aicpu_record_dispatch_and_finish_time(
@@ -769,7 +769,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                     uint32_t count = perf_buf->count;
                     if (count > 0) {
                         PerfRecord* record = &perf_buf->records[count - 1];
-                        if (record->task_id == static_cast<uint32_t>(completed_task_id)) {
+                        if (record->mixed_task_id == static_cast<uint64_t>(completed_task_id)) {
                             record->func_id = runtime.tasks[completed_task_id].func_id;
                             record->core_type = h->core_type;
                             perf_aicpu_record_dispatch_and_finish_time(

diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
@@ -228,18 +228,20 @@ void Runtime::complete_perf_records(PerfBuffer* perf_buf) {
 
     for (uint32_t i = 0; i < count; i++) {
         PerfRecord* record = &perf_buf->records[i];
-        uint32_t task_id = record->task_id;
+        // In host_build_graph, AICore writes a plain uint32_t dispatch counter into
+        // mixed_task_id (upper 32 bits are always 0), so truncating to uint32_t is safe.
+        uint32_t task_id = static_cast<uint32_t>(record->mixed_task_id);
 
         // Query Task by task_id (O(1) array indexing)
         Task* task = get_task(task_id);
+        record->fanout_count = 0;
         if (task != nullptr) {
             record->fanout_count = task->fanout_count;
 
             for (int32_t j = 0; j < task->fanout_count; j++) {
-                record->fanout[j] = task->fanout[j];
+                record->fanout[j] = static_cast<uint64_t>(task->fanout[j]);
             }
-        } else {
-            record->fanout_count = 0;
         }
+        record->fanout_filled = 1;
     }
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -367,9 +367,11 @@ struct AicpuExecutor {
                     uint32_t count = perf_buf->count;
                     if (count > 0) {
                         PerfRecord* record = &perf_buf->records[count - 1];
-                        if (record->task_id == static_cast<uint32_t>(expected_reg_task_id)) {
+                        if (record->mixed_task_id == static_cast<uint64_t>(expected_reg_task_id)) {
                             // Fill metadata that AICore doesn't know
                             int32_t perf_slot_idx = static_cast<int32_t>(executing_subslot_by_core_[core_id]);
+                            // Overwrite with full mixed_task_id for cross-view correlation.
+                            record->mixed_task_id = pto2_task_id_raw(slot_state.task->mixed_task_id);
                             record->func_id = slot_state.task->kernel_id[perf_slot_idx];
                             record->core_type = CT;
                             perf_aicpu_record_dispatch_and_finish_time(
@@ -384,10 +386,11 @@ struct AicpuExecutor {
                             record->fanout_count = 0;
                             PTO2DepListEntry* cur = slot_state.fanout_head;
                             while (cur != nullptr && record->fanout_count < RUNTIME_MAX_FANOUT) {
-                                record->fanout[record->fanout_count++] = static_cast<int32_t>(
-                                    pto2_task_id_local(cur->slot_state->task->mixed_task_id));
+                                record->fanout[record->fanout_count++] =
+                                    pto2_task_id_raw(cur->slot_state->task->mixed_task_id);
                                 cur = cur->next;
                             }
+                            record->fanout_filled = 1;
                         }
                     }
 #if PTO2_SCHED_PROFILING
@@ -1716,9 +1719,10 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
                 }
 #endif
 
-                // With multi-ring, slot_states are per-ring inside the scheduler.
-                // Fanout fill-in in complete_perf_records is disabled (slot_states_ptr = nullptr).
-                runtime->set_pto2_slot_states_ptr(nullptr);
+                // Register per-ring slot states for complete_perf_records fallback.
+                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                    runtime->set_pto2_ring_slot_states_ptr(r, rt->scheduler.ring_sched_states[r].slot_states);
+                }
 
                 // Store shared state for other orchestrator threads
                 orch_func_ = orch_func;

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -43,7 +43,7 @@ __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { retur
 // The strong symbol from the AICPU build wins when profiling is available.
 // Also hidden to prevent HOST .so from polluting the global symbol table.
 __attribute__((weak, visibility("hidden"))) void perf_aicpu_record_orch_phase(
-    AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint32_t) {}
+    AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
 // Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
 static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
 static uint64_t g_orch_alloc_cycle = 0;      // task ring alloc
@@ -78,7 +78,7 @@ uint64_t g_orch_scope_end_atomic_count = 0;
 #include "aicpu/performance_collector_aicpu.h"
 __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
 __attribute__((weak, visibility("hidden"))) void perf_aicpu_record_orch_phase(
-    AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint32_t) {}
+    AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
 // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
 static uint32_t g_orch_submit_idx = 0;
 #define CYCLE_COUNT_START()                                                           \
@@ -387,7 +387,7 @@ void pto2_submit_mixed_task(
     PTO2TaskSlotState* fanin_states[PTO2_MAX_INPUTS];
     int32_t fanin_count = 0;
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, local_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, static_cast<uint64_t>(mixed_task_id));
 
     // === STEP 2: Calculate output size + heap alloc (read from params only, no GM access) ===
     int32_t total_output_size = 0;
@@ -405,7 +405,7 @@ void pto2_submit_mixed_task(
         if (!local_packed_base) { orch->fatal = true; return; }
         local_packed_end = (char*)local_packed_base + total_output_size;
     }
-    CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, local_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, static_cast<uint64_t>(mixed_task_id));
 #if PTO2_ORCH_PROFILING
     if (total_output_size > 0) {
         g_orch_heap_atomic_count += 1;  // heap_top.store in pto2_alloc_packed_buffer
@@ -484,7 +484,7 @@ void pto2_submit_mixed_task(
         }
     }
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, local_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, static_cast<uint64_t>(mixed_task_id));
 
     // === STEP 5: Register outputs/inouts in TensorMap (must be separate from lookup) ===
     for (int i = 0; i < params.tensor_count; i++) {
@@ -496,7 +496,7 @@ void pto2_submit_mixed_task(
         }
     }
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, local_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, static_cast<uint64_t>(mixed_task_id));
 
     // === STEP 6: Batch-write to GM (single cache line burst) ===
     // Deferred from allocation phase to avoid scattered GM writes that get
@@ -521,7 +521,7 @@ void pto2_submit_mixed_task(
 
     payload->init(params);
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_params_cycle, AicpuPhaseId::ORCH_PARAMS, local_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_params_cycle, AicpuPhaseId::ORCH_PARAMS, static_cast<uint64_t>(mixed_task_id));
 #if PTO2_ORCH_PROFILING
     g_orch_params_atomic_count += 2;  // fanout_lock.store + fanout_count.store
 #endif
@@ -586,7 +586,7 @@ void pto2_submit_mixed_task(
 #endif
     }
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, local_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, static_cast<uint64_t>(mixed_task_id));
 
 #if PTO2_PROFILING
     orch->tasks_submitted++;

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -125,6 +125,7 @@ struct PTO2TaskId {
 
     constexpr uint8_t ring() const { return static_cast<uint8_t>(raw >> 32); }
     constexpr uint32_t local() const { return static_cast<uint32_t>(raw & 0xFFFFFFFFu); }
+    constexpr explicit operator uint64_t() const { return raw; }
 
     constexpr bool operator==(const PTO2TaskId& other) const { return raw == other.raw; }
     constexpr bool operator!=(const PTO2TaskId& other) const { return raw != other.raw; }