From ca4151718c707ff5a05c805588156d11310ba8b9 Mon Sep 17 00:00:00 2001
From: zhusy54 <zhusiyu1@hisilicon.com>
Date: Wed, 18 Mar 2026 18:33:17 +0800
Subject: [PATCH 1/2] perf(rt2): precompute dispatch desc and use flat byte
 offset register encoding

Replace structured (slot_in_ring, ring_id, slot_idx) register encoding with
a flat byte offset scheme. The new layout uses 28 bits for offset_field,
enabling ~2GB of addressable dispatch descriptors and eliminating independent
bit-width constraints on ring depth and task window size.

Add overflow safety check in dispatch_subtask_to_core that triggers
emergency_shutdown with PTO2_ERROR_ENCODING_OVERFLOW on sentinel collision.
---
 .../aicore/aicore_executor.cpp                |  85 ++++++---
 .../aicpu/aicpu_executor.cpp                  | 172 ++++++++++--------
 .../tensormap_and_ringbuffer/build_config.py  |   2 +-
 .../runtime/pto2_dispatch_payload.h           | 124 +++++++++++--
 .../runtime/pto_orchestrator.cpp              |  10 +-
 .../runtime/pto_orchestrator.h                |   4 +
 .../runtime/pto_runtime2_types.h              |  57 ++++--
 .../runtime/pto_shared_memory.h               |   5 +
 .../runtime/runtime.h                         |  10 +-
 9 files changed, 339 insertions(+), 130 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index 860c91c2..12c2e706 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -14,21 +14,28 @@
 typedef void (*UnifiedKernelFunc)(__gm__ int64_t*);
 
 /**
- * Execute task from PTO2DispatchPayload.
+ * Execute task from PTO2DispatchDesc.
  *
- * Reads function_bin_addr and args from the dispatch payload.
+ * Reads function_bin_addrs[slot_idx] and args from the dispatch descriptor.
+ * The descriptor is pre-built by the Orchestrator at submit time, so this
+ * function performs no address computation—just a function pointer call.
  *
- * @param payload Pointer to PTO2DispatchPayload in global memory
+ * @param desc     Pointer to PTO2DispatchDesc in global memory
+ * @param slot_idx Subtask slot index (0=AIC, 1=AIV0, 2=AIV1)
  */
 __aicore__ __attribute__((always_inline)) static void execute_task(
-    __gm__ PTO2DispatchPayload* payload
+    __gm__ PTO2DispatchDesc* desc, uint32_t slot_idx
 ) {
-    if (payload == nullptr || payload->function_bin_addr == 0) {
+    if (desc == nullptr) {
+        return;
+    }
+    uint64_t func_addr = desc->function_bin_addrs[slot_idx];
+    if (func_addr == 0) {
         return;
     }
 
-    UnifiedKernelFunc kernel = (UnifiedKernelFunc)payload->function_bin_addr;
-    kernel(reinterpret_cast<__gm__ int64_t*>(payload->args));
+    UnifiedKernelFunc kernel = (UnifiedKernelFunc)func_addr;
+    kernel(reinterpret_cast<__gm__ int64_t*>(desc->args));
     FULL_MEMORY_BARRIER();
 }
 
@@ -38,10 +45,15 @@ __aicore__ __attribute__((always_inline)) static void execute_task(
  * Implements the AICPU-AICore register-based dispatch protocol:
  * 1. Wait for AICPU ready signal via handshake buffer
  * 2. Report physical core ID and core type, signal AICore ready
- * 3. Poll DATA_MAIN_BASE register for task dispatch until exit signal
+ * 3. Read PTO2DispatchInitInfo from hank->task (one-shot, wait for non-zero)
+ * 4. Poll DATA_MAIN_BASE register for task dispatch until exit signal
  *
- * Task dispatch reads PTO2DispatchPayload address from Handshake.task.
- * Task ID is derived from the register value (task_id + 1 encoding).
+ * Register encoding (set by AICPU scheduler) — see pto2_dispatch_payload.h:
+ *   bit  [30]   = toggle bit (alternates per core, ignored during decode)
+ *   bits [29:2] = offset_field = (desc_byte_offset >> 3) + 1  (28 bits, 0 = idle)
+ *   bits [1:0]  = slot_idx  (2 bits: 0=AIC, 1=AIV0, 2=AIV1)
+ *
+ * Dispatch desc address = dispatch_base + decoded byte offset
  *
  * @param runtime Pointer to Runtime in global memory
  * @param block_idx Block index (core ID)
@@ -72,15 +84,32 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
 
     dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
 
-    // Cache payload address (set once by AICPU during initialization, never changes)
-    __gm__ PTO2DispatchPayload* payload =
-        reinterpret_cast<__gm__ PTO2DispatchPayload*>(my_hank->task);
+    // Phase 3.5: Cache dispatch init info from AICPU.
+    //
+    // Why this wait is necessary:
+    //   hank->task is set to 0 during handshake (Phase 1) because
+    //   PTO2DispatchInitInfo requires shared memory addresses that don't
+    //   exist yet — PTO2Runtime is created AFTER handshake completes.
+    //   AICPU writes &init_info to hank->task once PTO2Runtime is ready.
+    //   dcci is needed because hank->task lives in GM; without cache
+    //   invalidation, AICore would keep reading a stale cached zero.
+    while (my_hank->task == 0) {
+        dcci(my_hank, SINGLE_CACHE_LINE);
+    }
+    __gm__ PTO2DispatchInitInfo* init_info =
+        reinterpret_cast<__gm__ PTO2DispatchInitInfo*>(my_hank->task);
+    // init_info points to a separate GM object — invalidate its cache line
+    // so we read the values AICPU wrote, not stale data.
+    dcci(init_info, SINGLE_CACHE_LINE);
+
+    uint64_t dispatch_base = init_info->dispatch_base;
+    my_hank->task = 0;  // Clear after reading (no longer needed)
+    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
 
     bool profiling_enabled = runtime->enable_profiling;
     uint64_t kernel_ready_time = get_sys_cnt_aicore();
 
     // Phase 4: Main execution loop - poll register for tasks until exit signal
-    // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
     uint32_t reg_val = AICPU_IDLE_TASK_ID;
     uint32_t last_reg_val = AICPU_IDLE_TASK_ID;
 
@@ -92,37 +121,45 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
             break;
         }
 
-        // Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task)
-        if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) {
+        // Skip idle (0 or AICPU_IDLE_TASK_ID) or duplicate dispatch
+        if (reg_val == 0 || reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) {
             SPIN_WAIT_HINT();
             continue;
         }
 
         {
-            uint32_t task_id = reg_val;  // Decode: register holds task_id directly
+            // Decode register value using named constants from pto2_dispatch_payload.h.
+            // Inline decode instead of calling pto2_reg_decode_*() because ccec does not
+            // allow [aicore] code to call [host]-annotated functions.
+            uint32_t offset_field = (reg_val >> PTO2_REG_OFFSET_SHIFT) & PTO2_REG_OFFSET_MASK;
+            uint64_t desc_byte_offset = static_cast<uint64_t>(offset_field - 1) << PTO2_REG_ALIGN_SHIFT;
+            uint32_t slot_idx = reg_val & PTO2_REG_SLOTIDX_MASK;
+
+            // Compute dispatch descriptor address from cached base + decoded offset
+            __gm__ PTO2DispatchDesc* desc = reinterpret_cast<__gm__ PTO2DispatchDesc*>(
+                dispatch_base + desc_byte_offset);
 
-            // Invalidate payload buffer (AICPU updates its content each dispatch)
-            dcci(payload, ENTIRE_DATA_CACHE);
+            // Invalidate data cache to ensure fresh read of dispatch descriptor
+            dcci(desc, ENTIRE_DATA_CACHE);
 
-            write_reg(RegId::COND, MAKE_ACK_VALUE(task_id));
+            write_reg(RegId::COND, MAKE_ACK_VALUE(reg_val));
 
             // Performance profiling: record start time
             uint64_t start_time = get_sys_cnt_aicore();
 
             // Execute the task
-            execute_task(payload);
+            execute_task(desc, slot_idx);
 
             // Performance profiling: record task execution
-            // (func_id and core_type are filled by AICPU at completion time)
             if (profiling_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
                 __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr;
-                perf_aicore_record_task(perf_buf, task_id,
+                perf_aicore_record_task(perf_buf, reg_val,
                                        start_time, end_time, kernel_ready_time);
             }
 
             last_reg_val = reg_val;
-            write_reg(RegId::COND, MAKE_FIN_VALUE(task_id));
+            write_reg(RegId::COND, MAKE_FIN_VALUE(reg_val));
         }
     }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index ad2b098d..302e7787 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -1,5 +1,6 @@
 #include <atomic>
 #include <cerrno>
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
@@ -69,9 +70,6 @@ constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions a
 
 static PTO2Runtime *rt{nullptr};
 
-// Per-core dispatch payload storage (one per physical core)
-static PTO2DispatchPayload s_pto2_payload_per_core[RUNTIME_MAX_WORKER];
-
 // Core information for discovery (with register address for fast dispatch)
 struct CoreInfo {
     int32_t worker_id;              // Index in runtime.workers[]
@@ -175,12 +173,6 @@ struct AicpuExecutor {
     uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD];
 
     // Per-core monotonic dispatch counter for register protocol uniqueness.
-    // Multi-ring task_ids can collide in the lower 32 bits (e.g., ring 0 local 0
-    // and ring 1 local 0 both truncate to 0), breaking the AICore's last_reg_val
-    // duplicate detection and causing false-positive COND completion. A per-core
-    // counter guarantees each dispatch writes a unique DATA_MAIN_BASE value.
-    uint32_t dispatch_seq_by_core_[RUNTIME_MAX_WORKER]{};
-
     // Per-core subtask slot tracking (which PTO2SubtaskSlot is running on each core)
     PTO2SubtaskSlot executing_subslot_by_core_[RUNTIME_MAX_WORKER]{};
 
@@ -190,10 +182,17 @@ struct AicpuExecutor {
     // Platform register base address array (set via get_platform_regs())
     uint64_t regs_{0};
 
-    // Track executing register task_id per core (AICPU_TASK_INVALID = idle).
-    // NOTE: this is NOT the mixed_task_id; it is the per-core dispatch id used by the
-    // register protocol (derived from dispatch_seq_by_core_ and masked by TASK_ID_MASK).
+    // Track executing register value per core (0 = idle).
+    // The register value encodes a flat byte offset + slot_idx + toggle bit.
     int32_t executing_reg_task_ids_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD];
+    // Per-core toggle bit to guarantee consecutive dispatches differ in reg_val.
+    // Without this, ring buffer wrap-around could produce identical encodings.
+    uint32_t dispatch_toggle_by_core_[RUNTIME_MAX_WORKER]{};
+    // Dispatch base address: address of ring 0's first PTO2DispatchDesc.
+    // Used to compute byte offsets for register encoding.
+    uint64_t dispatch_base_{0};
+    // Shared memory header for error reporting during dispatch.
+    PTO2SharedMemoryHeader* sm_header_{nullptr};
     CoreStateTracker trackers_[MAX_AICPU_THREADS];
 
     // ===== Task queue state (managed by scheduler ready queues) =====
@@ -247,24 +246,6 @@ struct AicpuExecutor {
     void diagnose_stuck_state(
         Runtime* runtime, int32_t thread_idx, const int32_t* cur_thread_cores, int32_t core_num, Handshake* hank);
 
-    // Build slim PTO2DispatchPayload: only function_bin_addr + args.
-    // Metadata (mixed_task_id, subslot, kernel_id, core_type) stays in TaskDescriptor.
-    // Dispatch order: tensor args first, then scalar args.
-    void build_pto2_payload(PTO2DispatchPayload& out,
-        int32_t kernel_id,
-        PTO2TaskPayload& task_pl) {
-        out.function_bin_addr = get_function_bin_addr(kernel_id);
-        int32_t n = 0;
-        for (int32_t i = 0; i < task_pl.tensor_count; i++) {
-            task_pl.tensors[i].update_start_offset();
-            out.args[n++] = reinterpret_cast<uint64_t>(&task_pl.tensors[i]);
-        }
-        for (int32_t i = 0; i < task_pl.scalar_count; i++) {
-            out.args[n++] = task_pl.scalars[i];
-        }
-    }
-
-    // Template methods for Phase 1 and Phase 2
     template <CoreType CT>
     void check_running_cores_for_completion(int32_t thread_idx,
         CoreTypeTracker& ct,
@@ -492,7 +473,11 @@ struct AicpuExecutor {
         return slot_state;
     }
 
-    void dispatch_subtask_to_core(
+    /**
+     * Dispatch a subtask to an AICore and encode the register value.
+     * Returns true on success, false on fatal encoding overflow (emergency shutdown initiated).
+     */
+    bool dispatch_subtask_to_core(
         Runtime* runtime, CoreStateTracker& tracker, int32_t* executing_reg_task_ids,
         int32_t core_id, CoreType core_type, PTO2TaskSlotState& slot_state,
         PTO2SubtaskSlot subslot
@@ -500,10 +485,30 @@ struct AicpuExecutor {
         , bool profiling_enabled, int32_t thread_idx
 #endif
     ) {
-        PTO2DispatchPayload& payload = s_pto2_payload_per_core[core_id];
-        PTO2TaskDescriptor& task = *slot_state.task;
-        int32_t slot_idx = static_cast<int32_t>(subslot);
-        build_pto2_payload(payload, task.kernel_id[slot_idx], *slot_state.payload);
+        // Compute flat byte offset of dispatch descriptor from global base.
+        uint64_t desc_addr = reinterpret_cast<uint64_t>(&slot_state.payload->dispatch);
+        uint64_t desc_byte_offset = desc_addr - dispatch_base_;
+        uint32_t offset_field = static_cast<uint32_t>((desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1);
+
+        // Overflow check: offset_field must not exceed sentinel-safe upper bound.
+        // If it does, the encoded register value would collide with AICORE_EXIT_SIGNAL
+        // or AICORE_IDLE_TASK_ID, causing AICore to misinterpret the dispatch.
+        if (offset_field > PTO2_REG_MAX_OFFSET_FIELD) {
+            DEV_ERROR("PTO2 REG ENCODING OVERFLOW: offset_field=0x%x exceeds max 0x%x. "
+                      "desc_byte_offset=0x%llx ring_id=%u slot_in_ring=%u core=%d subslot=%u. "
+                      "Reduce task_window_size or shared memory size.",
+                      offset_field, PTO2_REG_MAX_OFFSET_FIELD,
+                      (unsigned long long)desc_byte_offset,
+                      (unsigned)slot_state.ring_id, (unsigned)slot_state.slot_in_ring,
+                      core_id, (unsigned)static_cast<uint32_t>(subslot));
+            if (sm_header_) {
+                sm_header_->sched_error_code.store(PTO2_ERROR_ENCODING_OVERFLOW, std::memory_order_release);
+            }
+            emergency_shutdown(runtime);
+            completed_.store(true, std::memory_order_release);
+            return false;
+        }
+
         executing_subslot_by_core_[core_id] = subslot;
         executing_slot_state_by_core_[core_id] = &slot_state;
 #if PTO2_PROFILING
@@ -516,28 +521,22 @@ struct AicpuExecutor {
             core_dispatch_counts_[core_id]++;
         }
 #endif
-        // Per-core monotonic counter for register protocol uniqueness.
-        // mixed_task_id encodes (ring_id << 32 | local_id); truncation to
-        // uint32 loses ring_id, so tasks from different rings with the same
-        // local_id would write identical DATA_MAIN_BASE values. The AICore
-        // uses last_reg_val to detect new dispatches and would skip the
-        // duplicate, while the stale COND register from the previous task
-        // (same local_id) would cause a false-positive completion.
-        dispatch_seq_by_core_[core_id]++;
-        uint32_t reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK;
-        // Skip reserved sentinel values
-        while (reg_task_id == AICORE_IDLE_TASK_ID ||
-            (reg_task_id + 1) == AICORE_EXIT_SIGNAL) {
-            dispatch_seq_by_core_[core_id]++;
-            reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK;
-        }
-        write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast<uint64_t>(reg_task_id));
+        // Encode flat byte offset + slot_idx into register value.
+        // Toggle bit ensures consecutive dispatches to the same core always differ,
+        // preventing the AICore last_reg_val duplicate check from skipping a dispatch
+        // when the ring buffer wraps and the same slot is reused for the same core.
+        uint32_t slot_idx = static_cast<uint32_t>(subslot);
+        dispatch_toggle_by_core_[core_id] ^= (1u << PTO2_REG_TOGGLE_BIT);
+        uint32_t reg_val = pto2_reg_encode(
+            desc_byte_offset, slot_idx, dispatch_toggle_by_core_[core_id]);
+        write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast<uint64_t>(reg_val));
+        executing_reg_task_ids[core_id] = static_cast<int32_t>(reg_val);
 
         CoreTypeTracker& ct = tracker.by_type[static_cast<int32_t>(core_type)];
         int32_t idle_idx = ct.find_idle_index(core_id);
         ct.move_idle_to_running(idle_idx);
         tracker.core_idle[core_id] = false;
-        executing_reg_task_ids[core_id] = reg_task_id;
+        return true;
     }
 };
 
@@ -564,10 +563,10 @@ int32_t AicpuExecutor::handshake_all_cores(Runtime* runtime) {
 
     DEV_INFO("Handshaking with %d cores", cores_total_num_);
 
-    // Step 1: Write per-core payload addresses and send handshake signal
-    // task must be written BEFORE aicpu_ready so AICore sees it after waking up
+    // Step 1: Send handshake signal (task=0 initially; dispatch init info is set later
+    // after PTO2Runtime is created, see pto2_runtime_create_from_sm path)
     for (int32_t i = 0; i < cores_total_num_; i++) {
-        all_handshakes[i].task = reinterpret_cast<uint64_t>(&s_pto2_payload_per_core[i]);
+        all_handshakes[i].task = 0;
         all_handshakes[i].aicpu_ready = 1;
     }
 
@@ -871,10 +870,11 @@ int32_t AicpuExecutor::init(Runtime* runtime) {
     }
 
     // Clear per-core dispatch payloads and subslot tracking
-    memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core));
-    memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_));
     memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_));
     memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_));
+    memset(dispatch_toggle_by_core_, 0, sizeof(dispatch_toggle_by_core_));
+    dispatch_base_ = 0;
+    sm_header_ = nullptr;
 
     DEV_INFO("Init: PTO2 mode, task count from shared memory");
 
@@ -1154,29 +1154,29 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                     ResourceCount rc = shape_resource_count(shape);
 
                     if (rc.aic) {
-                        dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                        if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
                             c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC
 #if PTO2_PROFILING
                             , profiling_enabled, thread_idx
 #endif
-                        );
+                        )) break;
                     }
                     if (rc.aiv >= 1) {
                         int32_t aiv0 = tracker.core_idle[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1];
-                        dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                        if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
                             aiv0, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0
 #if PTO2_PROFILING
                             , profiling_enabled, thread_idx
 #endif
-                        );
+                        )) break;
                     }
                     if (rc.aiv >= 2) {
-                        dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                        if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
                             c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1
 #if PTO2_PROFILING
                             , profiling_enabled, thread_idx
 #endif
-                        );
+                        )) break;
                     }
 #if PTO2_PROFILING
                     phase_dispatch_count++;
@@ -1236,30 +1236,30 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                 ResourceCount rc = shape_resource_count(shape);
 
                 if (rc.aic) {
-                    dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                    if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
                         c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC
 #if PTO2_PROFILING
                         , profiling_enabled, thread_idx
 #endif
-                    );
+                    )) break;
                 }
                 if (rc.aiv >= 1) {
                     int32_t aiv_id = tracker.core_idle[c.aiv_core_ids[0]]
                         ? c.aiv_core_ids[0] : c.aiv_core_ids[1];
-                    dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                    if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
                         aiv_id, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0
 #if PTO2_PROFILING
                         , profiling_enabled, thread_idx
 #endif
-                    );
+                    )) break;
                 }
                 if (rc.aiv >= 2) {
-                    dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                    if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
                         c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1
 #if PTO2_PROFILING
                         , profiling_enabled, thread_idx
 #endif
-                    );
+                    )) break;
                 }
                 made_progress = true;
 #if PTO2_SCHED_PROFILING
@@ -1716,6 +1716,33 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
                 }
 #endif
 
+                // Pass kernel address mapping to orchestrators for dispatch descriptor construction
+                for (int i = 0; i < orch_thread_num_; i++) {
+                    rt->orchestrators[i].func_id_to_addr = func_id_to_addr_;
+                    rt->orchestrators[i].func_id_count = RUNTIME_MAX_FUNC_ID;
+                }
+
+                // Build dispatch init info in shared memory (GM) for AICore.
+                // IMPORTANT: dispatch_init_info MUST live in GM because AICore cannot
+                // access AICPU-local memory on hardware. The shared memory header is
+                // allocated in GM, so sm_handle->header->dispatch_init_info is accessible.
+                // AICore caches dispatch_base at startup, then computes dispatch desc
+                // address as: dispatch_base + decoded_byte_offset
+                PTO2DispatchInitInfo& init_info = sm_handle->header->dispatch_init_info;
+                dispatch_base_ = reinterpret_cast<uint64_t>(sm_handle->task_payloads[0]) +
+                    offsetof(PTO2TaskPayload, dispatch);
+                init_info.dispatch_base = dispatch_base_;
+                sm_header_ = sm_handle->header;
+
+                // Publish dispatch init info to all AICore handshakes.
+                // AICore waits for hank->task != 0 before reading.
+                {
+                    Handshake* hank = static_cast<Handshake*>(runtime->workers);
+                    for (int32_t i = 0; i < cores_total_num_; i++) {
+                        hank[i].task = reinterpret_cast<uint64_t>(&init_info);
+                    }
+                }
+
                 // With multi-ring, slot_states are per-ring inside the scheduler.
                 // Fanout fill-in in complete_perf_records is disabled (slot_states_ptr = nullptr).
                 runtime->set_pto2_slot_states_ptr(nullptr);
@@ -2022,11 +2049,12 @@ void AicpuExecutor::deinit(Runtime* runtime) {
         core_dispatch_counts_[i] = 0;
     }
 
-    // Clear per-core dispatch payloads and subslot tracking
-    memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core));
-    memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_));
+    // Clear per-core subslot tracking
     memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_));
     memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_));
+    memset(dispatch_toggle_by_core_, 0, sizeof(dispatch_toggle_by_core_));
+    dispatch_base_ = 0;
+    sm_header_ = nullptr;
 
     completed_tasks_.store(0, std::memory_order_release);
     total_tasks_ = 0;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py b/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py
index 85a841ca..d5321cb3 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py
@@ -4,7 +4,7 @@
 # This is a device-orchestration runtime where:
 # - AICPU thread 3 runs the orchestrator (builds task graph on device)
 # - AICPU threads 0/1/2 run schedulers (dispatch tasks to AICore)
-# - AICore executes tasks via PTO2DispatchPayload
+# - AICore executes tasks via PTO2DispatchDesc (precomputed by Orchestrator)
 #
 # The "orchestration" directory contains source files compiled into both
 # runtime targets AND the orchestration .so (e.g., tensor methods needed
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
index 94f2da37..d8cd9f94 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
@@ -1,13 +1,17 @@
 /**
  * @file pto2_dispatch_payload.h
- * @brief Minimal dispatch payload for AICore kernel execution
+ * @brief Dispatch descriptor for AICore kernel execution
  *
- * Shared between AICPU (builds in-place) and AICore (reads to run kernel).
- * Handshake.task points to PTO2DispatchPayload embedded in PTO2TaskPayload.
+ * PTO2DispatchDesc is embedded in PTO2TaskPayload and built by the Orchestrator
+ * at submit time. It contains per-slot function addresses and a unified args[]
+ * array (tensor pointers + scalar values). The Scheduler encodes a flat byte
+ * offset (from a single dispatch base address) into the DATA_MAIN_BASE register;
+ * AICore decodes the offset and adds it to a cached base to get the descriptor
+ * address, achieving zero GM access on the scheduler hot path.
  *
- * Only contains fields AICore needs to execute: function address + arguments.
- * Metadata (task_id, kernel_id, core_type) lives in PTO2TaskDescriptor and
- * is accessed by AICPU when needed (profiling, diagnostics).
+ * PTO2DispatchInitInfo is a one-shot initialization struct passed to AICore
+ * via Handshake.task during startup. It provides the single dispatch base
+ * address needed for address computation.
  */
 
 #ifndef RT2_PTO2_DISPATCH_PAYLOAD_H_
@@ -15,19 +19,115 @@
 
 #include <stdint.h>
 
+#include "pto_submit_types.h"
+
 /** Max arguments per task; must match RUNTIME_MAX_ARGS and PTO2_MAX_OUTPUTS */
 #ifndef PTO2_DISPATCH_MAX_ARGS
 #define PTO2_DISPATCH_MAX_ARGS 128
 #endif
 
+// =============================================================================
+// Register Encoding Constants (Flat Byte Offset Scheme)
+// =============================================================================
+
+/**
+ * DATA_MAIN_BASE register bit layout (AICPU → AICore):
+ *
+ *   [31]    unused (COND register uses bit 31 for ACK/FIN state)
+ *   [30]    toggle bit (alternates per core to guarantee uniqueness)
+ *   [29:2]  offset_field = (desc_byte_offset >> 3) + 1  (28 bits)
+ *           +1 reserves 0 for idle; >>3 because PTO2DispatchDesc is 8-byte aligned
+ *   [1:0]   slot_idx  (2 bits: 0=AIC, 1=AIV0, 2=AIV1)
+ *
+ * Max safe offset_field = 0x0FFFFFFB (avoid sentinel collision when toggle=1).
+ * Max encodable byte offset ≈ 2GB — well beyond any practical shared memory size.
+ *
+ * Sentinel values (must never be produced by encoding):
+ *   AICORE_EXIT_SIGNAL   = 0x7FFFFFF0
+ *   AICPU_IDLE_TASK_ID   = 0x7FFFFFFD
+ *   AICORE_IDLE_TASK_ID  = 0x7FFFFFFF
+ */
+constexpr uint32_t PTO2_REG_SLOTIDX_MASK      = 0x3;
+constexpr uint32_t PTO2_REG_OFFSET_SHIFT       = 2;
+constexpr uint32_t PTO2_REG_OFFSET_BITS        = 28;
+constexpr uint32_t PTO2_REG_OFFSET_MASK        = (1u << PTO2_REG_OFFSET_BITS) - 1;  // 0x0FFFFFFF
+constexpr uint32_t PTO2_REG_TOGGLE_BIT         = 30;
+constexpr uint32_t PTO2_REG_ALIGN_SHIFT        = 3;   // PTO2DispatchDesc is 8-byte aligned
+constexpr uint32_t PTO2_REG_MAX_OFFSET_FIELD   = 0x0FFFFFFBu;  // sentinel-safe upper bound
+
 /**
- * Dispatch payload: minimal execution interface for AICore.
- * Layout: function_bin_addr followed by args[].
- * AICore reads function_bin_addr, casts to UnifiedKernelFunc, calls with args.
+ * Encode dispatch info into a 32-bit register value.
+ *
+ * @param desc_byte_offset  Byte offset of PTO2DispatchDesc from dispatch_base (must be 8-byte aligned)
+ * @param slot_idx          Subtask slot (0=AIC, 1=AIV0, 2=AIV1)
+ * @param toggle            Toggle bit value (bit 30, alternated per dispatch per core)
+ */
+static inline uint32_t pto2_reg_encode(uint64_t desc_byte_offset,
+                                       uint32_t slot_idx, uint32_t toggle) {
+    uint32_t offset_field = static_cast<uint32_t>((desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1);
+    return (offset_field << PTO2_REG_OFFSET_SHIFT)
+         | slot_idx
+         | toggle;
+}
+
+/**
+ * Decode desc byte offset from register value.
+ * Returns the byte offset to add to dispatch_base to get the PTO2DispatchDesc address.
+ */
+static inline uint64_t pto2_reg_decode_offset(uint32_t reg_val) {
+    uint32_t offset_field = (reg_val >> PTO2_REG_OFFSET_SHIFT) & PTO2_REG_OFFSET_MASK;
+    return static_cast<uint64_t>(offset_field - 1) << PTO2_REG_ALIGN_SHIFT;
+}
+
+/** Decode slot_idx (subtask slot: 0=AIC, 1=AIV0, 2=AIV1) from register value. */
+static inline uint32_t pto2_reg_decode_slotidx(uint32_t reg_val) {
+    return reg_val & PTO2_REG_SLOTIDX_MASK;
+}
+
+// =============================================================================
+// Dispatch Descriptor
+// =============================================================================
+
+/**
+ * Dispatch descriptor: execution interface for AICore.
+ *
+ * Layout: per-slot function_bin_addrs[] followed by unified args[].
+ * AICore reads function_bin_addrs[slot_idx], casts to UnifiedKernelFunc,
+ * and calls with args (tensor GM pointers followed by scalar values).
+ *
+ * Built once by the Orchestrator during submit_mixed_task(); the Scheduler
+ * never touches it — it only writes a register-encoded value.
+ */
+struct PTO2DispatchDesc {
+    /** Per-slot kernel entry addresses in GM (AIC, AIV0, AIV1); 0 = inactive */
+    uint64_t function_bin_addrs[PTO2_SUBTASK_SLOT_COUNT];
+    /** Kernel arguments: tensor GM pointers first, then scalar values */
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS];
+};
+
+// =============================================================================
+// Dispatch Initialization Info
+// =============================================================================
+
+/**
+ * One-shot initialization info passed from AICPU to AICore.
+ *
+ * Lifecycle:
+ *   1. AICPU sets Handshake.task = 0 during handshake (AICore not yet ready)
+ *   2. AICPU creates PTO2Runtime, computes dispatch_base from shared memory
+ *   3. AICPU writes Handshake.task = &init_info for all cores
+ *   4. AICore waits for Handshake.task != 0, reads and caches dispatch_base
+ *   5. AICore clears Handshake.task = 0 (init_info no longer needed)
+ *
+ * During the main loop, AICore computes the dispatch desc address as:
+ *   dispatch_base + pto2_reg_decode_offset(reg_val)
+ *
+ * This avoids any GM read on the scheduler hot path — the scheduler only
+ * writes a register-encoded byte offset value.
  */
-struct PTO2DispatchPayload {
-    uint64_t function_bin_addr; /**< Kernel entry in GM: (UnifiedKernelFunc)function_bin_addr */
-    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars) */
+struct PTO2DispatchInitInfo {
+    /** Base address of the first PTO2DispatchDesc (ring 0, slot 0) */
+    uint64_t dispatch_base;
 };
 
 #endif  // RT2_PTO2_DISPATCH_PAYLOAD_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index d62e0f9a..c7bedfd9 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -353,8 +353,9 @@ void pto2_submit_mixed_task(
         __builtin_prefetch(&payload->tensors[i], 1, 3);
         __builtin_prefetch(reinterpret_cast<char*>(&payload->tensors[i]) + 64, 1, 3);
     }
-    for (int32_t i = 0; i < params.scalar_count; i += 8) {
-        __builtin_prefetch(&payload->scalars[i], 1, 3);
+    // Prefetch dispatch.args[] area (scalar values written directly here)
+    for (int32_t j = 0; j < params.scalar_count; j += 8) {
+        __builtin_prefetch(&payload->dispatch.args[params.tensor_count + j], 1, 3);
     }
     __builtin_prefetch(payload, 1, 3);
     __builtin_prefetch(reinterpret_cast<char*>(payload) + 64, 1, 3);
@@ -375,7 +376,8 @@ void pto2_submit_mixed_task(
         slot_state.task = &task;
         slot_state.active_mask = active_mask;
         slot_state.subtask_done_mask.store(0, std::memory_order_relaxed);
-        slot_state.ring_id = ring_id;
+        slot_state.ring_id = static_cast<uint8_t>(ring_id);
+        slot_state.slot_in_ring = static_cast<uint32_t>(slot);
         scope_tasks_push(orch, &slot_state);
     } else {
         scope_tasks_push(orch, nullptr);
@@ -519,7 +521,7 @@ void pto2_submit_mixed_task(
         }
     }
 
-    payload->init(params);
+    payload->init(params, orch->func_id_to_addr, orch->func_id_count, task.kernel_id);
 
     CYCLE_COUNT_LAP_RECORD(g_orch_params_cycle, AicpuPhaseId::ORCH_PARAMS, local_id);
 #if PTO2_ORCH_PROFILING
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index a2d4898d..e778f7d1 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -69,6 +69,10 @@ struct PTO2OrchestratorState {
     void* gm_heap_base;    // Base address of GM heap
     uint64_t gm_heap_size;   // Total size of GM heap (all rings)
 
+    // === FUNCTION ADDRESS MAPPING (for dispatch descriptor construction) ===
+    const uint64_t* func_id_to_addr;  // Kernel ID → GM function address (points to Runtime::func_id_to_addr_[])
+    int32_t func_id_count;            // Number of entries in func_id_to_addr (for bounds check)
+
     // === FATAL ERROR ===
     // Fatal error flag (single-thread access by orchestrator, no atomic needed)
     // Cross-thread notification uses shared memory orch_error_code (atomic)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 141be544..ce1c0b60 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -21,6 +21,7 @@
 
 #include "pto_types.h"
 #include "pto_submit_types.h"
+#include "pto2_dispatch_payload.h"
 
 // =============================================================================
 // Profiling Configuration
@@ -68,6 +69,7 @@
 
 // Scheduler errors (100+): detected in scheduler threads
 #define PTO2_ERROR_SCHEDULER_TIMEOUT          100
+#define PTO2_ERROR_ENCODING_OVERFLOW          101
 
 // =============================================================================
 // Configuration Constants
@@ -358,9 +360,14 @@ struct PTO2TaskDescriptor {
  * Task payload data (cold path - only accessed during orchestration and dispatch)
  *
  * Layout: metadata (counts, fanin pointers) packed in the first 3 cache lines,
- * followed by bulk tensor and scalar data. This gives sequential write access
- * during orchestration and groups scheduler-hot fields (fanin_actual_count +
- * fanin_slot_states) together for on_task_release.
+ * followed by the dispatch descriptor and bulk tensor data. Scalar values are
+ * written directly into dispatch.args[] (after tensor pointers), eliminating the
+ * separate scalars[] array.
+ *
+ * The Scheduler never reads this struct — it only encodes a flat byte offset
+ * and slot_idx into the DATA_MAIN_BASE register. AICore computes the dispatch
+ * desc address from a cached base address and reads dispatch.function_bin_addrs[]
+ * + dispatch.args[] directly.
  */
 struct PTO2TaskPayload {
     // === Cache line 0 (64B) — metadata ===
@@ -369,23 +376,48 @@ struct PTO2TaskPayload {
     int32_t fanin_actual_count{0};             // Actual fanin count (without the +1 redundance)
     int32_t _reserved{0};                      // Reserved (dep_pool_mark moved to SlotState for local access)
     PTO2TaskSlotState* fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release)
-    // === Cache lines 3-34 (2048B) — tensors (alignas(64) forces alignment) ===
+    // === Dispatch descriptor (1048B) — built by Orchestrator, read by AICore ===
+    PTO2DispatchDesc dispatch;
+    // === Tensors (2048B) — alignas(64) Tensor forces alignment ===
     Tensor tensors[PTO2_MAX_TENSOR_PARAMS];
-    // === Cache lines 35-50 (1024B) — scalars ===
-    uint64_t scalars[PTO2_MAX_SCALAR_PARAMS];
 
-    void init(const PTOParam& params) {
+    /**
+     * Initialize payload: copy tensors, build dispatch descriptor.
+     *
+     * @param params            Task parameters (tensors + scalars)
+     * @param func_id_to_addr   Kernel ID → GM function address mapping
+     * @param func_id_count     Number of entries in func_id_to_addr (for bounds check)
+     * @param kernel_ids        Per-slot kernel IDs (AIC, AIV0, AIV1); <0 = inactive
+     */
+    void init(const PTOParam& params,
+              const uint64_t* func_id_to_addr,
+              int32_t func_id_count,
+              const int32_t kernel_ids[PTO2_SUBTASK_SLOT_COUNT]) {
         tensor_count = params.tensor_count;
         scalar_count = params.scalar_count;
+
+        // 1. Copy tensors from PTOParam
         auto src_tensors = params.tensors;
         for (int32_t i = 0; i < params.tensor_count; i++) {
             tensors[i].copy(*src_tensors[i]);
         }
-        static_assert(sizeof(scalars) == sizeof(params.scalars));
-        // Round up to cache line boundary. Both arrays are 1024B so no overrun.
-        // Eliminates branches; extra bytes within the same CL have zero additional cost.
-        memcpy(scalars, params.scalars,
-               PTO2_ALIGN_UP(params.scalar_count * sizeof(uint64_t), 64));
+
+        // 2. Fill per-slot function addresses (0 for inactive or out-of-range slots)
+        for (int32_t s = 0; s < PTO2_SUBTASK_SLOT_COUNT; s++) {
+            int32_t kid = kernel_ids[s];
+            dispatch.function_bin_addrs[s] =
+                (kid >= 0 && kid < func_id_count) ? func_id_to_addr[kid] : 0;
+        }
+
+        // 3. Build dispatch.args[]: tensor pointers first, then scalar values
+        int32_t n = 0;
+        for (int32_t i = 0; i < params.tensor_count; i++) {
+            tensors[i].update_start_offset();
+            dispatch.args[n++] = reinterpret_cast<uint64_t>(&tensors[i]);
+        }
+        for (int32_t i = 0; i < params.scalar_count; i++) {
+            dispatch.args[n++] = params.scalars[i];
+        }
     }
 };
 
@@ -427,6 +459,7 @@ struct alignas(64) PTO2TaskSlotState {
     std::atomic<uint8_t> subtask_done_mask;      // Each subtask sets its done bit on completion
     uint8_t ring_id;                             // Ring layer this task belongs to (for per-ring reclamation)
     int32_t dep_pool_mark{0};                    // Dep pool top after this task's submission (orchestrator-only, local memory)
+    uint32_t slot_in_ring;                       // Index into task_payloads[ring_id][] (for register encoding)
 };
 
 static_assert(sizeof(PTO2TaskSlotState) == 64);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index e3ada51f..3f44085a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -96,6 +96,11 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
     std::atomic<uint64_t> graph_output_ptr;   // Address where final output was written (packed buffer)
     std::atomic<uint64_t> graph_output_size;  // Size in bytes
 
+    // === DISPATCH INIT INFO (GM-resident, read by AICore at startup) ===
+    // Must live in shared memory (GM) because AICore cannot access AICPU-local memory.
+    // Written by AICPU after PTO2Runtime creation; AICore caches the values once.
+    PTO2DispatchInitInfo dispatch_init_info;
+
     // === ERROR REPORTING ===
 
     // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 62508b8b..0a6f83e3 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -10,7 +10,7 @@
  * - Device orchestration state (pto2_gm_sm_ptr_, orch_args_)
  * - Function address mapping (func_id_to_addr_)
  *
- * Task dispatch uses PTO2DispatchPayload from PTO2 shared memory.
+ * Task dispatch uses PTO2DispatchDesc from PTO2 shared memory (precomputed by Orchestrator).
  */
 
 #ifndef RUNTIME_H
@@ -71,7 +71,7 @@ constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS -
  * Field Access Patterns:
  * - aicpu_ready: Written by AICPU, read by AICore
  * - aicore_done: Written by AICore, read by AICPU
- * - task: Written by AICPU, read by AICore (0 = no task, non-zero = PTO2DispatchPayload*)
+ * - task: Written by AICPU, read by AICore (0 = not ready, non-zero = PTO2DispatchInitInfo*)
  * - task_status: Written by both (AICPU=1 on dispatch, AICore=0 on completion)
  * - control: Written by AICPU, read by AICore (0 = continue, 1 = quit)
  * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
@@ -79,7 +79,7 @@ constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS -
 struct Handshake {
     volatile uint32_t aicpu_ready;         // AICPU ready signal: 0=not ready, 1=ready
     volatile uint32_t aicore_done;         // AICore ready signal: 0=not ready, core_id+1=ready
-    volatile uint64_t task;                // Task pointer: 0=no task, non-zero=PTO2DispatchPayload*
+    volatile uint64_t task;                // Init: PTO2DispatchInitInfo* (0 until ready); runtime: unused
     volatile int32_t task_status;          // Task execution status: 0=idle, 1=busy
     volatile int32_t control;              // Control signal: 0=execute, 1=quit
     volatile CoreType core_type;           // Core type: CoreType::AIC or CoreType::AIV
@@ -116,7 +116,7 @@ struct HostApi {
 /**
  * Task structure - Compatibility stub for platform layer
  *
- * RT2 uses PTO2DispatchPayload instead of Task for task dispatch.
+ * RT2 uses PTO2DispatchDesc instead of Task for task dispatch.
  * This stub exists only for API compatibility with device_runner.cpp.
  * Since get_task_count() returns 0, this struct is never actually used.
  */
@@ -262,7 +262,7 @@ class Runtime {
     /** @deprecated Task count is now in PTO2 shared memory */
     int get_task_count() const { return 0; }
 
-    /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */
+    /** @deprecated RT2 uses PTO2DispatchDesc, not Task. Always returns nullptr. */
     Task* get_task(int) { return nullptr; }
 
     /** @deprecated Use PTO2 dispatch mode */

From eb92a017c12477ea6f0d42572b3153de63fb2f51 Mon Sep 17 00:00:00 2001
From: zhusy54 <zhusiyu1@hisilicon.com>
Date: Wed, 18 Mar 2026 19:45:50 +0800
Subject: [PATCH 2/2] fix(rt2): use uint64_t for offset_field overflow check
 before truncation

Perform the overflow safety check on the full-width uint64_t value
before narrowing to uint32_t, preventing silent truncation from
bypassing the sentinel-collision guard.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp     | 8 ++++----
 .../runtime/pto2_dispatch_payload.h                       | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 302e7787..3e7a8cbf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -488,16 +488,16 @@ struct AicpuExecutor {
         // Compute flat byte offset of dispatch descriptor from global base.
         uint64_t desc_addr = reinterpret_cast<uint64_t>(&slot_state.payload->dispatch);
         uint64_t desc_byte_offset = desc_addr - dispatch_base_;
-        uint32_t offset_field = static_cast<uint32_t>((desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1);
+        uint64_t offset_field_64 = (desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1;
 
         // Overflow check: offset_field must not exceed sentinel-safe upper bound.
         // If it does, the encoded register value would collide with AICORE_EXIT_SIGNAL
         // or AICORE_IDLE_TASK_ID, causing AICore to misinterpret the dispatch.
-        if (offset_field > PTO2_REG_MAX_OFFSET_FIELD) {
-            DEV_ERROR("PTO2 REG ENCODING OVERFLOW: offset_field=0x%x exceeds max 0x%x. "
+        if (offset_field_64 > PTO2_REG_MAX_OFFSET_FIELD) {
+            DEV_ERROR("PTO2 REG ENCODING OVERFLOW: offset_field=0x%llx exceeds max 0x%x. "
                       "desc_byte_offset=0x%llx ring_id=%u slot_in_ring=%u core=%d subslot=%u. "
                       "Reduce task_window_size or shared memory size.",
-                      offset_field, PTO2_REG_MAX_OFFSET_FIELD,
+                      (unsigned long long)offset_field_64, PTO2_REG_MAX_OFFSET_FIELD,
                       (unsigned long long)desc_byte_offset,
                       (unsigned)slot_state.ring_id, (unsigned)slot_state.slot_in_ring,
                       core_id, (unsigned)static_cast<uint32_t>(subslot));
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
index d8cd9f94..9fe51dfd 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
@@ -64,7 +64,8 @@ constexpr uint32_t PTO2_REG_MAX_OFFSET_FIELD   = 0x0FFFFFFBu;  // sentinel-safe
  */
 static inline uint32_t pto2_reg_encode(uint64_t desc_byte_offset,
                                        uint32_t slot_idx, uint32_t toggle) {
-    uint32_t offset_field = static_cast<uint32_t>((desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1);
+    uint64_t offset_field_64 = (desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1;
+    uint32_t offset_field = static_cast<uint32_t>(offset_field_64);
     return (offset_field << PTO2_REG_OFFSET_SHIFT)
          | slot_idx
          | toggle;