diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
index 0f9b0ae5..4ba8b41b 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -78,6 +78,9 @@ static __aicore__ void pv_matmul_impl(__gm__ Tensor* pij, __gm__ Tensor* vj, __g
     wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
 
     TSTORE(oiGlobal, cTile);
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
index 3b9ef46f..338588e3 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -79,6 +79,9 @@ static __aicore__ void qk_matmul_impl(__gm__ Tensor* qi, __gm__ Tensor* kj, __gm
     wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
 
     TSTORE(sijGlobal, cTile);
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
index 7351f73f..8c3b6d73 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -207,6 +207,8 @@ static __aicore__ void online_update_impl(__gm__ Tensor* mij,
             TSTORE(oiGlobal, oiTile);
         }
     }
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index e3306f94..7baea813 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -83,6 +83,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
     set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
     wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
+    // manually fill invalid columns with -inf as a workaround.
     TFILLPAD_INPLACE(sijPadTile, sijDynTile);
 
     TMULS(sijTile, sijTile, scale_value);
@@ -99,19 +100,22 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
     TSTORE(mijGlobal, maxTile);
     TSTORE(lijGlobal, sumTile);
     TSTORE(pijGlobal, pijF16Tile);
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[3]);
     union {
         uint64_t u;
         float f;
     } scale_conv;
-    scale_conv.u = static_cast<uint64_t>(args[1]);
+    scale_conv.u = static_cast<uint64_t>(args[4]);
     float scale_value = scale_conv.f;
-    __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[2]);
-    __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[3]);
-    __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[4]);
 
     softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij);
 }
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 9184031e..6eb8bb64 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -102,11 +102,11 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
              (unsigned long)batch, (unsigned long)b_start, (unsigned long)b_end);
 
     // Compute actual tensor shapes from buffer sizes (not from max block_num)
-    uint64_t query_shapes[2] = {batch * num_heads, head_dim};
+    uint32_t query_shapes[2] = {(uint32_t)(batch * num_heads), (uint32_t)head_dim};
     uint64_t kv_total_rows = key_cache_size / (head_dim * elem_size);
-    uint64_t key_cache_shapes[2] = {kv_total_rows, head_dim};
-    uint64_t value_cache_shapes[2] = {kv_total_rows, head_dim};
-    uint64_t out_shapes[2] = {batch * num_heads, head_dim};
+    uint32_t key_cache_shapes[2] = {(uint32_t)kv_total_rows, (uint32_t)head_dim};
+    uint32_t value_cache_shapes[2] = {(uint32_t)kv_total_rows, (uint32_t)head_dim};
+    uint32_t out_shapes[2] = {(uint32_t)(batch * num_heads), (uint32_t)head_dim};
     Tensor query = make_tensor_external(host_query, query_shapes, 2, data_type);
     Tensor key_cache = make_tensor_external(host_key_cache, key_cache_shapes, 2, data_type);
     Tensor value_cache = make_tensor_external(host_value_cache, value_cache_shapes, 2, data_type);
@@ -121,86 +121,81 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
         uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
         for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
             PTO2_SCOPE(rt) {
-                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
-                uint64_t oi_shapes[2] = {q_tile, head_dim};
-                uint64_t li_shapes[1] = {q_tile};
-                uint64_t mi_shapes[1] = {q_tile};
+                uint32_t cur_offset = (uint32_t)(b_idx * q_head_num + q_idx * q_tile);
+                uint32_t oi_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
+                uint32_t li_shapes[1] = {(uint32_t)q_tile};
+                uint32_t mi_shapes[1] = {(uint32_t)q_tile};
                 Tensor oi = make_tensor(oi_shapes, 2, DataType::FLOAT32);
                 Tensor li_update = make_tensor(li_shapes, 1, DataType::FLOAT32);
                 Tensor mi_update = make_tensor(mi_shapes, 1, DataType::FLOAT32);
 
-                uint64_t qi_shapes[2] = {q_tile, head_dim};
-                uint64_t qi_offsets[2] = {cur_offset, 0};
+                uint32_t qi_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
+                uint32_t qi_offsets[2] = {cur_offset, 0};
                 Tensor qi = query.view(qi_shapes, qi_offsets);
-                uint64_t out_view_shapes[2] = {q_tile, head_dim};
-                uint64_t out_view_offsets[2] = {cur_offset, 0};
+                uint32_t out_view_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
+                uint32_t out_view_offsets[2] = {cur_offset, 0};
                 Tensor out_view = out.view(out_view_shapes, out_view_offsets);
 
-                PTOParam params_inplace[] = {
-                    make_output_param(oi),
-                    make_output_param(li_update),
-                    make_output_param(mi_update),
-                };
-                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); // create_inplace
+                PTOParam params_inplace;
+                params_inplace.add_output(oi);
+                params_inplace.add_output(li_update);
+                params_inplace.add_output(mi_update);
+                pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace); // create_inplace
 
                 for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
                     uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn];
                     uint64_t valid_len = block_size < (cur_seq - bn * block_size) ? block_size : (cur_seq - bn * block_size);
-                    uint64_t kv_shapes[2] = {block_size, head_dim};
-                    uint64_t kv_offsets[2] = {cur_block_idx * block_size, 0};
+                    uint32_t kv_shapes[2] = {(uint32_t)block_size, (uint32_t)head_dim};
+                    uint32_t kv_offsets[2] = {(uint32_t)(cur_block_idx * block_size), 0};
                     Tensor kj = key_cache.view(kv_shapes, kv_offsets);
                     Tensor vj = value_cache.view(kv_shapes, kv_offsets);
 
-                    uint64_t sij_shapes[2] = {q_tile, block_size};
+                    uint32_t sij_shapes[2] = {(uint32_t)q_tile, (uint32_t)block_size};
                     Tensor sij = make_tensor(sij_shapes, 2, DataType::FLOAT32);
                     Tensor pij_f16 = make_tensor(sij_shapes, 2, data_type);
 
-                    PTOParam params_qk[] = {
-                        make_input_param(qi),
-                        make_input_param(kj),
-                        make_output_param(sij),
-                    };
-                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3); // c1
+                    PTOParam params_qk;
+                    params_qk.add_input(qi);
+                    params_qk.add_input(kj);
+                    params_qk.add_output(sij);
+                    pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk); // c1
 
-                    uint64_t sij_valid_shapes[2] = {q_tile, valid_len};
-                    uint64_t sij_valid_offsets[2] = {0, 0};
+                    uint32_t sij_valid_shapes[2] = {(uint32_t)q_tile, (uint32_t)valid_len};
+                    uint32_t sij_valid_offsets[2] = {0, 0};
                     Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
                     Tensor li = make_tensor(li_shapes, 1, DataType::FLOAT32);
                     Tensor mi = make_tensor(mi_shapes, 1, DataType::FLOAT32);
-                    PTOParam params_sf[] = {
-                        make_input_param(sij_valid),
-                        make_scalar_param(float_to_u64(scale_value)),
-                        make_output_param(pij_f16),
-                        make_output_param(mi),
-                        make_output_param(li),
-                    };
-                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5); // v1
-
-                    uint64_t oi_tmp_shapes[2] = {q_tile, head_dim};
+                    PTOParam params_sf;
+                    params_sf.add_input(sij_valid);
+                    params_sf.add_output(pij_f16);
+                    params_sf.add_output(mi);
+                    params_sf.add_output(li);
+                    params_sf.add_scalar(float_to_u64(scale_value));
+                    pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf); // v1
+
+                    uint32_t oi_tmp_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
                     Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32);
 
-                    PTOParam params_pv[] = {
-                        make_input_param(pij_f16),
-                        make_input_param(vj),
-                        make_output_param(oi_tmp),
-                    };
-                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3); // c2
+                    PTOParam params_pv;
+                    params_pv.add_input(pij_f16);
+                    params_pv.add_input(vj);
+                    params_pv.add_output(oi_tmp);
+                    pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv); // c2
 
                     uint64_t is_first = (bn == 0) ? 1 : 0;
                     uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
 
-                    PTOParam params_up[] = {
-                        make_input_param(mi),
-                        make_input_param(li),
-                        make_input_param(oi_tmp),
-                        make_inout_param(mi_update),
-                        make_inout_param(li_update),
-                        make_inout_param(oi),
-                        make_output_param(out_view),
-                        make_scalar_param(is_first),
-                        make_scalar_param(is_last),
-                    };
-                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); // v2
+                    PTOParam params_up;
+                    params_up.add_input(mi);
+                    params_up.add_input(li);
+                    params_up.add_input(oi_tmp);
+                    params_up.add_inout(mi_update);
+                    params_up.add_inout(li_update);
+                    params_up.add_inout(oi);
+                    params_up.add_output(out_view);
+                    params_up.add_scalar(is_first);
+                    params_up.add_scalar(is_last);
+                    pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up); // v2
                 }
             }
         }
diff --git a/src/a5/platform/include/aicore/performance_collector_aicore.h b/src/a5/platform/include/aicore/performance_collector_aicore.h
index 34e89ad8..6d4f5baf 100644
--- a/src/a5/platform/include/aicore/performance_collector_aicore.h
+++ b/src/a5/platform/include/aicore/performance_collector_aicore.h
@@ -27,23 +27,22 @@
  * Writes performance metrics to the provided buffer. Buffer management
  * and status tracking are handled by AICPU.
  *
+ * AICore records task_id and timestamps only. AICPU fills func_id and
+ * core_type at completion time from TaskDescriptor.
+ *
  * @param perf_buf Performance buffer pointer
  * @param task_id Task ID
- * @param func_id Function ID
  * @param start_time Start timestamp
  * @param end_time End timestamp
  * @param kernel_ready_time Kernel ready timestamp
- * @param core_type Core type (AIC/AIV)
  */
 __aicore__ __attribute__((always_inline))
 static inline void perf_aicore_record_task(
     __gm__ PerfBuffer* perf_buf,
     uint32_t task_id,
-    uint32_t func_id,
     uint64_t start_time,
     uint64_t end_time,
-    uint64_t kernel_ready_time,
-    CoreType core_type) {
+    uint64_t kernel_ready_time) {
 
     // Read current buffer count
     dcci(&perf_buf->count, SINGLE_CACHE_LINE);
@@ -55,13 +54,11 @@ static inline void perf_aicore_record_task(
 
     __gm__ PerfRecord* record = &perf_buf->records[idx];
 
-    // Write record data
+    // Write record data (func_id and core_type filled by AICPU at completion)
     record->start_time = start_time;
     record->end_time = end_time;
     record->kernel_ready_time = kernel_ready_time;
     record->task_id = task_id;
-    record->func_id = func_id;
-    record->core_type = core_type;
 
     perf_buf->count = idx + 1;
 
diff --git a/src/a5/platform/include/aicpu/platform_aicpu_affinity.h b/src/a5/platform/include/aicpu/platform_aicpu_affinity.h
new file mode 100644
index 00000000..180b90ab
--- /dev/null
+++ b/src/a5/platform/include/aicpu/platform_aicpu_affinity.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <cstdint>
+
+// Returns true if this thread should call aicpu_execute().
+// Returns false if this thread should exit (dropped).
+// logical_count: desired active threads (from runtime.sche_cpu_num)
+// total_launched: actual threads launched (PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)
+bool platform_aicpu_affinity_gate(int32_t logical_count, int32_t total_launched);
diff --git a/src/a5/platform/include/common/perf_profiling.h b/src/a5/platform/include/common/perf_profiling.h
index a5cca3c7..462cdc74 100644
--- a/src/a5/platform/include/common/perf_profiling.h
+++ b/src/a5/platform/include/common/perf_profiling.h
@@ -75,9 +75,11 @@ struct PerfRecord {
     uint64_t finish_time;        // AICPU timestamp: when AICPU observed task completion (task_status back to 0)
 
     // Task identification
-    uint32_t task_id;         // Task unique identifier
+    uint32_t task_id;         // Register dispatch id (per-core monotonic counter, NOT mixed_task_id).
+                              // May collide across cores; use (ring_id, task_id, core_id) as unique key.
     uint32_t func_id;         // Kernel function identifier
     CoreType core_type;       // Core type (AIC/AIV)
+    uint8_t ring_id;          // Ring layer (0 for single-ring / legacy)
 
     // Dependency relationship (fanout only)
     int32_t fanout[RUNTIME_MAX_FANOUT];  // Successor task ID array
diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h
index b407128e..48f00ad1 100644
--- a/src/a5/platform/include/common/platform_config.h
+++ b/src/a5/platform/include/common/platform_config.h
@@ -40,6 +40,14 @@ constexpr int PLATFORM_AIV_CORES_PER_BLOCKDIM = 2;
  */
 constexpr int PLATFORM_MAX_AICPU_THREADS = 7;
 
+/**
+ * Maximum AICPU launch threads (physical)
+ * Upper bound for the number of AICPU threads that can be launched by Host.
+ * Can be larger than PLATFORM_MAX_AICPU_THREADS to allow threads to be dropped
+ * from scheduling while still participating in affinity (e.g. 6 launch, 4 active).
+ */
+constexpr int PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH = 6;
+
 // =============================================================================
 // Derived Platform Limits
 // =============================================================================
diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp
index 5cbcd0d9..84af75d7 100644
--- a/src/a5/platform/onboard/aicpu/kernel.cpp
+++ b/src/a5/platform/onboard/aicpu/kernel.cpp
@@ -2,11 +2,11 @@
 
 #include "common/unified_log.h"
 #include "common/kernel_args.h"
+#include "common/platform_config.h"
 #include "aicpu/device_log.h"
 #include "aicpu/platform_regs.h"
-
-// Forward declaration (no need for full runtime.h)
-class Runtime;
+#include "aicpu/platform_aicpu_affinity.h"
+#include "runtime.h"
 
 // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp)
 extern "C" int aicpu_execute(Runtime *arg);
@@ -71,6 +71,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
     // Store platform regs before calling aicpu_execute
     set_platform_regs(k_args->regs);
 
+    // Affinity gate: drop excess threads before entering runtime
+    if (!platform_aicpu_affinity_gate(runtime->sche_cpu_num,
+                                      PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) {
+        LOG_INFO("Thread dropped by cluster affinity");
+        return 0;
+    }
+
     LOG_INFO("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime");
     int rc = aicpu_execute(runtime);
     if (rc != 0) {
diff --git a/src/a5/platform/onboard/aicpu/platform_aicpu_affinity.cpp b/src/a5/platform/onboard/aicpu/platform_aicpu_affinity.cpp
new file mode 100644
index 00000000..dc28166c
--- /dev/null
+++ b/src/a5/platform/onboard/aicpu/platform_aicpu_affinity.cpp
@@ -0,0 +1,140 @@
+#include "aicpu/platform_aicpu_affinity.h"
+
+#include <atomic>
+#include <cstdint>
+#ifdef __linux__
+#include <sched.h>
+#endif
+
+#include "common/unified_log.h"
+
+static constexpr int32_t AICPU_CORES_PER_CHIP = 8;
+static constexpr int32_t MAX_CLUSTERS = 2;
+static constexpr int32_t CPUS_PER_CLUSTER = 4;
+static constexpr int32_t MAX_GATE_THREADS = 8;
+
+static std::atomic<uint64_t> s_cpumask{0};
+static std::atomic<int32_t> s_reported{0};
+static std::atomic<int32_t> s_gate_init{0};
+static std::atomic<int32_t> s_gate_ready{0};
+
+static int32_t s_thread_cpu[MAX_GATE_THREADS];
+static bool s_thread_survive[MAX_GATE_THREADS];
+
+static inline int32_t popcount64(uint64_t v) {
+    return __builtin_popcountll(static_cast<unsigned long long>(v));
+}
+
+bool platform_aicpu_affinity_gate(int32_t logical_count, int32_t total_launched) {
+    if (logical_count >= total_launched) {
+        return true;
+    }
+
+    // Assign thread index
+    int32_t idx = s_reported.fetch_add(1, std::memory_order_acq_rel);
+
+    // Report CPU
+#if defined(__aarch64__)
+    int32_t cpu = sched_getcpu();
+#elif defined(__x86_64__)
+    int32_t cpu = sched_getcpu();
+#else
+    int32_t cpu = -1;
+#endif
+
+    int32_t normalized_cpu = -1;
+    if (cpu >= 0) {
+        if (cpu < 63) {
+            s_cpumask.fetch_or(1ULL << cpu, std::memory_order_release);
+        }
+        normalized_cpu = cpu % AICPU_CORES_PER_CHIP;
+    }
+    if (idx < MAX_GATE_THREADS) {
+        s_thread_cpu[idx] = normalized_cpu;
+    }
+
+    // Barrier: wait until all total_launched threads have reported
+    while (popcount64(s_cpumask.load(std::memory_order_acquire)) < total_launched &&
+           s_reported.load(std::memory_order_acquire) < total_launched) {
+    }
+
+    // CAS winner does cluster classification
+    int32_t expected = 0;
+    if (s_gate_init.compare_exchange_strong(expected, 1,
+            std::memory_order_acq_rel, std::memory_order_acquire)) {
+        // Initialize survive flags
+        for (int32_t i = 0; i < total_launched; ++i) {
+            s_thread_survive[i] = false;
+        }
+
+        struct ClusterInfo {
+            int32_t count{0};
+            int32_t tids[MAX_GATE_THREADS];
+        };
+        ClusterInfo clusters[MAX_CLUSTERS];
+
+        for (int32_t tid = 0; tid < total_launched; ++tid) {
+            int32_t c = s_thread_cpu[tid];
+            if (c < 0) continue;
+            int32_t cluster_id = c / CPUS_PER_CLUSTER;
+            if (cluster_id < 0 || cluster_id >= MAX_CLUSTERS) continue;
+            ClusterInfo& info = clusters[cluster_id];
+            if (info.count < MAX_GATE_THREADS) info.tids[info.count++] = tid;
+        }
+
+        int32_t major_id = (clusters[0].count >= clusters[1].count) ? 0 : 1;
+        int32_t minor_id = 1 - major_id;
+        int32_t major_cnt = clusters[major_id].count;
+        int32_t minor_cnt = clusters[minor_id].count;
+
+        LOG_INFO("AICPU affinity gate: major=%d(cnt=%d) minor=%d(cnt=%d) logical=%d",
+                 major_id, major_cnt, minor_id, minor_cnt, logical_count);
+
+        if (major_cnt == logical_count && minor_cnt == (total_launched - logical_count)) {
+            // Expected topology: major cluster threads survive
+            for (int32_t i = 0; i < clusters[major_id].count; ++i) {
+                s_thread_survive[clusters[major_id].tids[i]] = true;
+            }
+        } else {
+            // Unexpected topology: fall back to first logical_count threads
+            LOG_WARN("AICPU affinity gate: unexpected topology (major=%d minor=%d), "
+                     "falling back to index-based cutoff",
+                     major_cnt, minor_cnt);
+            for (int32_t i = 0; i < logical_count && i < total_launched; ++i) {
+                s_thread_survive[i] = true;
+            }
+        }
+
+        s_gate_ready.store(1, std::memory_order_release);
+    }
+
+    // Wait for classification to complete
+    while (s_gate_ready.load(std::memory_order_acquire) == 0) {
+    }
+
+    bool survive = (idx < total_launched) ? s_thread_survive[idx] : false;
+
+    // Last thread resets state for next invocation
+    int32_t finished = s_reported.load(std::memory_order_acquire);
+    (void)finished;
+    // Reset is deferred: the statics persist but are re-initialized by the CAS winner
+    // on next call. We reset the atomics after all threads have read their result.
+    // Use a second atomic counter for cleanup.
+    static std::atomic<int32_t> s_cleanup{0};
+    int32_t cleanup_idx = s_cleanup.fetch_add(1, std::memory_order_acq_rel);
+    if (cleanup_idx + 1 == total_launched) {
+        s_cpumask.store(0, std::memory_order_release);
+        s_reported.store(0, std::memory_order_release);
+        s_gate_init.store(0, std::memory_order_release);
+        s_gate_ready.store(0, std::memory_order_release);
+        s_cleanup.store(0, std::memory_order_release);
+    }
+
+    if (!survive) {
+        LOG_INFO("AICPU affinity gate: thread idx=%d cpu=%d DROPPED", idx, normalized_cpu);
+    } else {
+        LOG_INFO("AICPU affinity gate: thread idx=%d cpu=%d ACTIVE", idx, normalized_cpu);
+    }
+
+    return survive;
+}
diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp
index 86f8b4db..8b7ce174 100644
--- a/src/a5/platform/onboard/host/device_runner.cpp
+++ b/src/a5/platform/onboard/host/device_runner.cpp
@@ -414,7 +414,8 @@ int DeviceRunner::run(Runtime& runtime,
 
     std::cout << "\n=== launch_aicpu_kernel DynTileFwkKernelServer===" << '\n';
     // Launch AICPU main kernel
-    rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer", launch_aicpu_num);
+    rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, "DynTileFwkKernelServer",
+                             PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH);
     if (rc != 0) {
         LOG_ERROR("launch_aicpu_kernel (main) failed: %d", rc);
         if (kernel_args_.args.regs != 0) {
diff --git a/src/a5/platform/sim/aicpu/platform_aicpu_affinity.cpp b/src/a5/platform/sim/aicpu/platform_aicpu_affinity.cpp
new file mode 100644
index 00000000..9495ee20
--- /dev/null
+++ b/src/a5/platform/sim/aicpu/platform_aicpu_affinity.cpp
@@ -0,0 +1,32 @@
+#include "aicpu/platform_aicpu_affinity.h"
+
+#include <atomic>
+#include <cstdint>
+
+#include "common/unified_log.h"
+
+static std::atomic<int32_t> s_thread_counter{0};
+static std::atomic<int32_t> s_cleanup_counter{0};
+
+bool platform_aicpu_affinity_gate(int32_t logical_count, int32_t total_launched) {
+    if (logical_count >= total_launched) {
+        return true;
+    }
+
+    int32_t idx = s_thread_counter.fetch_add(1, std::memory_order_acq_rel);
+    bool survive = (idx < logical_count);
+
+    if (!survive) {
+        LOG_INFO("AICPU affinity gate (sim): thread idx=%d DROPPED (logical=%d, launched=%d)",
+                 idx, logical_count, total_launched);
+    }
+
+    // Last thread resets state for next invocation
+    int32_t cleanup_idx = s_cleanup_counter.fetch_add(1, std::memory_order_acq_rel);
+    if (cleanup_idx + 1 == total_launched) {
+        s_thread_counter.store(0, std::memory_order_release);
+        s_cleanup_counter.store(0, std::memory_order_release);
+    }
+
+    return survive;
+}
diff --git a/src/a5/platform/sim/host/CMakeLists.txt b/src/a5/platform/sim/host/CMakeLists.txt
index bdb0ce53..cb616ed9 100644
--- a/src/a5/platform/sim/host/CMakeLists.txt
+++ b/src/a5/platform/sim/host/CMakeLists.txt
@@ -32,6 +32,7 @@ list(APPEND HOST_RUNTIME_SOURCES
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../sim/aicpu/platform_aicpu_affinity.cpp"
 )
 
 if(DEFINED CUSTOM_SOURCE_DIRS)
diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp
index 21c56db1..8f8c98d6 100644
--- a/src/a5/platform/sim/host/device_runner.cpp
+++ b/src/a5/platform/sim/host/device_runner.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "device_runner.h"
+#include "aicpu/platform_aicpu_affinity.h"
 
 // Function pointer types for dynamically loaded executors
 typedef int (*aicpu_execute_func_t)(Runtime* runtime);
@@ -277,11 +278,15 @@ int DeviceRunner::run(Runtime& runtime,
     // Set platform regs in the AICPU .so before launching threads
     set_platform_regs_func_(kernel_args_.regs);
 
-    // Launch AICPU threads
-    LOG_INFO("Launching %d AICPU thread(s)", launch_aicpu_num);
+    // Launch AICPU threads (over-launch for affinity gate)
+    constexpr int over_launch = PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH;
+    LOG_INFO("Launching %d AICPU threads (logical=%d)", over_launch, launch_aicpu_num);
     std::vector<std::thread> aicpu_threads;
-    for (int i = 0; i < launch_aicpu_num; i++) {
-        aicpu_threads.emplace_back([this, &runtime]() {
+    for (int i = 0; i < over_launch; i++) {
+        aicpu_threads.emplace_back([this, &runtime, launch_aicpu_num, over_launch]() {
+            if (!platform_aicpu_affinity_gate(launch_aicpu_num, over_launch)) {
+                return;
+            }
             aicpu_execute_func_(&runtime);
         });
     }
diff --git a/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp b/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp
index d1b222ec..724187db 100644
--- a/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp
+++ b/src/a5/runtime/host_build_graph/aicore/aicore_executor.cpp
@@ -26,9 +26,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
     // Phase 2: Report physical core ID, signal ready
     my_hank->physical_core_id = get_physical_core_id();
     my_hank->aicore_regs_ready = 1;
-    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
+    dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT);
     while (my_hank->aicpu_regs_ready == 0) {
-        dcci(my_hank, SINGLE_CACHE_LINE);
+        dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE);
     }
     // Report initial idle status via register
     write_reg(RegId::COND, AICORE_IDLE_VALUE);
@@ -71,9 +71,8 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
             if (profiling_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
                 __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr;
-                perf_aicore_record_task(perf_buf, task_ptr->task_id, task_ptr->func_id,
-                                      start_time, end_time, kernel_ready_time,
-                                      core_type);
+                perf_aicore_record_task(perf_buf, actual_task_id,
+                                      start_time, end_time, kernel_ready_time);
             }
 
             last_task_id = task_id;
@@ -83,5 +82,5 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
     }
 
     // Flush all dirty cache lines to HBM before kernel exit.
-    dcci(my_hank, ENTIRE_DATA_CACHE, CACHELINE_OUT);
+    dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
 }
diff --git a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp
index 0fbd7b39..d2923711 100644
--- a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp
@@ -35,9 +35,9 @@ struct AicpuExecutor {
 
     int thread_num_{0};
     int cores_total_num_{0};
-    int thread_cores_num_{0};
-    int aic_per_thread_{0};  // Fixed number of AIC cores per thread
-    int aiv_per_thread_{0};  // Fixed number of AIV cores per thread
+    int thread_cores_num_[MAX_AICPU_THREADS]{};  // Total cores (AIC+AIV) assigned to each thread
+    int aic_per_thread_{0};  // Max AIC cores per thread (ceil), used as local queue cap
+    int aiv_per_thread_{0};  // Max AIV cores per thread (ceil), used as local queue cap
     int core_assignments_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD];
 
     // Core discovery arrays (space-time tradeoff: avoid sorting)
@@ -251,10 +251,7 @@ int AicpuExecutor::init(Runtime* runtime) {
         return -1;
     }
 
-    // Calculate cores per thread
-    thread_cores_num_ = cores_total_num_ / thread_num_;
-
-    LOG_INFO("Config: threads=%d, cores=%d, cores_per_thread=%d", thread_num_, cores_total_num_, thread_cores_num_);
+    LOG_INFO("Config: threads=%d, cores=%d", thread_num_, cores_total_num_);
 
     for (int i = 0; i < cores_total_num_; i++) {
         pending_task_ids_[i] = AICPU_TASK_INVALID;
@@ -389,60 +386,48 @@ int AicpuExecutor::handshake_all_cores(Runtime* runtime) {
     return 0;
 }
 
-// Assign discovered cores to threads (requires even distribution)
+// Assign discovered cores to threads using round-robin
 void AicpuExecutor::assign_cores_to_threads() {
-    if (aic_count_ % thread_num_ != 0) {
-        LOG_ERROR("AIC cores (%d) cannot be evenly distributed to %d threads", aic_count_, thread_num_);
-        init_failed_.store(true, std::memory_order_release);
-        return;
-    }
-
-    if (aiv_count_ % thread_num_ != 0) {
-        LOG_ERROR("AIV cores (%d) cannot be evenly distributed to %d threads", aiv_count_, thread_num_);
-        init_failed_.store(true, std::memory_order_release);
-        return;
-    }
+    // Round-robin: AIC core i → thread (i % thread_num_), AIV core i → thread (i % thread_num_).
+    // AIC and AIV are assigned independently; no cluster pairing is required.
+    // aic_per_thread_ / aiv_per_thread_ store the ceiling value and serve as local queue caps.
+    aic_per_thread_ = (aic_count_ + thread_num_ - 1) / thread_num_;
+    aiv_per_thread_ = (aiv_count_ + thread_num_ - 1) / thread_num_;
 
-    aic_per_thread_ = aic_count_ / thread_num_;
-    aiv_per_thread_ = aiv_count_ / thread_num_;
-
-    LOG_INFO("Core Assignment: %d AIC/thread, %d AIV/thread", aic_per_thread_, aiv_per_thread_);
+    LOG_INFO("Core Assignment: %d AIC cores, %d AIV cores across %d threads (max %d AIC/thread, %d AIV/thread)",
+        aic_count_, aiv_count_, thread_num_, aic_per_thread_, aiv_per_thread_);
 
     for (int t = 0; t < thread_num_; t++) {
         int core_idx = 0;
 
-        // Assign AIC cores to this thread
-        int aic_start = t * aic_per_thread_;
-        int aic_end = (t + 1) * aic_per_thread_;
-        for (int i = aic_start; i < aic_end; i++) {
+        // Assign AIC cores: cores at indices t, t+thread_num_, t+2*thread_num_, ...
+        for (int i = t; i < aic_count_; i += thread_num_) {
             core_assignments_[t][core_idx++] = aic_cores_[i].worker_id;
         }
 
-        // Assign AIV cores to this thread
-        int aiv_start = t * aiv_per_thread_;
-        int aiv_end = (t + 1) * aiv_per_thread_;
-        for (int i = aiv_start; i < aiv_end; i++) {
+        // Assign AIV cores after AIC cores
+        for (int i = t; i < aiv_count_; i += thread_num_) {
             core_assignments_[t][core_idx++] = aiv_cores_[i].worker_id;
         }
 
+        thread_cores_num_[t] = core_idx;
+
         char log_buffer[256];
         int offset = 0;
 
         offset += snprintf(
             log_buffer + offset, sizeof(log_buffer) - offset, "Thread %d: assigned %d cores - AIC[", t, core_idx);
 
-        for (int i = 0; i < aic_per_thread_; i++) {
-            if (i > 0) offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, ",");
-            offset +=
-                snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "%d", aic_cores_[aic_start + i].worker_id);
+        for (int k = 0, i = t; i < aic_count_; i += thread_num_, k++) {
+            if (k > 0) offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, ",");
+            offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "%d", aic_cores_[i].worker_id);
         }
 
         offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "] AIV[");
 
-        for (int i = 0; i < aiv_per_thread_; i++) {
-            if (i > 0) offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, ",");
-            offset +=
-                snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "%d", aiv_cores_[aiv_start + i].worker_id);
+        for (int k = 0, i = t; i < aiv_count_; i += thread_num_, k++) {
+            if (k > 0) offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, ",");
+            offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "%d", aiv_cores_[i].worker_id);
         }
 
         offset += snprintf(log_buffer + offset, sizeof(log_buffer) - offset, "]");
@@ -554,9 +539,9 @@ void AicpuExecutor::classify_and_distribute_initial_tasks(Runtime* runtime) {
 int AicpuExecutor::shutdown_aicore(Runtime* runtime, int thread_idx, const int* cur_thread_cores) {
     Handshake* all_handshakes = (Handshake*)runtime->workers;
 
-    LOG_INFO("Thread %d: Shutting down %d cores", thread_idx, thread_cores_num_);
+    LOG_INFO("Thread %d: Shutting down %d cores", thread_idx, thread_cores_num_[thread_idx]);
 
-    for (int i = 0; i < thread_cores_num_; i++) {
+    for (int i = 0; i < thread_cores_num_[thread_idx]; i++) {
         int core_id = cur_thread_cores[i];
         Handshake* hank = &all_handshakes[core_id];
         LOG_INFO("Thread %d: AICPU hank addr = 0x%lx", thread_idx, (uint64_t)hank);
@@ -648,6 +633,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                     if (count > 0) {
                         PerfRecord* record = &perf_buf->records[count - 1];
                         if (record->task_id == static_cast<uint32_t>(completed_task_id)) {
+                            record->func_id = runtime.tasks[completed_task_id].func_id;
+                            record->core_type = h->core_type;
                             perf_aicpu_record_dispatch_and_finish_time(
                                 record, dispatch_timestamps_[core_id], finish_ts);
                         }
@@ -783,6 +770,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime& runtime, int thread_idx, const
                     if (count > 0) {
                         PerfRecord* record = &perf_buf->records[count - 1];
                         if (record->task_id == static_cast<uint32_t>(completed_task_id)) {
+                            record->func_id = runtime.tasks[completed_task_id].func_id;
+                            record->core_type = h->core_type;
                             perf_aicpu_record_dispatch_and_finish_time(
                                 record, dispatch_timestamps_[core_id], finish_ts);
                         }
@@ -995,7 +984,7 @@ int AicpuExecutor::run(Runtime* runtime) {
     const int* cur_thread_cores = core_assignments_[thread_idx];
 
     LOG_INFO("Thread %d: Runtime has %d tasks", thread_idx, runtime->get_task_count());
-    int completed = resolve_and_dispatch(*runtime, thread_idx, cur_thread_cores, thread_cores_num_);
+    int completed = resolve_and_dispatch(*runtime, thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
     LOG_INFO("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
 
     int rc = shutdown_aicore(runtime, thread_idx, cur_thread_cores);
@@ -1005,7 +994,7 @@ int AicpuExecutor::run(Runtime* runtime) {
 
     // Flush performance buffers for cores managed by this thread
     if (runtime->enable_profiling) {
-        perf_aicpu_flush_buffers(runtime, thread_idx, cur_thread_cores, thread_cores_num_);
+        perf_aicpu_flush_buffers(runtime, thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]);
     }
 
     LOG_INFO("Thread %d: Completed", thread_idx);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index 8a0c1095..796407ce 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -16,13 +16,13 @@ typedef void (*UnifiedKernelFunc)(__gm__ int64_t*);
 /**
  * Execute task from PTO2DispatchPayload.
  *
- * Directly accesses PTO2DispatchPayload fields for task execution,
- * matching ref_runtime implementation for a2a3 compatibility.
+ * Reads function_bin_addr and args from the dispatch payload.
  *
- * @param task_ptr Pointer to PTO2DispatchPayload in global memory
+ * @param payload Pointer to PTO2DispatchPayload in global memory
  */
-__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ void* task_ptr) {
-    __gm__ PTO2DispatchPayload* payload = reinterpret_cast<__gm__ PTO2DispatchPayload*>(task_ptr);
+__aicore__ __attribute__((always_inline)) static void execute_task(
+    __gm__ PTO2DispatchPayload* payload
+) {
     if (payload == nullptr || payload->function_bin_addr == 0) {
         return;
     }
@@ -40,11 +40,11 @@ __aicore__ __attribute__((always_inline)) static void execute_task(__gm__ void*
  * 2. Report physical core ID and core type, signal AICore ready
  * 3. Poll DATA_MAIN_BASE register for task dispatch until exit signal
  *
- * Task dispatch uses PTO2DispatchPayload from per-core payload array.
- * Supports performance profiling when runtime->enable_profiling is true.
+ * Task dispatch reads PTO2DispatchPayload address from Handshake.task.
+ * Task ID is derived from the register value (task_id + 1 encoding).
  *
  * @param runtime Pointer to Runtime in global memory
- * @param core_idx Core index (core ID)
+ * @param core_idx Block index (core ID)
  * @param core_type Core type (AIC or AIV)
  */
 __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, int core_idx, CoreType core_type) {
@@ -72,7 +72,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
 
     dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);
 
-    // Read per-core payload address from hank->task (written by AICPU before aicpu_ready)
+    // Cache payload address (set once by AICPU during initialization, never changes)
     __gm__ PTO2DispatchPayload* payload =
         reinterpret_cast<__gm__ PTO2DispatchPayload*>(my_hank->task);
 
@@ -80,46 +80,49 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
     uint64_t kernel_ready_time = get_sys_cnt_aicore();
 
     // Phase 4: Main execution loop - poll register for tasks until exit signal
-    uint32_t task_id = AICPU_IDLE_TASK_ID;
-    uint32_t last_task_id = AICPU_IDLE_TASK_ID;
+    // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
+    uint32_t reg_val = AICPU_IDLE_TASK_ID;
+    uint32_t last_reg_val = AICPU_IDLE_TASK_ID;
 
     while (true) {
-        task_id = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
-        if (task_id == AICORE_EXIT_SIGNAL) {
+        reg_val = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
+        if (reg_val == AICORE_EXIT_SIGNAL) {
             // Signal exit acknowledgment to AICPU
             write_reg(RegId::COND, AICORE_EXITED_VALUE);
             break;
         }
 
-        // Execute task if new (task_id encoding: AICPU_IDLE_TASK_ID=idle, task_id=task)
-        if (task_id == AICPU_IDLE_TASK_ID || task_id == last_task_id) {
+        // Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task)
+        if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) {
             SPIN_WAIT_HINT();
             continue;
         }
 
         {
-            // Invalidate cache to read fresh payload written by AICPU
+            uint32_t task_id = reg_val;  // Decode: register holds task_id directly
+
+            // Invalidate payload buffer (AICPU updates its content each dispatch)
             dcci(payload, ENTIRE_DATA_CACHE);
 
-            write_reg(RegId::COND, MAKE_ACK_VALUE(payload->task_id));
+            write_reg(RegId::COND, MAKE_ACK_VALUE(task_id));
 
             // Performance profiling: record start time
             uint64_t start_time = get_sys_cnt_aicore();
 
             // Execute the task
-            execute_task(reinterpret_cast<__gm__ void*>(payload));
+            execute_task(payload);
 
             // Performance profiling: record task execution
+            // (func_id and core_type are filled by AICPU at completion time)
             if (profiling_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
                 __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr;
-                perf_aicore_record_task(perf_buf, payload->task_id, payload->kernel_id,
-                                       start_time, end_time, kernel_ready_time,
-                                       core_type);
+                perf_aicore_record_task(perf_buf, task_id,
+                                       start_time, end_time, kernel_ready_time);
             }
 
-            last_task_id = task_id;
-            write_reg(RegId::COND, MAKE_FIN_VALUE(payload->task_id));
+            last_reg_val = reg_val;
+            write_reg(RegId::COND, MAKE_FIN_VALUE(task_id));
         }
     }
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index aab32d61..ad2b098d 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -4,15 +4,11 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <mutex>
-#include <string>
-#include <thread>
 
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <unistd.h>
 #ifdef __linux__
-#include <sched.h>
 #include <sys/mman.h>
 #endif
 
@@ -64,17 +60,18 @@ constexpr int32_t MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD;
 
 constexpr int32_t MAX_IDLE_ITERATIONS = 800000;  // ~20s idle then scheduler gives up (avoid long hang)
 constexpr int32_t STALL_LOG_INTERVAL = 50000;    // DEV_ALWAYS every N idle iters to debug hang
+constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
 constexpr int32_t STALL_DUMP_READY_MAX = 8;
 constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
 constexpr int32_t STALL_DUMP_CORE_MAX = 8;
 constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
 constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
 
-// PTO2 device-mode state (per-core dispatch payloads)
-static PTO2DispatchPayload s_pto2_payload_per_core[RUNTIME_MAX_WORKER];
-
 static PTO2Runtime *rt{nullptr};
 
+// Per-core dispatch payload storage (one per physical core)
+static PTO2DispatchPayload s_pto2_payload_per_core[RUNTIME_MAX_WORKER];
+
 // Core information for discovery (with register address for fast dispatch)
 struct CoreInfo {
     int32_t worker_id;              // Index in runtime.workers[]
@@ -98,16 +95,57 @@ struct CoreTypeTracker {
         idle[idle_count++] = running[idx];
         running[idx] = running[--running_count];
     }
+
+    int32_t find_idle_index(int32_t core_id) {
+        for (int32_t i = 0; i < idle_count; i++) {
+            if (idle[i] == core_id) return i;
+        }
+        return -1;
+    }
+};
+
+struct Cluster {
+    int32_t aic_core_id;
+    int32_t aiv_core_ids[2];
 };
 
 struct CoreStateTracker {
     CoreTypeTracker by_type[2];  // indexed by static_cast<int32_t>(CoreType)
+    Cluster clusters[MAX_AIC_PER_THREAD];
+    int32_t cluster_count;
+    bool core_idle[MAX_CORES_PER_THREAD];
 
     CoreTypeTracker& aic() { return by_type[0]; }
     CoreTypeTracker& aiv() { return by_type[1]; }
 
     template<CoreType CT>
     CoreTypeTracker& get() { return by_type[static_cast<int32_t>(CT)]; }
+
+    int32_t find_cluster_for_shape(PTO2ResourceShape shape) {
+        for (int32_t i = 0; i < cluster_count; i++) {
+            Cluster& c = clusters[i];
+            switch (shape) {
+            case PTO2ResourceShape::AIC_ONLY:
+                if (core_idle[c.aic_core_id]) return i;
+                break;
+            case PTO2ResourceShape::AIV_X1:
+                if (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]]) return i;
+                break;
+            case PTO2ResourceShape::AIV_X2:
+                if (core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i;
+                break;
+            case PTO2ResourceShape::AIC_AIV_X1:
+                if (core_idle[c.aic_core_id] &&
+                    (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]])) return i;
+                break;
+            case PTO2ResourceShape::AIC_AIV_X2:
+                if (core_idle[c.aic_core_id] &&
+                    core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i;
+                break;
+            }
+        }
+        return -1;
+    }
 };
 
 struct AicpuExecutor {
@@ -136,11 +174,26 @@ struct AicpuExecutor {
     // Fast lookup: core_id -> reg_addr (for register-based dispatch)
     uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD];
 
+    // Per-core monotonic dispatch counter for register protocol uniqueness.
+    // Multi-ring task_ids can collide in the lower 32 bits (e.g., ring 0 local 0
+    // and ring 1 local 0 both truncate to 0), breaking the AICore's last_reg_val
+    // duplicate detection and causing false-positive COND completion. A per-core
+    // counter guarantees each dispatch writes a unique DATA_MAIN_BASE value.
+    uint32_t dispatch_seq_by_core_[RUNTIME_MAX_WORKER]{};
+
+    // Per-core subtask slot tracking (which PTO2SubtaskSlot is running on each core)
+    PTO2SubtaskSlot executing_subslot_by_core_[RUNTIME_MAX_WORKER]{};
+
+    // Per-core slot state tracking (PTO2TaskSlotState* for the running task on each core)
+    PTO2TaskSlotState* executing_slot_state_by_core_[RUNTIME_MAX_WORKER]{};
+
     // Platform register base address array (set via get_platform_regs())
     uint64_t regs_{0};
 
-    // Track executing task_id per core (AICPU_TASK_INVALID = idle)
-    int32_t executing_task_ids_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD];
+    // Track executing register task_id per core (AICPU_TASK_INVALID = idle).
+    // NOTE: this is NOT the mixed_task_id; it is the per-core dispatch id used by the
+    // register protocol (derived from dispatch_seq_by_core_ and masked by TASK_ID_MASK).
+    int32_t executing_reg_task_ids_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD];
     CoreStateTracker trackers_[MAX_AICPU_THREADS];
 
     // ===== Task queue state (managed by scheduler ready queues) =====
@@ -175,6 +228,12 @@ struct AicpuExecutor {
     uint64_t dispatch_timestamps_[RUNTIME_MAX_WORKER];  // Per-core AICPU dispatch timestamp
     uint32_t core_dispatch_counts_[RUNTIME_MAX_WORKER]; // Per-core total dispatched task counter (for buffer management)
 
+    uint64_t* func_id_to_addr_;
+    uint64_t get_function_bin_addr(int func_id) const {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+        return func_id_to_addr_[func_id];
+    }
+
     // ===== Methods =====
     int32_t init(Runtime* runtime);
     int32_t handshake_all_cores(Runtime* runtime);
@@ -188,56 +247,50 @@ struct AicpuExecutor {
     void diagnose_stuck_state(
         Runtime* runtime, int32_t thread_idx, const int32_t* cur_thread_cores, int32_t core_num, Handshake* hank);
 
-    // Build PTO2DispatchPayload from PTO2TaskDescriptor.
-    template<CoreType CT>
-    void build_pto2_payload(PTO2DispatchPayload* out,
-        Runtime* runtime,
-        PTO2TaskDescriptor* task,
-        PTO2TaskPayload* task_payload) {
-        out->task_id = task->task_id;
-        out->kernel_id = task->kernel_id;
-        out->core_type = CT;
-        out->function_bin_addr = runtime->get_function_bin_addr(task->kernel_id);
+    // Build slim PTO2DispatchPayload: only function_bin_addr + args.
+    // Metadata (mixed_task_id, subslot, kernel_id, core_type) stays in TaskDescriptor.
+    // Dispatch order: tensor args first, then scalar args.
+    void build_pto2_payload(PTO2DispatchPayload& out,
+        int32_t kernel_id,
+        PTO2TaskPayload& task_pl) {
+        out.function_bin_addr = get_function_bin_addr(kernel_id);
         int32_t n = 0;
-
-        for (int32_t i = 0; i < task_payload->param_count; i++) {
-            if (!task_payload->is_tensor[i]) {
-                out->args[n++] = task_payload->scalar_value[i];
-            } else {
-                out->args[n++] = reinterpret_cast<uint64_t>(&task_payload->tensors[i]);
-                task_payload->tensors[i].update_start_offset();
-            }
+        for (int32_t i = 0; i < task_pl.tensor_count; i++) {
+            task_pl.tensors[i].update_start_offset();
+            out.args[n++] = reinterpret_cast<uint64_t>(&task_pl.tensors[i]);
+        }
+        for (int32_t i = 0; i < task_pl.scalar_count; i++) {
+            out.args[n++] = task_pl.scalars[i];
         }
-
-        out->num_args = n;
     }
 
     // Template methods for Phase 1 and Phase 2
     template <CoreType CT>
     void check_running_cores_for_completion(int32_t thread_idx,
         CoreTypeTracker& ct,
+        bool* core_idle,
         Handshake* hank,
-        int32_t* executing_task_ids,
+        int32_t* executing_reg_task_ids,
         int32_t& completed_this_turn,
         int32_t& cur_thread_completed,
         bool& made_progress,
-        int32_t deferred_release_ids[],
+        PTO2TaskSlotState* deferred_release_slot_states[],
         int32_t& deferred_release_count,
-        PTO2LocalReadyBuffer& local_buf
+        PTO2LocalReadyBuffer* local_bufs
 #if PTO2_PROFILING
         ,
         bool profiling_enabled,
+        uint32_t& phase_complete_count
+#endif
+#if PTO2_SCHED_PROFILING
+        ,
         uint64_t& complete_probe_count,
         uint64_t& complete_hit_count,
-        uint32_t& phase_complete_count,
         uint64_t& notify_edges_total,
         int32_t& notify_max_degree,
         uint64_t& notify_tasks_enqueued,
         uint64_t& fanin_edges_total,
-        int32_t& fanin_max_degree
-#endif
-#if PTO2_SCHED_PROFILING
-        ,
+        int32_t& fanin_max_degree,
         uint64_t& sched_complete_perf_cycle
 #endif
     ) {
@@ -245,12 +298,12 @@ struct AicpuExecutor {
             int32_t core_id = ct.running[i];
             uint64_t reg_addr = core_id_to_reg_addr_[core_id];
 
-            int32_t task_id = executing_task_ids[core_id];
+            int32_t expected_reg_task_id = executing_reg_task_ids[core_id];
             uint64_t reg_val = read_reg(reg_addr, RegId::COND);
             int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
             int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
-            bool done = reg_task_id == task_id && reg_state == TASK_FIN_STATE;
-#if PTO2_PROFILING
+            bool done = reg_task_id == expected_reg_task_id && reg_state == TASK_FIN_STATE;
+#if PTO2_SCHED_PROFILING
             if (profiling_enabled) {
                 complete_probe_count++;
                 if (done) {
@@ -260,45 +313,48 @@ struct AicpuExecutor {
 #endif
 
             if (done) {
-                executing_task_ids[core_id] = AICPU_TASK_INVALID;
+                executing_reg_task_ids[core_id] = AICPU_TASK_INVALID;
+                PTO2SubtaskSlot subslot = executing_subslot_by_core_[core_id];
+                PTO2TaskSlotState& slot_state = *executing_slot_state_by_core_[core_id];
+
+                // Two-stage completion: mark subtask done, then handle mixed-task completion
+                bool mixed_complete = rt->scheduler.on_subtask_complete(slot_state, subslot);
+                if (mixed_complete) {
 #if PTO2_SCHED_PROFILING
-                PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
-                PTO2CompletionStats cstats = rt->scheduler.on_task_complete(task_id, thread_idx, &local_buf);
-                notify_edges_total += cstats.fanout_edges;
-                if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
-                notify_tasks_enqueued += cstats.tasks_enqueued;
-                phase_complete_count++;
-#elif PTO2_PROFILING
-                PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
-                PTO2CompletionStats cstats = rt->scheduler.on_task_complete(task_id, &local_buf);
-                notify_edges_total += cstats.fanout_edges;
-                if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
-                notify_tasks_enqueued += cstats.tasks_enqueued;
-                phase_complete_count++;
+                    PTO2CompletionStats cstats = rt->scheduler.on_mixed_task_complete(slot_state, thread_idx, local_bufs);
+                    notify_edges_total += cstats.fanout_edges;
+                    if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
+                    notify_tasks_enqueued += cstats.tasks_enqueued;
+                    phase_complete_count++;
 #else
-                rt->scheduler.on_task_complete(task_id, &local_buf);
+                    rt->scheduler.on_mixed_task_complete(slot_state, local_bufs);
+#if PTO2_PROFILING
+                    phase_complete_count++;
 #endif
-                if (deferred_release_count < 64) {
-                    deferred_release_ids[deferred_release_count++] = task_id;
-                } else {
-                    DEV_ALWAYS("Thread %d: release", thread_idx);
-                    while (deferred_release_count > 0) {
+#endif
+                    if (deferred_release_count < 256) {
+                        deferred_release_slot_states[deferred_release_count++] = &slot_state;
+                    } else {
+                        DEV_ALWAYS("Thread %d: release", thread_idx);
+                        while (deferred_release_count > 0) {
 #if PTO2_SCHED_PROFILING
-                        int32_t fe =
-                            rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count], thread_idx);
+                            int32_t fe = rt->scheduler.on_task_release(
+                                *deferred_release_slot_states[--deferred_release_count], thread_idx);
 #else
-                        int32_t fe = rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count]);
+                            int32_t fe =
+                                rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count]);
 #endif
-                        (void)fe;
-#if PTO2_PROFILING
-                        fanin_edges_total += fe;
-                        if (fe > fanin_max_degree) fanin_max_degree = fe;
+                            (void)fe;
+#if PTO2_SCHED_PROFILING
+                            fanin_edges_total += fe;
+                            if (fe > fanin_max_degree) fanin_max_degree = fe;
 #endif
+                        }
+                        deferred_release_slot_states[deferred_release_count++] = &slot_state;
                     }
-                    deferred_release_ids[deferred_release_count++] = task_id;
                 }
                 ct.move_running_to_idle(i);
-
+                core_idle[core_id] = true;
 #if PTO2_PROFILING
                 if (profiling_enabled) {
 #if PTO2_SCHED_PROFILING
@@ -311,9 +367,27 @@ struct AicpuExecutor {
                     uint32_t count = perf_buf->count;
                     if (count > 0) {
                         PerfRecord* record = &perf_buf->records[count - 1];
-                        if (record->task_id == static_cast<uint32_t>(payload->task_id)) {
+                        if (record->task_id == static_cast<uint32_t>(expected_reg_task_id)) {
+                            // Fill metadata that AICore doesn't know
+                            int32_t perf_slot_idx = static_cast<int32_t>(executing_subslot_by_core_[core_id]);
+                            record->func_id = slot_state.task->kernel_id[perf_slot_idx];
+                            record->core_type = CT;
                             perf_aicpu_record_dispatch_and_finish_time(
                                 record, dispatch_timestamps_[core_id], finish_ts);
+
+                            // Fill ring_id from slot state
+                            record->ring_id = slot_state.ring_id;
+
+                            // Fill fanout from slot_state's dependency linked list.
+                            // No lock: head-insert guarantees existing nodes' next pointers
+                            // are stable, so this snapshot is consistent (best-effort).
+                            record->fanout_count = 0;
+                            PTO2DepListEntry* cur = slot_state.fanout_head;
+                            while (cur != nullptr && record->fanout_count < RUNTIME_MAX_FANOUT) {
+                                record->fanout[record->fanout_count++] = static_cast<int32_t>(
+                                    pto2_task_id_local(cur->slot_state->task->mixed_task_id));
+                                cur = cur->next;
+                            }
                         }
                     }
 #if PTO2_SCHED_PROFILING
@@ -322,95 +396,148 @@ struct AicpuExecutor {
                 }
 #endif
 
-                DEV_DEBUG("Thread %d: %s core %d completed PTO2 task %d",
+                DEV_DEBUG("Thread %d: %s core %d completed PTO2 task %d (mixed_complete=%d)",
                     thread_idx,
                     CT == CoreType::AIC ? "AIC" : "AIV",
                     core_id,
-                    task_id);
+                    expected_reg_task_id,
+                    mixed_complete ? 1 : 0);
                 cur_thread_completed++;
-                completed_this_turn++;
+                if (mixed_complete) {
+                    completed_this_turn++;
+                }
                 made_progress = true;
             }
         }
     }
 
-    template <CoreType CT>
-    void dispatch_ready_tasks_to_idle_cores(Runtime* runtime,
-        int32_t thread_idx,
-        CoreTypeTracker& ct,
-        int32_t* executing_task_ids,
-        bool& made_progress,
-        PTO2TaskDescriptor* task_descriptors,
-        PTO2TaskPayload* task_payloads,
-        int32_t window_mask
-#if PTO2_PROFILING
-        ,
-        bool profiling_enabled,
-        uint64_t& pop_hit,
-        uint64_t& pop_miss,
-        uint32_t& phase_dispatch_count
-#endif
+    static const char* shape_name(PTO2ResourceShape shape) {
+        switch (shape) {
+        case PTO2ResourceShape::AIC_ONLY:   return "AIC_ONLY";
+        case PTO2ResourceShape::AIV_X1:     return "AIV_X1";
+        case PTO2ResourceShape::AIV_X2:     return "AIV_X2";
+        case PTO2ResourceShape::AIC_AIV_X1: return "AIC_AIV_X1";
+        case PTO2ResourceShape::AIC_AIV_X2: return "AIC_AIV_X2";
+        }
+        return "UNKNOWN";
+    }
+
+    struct ResourceCount {
+        int32_t aic;
+        int32_t aiv;
+    };
+
+    static constexpr ResourceCount shape_resource_count(PTO2ResourceShape shape) {
+        constexpr ResourceCount kTable[PTO2_NUM_RESOURCE_SHAPES] = {
+            {1, 0},  // AIC_ONLY    = 0
+            {0, 1},  // AIV_X1      = 1
+            {0, 2},  // AIV_X2      = 2
+            {1, 1},  // AIC_AIV_X1  = 3
+            {1, 2},  // AIC_AIV_X2  = 4
+        };
+        return kTable[static_cast<int>(shape)];
+    }
+
+    /**
+     * Returns the dispatch probe order for a given scheduler thread.
+     * Widest shapes first to avoid consuming cluster resources with narrow tasks.
+     * Even/odd threads use different fallback orders (AIC-first vs AIV-first)
+     * to reduce contention on the same ready queue across adjacent threads.
+     */
+    static const PTO2ResourceShape* get_dispatch_order(int32_t thread_idx) {
+        // Even threads: AIC-first fallback after widest
+        static constexpr PTO2ResourceShape kEvenOrder[PTO2_NUM_RESOURCE_SHAPES] = {
+            PTO2ResourceShape::AIC_AIV_X2,
+            PTO2ResourceShape::AIC_AIV_X1,
+            PTO2ResourceShape::AIC_ONLY,
+            PTO2ResourceShape::AIV_X2,
+            PTO2ResourceShape::AIV_X1,
+        };
+        // Odd threads: AIV-first fallback after widest
+        static constexpr PTO2ResourceShape kOddOrder[PTO2_NUM_RESOURCE_SHAPES] = {
+            PTO2ResourceShape::AIC_AIV_X2,
+            PTO2ResourceShape::AIV_X2,
+            PTO2ResourceShape::AIC_AIV_X1,
+            PTO2ResourceShape::AIV_X1,
+            PTO2ResourceShape::AIC_ONLY,
+        };
+        return (thread_idx % 2 == 0) ? kEvenOrder : kOddOrder;
+    }
+
+    PTO2TaskSlotState* pop_ready_task(PTO2ResourceShape shape, int32_t thread_idx
 #if PTO2_SCHED_PROFILING
-        ,
-        uint64_t& sched_dispatch_pop_cycle,
-        uint64_t& sched_dispatch_setup_cycle
+        , uint64_t& pop_hit, uint64_t& pop_miss
+        , uint64_t& sched_dispatch_pop_cycle
 #endif
     ) {
-        if (ct.idle_count > 0 && rt->scheduler.ready_queues[static_cast<int32_t>(CT)].size() > 0) {
-            for (int32_t i = ct.idle_count - 1; i >= 0; i--) {
-                int32_t core_id = ct.idle[i];
-
+        (void)thread_idx;
 #if PTO2_SCHED_PROFILING
-                extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
-                uint64_t t_pop_start = get_sys_cnt_aicpu();
-                int32_t task_id = rt->scheduler.get_ready_task<CT>(
-                    g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]);
-                sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
+        extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
+        uint64_t t_pop_start = get_sys_cnt_aicpu();
+        PTO2TaskSlotState* slot_state = rt->scheduler.get_ready_task(shape,
+            g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]);
+        sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
 #else
-                int32_t task_id = rt->scheduler.get_ready_task<CT>();
-#endif
-                if (task_id >= 0) {
-#if PTO2_PROFILING
-                    pop_hit++;
-                    phase_dispatch_count++;
+        PTO2TaskSlotState* slot_state = rt->scheduler.get_ready_task(shape);
 #endif
+        if (slot_state) {
 #if PTO2_SCHED_PROFILING
-                    uint64_t t_setup_start = get_sys_cnt_aicpu();
-#endif
-                    PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask];
-                    PTO2TaskPayload* task_pl = &task_payloads[task_id & window_mask];
-                    PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
-                    build_pto2_payload<CT>(payload, runtime, task, task_pl);
-#if PTO2_PROFILING
-                    if (profiling_enabled) {
-                        dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
-                        if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) {
-                            perf_aicpu_switch_buffer(runtime, core_id, thread_idx);
-                            core_dispatch_counts_[core_id] = 0;
-                        }
-                        core_dispatch_counts_[core_id]++;
-                    }
+            pop_hit++;
 #endif
-                    write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast<uint64_t>(task_id));
-                    ct.move_idle_to_running(i);
-                    executing_task_ids[core_id] = task_id;
-                    made_progress = true;
+        } else {
 #if PTO2_SCHED_PROFILING
-                    sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+            pop_miss++;
 #endif
-                    DEV_DEBUG("Thread %d: Dispatching PTO2 task %d to %s core %d",
-                        thread_idx,
-                        task_id,
-                        CT == CoreType::AIC ? "AIC" : "AIV",
-                        core_id);
-                } else {
+        }
+        return slot_state;
+    }
+
+    void dispatch_subtask_to_core(
+        Runtime* runtime, CoreStateTracker& tracker, int32_t* executing_reg_task_ids,
+        int32_t core_id, CoreType core_type, PTO2TaskSlotState& slot_state,
+        PTO2SubtaskSlot subslot
 #if PTO2_PROFILING
-                    pop_miss++;
+        , bool profiling_enabled, int32_t thread_idx
 #endif
-                    break;
-                }
+    ) {
+        PTO2DispatchPayload& payload = s_pto2_payload_per_core[core_id];
+        PTO2TaskDescriptor& task = *slot_state.task;
+        int32_t slot_idx = static_cast<int32_t>(subslot);
+        build_pto2_payload(payload, task.kernel_id[slot_idx], *slot_state.payload);
+        executing_subslot_by_core_[core_id] = subslot;
+        executing_slot_state_by_core_[core_id] = &slot_state;
+#if PTO2_PROFILING
+        if (profiling_enabled) {
+            dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
+            if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) {
+                perf_aicpu_switch_buffer(runtime, core_id, thread_idx);
+                core_dispatch_counts_[core_id] = 0;
             }
+            core_dispatch_counts_[core_id]++;
+        }
+#endif
+        // Per-core monotonic counter for register protocol uniqueness.
+        // mixed_task_id encodes (ring_id << 32 | local_id); truncation to
+        // uint32 loses ring_id, so tasks from different rings with the same
+        // local_id would write identical DATA_MAIN_BASE values. The AICore
+        // uses last_reg_val to detect new dispatches and would skip the
+        // duplicate, while the stale COND register from the previous task
+        // (same local_id) would cause a false-positive completion.
+        dispatch_seq_by_core_[core_id]++;
+        uint32_t reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK;
+        // Skip reserved sentinel values
+        while (reg_task_id == AICORE_IDLE_TASK_ID ||
+            (reg_task_id + 1) == AICORE_EXIT_SIGNAL) {
+            dispatch_seq_by_core_[core_id]++;
+            reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK;
         }
+        write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast<uint64_t>(reg_task_id));
+
+        CoreTypeTracker& ct = tracker.by_type[static_cast<int32_t>(core_type)];
+        int32_t idle_idx = ct.find_idle_index(core_id);
+        ct.move_idle_to_running(idle_idx);
+        tracker.core_idle[core_id] = false;
+        executing_reg_task_ids[core_id] = reg_task_id;
     }
 };
 
@@ -451,6 +578,7 @@ int32_t AicpuExecutor::handshake_all_cores(Runtime* runtime) {
     bool handshake_failed = false;
     for (int32_t i = 0; i < cores_total_num_; i++) {
         Handshake* hank = &all_handshakes[i];
+
         while (hank->aicore_regs_ready == 0) {
         }
 
@@ -510,59 +638,69 @@ int32_t AicpuExecutor::handshake_all_cores(Runtime* runtime) {
  * (Aligned with host_build_graph mechanism)
  */
 void AicpuExecutor::assign_cores_to_threads() {
-    // Determine how many cores each thread gets initially:
-    // - Mixed mode: distribute among scheduler threads only
-    // - All-orchestrator mode: distribute among all threads (they all transition to schedulers)
+    // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % divisor.
+    // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
     int32_t divisor = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_;
-    int32_t aic_per_thread = aic_count_ / divisor;
-    int32_t aiv_per_thread = aiv_count_ / divisor;
+    int32_t cluster_count = aic_count_;
 
-    DEV_INFO("Assigning cores: %d AIC per thread, %d AIV per thread", aic_per_thread, aiv_per_thread);
+    DEV_INFO("Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)",
+             cluster_count, divisor, aic_count_, aiv_count_);
 
     for (int32_t i = 0; i < thread_num_; i++) {
         for (int32_t j = 0; j < MAX_CORES_PER_THREAD; j++) {
-            executing_task_ids_[i][j] = AICPU_TASK_INVALID;
+            executing_reg_task_ids_[i][j] = AICPU_TASK_INVALID;
         }
         trackers_[i].aic().running_count = 0;
         trackers_[i].aiv().running_count = 0;
         trackers_[i].aic().idle_count = 0;
         trackers_[i].aiv().idle_count = 0;
+        trackers_[i].cluster_count = 0;
+        memset(trackers_[i].core_idle, 0, sizeof(trackers_[i].core_idle));
+        core_count_per_thread_[i] = 0;
     }
 
-    for (int32_t t = 0; t < thread_num_; t++) {
-        if (sched_thread_num_ > 0 && t >= sched_thread_num_) {
-            // Orchestrator thread: no cores
-            core_count_per_thread_[t] = 0;
-            DEV_INFO("Thread %d: orchestrator (0 cores)", t);
-            continue;
-        }
+    // Mark orchestrator threads explicitly (no cores).
+    for (int32_t t = divisor; t < thread_num_; t++) {
+        DEV_INFO("Thread %d: orchestrator (0 cores)", t);
+    }
 
-        int32_t core_idx = 0;
+    // Per-sched-thread running core index used while filling core_assignments_.
+    int32_t core_idx[MAX_AICPU_THREADS] = {};
 
-        // Assign AIC cores
-        int32_t aic_start = t * aic_per_thread;
-        for (int32_t i = 0; i < aic_per_thread; i++) {
-            int32_t worker_id = aic_cores_[aic_start + i].worker_id;
-            core_assignments_[t][core_idx++] = worker_id;
-            trackers_[t].aic().idle[trackers_[t].aic().idle_count++] = worker_id;
-            DEV_INFO("Thread %d: assigned AIC worker_id=%d", t, worker_id);
-        }
+    for (int32_t ci = 0; ci < cluster_count; ci++) {
+        int32_t t = ci % divisor;
+        CoreStateTracker& tracker = trackers_[t];
+        int32_t& idx = core_idx[t];
 
-        // Assign AIV cores
-        int32_t aiv_start = t * aiv_per_thread;
-        for (int32_t i = 0; i < aiv_per_thread; i++) {
-            int32_t worker_id = aiv_cores_[aiv_start + i].worker_id;
-            core_assignments_[t][core_idx++] = worker_id;
-            trackers_[t].aiv().idle[trackers_[t].aiv().idle_count++] = worker_id;
-            DEV_INFO("Thread %d: assigned AIV worker_id=%d", t, worker_id);
-        }
+        int32_t aic_wid  = aic_cores_[ci].worker_id;
+        int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id;
+        int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id;
+
+        tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}};
 
-        core_count_per_thread_[t] = core_idx;
+        core_assignments_[t][idx++] = aic_wid;
+        tracker.aic().idle[tracker.aic().idle_count++] = aic_wid;
+        tracker.core_idle[aic_wid] = true;
 
-        DEV_INFO("Thread %d: total %d cores", t, core_idx);
+        core_assignments_[t][idx++] = aiv0_wid;
+        core_assignments_[t][idx++] = aiv1_wid;
+        tracker.aiv().idle[tracker.aiv().idle_count++] = aiv0_wid;
+        tracker.aiv().idle[tracker.aiv().idle_count++] = aiv1_wid;
+        tracker.core_idle[aiv0_wid] = true;
+        tracker.core_idle[aiv1_wid] = true;
+
+        DEV_INFO("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)",
+                 t, ci, aic_wid, aiv0_wid, aiv1_wid);
     }
 
-    thread_cores_num_ = aic_per_thread + aiv_per_thread;
+    for (int32_t t = 0; t < divisor; t++) {
+        core_count_per_thread_[t] = core_idx[t];
+        DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx[t], trackers_[t].cluster_count);
+    }
+
+    // Max clusters any single sched thread can hold: ceil(cluster_count / divisor).
+    int32_t max_clusters_per_thread = (cluster_count + divisor - 1) / divisor;
+    thread_cores_num_ = max_clusters_per_thread * 3;
 }
 
 /**
@@ -571,100 +709,90 @@ void AicpuExecutor::assign_cores_to_threads() {
  * Writes into new_core_assignments_ / new_core_count_per_thread_.
  */
 void AicpuExecutor::reassign_cores_for_all_threads() {
-    // Calculate how many AIC/AIV each thread should have
-
-    DEV_INFO("Reassigning cores for all %d threads: %d AIC, %d AIV", thread_num_, aic_count_, aiv_count_);
+    DEV_INFO("Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV",
+             thread_num_, aic_count_, aiv_count_);
 
-    int32_t aic_running_cores[128];
-    int32_t aic_running_task_ids[128];
-    int32_t aic_idle_cores[128];
-    int32_t aic_running_cores_num = 0;
-    int32_t aic_idle_cores_num = 0;
+    // Collect running/idle state from all threads before reassignment
+    int32_t running_cores[128];
+    int32_t running_task_ids[128];
+    int32_t running_count = 0;
 
-    int32_t aiv_running_cores[128];
-    int32_t aiv_running_task_ids[128];
-    int32_t aiv_idle_cores[128];
-    int32_t aiv_running_cores_num = 0;
-    int32_t aiv_idle_cores_num = 0;
+    bool was_idle[MAX_CORES_PER_THREAD];
+    memset(was_idle, 0, sizeof(was_idle));
 
     for (int32_t i = 0; i < thread_num_; i++) {
-        core_count_per_thread_[i] = 0;
         for (int32_t j = 0; j < trackers_[i].aic().running_count; j++) {
             int32_t core_id = trackers_[i].aic().running[j];
-            aic_running_cores[aic_running_cores_num] = core_id;
-            aic_running_task_ids[aic_running_cores_num] = executing_task_ids_[i][core_id];
-            aic_running_cores_num++;
+            running_cores[running_count] = core_id;
+            running_task_ids[running_count] = executing_reg_task_ids_[i][core_id];
+            running_count++;
         }
         for (int32_t j = 0; j < trackers_[i].aic().idle_count; j++) {
-            aic_idle_cores[aic_idle_cores_num++] = trackers_[i].aic().idle[j];
+            was_idle[trackers_[i].aic().idle[j]] = true;
         }
         for (int32_t j = 0; j < trackers_[i].aiv().running_count; j++) {
             int32_t core_id = trackers_[i].aiv().running[j];
-            aiv_running_cores[aiv_running_cores_num] = core_id;
-            aiv_running_task_ids[aiv_running_cores_num] = executing_task_ids_[i][core_id];
-            aiv_running_cores_num++;
+            running_cores[running_count] = core_id;
+            running_task_ids[running_count] = executing_reg_task_ids_[i][core_id];
+            running_count++;
         }
         for (int32_t j = 0; j < trackers_[i].aiv().idle_count; j++) {
-            aiv_idle_cores[aiv_idle_cores_num++] = trackers_[i].aiv().idle[j];
+            was_idle[trackers_[i].aiv().idle[j]] = true;
         }
+    }
+
+    // Reset all trackers
+    for (int32_t i = 0; i < thread_num_; i++) {
+        core_count_per_thread_[i] = 0;
         trackers_[i].aic().running_count = 0;
         trackers_[i].aic().idle_count = 0;
         trackers_[i].aiv().running_count = 0;
         trackers_[i].aiv().idle_count = 0;
+        trackers_[i].cluster_count = 0;
+        memset(trackers_[i].core_idle, 0, sizeof(trackers_[i].core_idle));
         for (int32_t j = 0; j < MAX_CORES_PER_THREAD; j++) {
-            executing_task_ids_[i][j] = AICPU_TASK_INVALID;
+            executing_reg_task_ids_[i][j] = AICPU_TASK_INVALID;
         }
     }
-    for (int32_t i = 0; i < aic_count_; i++) {
-        int32_t thread_idx = i % thread_num_;
-        int32_t core_id = aic_cores_[i].worker_id;
-        core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = core_id;
-        bool found = false;
-        for (int32_t j = 0; j < aic_running_cores_num; j++) {
-            if (core_id == aic_running_cores[j]) {
-                trackers_[thread_idx].aic().running[trackers_[thread_idx].aic().running_count++] = core_id;
-                executing_task_ids_[thread_idx][core_id] = aic_running_task_ids[j];
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            for (int32_t j = 0; j < aic_idle_cores_num; j++) {
-                if (core_id == aic_idle_cores[j]) {
-                    trackers_[thread_idx].aic().idle[trackers_[thread_idx].aic().idle_count++] = core_id;
-                    break;
-                }
-            }
-        }
-    }
-    for (int32_t i = 0; i < aiv_count_; i++) {
-        int32_t thread_idx = i % thread_num_;
-        int32_t core_id = aiv_cores_[i].worker_id;
-        core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = core_id;
-        bool found = false;
-        for (int32_t j = 0; j < aiv_running_cores_num; j++) {
-            if (core_id == aiv_running_cores[j]) {
-                trackers_[thread_idx].aiv().running[trackers_[thread_idx].aiv().running_count++] = core_id;
-                executing_task_ids_[thread_idx][core_id] = aiv_running_task_ids[j];
-                found = true;
-                break;
+
+    // Restore a single core's running/idle state into its new thread's tracker
+    auto reassign_core = [&](int32_t worker_id, CoreTypeTracker& type_tracker,
+                             CoreStateTracker& tracker, int32_t thread_idx) {
+        core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = worker_id;
+        for (int32_t j = 0; j < running_count; j++) {
+            if (running_cores[j] == worker_id) {
+                type_tracker.running[type_tracker.running_count++] = worker_id;
+                executing_reg_task_ids_[thread_idx][worker_id] = running_task_ids[j];
+                return;
             }
         }
-        if (!found) {
-            for (int32_t j = 0; j < aiv_idle_cores_num; j++) {
-                if (core_id == aiv_idle_cores[j]) {
-                    trackers_[thread_idx].aiv().idle[trackers_[thread_idx].aiv().idle_count++] = core_id;
-                    break;
-                }
-            }
+        if (was_idle[worker_id]) {
+            type_tracker.idle[type_tracker.idle_count++] = worker_id;
+            tracker.core_idle[worker_id] = true;
         }
+    };
+
+    // Assign whole clusters round-robin across all threads
+    for (int32_t ci = 0; ci < aic_count_; ci++) {
+        int32_t t = ci % thread_num_;
+        CoreStateTracker& tracker = trackers_[t];
+
+        int32_t aic_wid = aic_cores_[ci].worker_id;
+        int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id;
+        int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id;
+
+        tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}};
+
+        reassign_core(aic_wid, tracker.aic(), tracker, t);
+        reassign_core(aiv0_wid, tracker.aiv(), tracker, t);
+        reassign_core(aiv1_wid, tracker.aiv(), tracker, t);
     }
 
     // Log final distribution for verification
     DEV_INFO("Core reassignment complete:");
     for (int32_t t = 0; t < thread_num_; t++) {
-        DEV_INFO("  Thread %d: %d cores (AIC: running=%d idle=%d, AIV: running=%d idle=%d)",
-                 t, core_count_per_thread_[t],
+        DEV_INFO("  Thread %d: %d cores, %d clusters (AIC: running=%d idle=%d, AIV: running=%d idle=%d)",
+                 t, core_count_per_thread_[t], trackers_[t].cluster_count,
                  trackers_[t].aic().running_count, trackers_[t].aic().idle_count,
                  trackers_[t].aiv().running_count, trackers_[t].aiv().idle_count);
     }
@@ -684,6 +812,8 @@ int32_t AicpuExecutor::init(Runtime* runtime) {
         return -1;
     }
 
+    func_id_to_addr_ = runtime->func_id_to_addr_;
+
     // Read execution parameters from runtime
     thread_num_ = runtime->sche_cpu_num;
     orch_thread_num_ = runtime->orch_thread_num;
@@ -718,7 +848,10 @@ int32_t AicpuExecutor::init(Runtime* runtime) {
     // Task count comes from PTO2 shared memory
     if (runtime->get_pto2_gm_sm_ptr()) {
         auto* header = static_cast<PTO2SharedMemoryHeader*>(runtime->get_pto2_gm_sm_ptr());
-        int32_t pto2_count = header->current_task_index.load(std::memory_order_acquire);
+        int32_t pto2_count = 0;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            pto2_count += header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+        }
         total_tasks_ = pto2_count > 0 ? pto2_count : 0;
     } else {
         total_tasks_ = 0;
@@ -737,6 +870,12 @@ int32_t AicpuExecutor::init(Runtime* runtime) {
         core_dispatch_counts_[i] = 0;
     }
 
+    // Clear per-core dispatch payloads and subslot tracking
+    memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core));
+    memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_));
+    memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_));
+    memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_));
+
     DEV_INFO("Init: PTO2 mode, task count from shared memory");
 
     finished_count_.store(0, std::memory_order_release);
@@ -770,7 +909,7 @@ int32_t AicpuExecutor::shutdown_aicore(Runtime* runtime, int32_t thread_idx, con
 
 int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t thread_idx) {
     int32_t &core_num = core_count_per_thread_[thread_idx];
-    int32_t* executing_task_ids = executing_task_ids_[thread_idx];
+    int32_t* executing_reg_task_ids = executing_reg_task_ids_[thread_idx];
     CoreStateTracker& tracker = trackers_[thread_idx];
     DEV_INFO("Thread %d: resolve_and_dispatch_pto2 entry", thread_idx);
 
@@ -782,25 +921,13 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
     DEV_INFO("Thread %d: sm_base=%p", thread_idx, sm_base);
 
     PTO2SharedMemoryHeader* header = static_cast<PTO2SharedMemoryHeader*>(sm_base);
-    DEV_INFO("Thread %d: header=%p, task_desc_offset=%d, window_size=%d",
-             thread_idx, (void*)header, header->task_descriptors_offset,
-             header->task_window_size);
-
-    PTO2TaskDescriptor* task_descriptors = reinterpret_cast<PTO2TaskDescriptor*>(
-        static_cast<char*>(sm_base) + header->task_descriptors_offset);
-    PTO2TaskPayload* task_payloads = reinterpret_cast<PTO2TaskPayload*>(
-        reinterpret_cast<char*>(task_descriptors) +
-        PTO2_ALIGN_UP(header->task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE));
-    DEV_INFO("Thread %d: task_descriptors=%p",
-             thread_idx, (void*)task_descriptors);
-
-    int32_t window_size = header->task_window_size;
-    if (window_size <= 0 || window_size > PTO2_TASK_WINDOW_SIZE) window_size = PTO2_TASK_WINDOW_SIZE;
-    int32_t window_mask = window_size - 1;
+    DEV_INFO("Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu",
+             thread_idx, (void*)header, (unsigned long)header->rings[0].task_descriptors_offset,
+             (unsigned long)header->rings[0].task_window_size);
 
     Handshake* hank = static_cast<Handshake*>(runtime->workers);
-    DEV_INFO("Thread %d: hank=%p, window_size=%d",
-             thread_idx, (void*)hank, window_size);
+    DEV_INFO("Thread %d: hank=%p, window_size=%lu",
+             thread_idx, (void*)hank, (unsigned long)header->rings[0].task_window_size);
 
     // One-time init: assign perf buffers (one thread does it; others wait)
     if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) {
@@ -839,9 +966,12 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
     uint64_t sched_complete_cycle = 0;
     uint64_t sched_dispatch_cycle = 0;
     uint64_t sched_idle_cycle = 0;
+    uint64_t sched_loop_count = 0;
+    uint32_t phase_complete_count = 0;
+    uint32_t phase_dispatch_count = 0;
+#if PTO2_SCHED_PROFILING
     uint64_t complete_probe_count = 0;
     uint64_t complete_hit_count = 0;
-    uint64_t sched_loop_count = 0;
     uint64_t notify_edges_total = 0;
     int32_t  notify_max_degree = 0;
     uint64_t notify_tasks_enqueued = 0;
@@ -849,24 +979,23 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
     int32_t  fanin_max_degree = 0;
     uint64_t pop_hit = 0;
     uint64_t pop_miss = 0;
-    uint32_t phase_complete_count = 0;
-    uint32_t phase_dispatch_count = 0;
     uint64_t local_dispatch_count = 0;
     uint64_t local_overflow_count = 0;
-#if PTO2_SCHED_PROFILING
     uint64_t sched_complete_perf_cycle = 0;
     uint64_t sched_dispatch_pop_cycle = 0;
     uint64_t sched_dispatch_setup_cycle = 0;
 #endif
 #endif
 
-    // Local-first dispatch buffer (stack-allocated, one per scheduling thread).
+    // Local-first dispatch buffers (stack-allocated, one per CoreType per scheduling thread).
     // Initialized once; must be empty at the start of each iteration.
-    constexpr int LOCAL_READY_CAP = 64;
-    int32_t local_task_ids[LOCAL_READY_CAP];
-    PTO2LocalReadyBuffer local_buf;
-    local_buf.reset(local_task_ids, LOCAL_READY_CAP);
-    int32_t deferred_release_ids[128];
+    constexpr int LOCAL_READY_CAP_PER_TYPE = 256;
+    PTO2TaskSlotState* local_aic_ptrs[LOCAL_READY_CAP_PER_TYPE];
+    PTO2TaskSlotState* local_aiv_ptrs[LOCAL_READY_CAP_PER_TYPE];
+    PTO2LocalReadyBuffer local_bufs[PTO2_LOCAL_DISPATCH_TYPE_NUM];  // [0]=AIC, [1]=AIV
+    local_bufs[0].reset(local_aic_ptrs, LOCAL_READY_CAP_PER_TYPE);
+    local_bufs[1].reset(local_aiv_ptrs, LOCAL_READY_CAP_PER_TYPE);
+    PTO2TaskSlotState* deferred_release_slot_states[256];
     int32_t deferred_release_count = 0;
 
     bool cores_released = false;
@@ -882,6 +1011,20 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
         if (tracker.aic().running_count == 0 && tracker.aiv().running_count == 0) {
             bool orch_done = orchestrator_done_;
             if (orch_done) {
+                // Check for orchestrator fatal error — exit immediately
+                int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+                if (orch_err != PTO2_ERROR_NONE) {
+                    DEV_ERROR("Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. "
+                               "completed_tasks=%d, total_tasks=%d",
+                               thread_idx, orch_err,
+                               completed_tasks_.load(std::memory_order_relaxed),
+                               total_tasks_);
+                    emergency_shutdown(runtime);
+                    completed_.store(true, std::memory_order_release);
+                    break;
+                }
+
+                // Normal exit: all tasks complete
                 task_count = total_tasks_;
                 if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) {
                     completed_.store(true, std::memory_order_release);
@@ -921,21 +1064,21 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
 
         // Check AIC running cores
         bool try_completed = false;
-        always_assert(local_buf.count == 0);  // Invariant: previous iteration fully consumed
+        always_assert(local_bufs[0].count == 0 && local_bufs[1].count == 0);  // Invariant: previous iteration fully consumed
         if (tracker.aic().running_count > 0) {
             try_completed = true;
             check_running_cores_for_completion<CoreType::AIC>(
-                thread_idx, tracker.aic(), hank, executing_task_ids,
+                thread_idx, tracker.aic(), tracker.core_idle, hank, executing_reg_task_ids,
                 completed_this_turn, cur_thread_completed, made_progress,
-                deferred_release_ids, deferred_release_count,
-                local_buf
+                deferred_release_slot_states, deferred_release_count,
+                local_bufs
 #if PTO2_PROFILING
-                , profiling_enabled, complete_probe_count, complete_hit_count, phase_complete_count,
-                notify_edges_total, notify_max_degree, notify_tasks_enqueued,
-                fanin_edges_total, fanin_max_degree
+                , profiling_enabled, phase_complete_count
 #endif
 #if PTO2_SCHED_PROFILING
-                , sched_complete_perf_cycle
+                , complete_probe_count, complete_hit_count,
+                notify_edges_total, notify_max_degree, notify_tasks_enqueued,
+                fanin_edges_total, fanin_max_degree, sched_complete_perf_cycle
 #endif
             );
         }
@@ -944,21 +1087,24 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
         if (tracker.aiv().running_count > 0) {
             try_completed = true;
             check_running_cores_for_completion<CoreType::AIV>(
-                thread_idx, tracker.aiv(), hank, executing_task_ids,
+                thread_idx, tracker.aiv(), tracker.core_idle, hank, executing_reg_task_ids,
                 completed_this_turn, cur_thread_completed, made_progress,
-                deferred_release_ids, deferred_release_count,
-                local_buf
+                deferred_release_slot_states, deferred_release_count,
+                local_bufs
 #if PTO2_PROFILING
-                , profiling_enabled, complete_probe_count, complete_hit_count, phase_complete_count,
-                notify_edges_total, notify_max_degree, notify_tasks_enqueued,
-                fanin_edges_total, fanin_max_degree
+                , profiling_enabled, phase_complete_count
 #endif
 #if PTO2_SCHED_PROFILING
-                , sched_complete_perf_cycle
+                , complete_probe_count, complete_hit_count,
+                notify_edges_total, notify_max_degree, notify_tasks_enqueued,
+                fanin_edges_total, fanin_max_degree, sched_complete_perf_cycle
 #endif
             );
         }
         if (completed_this_turn > 0) {
+#if PTO2_SCHED_PROFILING
+            rt->scheduler.tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed);
+#endif
             int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
             int32_t new_total = prev + completed_this_turn;
             last_progress_count = new_total;
@@ -976,116 +1122,168 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
         if (!try_completed) {
             CYCLE_COUNT_LAP(sched_idle_cycle);
         } else {
+            CYCLE_COUNT_LAP(sched_complete_cycle);
             if (profiling_enabled && phase_complete_count > 0) {
                 perf_aicpu_record_phase(
                     thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, sched_loop_count, phase_complete_count);
                 _t0_phase = _t1;
                 phase_complete_count = 0;
             }
-            CYCLE_COUNT_LAP(sched_complete_cycle);
         }
 #endif
 
-        // Phase 2: Local dispatch — match local_buf tasks to idle cores (zero MPMC operations)
+        // Phase 2: Local dispatch — drain local_bufs, match to idle clusters (zero MPMC operations)
         // Phase 3: Global queue — push overflow to readyQ + fill remaining idle cores from readyQ
         bool try_pushed = false;
 
-        // Local dispatch: drain local_buf, match to idle cores by type
-        int32_t overflow_ids[LOCAL_READY_CAP];
+        // Local dispatch: drain both per-CoreType local_bufs, match to idle clusters by shape
+        PTO2TaskSlotState* overflow_ptrs[LOCAL_READY_CAP_PER_TYPE * PTO2_LOCAL_DISPATCH_TYPE_NUM];
         int overflow_count = 0;
-        while (local_buf.count > 0) {
-            int32_t task_id = local_buf.pop();
-            PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask];
-            CoreType ct_type = static_cast<CoreType>(task->worker_type);
-            CoreTypeTracker& ct = (ct_type == CoreType::AIC) ? tracker.aic() : tracker.aiv();
+        for (int bi = 0; bi < PTO2_LOCAL_DISPATCH_TYPE_NUM; bi++) {
+            while (local_bufs[bi].count > 0) {
+                PTO2TaskSlotState* slot_state = local_bufs[bi].pop();
+                PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state->active_mask);
+                int32_t ci = tracker.find_cluster_for_shape(shape);
+
+                if (ci >= 0) {
+                    try_pushed = true;
+                    Cluster& c = tracker.clusters[ci];
+#if PTO2_SCHED_PROFILING
+                    uint64_t t_setup_start = get_sys_cnt_aicpu();
+#endif
+                    ResourceCount rc = shape_resource_count(shape);
 
-            if (ct.idle_count > 0) {
-                try_pushed = true;
-                int32_t idle_idx = ct.idle_count - 1;
-                int32_t core_id = ct.idle[idle_idx];
-                PTO2TaskPayload* task_pl = &task_payloads[task_id & window_mask];
-                PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
-                if (ct_type == CoreType::AIC) {
-                    build_pto2_payload<CoreType::AIC>(payload, runtime, task, task_pl);
-                } else {
-                    build_pto2_payload<CoreType::AIV>(payload, runtime, task, task_pl);
-                }
+                    if (rc.aic) {
+                        dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                            c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC
 #if PTO2_PROFILING
-                if (profiling_enabled) {
-                    dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
-                    if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) {
-                        perf_aicpu_switch_buffer(runtime, core_id, thread_idx);
-                        core_dispatch_counts_[core_id] = 0;
+                            , profiling_enabled, thread_idx
+#endif
+                        );
                     }
-                    core_dispatch_counts_[core_id]++;
-                }
-                pop_hit++;
-                phase_dispatch_count++;
-                local_dispatch_count++;
+                    if (rc.aiv >= 1) {
+                        int32_t aiv0 = tracker.core_idle[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1];
+                        dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                            aiv0, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0
+#if PTO2_PROFILING
+                            , profiling_enabled, thread_idx
 #endif
-                write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE,
-                          static_cast<uint64_t>(task_id));
-                ct.move_idle_to_running(idle_idx);
-                executing_task_ids[core_id] = task_id;
-                made_progress = true;
-                DEV_DEBUG("Thread %d: Dispatching PTO2 task %d to core %d (local)",
-                          thread_idx, task_id, core_id);
-            } else {
-                overflow_ids[overflow_count++] = task_id;
+                        );
+                    }
+                    if (rc.aiv >= 2) {
+                        dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                            c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1
+#if PTO2_PROFILING
+                            , profiling_enabled, thread_idx
+#endif
+                        );
+                    }
 #if PTO2_PROFILING
-                local_overflow_count++;
+                    phase_dispatch_count++;
 #endif
+#if PTO2_SCHED_PROFILING
+                    pop_hit++;
+                    local_dispatch_count++;
+                    sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+#endif
+                    made_progress = true;
+                    DEV_DEBUG("Thread %d: Dispatching %s task %lld to cluster %d (local)",
+                        thread_idx,
+                        shape_name(shape),
+                        (long long)pto2_task_id_raw(slot_state->task->mixed_task_id),
+                        ci);
+                } else {
+                    overflow_ptrs[overflow_count++] = slot_state;
+#if PTO2_SCHED_PROFILING
+                    local_overflow_count++;
+#endif
+                }
             }
         }
 
-        // Push overflow to global readyQ
+        // Push overflow to global readyQ (shape-based)
         for (int i = 0; i < overflow_count; i++) {
-            PTO2TaskDescriptor* task = &task_descriptors[overflow_ids[i] & window_mask];
-            rt->scheduler.ready_queues[task->worker_type].push(overflow_ids[i]);
+            rt->scheduler.requeue_ready_task(*overflow_ptrs[i]);
         }
 
-        // Global dispatch: fill remaining idle cores from global readyQ
-        // Process AIC cores if CUBE queue has tasks
-        if (tracker.aic().idle_count > 0 && rt->scheduler.ready_queues[PTO2_WORKER_CUBE].size() > 0) {
-            try_pushed = true;
-            dispatch_ready_tasks_to_idle_cores<CoreType::AIC>(
-                runtime, thread_idx, tracker.aic(), executing_task_ids, made_progress,
-                task_descriptors, task_payloads, window_mask
+        // Phase 3: Global dispatch — fill remaining idle cores from global readyQ (cluster-based)
+        const PTO2ResourceShape* dispatch_order = get_dispatch_order(thread_idx);
+
+        for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) {
+            PTO2ResourceShape shape = dispatch_order[si];
+            if (rt->scheduler.ready_queues[static_cast<int32_t>(shape)].size() == 0) continue;
+
+            while (true) {
+                int32_t ci = tracker.find_cluster_for_shape(shape);
+                if (ci < 0) break;
+
+                PTO2TaskSlotState* slot_state = pop_ready_task(shape, thread_idx
+#if PTO2_SCHED_PROFILING
+                    , pop_hit, pop_miss
+                    , sched_dispatch_pop_cycle
+#endif
+                );
+                if (!slot_state) break;
+
+                try_pushed = true;
 #if PTO2_PROFILING
-                , profiling_enabled, pop_hit, pop_miss, phase_dispatch_count
+                phase_dispatch_count++;
 #endif
 #if PTO2_SCHED_PROFILING
-                , sched_dispatch_pop_cycle, sched_dispatch_setup_cycle
+                uint64_t t_setup_start = get_sys_cnt_aicpu();
 #endif
-            );
-        }
+                Cluster& c = tracker.clusters[ci];
+                ResourceCount rc = shape_resource_count(shape);
 
-        // Process AIV cores if VECTOR queue has tasks
-        if (tracker.aiv().idle_count > 0 && rt->scheduler.ready_queues[PTO2_WORKER_VECTOR].size() > 0) {
-            try_pushed = true;
-            dispatch_ready_tasks_to_idle_cores<CoreType::AIV>(
-                runtime, thread_idx, tracker.aiv(), executing_task_ids, made_progress,
-                task_descriptors, task_payloads, window_mask
+                if (rc.aic) {
+                    dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                        c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC
+#if PTO2_PROFILING
+                        , profiling_enabled, thread_idx
+#endif
+                    );
+                }
+                if (rc.aiv >= 1) {
+                    int32_t aiv_id = tracker.core_idle[c.aiv_core_ids[0]]
+                        ? c.aiv_core_ids[0] : c.aiv_core_ids[1];
+                    dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                        aiv_id, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0
+#if PTO2_PROFILING
+                        , profiling_enabled, thread_idx
+#endif
+                    );
+                }
+                if (rc.aiv >= 2) {
+                    dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids,
+                        c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1
 #if PTO2_PROFILING
-                , profiling_enabled, pop_hit, pop_miss, phase_dispatch_count
+                        , profiling_enabled, thread_idx
 #endif
+                    );
+                }
+                made_progress = true;
 #if PTO2_SCHED_PROFILING
-                , sched_dispatch_pop_cycle, sched_dispatch_setup_cycle
+                sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
 #endif
-            );
+                DEV_DEBUG("Thread %d: Dispatching %s task %lld to cluster %d",
+                    thread_idx,
+                    shape_name(shape),
+                    (long long)pto2_task_id_raw(slot_state->task->mixed_task_id),
+                    ci);
+            }
         }
 
 #if PTO2_PROFILING
         if (!try_pushed) {
             CYCLE_COUNT_LAP(sched_idle_cycle);
         } else {
+            CYCLE_COUNT_LAP(sched_dispatch_cycle);
             if (profiling_enabled && phase_dispatch_count > 0) {
                 perf_aicpu_record_phase(
                     thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, sched_loop_count, phase_dispatch_count);
                 _t0_phase = _t1;
                 phase_dispatch_count = 0;
             }
-            CYCLE_COUNT_LAP(sched_dispatch_cycle);
 #endif
         }
 
@@ -1097,45 +1295,65 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
             // freeing heap space for the orchestrator without blocking completion polling.
             while (deferred_release_count > 0) {
 #if PTO2_SCHED_PROFILING
-                int32_t fe = rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count], thread_idx);
+                int32_t fe = rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
 #else
-                int32_t fe = rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count]);
+                int32_t fe = rt->scheduler.on_task_release(*deferred_release_slot_states[--deferred_release_count]);
 #endif
                 (void)fe;
-#if PTO2_PROFILING
+#if PTO2_SCHED_PROFILING
                 fanin_edges_total += fe;
                 if (fe > fanin_max_degree) fanin_max_degree = fe;
 #endif
             }
             idle_iterations++;
+
+            // Check for orchestrator fatal error during idle (every 1024 iterations)
+            // orch_error_code is set in shared memory by the orchestrator's spin loop
+            // BEFORE orchestrator_done_ is set, so this catches errors earlier.
+            if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) {
+                int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+                if (orch_err != PTO2_ERROR_NONE) {
+                    DEV_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores",
+                               thread_idx, orch_err);
+                    emergency_shutdown(runtime);
+                    completed_.store(true, std::memory_order_release);
+                    break;
+                }
+            }
+
             if (thread_idx == 0 && task_count > 0 && idle_iterations % STALL_LOG_INTERVAL == 0) {
                 int32_t c = completed_tasks_.load(std::memory_order_relaxed);
                 DEV_ALWAYS("PTO2 stall: no progress for %d iterations, completed=%d total=%d (last progress at %d)",
                            idle_iterations, c, task_count, last_progress_count);
                 // Scan all task slots to find truly stuck tasks using scheduler state
                 PTO2SchedulerState* sched = &rt->scheduler;
+                PTO2SharedMemoryHeader* sm_header_diag = static_cast<PTO2SharedMemoryHeader*>(sm_base);
                 int32_t cnt_ready = 0, cnt_waiting = 0, cnt_inflight = 0;
-                for (int32_t si = 0; si < task_count; si++) {
-                    int32_t slot = si & window_mask;
-                    PTO2TaskState st = sched->task_state[slot].load(std::memory_order_relaxed);
-                    int32_t rc = sched->fanin_refcount[slot].load(std::memory_order_relaxed);
-                    int32_t fi = task_descriptors[slot].fanin_count;
-                    int32_t kid = task_descriptors[slot].kernel_id;
-                    if (st >= PTO2_TASK_COMPLETED) continue; // Already done
-                    if (st == PTO2_TASK_READY || st == PTO2_TASK_RUNNING) { cnt_inflight++; continue; }
-                    // PENDING
-                    if (rc >= fi) {
-                        // Ready (all deps satisfied) but not enqueued — this is the real bug
-                        cnt_ready++;
-                        if (cnt_ready <= STALL_DUMP_READY_MAX) {
-                            DEV_ALWAYS("  STUCK-READY  slot=%d kernel_id=%d refcount=%d fanin=%d state=%d",
-                                       slot, kid, rc, fi, (int32_t)st);
-                        }
-                    } else {
-                        cnt_waiting++;
-                        if (cnt_waiting <= STALL_DUMP_WAIT_MAX) {
-                            DEV_ALWAYS("  STUCK-WAIT   slot=%d kernel_id=%d refcount=%d fanin=%d state=%d",
-                                       slot, kid, rc, fi, (int32_t)st);
+                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                    int32_t ring_task_count =
+                        sm_header_diag->rings[r].fc.current_task_index.load(std::memory_order_relaxed);
+                    for (int32_t si = 0; si < ring_task_count; si++) {
+                        PTO2TaskSlotState& slot_state = sched->get_slot_state(r, si);
+                        PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
+                        int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
+                        int32_t fi = slot_state.fanin_count;
+                        int32_t kid = slot_state.task->kernel_id[0];
+                        if (st >= PTO2_TASK_COMPLETED) continue; // Already done
+                        if (st == PTO2_TASK_READY || st == PTO2_TASK_RUNNING) { cnt_inflight++; continue; }
+                        // PENDING
+                        if (rc >= fi) {
+                            // Ready (all deps satisfied) but not enqueued — this is the real bug
+                            cnt_ready++;
+                            if (cnt_ready <= STALL_DUMP_READY_MAX) {
+                                DEV_ALWAYS("  STUCK-READY  ring=%d task_id=%lld kernel_id=%d refcount=%d fanin=%d state=%d",
+                                            r, (long long)pto2_task_id_raw(slot_state.task->mixed_task_id), kid, rc, fi, (int32_t)st);
+                            }
+                        } else {
+                            cnt_waiting++;
+                            if (cnt_waiting <= STALL_DUMP_WAIT_MAX) {
+                                DEV_ALWAYS("  STUCK-WAIT   ring=%d task_id=%lld kernel_id=%d refcount=%d fanin=%d state=%d",
+                                            r, (long long)pto2_task_id_raw(slot_state.task->mixed_task_id), kid, rc, fi, (int32_t)st);
+                            }
                         }
                     }
                 }
@@ -1150,33 +1368,40 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                 // Dump AIC running cores
                 for (int32_t ci = 0; ci < tracker.aic().running_count && ci < STALL_DUMP_CORE_MAX; ci++) {
                     int32_t cid = tracker.aic().running[ci];
-                    Handshake* hh = &hank[cid];
-                    int32_t hw_task_id = -1;
+                    int32_t sw_tid = executing_reg_task_ids[cid];
                     int32_t hw_kernel = -1;
-                    if (hh->task != 0) {
-                        const PTO2DispatchPayload* pl = reinterpret_cast<const PTO2DispatchPayload*>((uintptr_t)hh->task);
-                        hw_task_id = pl->task_id;
-                        hw_kernel  = pl->kernel_id;
+                    if (sw_tid >= 0 && executing_slot_state_by_core_[cid]) {
+                        int32_t diag_slot = static_cast<int32_t>(executing_subslot_by_core_[cid]);
+                        hw_kernel = executing_slot_state_by_core_[cid]->task->kernel_id[diag_slot];
                     }
-                    DEV_ALWAYS("    AIC core[%d] cid=%d sw_task=%d hw_task=%d hw_kernel=%d",
-                               ci, cid, executing_task_ids[cid], hw_task_id, hw_kernel);
+                    uint64_t cond_reg = read_reg(core_id_to_reg_addr_[cid], RegId::COND);
+                    DEV_ALWAYS("    core=%d cond=0x%x(state=%d,id=%d) exec_id=%d kernel=%d",
+                               cid, (unsigned)cond_reg,
+                               EXTRACT_TASK_STATE(cond_reg), EXTRACT_TASK_ID(cond_reg),
+                               sw_tid, hw_kernel);
                 }
                 // Dump AIV running cores
                 for (int32_t ci = 0; ci < tracker.aiv().running_count && ci < STALL_DUMP_CORE_MAX; ci++) {
                     int32_t cid = tracker.aiv().running[ci];
-                    Handshake* hh = &hank[cid];
-                    int32_t hw_task_id = -1;
+                    int32_t sw_tid = executing_reg_task_ids[cid];
                     int32_t hw_kernel = -1;
-                    if (hh->task != 0) {
-                        const PTO2DispatchPayload* pl = reinterpret_cast<const PTO2DispatchPayload*>((uintptr_t)hh->task);
-                        hw_task_id = pl->task_id;
-                        hw_kernel  = pl->kernel_id;
+                    if (sw_tid >= 0 && executing_slot_state_by_core_[cid]) {
+                        int32_t diag_slot = static_cast<int32_t>(executing_subslot_by_core_[cid]);
+                        hw_kernel = executing_slot_state_by_core_[cid]->task->kernel_id[diag_slot];
                     }
                     uint64_t cond_reg = read_reg(core_id_to_reg_addr_[cid], RegId::COND);
-                    DEV_ALWAYS("    core=%d cond=0x%x(state=%d,id=%d) exec_id=%d payload_task=%d kernel=%d",
+                    DEV_ALWAYS("    core=%d cond=0x%x(state=%d,id=%d) exec_id=%d kernel=%d",
                                cid, (unsigned)cond_reg,
                                EXTRACT_TASK_STATE(cond_reg), EXTRACT_TASK_ID(cond_reg),
-                               executing_task_ids[cid], hw_task_id, hw_kernel);
+                               sw_tid, hw_kernel);
+                }
+                // Dump cluster state
+                for (int32_t cli = 0; cli < tracker.cluster_count && cli < STALL_DUMP_CORE_MAX; cli++) {
+                    Cluster& cl = tracker.clusters[cli];
+                    DEV_ALWAYS("    cluster[%d] aic=%d(%s) aiv0=%d(%s) aiv1=%d(%s)",
+                               cli, cl.aic_core_id, tracker.core_idle[cl.aic_core_id] ? "idle" : "busy",
+                               cl.aiv_core_ids[0], tracker.core_idle[cl.aiv_core_ids[0]] ? "idle" : "busy",
+                               cl.aiv_core_ids[1], tracker.core_idle[cl.aiv_core_ids[1]] ? "idle" : "busy");
                 }
             }
             if (idle_iterations > MAX_IDLE_ITERATIONS) {
@@ -1186,12 +1411,12 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                 SPIN_WAIT_HINT();
             }
 #if PTO2_PROFILING
+            CYCLE_COUNT_LAP(sched_idle_cycle);
             if (profiling_enabled) {
                 perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT,
                                         _t0_phase, _t1, sched_loop_count, 0);
                 _t0_phase = _t1;
             }
-            CYCLE_COUNT_LAP(sched_idle_cycle);
 #endif
         }
     }
@@ -1335,7 +1560,7 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
 
     DEV_ALWAYS("Thread %d: Start", thread_idx);
 
-    // Orchestrator threads: thread_idx >= sched_thread_num_
+    // Orchestrator check
     if (thread_idx >= sched_thread_num_) {
         int32_t orch_idx = thread_idx - sched_thread_num_;
         if (runtime->get_orch_built_on_host()) {
@@ -1453,8 +1678,12 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
                 if (runtime->pto2_heap_size > 0) {
                     heap_size = runtime->pto2_heap_size;
                 }
-                DEV_INFO("Thread %d: Ring sizes: task_window=%lu, heap=%lu",
-                         thread_idx, (unsigned long)task_window_size, (unsigned long)heap_size);
+                int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
+                if (runtime->pto2_dep_pool_size > 0) {
+                    dep_pool_capacity = static_cast<int32_t>(runtime->pto2_dep_pool_size);
+                }
+                DEV_INFO("Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d",
+                         thread_idx, (unsigned long)task_window_size, (unsigned long)heap_size, dep_pool_capacity);
 
                 void* sm_ptr = runtime->get_pto2_gm_sm_ptr();
                 void* gm_heap = runtime->get_pto2_gm_heap_ptr();
@@ -1471,7 +1700,8 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
                 }
 
                 rt = pto2_runtime_create_from_sm(PTO2_MODE_EXECUTE,
-                                                 sm_handle, gm_heap, heap_size, orch_thread_num_);
+                                                 sm_handle, gm_heap, heap_size, orch_thread_num_,
+                                                 dep_pool_capacity);
                 if (!rt) {
                     DEV_ERROR("Thread %d: Failed to create PTO2Runtime", thread_idx);
                     pto2_sm_destroy(sm_handle);
@@ -1480,6 +1710,16 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
                     return -1;
                 }
 
+#if PTO2_PROFILING
+                for (int i = 0; i < orch_thread_num_; i++) {
+                    rt->orchestrators[i].enable_profiling = runtime->enable_profiling;
+                }
+#endif
+
+                // With multi-ring, slot_states are per-ring inside the scheduler.
+                // Fanout fill-in in complete_perf_records is disabled (slot_states_ptr = nullptr).
+                runtime->set_pto2_slot_states_ptr(nullptr);
+
                 // Store shared state for other orchestrator threads
                 orch_func_ = orch_func;
                 orch_args_cached_ = args;
@@ -1517,6 +1757,13 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
 
             pto2_set_orch_thread_idx(orch_idx);
 
+#if PTO2_PROFILING
+            // Each orchestrator thread sets its own phase buffer index (thread-local)
+            if (runtime->enable_profiling) {
+                perf_aicpu_set_orch_thread_idx(thread_idx);
+            }
+#endif
+
             // Call orchestration function wrapped in an outer scope
             DEV_ALWAYS("Thread %d: Calling aicpu_orchestration_entry from SO (orch_idx=%d/(0~%d))",
                        thread_idx, orch_idx, orch_thread_num_ - 1);
@@ -1534,34 +1781,53 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
             // Print orchestrator profiling data
 #if PTO2_ORCH_PROFILING
             PTO2OrchProfilingData p = pto2_orchestrator_get_profiling();
-            uint64_t total = p.sync_cycle + p.alloc_cycle + p.params_cycle +
-                             p.lookup_cycle + p.heap_cycle + p.insert_cycle +
-                             p.fanin_cycle;
+            uint64_t total = p.sync_cycle + p.alloc_cycle + p.params_cycle + p.lookup_cycle + p.heap_cycle +
+                             p.insert_cycle + p.fanin_cycle;
             if (total == 0) total = 1;  // avoid div-by-zero
-            DEV_ALWAYS("Thread %d: === Orchestrator Profiling: %lld tasks, total=%.3fus ===", thread_idx,
-                     (long long)p.submit_count, cycles_to_us(total));
-            DEV_ALWAYS("Thread %d:   sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle), p.sync_cycle * 100.0 / total);
-            DEV_ALWAYS("Thread %d:   task_ring_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%llu", thread_idx,
-                cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
-                cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
+            DEV_ALWAYS("Thread %d: === Orchestrator Profiling: %lld tasks, total=%.3fus ===",
+                thread_idx,
+                (long long)p.submit_count,
+                cycles_to_us(total));
+            DEV_ALWAYS("Thread %d:   task_ring_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%llu",
+                thread_idx,
+                cycles_to_us(p.alloc_cycle),
+                p.alloc_cycle * 100.0 / total,
+                cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle),
+                cycles_to_us(p.alloc_wait_cycle),
                 (unsigned long long)p.alloc_atomic_count);
-            DEV_ALWAYS("Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%llu", thread_idx,
-                cycles_to_us(p.params_cycle), p.params_cycle * 100.0 / total,
-                (unsigned long long)p.params_atomic_count);
-            DEV_ALWAYS("Thread %d:   lookup+dep     : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle), p.lookup_cycle * 100.0 / total);
-            DEV_ALWAYS("Thread %d:   heap_alloc     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%llu", thread_idx,
-                cycles_to_us(p.heap_cycle), p.heap_cycle * 100.0 / total,
-                cycles_to_us(p.heap_cycle - p.heap_wait_cycle), cycles_to_us(p.heap_wait_cycle),
+            DEV_ALWAYS("Thread %d:   heap_alloc     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%llu",
+                thread_idx,
+                cycles_to_us(p.heap_cycle),
+                p.heap_cycle * 100.0 / total,
+                cycles_to_us(p.heap_cycle - p.heap_wait_cycle),
+                cycles_to_us(p.heap_wait_cycle),
                 (unsigned long long)p.heap_atomic_count);
-            DEV_ALWAYS("Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle), p.insert_cycle * 100.0 / total);
-            DEV_ALWAYS("Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%llu", thread_idx,
-                cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
-                cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle),
+            DEV_ALWAYS("Thread %d:   sync_tensormap : %.3fus (%.1f%%)",
+                thread_idx,
+                cycles_to_us(p.sync_cycle),
+                p.sync_cycle * 100.0 / total);
+            DEV_ALWAYS("Thread %d:   lookup+dep     : %.3fus (%.1f%%)",
+                thread_idx,
+                cycles_to_us(p.lookup_cycle),
+                p.lookup_cycle * 100.0 / total);
+            DEV_ALWAYS("Thread %d:   tensormap_ins  : %.3fus (%.1f%%)",
+                thread_idx,
+                cycles_to_us(p.insert_cycle),
+                p.insert_cycle * 100.0 / total);
+            DEV_ALWAYS("Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%llu",
+                thread_idx,
+                cycles_to_us(p.params_cycle),
+                p.params_cycle * 100.0 / total,
+                (unsigned long long)p.params_atomic_count);
+            DEV_ALWAYS("Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%llu",
+                thread_idx,
+                cycles_to_us(p.fanin_cycle),
+                p.fanin_cycle * 100.0 / total,
+                cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle),
+                cycles_to_us(p.fanin_wait_cycle),
                 (unsigned long long)p.fanin_atomic_count);
-            DEV_ALWAYS("Thread %d:   scope_end      : %.3fus  atomics=%llu", thread_idx,
-                cycles_to_us(p.scope_end_cycle),
-                (unsigned long long)p.scope_end_atomic_count);
-            DEV_ALWAYS("Thread %d:   avg/task       : %.3fus", thread_idx,
+            DEV_ALWAYS("Thread %d:   avg/task       : %.3fus",
+                thread_idx,
                 p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0);
 
 #if PTO2_TENSORMAP_PROFILING
@@ -1616,8 +1882,13 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
 
                 void* sm = runtime->get_pto2_gm_sm_ptr();
                 PTO2SharedMemoryHeader* sm_header = static_cast<PTO2SharedMemoryHeader*>(sm);
-                int32_t pto2_task_count =
-                    sm_header ? sm_header->current_task_index.load(std::memory_order_acquire) : 0;
+                int32_t pto2_task_count = 0;
+                    if (sm_header) {
+                        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                            pto2_task_count +=
+                                sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+                        }
+                    }
 #if PTO2_PROFILING
                 DEV_ALWAYS("PTO2 total submitted tasks = %d, already executed %d tasks", pto2_task_count, completed_tasks_.load(std::memory_order_acquire));
 #endif
@@ -1626,28 +1897,49 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
                     perf_aicpu_update_total_tasks(runtime, static_cast<uint32_t>(pto2_task_count));
                 }
                 orchestrator_done_ = true;
+                {
+                    int32_t orch_err = 0;
+                    void* sm = runtime->get_pto2_gm_sm_ptr();
+                    if (sm) {
+                        orch_err = static_cast<PTO2SharedMemoryHeader*>(sm)->orch_error_code.load(
+                            std::memory_order_relaxed);
+                    }
+
+                    // Fatal error: shutdown AICore immediately before core transition.
+                    if (orch_err != PTO2_ERROR_NONE) {
+                        emergency_shutdown(runtime);
+                        completed_.store(true, std::memory_order_release);
+                    }
+                }
 
-                // Compute new core assignments for all threads and initialize donated slots
-                DEV_INFO("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
+                // Skip core transition on fatal error — cores already shut down above
+                if (completed_.load(std::memory_order_acquire)) {
+                    // Signal transition to unblock scheduler threads waiting at core transition
+                    transition_requested_.store(true, std::memory_order_release);
+                    reassigned_.store(true, std::memory_order_release);
+                } else {
+                    // Compute new core assignments for all threads and initialize donated slots
+                    DEV_INFO("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
 #if PTO2_PROFILING
-                // Benchmark: record orchestrator end timestamp before waiting for schedulers
-                DEV_ALWAYS("BENCHMARK: thread=%d end=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
+                    // Benchmark: record orchestrator end timestamp before waiting for schedulers
+                    DEV_ALWAYS("BENCHMARK: thread=%d end=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
 #endif
-                transition_requested_.store(true, std::memory_order_release);
-
-                // Wait for scheduler threads to acknowledge transition request
-                // All-orchestrator mode (sched_thread_num_ == 0): skip the wait
-                if (sched_thread_num_ > 0) {
-                    while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
-                        if (completed_.load(std::memory_order_acquire)) {
-                            break;
+                    transition_requested_.store(true, std::memory_order_release);
+
+                    // Wait for scheduler threads to acknowledge transition request
+                    // All-orchestrator mode (sched_thread_num_ == 0): skip the wait
+                    if (sched_thread_num_ > 0) {
+                        while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
+                            if (completed_.load(std::memory_order_acquire)) {
+                                break;
+                            }
+                            SPIN_WAIT_HINT();
                         }
-                        SPIN_WAIT_HINT();
                     }
-                }
-                if (!completed_.load(std::memory_order_acquire)) {
-                    reassign_cores_for_all_threads();
-                    reassigned_.store(true, std::memory_order_release);
+                    if (!completed_.load(std::memory_order_acquire)) {
+                        reassign_cores_for_all_threads();
+                        reassigned_.store(true, std::memory_order_release);
+                    }
                 }
             } else {
                 // Non-last orchestrator: wait for last orchestrator to finish setup
@@ -1677,18 +1969,26 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
         always_assert(rt != nullptr);
         int32_t completed = resolve_and_dispatch_pto2(runtime, thread_idx);
         DEV_INFO("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
+    }
 
-        // After transition, use new core assignments for shutdown
+    // Always shutdown AICore — even if completed_ was already true.
+    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
+    // core_count_per_thread_ == 0 so they skip the loop harmlessly.
+    {
         const int32_t* shutdown_cores = core_assignments_[thread_idx];
         int32_t shutdown_count = core_count_per_thread_[thread_idx];
 #if PTO2_PROFILING
         // Benchmark: record scheduler end timestamp before shutdown cleanup
-        DEV_ALWAYS("Thread=%d end=%llu",
-                   thread_idx, (unsigned long long)get_sys_cnt_aicpu());
+        if (shutdown_count > 0) {
+            DEV_ALWAYS("Thread=%d end=%llu",
+                       thread_idx, (unsigned long long)get_sys_cnt_aicpu());
+        }
 #endif
-        auto rc = shutdown_aicore(runtime, thread_idx, shutdown_cores, shutdown_count);
-        if (rc != 0) {
-            return rc;
+        if (shutdown_count > 0) {
+            auto rc = shutdown_aicore(runtime, thread_idx, shutdown_cores, shutdown_count);
+            if (rc != 0) {
+                return rc;
+            }
         }
     }
 
@@ -1722,8 +2022,11 @@ void AicpuExecutor::deinit(Runtime* runtime) {
         core_dispatch_counts_[i] = 0;
     }
 
-    // Clear per-core dispatch payloads to prevent stale data on next round
+    // Clear per-core dispatch payloads and subslot tracking
     memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core));
+    memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_));
+    memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_));
+    memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_));
 
     completed_tasks_.store(0, std::memory_order_release);
     total_tasks_ = 0;
@@ -1750,7 +2053,7 @@ void AicpuExecutor::deinit(Runtime* runtime) {
     }
     for (int32_t i = 0; i < thread_num_; i++) {
         for (int32_t j = 0; j < MAX_CORES_PER_THREAD; j++) {
-            executing_task_ids_[i][j] = AICPU_TASK_INVALID;
+            executing_reg_task_ids_[i][j] = AICPU_TASK_INVALID;
         }
     }
     regs_ = 0;
@@ -1787,6 +2090,7 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int32_t thread_idx,
                                          const int32_t* cur_thread_cores, int32_t core_num,
                                          Handshake* hank) {
     (void)runtime;
+    PTO2SchedulerState* sched = &rt->scheduler;
     DEV_ALWAYS("========== DIAGNOSTIC REPORT: Thread %d ==========", thread_idx);
 
     int32_t completed = completed_tasks_.load(std::memory_order_acquire);
@@ -1794,13 +2098,16 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int32_t thread_idx,
     DEV_ALWAYS("Progress: %d/%d tasks (%.1f%%)",
              completed, total, total > 0 ? completed * 100.0 / total : 0.0);
 
-    uint64_t aic_ready = 0, aiv_ready = 0;
+    uint64_t aic_ready = 0, aiv_ready = 0, aiv_x2_ready = 0, mixed_x1_ready = 0, mixed_x2_ready = 0;
     if (rt) {
-        PTO2SchedulerState* sched = &rt->scheduler;
-        aic_ready = sched->ready_queues[PTO2_WORKER_CUBE].size();
-        aiv_ready = sched->ready_queues[PTO2_WORKER_VECTOR].size();
+        aic_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_ONLY)].size();
+        aiv_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIV_X1)].size();
+        aiv_x2_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIV_X2)].size();
+        mixed_x1_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_AIV_X1)].size();
+        mixed_x2_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_AIV_X2)].size();
     }
-    DEV_ALWAYS("Ready Queues: AIC=%lu, AIV=%lu", aic_ready, aiv_ready);
+    DEV_ALWAYS("Ready Queues: AIC=%lu, AIV=%lu, AIV_X2=%lu, AIC_AIV_X1=%lu, AIC_AIV_X2=%lu",
+               aic_ready, aiv_ready, aiv_x2_ready, mixed_x1_ready, mixed_x2_ready);
 
     int32_t busy_cores = 0;
     int32_t idle_cores = 0;
@@ -1815,16 +2122,20 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int32_t thread_idx,
         uint64_t reg_val = read_reg(reg_addr, RegId::COND);
         int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
         int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
-        int32_t task_id = executing_task_ids_[thread_idx][core_id];
+        int32_t task_id = executing_reg_task_ids_[thread_idx][core_id];
 
         if (reg_state != TASK_FIN_STATE || task_id >= 0) {
             busy_cores++;
             if (task_id >= 0) {
-                PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
-                DEV_ALWAYS("  Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s), executing_task_id=%d, kernel_id=%d",
+                int32_t kernel_id = -1;
+                if (rt && rt->sm_handle && executing_slot_state_by_core_[core_id]) {
+                    int32_t diag_slot = static_cast<int32_t>(executing_subslot_by_core_[core_id]);
+                    kernel_id = executing_slot_state_by_core_[core_id]->task->kernel_id[diag_slot];
+                }
+                DEV_ALWAYS("  Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s), executing_reg_task_id=%d, kernel_id=%d",
                         core_id, core_type_str, reg_val, reg_task_id,
                         reg_state == TASK_FIN_STATE ? "FIN" : "ACK",
-                        payload->task_id, payload->kernel_id);
+                        task_id, kernel_id);
             } else {
                 DEV_ALWAYS("  Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s) but task_id not tracked",
                         core_id, core_type_str, reg_val, reg_task_id,
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
new file mode 100644
index 00000000..22de1070
--- /dev/null
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
@@ -0,0 +1,236 @@
+# Multi-Ring Buffer Architecture
+
+> Extension to the PTO2 runtime. For the base architecture, see [RUNTIME_LOGIC.md](RUNTIME_LOGIC.md).
+
+## 1. Problem
+
+The single-ring design uses one `last_task_alive` watermark shared by HeapRing, TaskRing, and DepPool. When tasks from an inner scope (e.g., per-block iteration) complete, their resources cannot be reclaimed until **all** prior tasks — including those from the outer scope — also complete. This wastes ring capacity and can trigger deadlocks when ring sizes are small.
+
+## 2. Solution
+
+Split HeapRing, TaskRing, and DepPool into arrays of `PTO2_MAX_RING_DEPTH` (4) independent instances. Each scope depth maps to its own ring, with an independent `last_task_alive` watermark.
+
+```
+Scope depth 0  ──►  rings[0] = { HeapRing, TaskRing, DepPool }
+Scope depth 1  ──►  rings[1] = { HeapRing, TaskRing, DepPool }
+Scope depth 2  ──►  rings[2] = { HeapRing, TaskRing, DepPool }
+Scope depth ≥3 ──►  rings[3] = { HeapRing, TaskRing, DepPool }  (clamped)
+```
+
+Inner-scope tasks can now be reclaimed independently without waiting for outer-scope tasks to complete.
+
+## 3. Task ID Encoding
+
+Task IDs are widened from 32-bit to 64-bit to carry the ring identity:
+
+```
+mixed_task_id.raw = (ring_id << 32) | local_id
+```
+
+Helper functions in `pto_runtime2_types.h`:
+
+| Function | Purpose |
+|----------|---------|
+| `pto2_make_task_id(ring_id, local_id)` | Compose a 64-bit task ID (`PTO2TaskId`) |
+| `pto2_task_id_ring(task_id)` | Extract `ring_id` (bits 63-32) |
+| `pto2_task_id_local(task_id)` | Extract `local_id` (bits 31-0) |
+
+Type changes:
+
+| Field | Before | After |
+|-------|--------|-------|
+| `PTO2TaskDescriptor.mixed_task_id` | `int32_t` | `PTO2TaskId` |
+| `PTO2TensorMapEntry.producer_task_id` | `int32_t` | `PTO2TaskId` |
+| `PTO2TaskSlotState.ring_id` | N/A | `uint8_t` (new, denormalized for fast access) |
+
+## 4. Data Structures
+
+### 4.1 PTO2RingSet (new)
+
+Bundles the three per-ring resources into a single aggregate (`pto_ring_buffer.h`):
+
+```cpp
+struct PTO2RingSet {
+    PTO2HeapRing   heap_ring;
+    PTO2TaskRing   task_ring;
+    PTO2DepListPool dep_pool;
+};
+```
+
+### 4.2 PTO2OrchestratorState (modified)
+
+```cpp
+// Before: single ring
+PTO2HeapRing heap_ring;
+PTO2TaskRing task_ring;
+PTO2DepListPool dep_pool;
+
+// After: per-ring array
+PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
+int32_t dep_pool_last_reclaimed[PTO2_MAX_RING_DEPTH];
+```
+
+Ring selection: `current_ring_id() = min(scope_stack_top, PTO2_MAX_RING_DEPTH - 1)`.
+
+### 4.3 PTO2SharedMemoryHeader (modified)
+
+Per-ring flow control and per-ring layout info are grouped together:
+
+```cpp
+struct PTO2RingFlowControl {
+    std::atomic<int32_t> current_task_index;  // task ring head
+    std::atomic<int32_t> last_task_alive;     // task ring tail
+    std::atomic<uint64_t> heap_top;           // heap alloc pointer
+    std::atomic<uint64_t> heap_tail;          // heap reclaim pointer
+};
+
+struct PTO2SharedMemoryRingHeader {
+    PTO2RingFlowControl fc;
+    uint64_t task_window_size;
+    uint64_t heap_size;
+    uint64_t task_descriptors_offset;
+};
+
+// In header:
+PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
+```
+
+The global `heap_tail_gen` ticket counter is removed; each ring's scheduler state serializes ring-advance via a per-ring try-lock.
+
+### 4.4 PTO2SharedMemoryHandle (modified)
+
+Per-ring descriptor and payload arrays:
+
+```cpp
+PTO2TaskDescriptor* task_descriptors[PTO2_MAX_RING_DEPTH];
+PTO2TaskPayload*    task_payloads[PTO2_MAX_RING_DEPTH];
+```
+
+### 4.5 PTO2SchedulerState (modified)
+
+```cpp
+struct RingSchedState {
+    PTO2TaskSlotState* slot_states;
+    int32_t task_window_size;
+    int32_t task_window_mask;
+    std::atomic<int32_t> advance_lock;
+};
+
+RingSchedState ring_sched_states[PTO2_MAX_RING_DEPTH];
+```
+
+### 4.6 PTO2TensorMap (modified)
+
+```cpp
+PTO2TensorMapEntry** task_entry_heads[PTO2_MAX_RING_DEPTH];
+int64_t last_task_alives[PTO2_MAX_RING_DEPTH];
+```
+
+Entry validity checks and `cleanup_retired` operate per-ring:
+
+```cpp
+bool entry_valid(const PTO2TensorMapEntry& e) {
+    int32_t ring = pto2_task_id_ring(e.producer_task_id);
+    int32_t local = pto2_task_id_local(e.producer_task_id);
+    return local >= last_task_alives[ring];
+}
+```
+
+### 4.7 Unchanged Structures
+
+| Structure | Reason |
+|-----------|--------|
+| `PTO2DepListEntry` | Stores `PTO2TaskSlotState*` pointer — naturally crosses ring boundaries |
+| `PTO2TaskPayload` | `fanin_slot_states[]` are pointers — no ring coupling |
+| `PTO2ReadyQueue` | Global ready queues shared across all rings (tasks ready to dispatch regardless of origin ring) |
+| `PTO2DispatchPayload` | Built per-dispatch, no ring state needed |
+
+## 5. Reclamation
+
+### 5.1 Per-Ring Watermark Advancement
+
+Each ring's `last_task_alive` advances independently:
+
+```
+advance_ring_pointers(ring_id):
+    la = rings[ring_id].fc.last_task_alive
+    while task_state[la & mask] >= CONSUMED:
+        advance heap_tail from packed_buffer_end
+        reset fanin_refcount
+        CAS(last_task_alive, la, la+1)
+        la++
+```
+
+Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving heap_tail writes within the same ring.
+
+### 5.2 Cross-Ring Dependencies
+
+Dependency edges use `PTO2TaskSlotState*` pointers, which naturally span rings:
+
+- Ring 1 task depends on ring 0 producer → ring 0's `fanout_head` linked list contains a ring 1 `PTO2TaskSlotState*`
+- When ring 0 task completes, it walks its fanout list and decrements ring 1 consumers' `fanin_refcount`
+- No special cross-ring logic needed — pointer-based design is ring-agnostic
+
+### 5.3 DepPool Reclamation
+
+```
+pto2_dep_pool_reclaim(ring_id):
+    la = rings[ring_id].fc.last_task_alive
+    newest_consumed = la - 1
+    mark = task_payloads[ring_id][slot(newest_consumed)].dep_pool_mark
+    if mark > 0:
+        rings[ring_id].dep_pool.advance_tail(mark)
+```
+
+Note: dep entries from ring N's pool may appear in ring M's fanout lists. Reclamation is safe because the entries are accessed during fanout traversal (completion time), which always happens before the consumer task — and therefore the dep entry — becomes eligible for reclamation.
+
+## 6. AICPU Register Protocol Fix
+
+The AICore dispatch protocol uses 32-bit registers. With multi-ring, `mixed_task_id` truncation to 32-bit loses the `ring_id`, causing collisions:
+
+```
+Ring 0, local_id=0  →  DATA_MAIN_BASE = 0 + 1 = 1
+Ring 1, local_id=0  →  DATA_MAIN_BASE = 0 + 1 = 1  (collision!)
+```
+
+AICore uses `last_reg_val` to detect new dispatches — identical values cause skipped tasks and false completions from stale COND registers.
+
+**Fix**: Per-core monotonic dispatch counter `s_dispatch_seq[core_id]` replaces `mixed_task_id` in register writes, guaranteeing unique `DATA_MAIN_BASE` values per core regardless of ring origin.
+
+## 7. Configuration
+
+### 7.1 Compile-Time Defaults (per ring)
+
+| Constant | Default | Total (×4 rings) |
+|----------|---------|-------------------|
+| `PTO2_TASK_WINDOW_SIZE` | 16384 | 65536 |
+| `PTO2_HEAP_SIZE` | 256 MB | 1 GB |
+| `PTO2_DEP_LIST_POOL_SIZE` | 16384 | 65536 |
+
+### 7.2 Runtime Environment Overrides
+
+Uniform (applies to all rings):
+
+```
+PTO2_RING_TASK_WINDOW=1024
+PTO2_RING_HEAP=1048576
+PTO2_RING_DEP_POOL=1024
+```
+
+In `kernel_config.py`:
+
+```python
+RUNTIME_ENV = {
+    "PTO2_RING_TASK_WINDOW": "128",
+    "PTO2_RING_HEAP": "262144",
+    "PTO2_RING_DEP_POOL": "256",
+}
+```
+
+### 7.3 Sizing Guidelines
+
+- `task_window` must be ≥ max tasks in any single scope + headroom for concurrent scopes
+- `heap` must accommodate peak output buffer allocation across all in-flight tasks on that ring
+- `dep_pool` must be ≥ total dependency entries for all in-flight tasks on that ring
+- On hardware, back-pressure latency is higher than in simulation — size conservatively
+- Adding inner `PTO2_SCOPE` reduces peak per-ring usage, enabling smaller sizes
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index 5e9455c2..d2b7b981 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -61,6 +61,7 @@ The primary production runtime. Uses ring buffers for task slots and output memo
 - **Memory**: GM Heap ring for output buffer allocation
 - **Dependencies**: automatically derived from tensor read/write patterns via TensorMap
 - **Thread model**: 3 scheduler threads + 1 orchestrator thread on AICPU
+- **Multi-ring**: HeapRing, TaskRing, and DepPool are split into `PTO2_MAX_RING_DEPTH` (4) independent instances for nested scope isolation. See [MULTI_RING.md](MULTI_RING.md) for details.
 - **Use case**: production workloads; supports streaming, flow control, and large batch sizes
 
 ---
@@ -102,7 +103,7 @@ Two platform implementations exist under `src/platform/`, sharing a common inter
 
 ## 3. Shared Memory Layout
 
-The orchestrator and schedulers communicate through a contiguous shared memory region in Global Memory (GM):
+The orchestrator and schedulers communicate through a contiguous shared memory region in Global Memory (GM). Each ring level has its own TaskDescriptor and DepListPool sections. See [MULTI_RING.md §4.3–4.4](MULTI_RING.md) for the per-ring shared memory header and handle layout.
 
 ```
 ┌─────────────────────────────┐  offset 0
@@ -146,6 +147,8 @@ Alignment is 64 bytes (`PTO2_ALIGN_SIZE`).
 
 ## 4. Ring Buffer Mechanisms
 
+> **Multi-ring extension**: All three ring buffers (TaskRing, HeapRing, DepPool) are replicated per scope depth. Each ring level has independent watermarks and reclamation. See [MULTI_RING.md](MULTI_RING.md) for details.
+
 ### 4.1 Task Ring
 
 The task ring manages task slot allocation with back-pressure flow control.
@@ -318,23 +321,31 @@ When `pto2_submit_task` processes parameters:
 
 ## 6. Task Descriptor and States
 
-### 6.1 PTO2TaskDescriptor
+### 6.1 PTO2TaskDescriptor (Hot Path)
 
 | Field | Description |
 |-------|-------------|
-| `task_id` | Monotonically increasing ID |
-| `kernel_id` | Function ID (maps to compiled kernel binary) |
-| `worker_type` | CUBE (AIC), VECTOR (AIV), AI_CPU, or ACCELERATOR |
-| `fanin_head` | Head of fanin dependency list (pointer into DepListPool) |
+| `mixed_task_id` | Canonical mixed-task ID (64-bit: `ring_id << 32 | local_id`). See [MULTI_RING.md §3](MULTI_RING.md). |
+| `kernel_id[3]` | Per-slot kernel IDs: `[AIC, AIV0, AIV1]`; `INVALID_KERNEL_ID` = inactive |
+| `active_mask` | Bitmask of active subtask slots: `bit0=AIC`, `bit1=AIV0`, `bit2=AIV1` |
+| `subtask_done_mask` | Atomic bitmask; each subtask sets its done bit on completion |
 | `fanin_count` | Number of producer dependencies |
 | `fanout_lock` | Per-task spinlock for concurrent fanout modification |
 | `fanout_head` | Head of fanout consumer list (pointer, protected by `fanout_lock`) |
 | `fanout_count` | 1 (scope ref) + number of consumers |
 | `packed_buffer_base` | Start of packed buffer in GM Heap |
 | `packed_buffer_end` | End of packed buffer (for heap reclamation) |
-| `is_active` | Task slot is in use |
-| `params[16]` | Tensor and scalar parameters (`PTOParam` array) |
+
+### 6.1b PTO2TaskPayload (Cold Path)
+
+| Field | Description |
+|-------|-------------|
+| `tensors[16]` | Tensor descriptors for parameters |
+| `scalar_value[16]` | Scalar parameter values |
+| `is_tensor[16]` | Whether each parameter is tensor or scalar |
 | `param_count` | Number of valid parameters |
+| `fanin_slot_states[]` | Producer slot state pointers (used by `on_task_release`) |
+| `fanin_actual_count` | Actual fanin count |
 
 ### 6.2 Task State Machine
 
@@ -361,7 +372,7 @@ In the scheduler's `task_state[]` array (`std::atomic<PTO2TaskState>`):
 The orchestrator runs on AICPU Thread 3 and builds the task graph by calling the user-provided orchestration function.
 
 Key members:
-- `task_ring`, `heap_ring`, `dep_pool`: ring buffer state
+- `rings[PTO2_MAX_RING_DEPTH]`: per-ring `PTO2RingSet` (HeapRing + TaskRing + DepPool). See [MULTI_RING.md §4.2](MULTI_RING.md).
 - `tensor_map`, `tensor_pool`: dependency tracking
 - `scope_tasks[]`, `scope_begins[]`, `scope_stack_top`: scope nesting stack (flat buffer partitioned by level)
 - `scheduler`: pointer to scheduler state (for simulated mode or `init_task_on_submit`)
@@ -406,8 +417,8 @@ Scopes control the lifetime of intermediate buffers. Each scope:
 ```cpp
 PTO2_SCOPE(rt) {
     // Tasks submitted here belong to this scope
-    pto2_rt_submit_task(rt, FUNC_QK, PTO2_WORKER_CUBE, params, n);
-    pto2_rt_submit_task(rt, FUNC_SF, PTO2_WORKER_VECTOR, params, n);
+    pto2_rt_submit_aic_task(rt, FUNC_QK, params, n);
+    pto2_rt_submit_aiv_task(rt, FUNC_SF, params, n);
 }
 // scope_end: scope reference released from all tasks above
 ```
@@ -435,11 +446,11 @@ Each scheduler thread runs a tight loop with two main phases:
 
 **Phase 1 — Completion Handling**:
 - Poll register `COND` on each managed core
-- When `TASK_FIN_STATE` detected: record completion timestamps, mark `task_state[slot] = COMPLETED`, acquire fanout lock, traverse fanout list (incrementing consumers' `fanin_refcount`), mark `task_state[slot] = CONSUMED`, advance `last_task_alive` watermark
+- When `TASK_FIN_STATE` detected: record completion timestamps, call `on_subtask_complete(mixed_task_id, subslot)` to set the done bit; when `subtask_done_mask == active_mask`, trigger `on_mixed_task_complete(mixed_task_id)` which marks `task_state[slot] = COMPLETED`, acquires fanout lock, traverses fanout list (incrementing consumers' `fanin_refcount`), marks `task_state[slot] = CONSUMED`, and advances `last_task_alive` watermark
 
 **Phase 2 — Dispatch**:
-- For each idle core: pop a task from the ready queue (lock-free MPMC Vyukov queue, one per worker type)
-- Build `PTO2DispatchPayload` from `TaskDescriptor`
+- For each idle core: pop a task from the matching shape-based ready queue (lock-free MPMC Vyukov queue, one per resource shape)
+- Build `PTO2DispatchPayload` from `TaskDescriptor` with `mixed_task_id`, `subslot`, `kernel_id`, and `core_type`
 - Write task pointer to `Handshake.task`, signal AICore via register `DATA_MAIN_BASE`
 
 After these phases, the scheduler updates profiling headers and checks for termination (all tasks completed and orchestrator done).
@@ -448,9 +459,9 @@ After these phases, the scheduler updates profiling headers and checks for termi
 
 Ready queues use a lock-free bounded MPMC (Vyukov) design:
 
-- One `PTO2ReadyQueue` per worker type (4 types: CUBE, VECTOR, AI_CPU, ACCELERATOR)
-- **Push**: any thread (orchestrator via `init_task`, or scheduler on completion) pushes newly-ready tasks
-- **Pop**: scheduler threads pop from the queue matching the idle core's worker type
+- One `PTO2ReadyQueue` per resource shape (5 shapes: `AIC_ONLY`, `AIV_X1`, `AIV_X2`, `AIC_AIV_X1`, `AIC_AIV_X2`)
+- **Push**: any thread (orchestrator via `init_task`, or scheduler on completion) pushes newly-ready tasks to the queue matching `pto2_active_mask_to_shape(task->active_mask)`
+- **Pop**: scheduler threads pop from the queue matching the idle core's resource shape
 - Per-slot sequence counters prevent ABA problems
 - `enqueue_pos` and `dequeue_pos` are on separate cache lines to avoid false sharing
 
@@ -485,7 +496,9 @@ Each AICore worker has a `Handshake` struct in shared memory:
 
 ### 9.2 Register-Based Dispatch
 
-Instead of polling `Handshake.task_status`, the production protocol uses hardware registers:
+Instead of polling `Handshake.task_status`, the production protocol uses hardware registers.
+
+> **Multi-ring note**: `mixed_task_id` is 64-bit but registers are 32-bit. A per-core monotonic dispatch counter (`s_dispatch_seq`) replaces `mixed_task_id` in register writes to prevent collisions. See [MULTI_RING.md §6](MULTI_RING.md).
 
 | Register | Direction | Usage |
 |----------|-----------|-------|
@@ -505,8 +518,10 @@ Built by the scheduler from `PTO2TaskDescriptor`:
 
 | Field | Description |
 |-------|-------------|
-| `task_id` | Task identifier |
-| `kernel_id` | Function ID |
+| `mixed_task_id` | Mixed-task identifier (for completion aggregation) |
+| `subslot` | Which subtask slot this dispatch represents (`AIC`, `AIV0`, or `AIV1`) |
+| `kernel_id` | Function ID for this subtask slot |
+| `core_type` | AIC or AIV |
 | `function_bin_addr` | GM address of compiled kernel binary |
 | `num_args` | Number of arguments |
 | `args[]` | Tensor addresses and scalar values |
@@ -557,7 +572,9 @@ The orchestration API is defined in `pto_orchestration_api.h`. Orchestration cod
 
 | Function/Macro | Purpose |
 |----------------|---------|
-| `pto2_rt_submit_task(rt, kernel_id, worker_type, params, n)` | Submit a task with parameters |
+| `pto2_rt_submit_task(rt, mixed_kernels, params, n)` | Submit a mixed task with `MixedKernels` struct |
+| `pto2_rt_submit_aic_task(rt, kernel_id, params, n)` | Convenience: submit AIC-only task |
+| `pto2_rt_submit_aiv_task(rt, kernel_id, params, n)` | Convenience: submit AIV-only task |
 | `PTO2_SCOPE(rt) { ... }` | RAII scope for buffer lifetime |
 | `pto2_rt_orchestration_done(rt)` | Signal orchestration complete |
 | `pto2_rt_init_tensor_pool(rt)` | Initialize tensor pool for `make_tensor()` |
@@ -573,14 +590,17 @@ The orchestration API is defined in `pto_orchestration_api.h`. Orchestration cod
 | `make_inout_param(tensor)` | INOUT parameter — read then written |
 | `make_scalar_param(value)` | 64-bit scalar parameter |
 
-### 11.3 Worker Types
+### 11.3 Resource Shapes
+
+Tasks are queued by resource shape, which is derived from the `active_mask` in the `MixedKernels` struct:
 
-| Type | Target |
-|------|--------|
-| `PTO2_WORKER_CUBE` | AIC cores (matrix multiplication) |
-| `PTO2_WORKER_VECTOR` | AIV cores (vector operations) |
-| `PTO2_WORKER_AI_CPU` | AICPU (scalar ops, control flow) |
-| `PTO2_WORKER_ACCELERATOR` | Fixed-function accelerators (DMA, etc.) |
+| Shape | Active Mask | Description |
+|-------|-------------|-------------|
+| `AIC_ONLY` | AIC only | AIC cores (matrix multiplication) |
+| `AIV_X1` | AIV0 or AIV1 only | Single AIV core (vector operations) |
+| `AIV_X2` | AIV0 + AIV1 | Two AIV cores |
+| `AIC_AIV_X1` | AIC + one AIV | AIC + single AIV core |
+| `AIC_AIV_X2` | AIC + AIV0 + AIV1 | Full cluster (AIC + two AIV cores) |
 
 ### 11.4 Orchestration Export Interface
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
new file mode 100644
index 00000000..72619284
--- /dev/null
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
@@ -0,0 +1,226 @@
+# Submit by Cluster - Requirements and Main-Branch-Aligned Design
+
+## 1. Goal
+
+Define a single, main-branch-aligned specification for PTO2 cluster submission that combines:
+
+1. Product requirements (what must be true).
+2. Runtime design (how it is implemented on current main baseline).
+
+The target model is: one submitted graph node is one `MixedTask`, and dispatch/completion is mixed-task-granular.
+
+## 2. Background and Motivation
+
+Future Ascend hardware is expected to provide stronger locality within an AICore cluster (`1 AIC + 2 AIV`).
+The runtime therefore needs a "submit together, run together" model for related AIC/AIV kernels.
+
+Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-dispatch of multiple kernels to one cluster.
+
+## 3. Scope
+
+### In Scope
+
+1. New orchestration-facing submit API for cluster-aware mixed submission.
+2. Runtime/backend scheduler and executor changes to treat a mixed submit as one atomic scheduling unit.
+3. Dependency gating, readiness, dispatch, completion, and reclamation at mixed-task granularity.
+4. AIV slot equivalence (`AIV0` and `AIV1` are equivalent execution targets).
+
+### Out of Scope
+
+1. User-facing cluster pinning (`allocate_cluster/free_cluster`-style APIs).
+2. New worker types beyond AIC/AIV.
+3. Cross-cluster user placement policies.
+4. Hardware topology changes beyond `1 AIC + 2 AIV` per cluster.
+
+## 4. Main-Branch Baseline Constraints
+
+Design must preserve the current main runtime architecture:
+
+1. Multi-orchestrator runtime wiring (`orchestrators[]`, `orch_count`, thread-local `pto2_current_orch_idx`).
+2. Executor threading split (orchestrator threads vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
+3. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).
+
+## 5. Terminology
+
+1. `cluster`: one physical unit with `1 AIC + 2 AIV`.
+2. `MixedKernels`: 3 submit slots (`AIC`, `AIV0`, `AIV1`) with `INVALID_KERNEL_ID` for inactive slots.
+3. `MixedTask`: one runtime graph node created by one submit call.
+4. `active_mask`: bitmask of active subtask slots.
+5. `resource shape`: normalized lane demand class of a mixed task.
+
+## 6. API Contract
+
+```cpp
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+};
+
+static inline void pto2_rt_submit_task(PTO2Runtime* rt,
+                                       const MixedKernels& mixed_kernels,
+                                       PTOParam* params,
+                                       int32_t num_params);
+
+static inline void pto2_rt_submit_aic_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           PTOParam* params,
+                                           int32_t num_params);
+
+static inline void pto2_rt_submit_aiv_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           PTOParam* params,
+                                           int32_t num_params);
+```
+
+Rules:
+
+1. One submit call creates one `MixedTask`.
+2. All active slots share the same `params` and `num_params`.
+3. At least one slot must be active.
+4. `aiv0_kernel_id` and `aiv1_kernel_id` are semantically equivalent.
+5. Wrappers are orchestration sugar only (inline in orchestration API); no dedicated runtime ops entries.
+6. Submit-contract types are defined once in a shared header-only submit-types surface consumed by orchestration and runtime headers.
+7. Invalid submits follow existing PTO2 behavior (`always_assert`), not a new recoverable return-code API.
+
+## 7. Data Model (Requirements + Design)
+
+`PTO2TaskDescriptor` (hot path) carries mixed-task identity/state:
+
+1. `mixed_task_id`
+2. `active_mask`
+3. `subtask_done_mask`
+4. `kernel_id[3]` for `(AIC, AIV0, AIV1)`
+5. dependency heads/counters and packed-buffer metadata
+
+`PTO2TaskPayload` (cold path) carries:
+
+1. shared params/tensors/scalars copied once per mixed submit
+2. fanin mixed-task IDs
+3. other cold-path submit metadata
+
+Producer identity in TensorMap is mixed-task ID end-to-end.
+
+## 8. Scheduling Model
+
+### 8.1 Resource Shapes
+
+Runtime uses shape-based ready queues (not worker-type queues):
+
+1. `AIC_ONLY`
+2. `AIV_X1`
+3. `AIV_X2`
+4. `AIC_AIV_X1`
+5. `AIC_AIV_X2`
+
+Queueing key is normalized resource shape (not raw slot label).
+
+### 8.2 Atomic Cluster Dispatch
+
+1. Dispatch decision unit is one mixed task.
+2. For multi-slot mixed tasks, partial launch is forbidden.
+3. A mixed task is dispatchable only when one local owned cluster can satisfy all required lanes.
+4. Compatible mixed tasks may co-reside over time if they use disjoint free lanes.
+
+### 8.3 Dependency and Completion
+
+1. Fanin release/readiness remains dependency-correct and graph-level.
+2. Two-stage completion:
+   - `on_subtask_complete(mixed_task_id, subslot)`
+   - `on_mixed_task_complete(mixed_task_id)` only when `subtask_done_mask == active_mask`
+3. Downstream release is triggered once per mixed task completion, not once per subslot.
+
+## 9. Executor Ownership and Numbering
+
+### 9.1 Canonical Flattened Numbering (Unchanged)
+
+Given `block_dim` clusters:
+
+1. AIC IDs: `[0, block_dim)`
+2. AIV IDs: `[block_dim, 3 * block_dim)`
+3. Cluster `i`: `{i, block_dim + i, 2 * block_dim + i}`
+
+This project-defined flattened numbering is kept unchanged.
+
+### 9.2 Cluster Ownership
+
+1. One cluster must be owned by one scheduler domain/thread at a time.
+2. No split-cluster ownership in either:
+   - initial `assign_cores_to_threads()`
+   - post-orchestrator `reassign_cores_for_all_threads()`
+3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
+
+## 10. Functional Requirements
+
+### 10.1 Valid Mixed Shapes
+
+1. AIC only
+2. AIV only (1 or 2 AIV lanes)
+3. AIC + 1 AIV
+4. AIC + 2 AIV
+
+### 10.2 Runtime Behavior per Submit
+
+1. Validate submit arguments.
+2. Allocate mixed-task ID and initialize descriptor/payload once.
+3. Build fanin/fanout at mixed-task granularity.
+4. Enqueue by shape when ready.
+5. Dispatch all active lanes atomically when resources allow.
+6. Aggregate completion and release downstream once.
+
+## 11. Non-Functional Requirements
+
+1. Correctness: no dependency violation, no partial mixed-task dispatch.
+2. Determinism: dependency-correct ordering preserved; AIV lane choice may vary but remains semantically equivalent.
+3. Fairness: resource-aware polling heuristic is allowed; strict starvation-free guarantee across all shapes is not required.
+4. Performance: no obvious regression for non-cluster workflows.
+5. Observability: lifecycle visibility for submit/ready/dispatch/block/complete.
+
+## 12. Acceptance Criteria
+
+Feature is accepted when:
+
+1. Orchestration compiles and submits via `MixedKernels` API/wrappers.
+2. Scheduler dispatches each mixed task as one cluster scheduling decision.
+3. Dependencies gate mixed-task readiness correctly.
+4. AIV execution remains cluster-local and semantically equivalent across lanes.
+5. Existing non-cluster workflows continue to pass without behavior regression.
+6. Cluster ownership is never split across scheduler domains before/after transition.
+
+## 13. Verification Matrix
+
+Recommended validation coverage:
+
+1. Mapping correctness for cluster-to-core ID relation.
+2. Atomic dispatch for multi-slot shapes.
+3. Dependency gating and completion aggregation (`done_mask == active_mask`).
+4. Lane-occupancy co-residency behavior for compatible shapes.
+5. Multi-orchestrator and core-transition ownership stability.
+6. Invalid submit handling (`always_assert` path).
+7. Regression coverage for existing examples/tests.
+
+Milestone command (device):
+
+```bash
+python examples/scripts/run_example.py \
+  -k tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels \
+  -g tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py \
+  -p a2a3 -d 9
+```
+
+Final validation:
+
+```bash
+./ci.sh
+```
+
+## 14. Resolved Decisions
+
+1. Legacy orchestration-facing single-task submit is replaced by mixed submit contract.
+2. Invalid mixed submits fail with existing submit-time assert behavior.
+3. Per-cluster concurrent capacity is lane-occupancy-driven, not a fixed constant.
+4. Submit-contract types live in one shared header-only surface.
+5. Resource-aware dispatch heuristics are allowed without a strict starvation-free guarantee.
+
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
index 3b23d7f7..c619f36a 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
@@ -110,9 +110,9 @@ The scheduler loop runs four phases each iteration. Each phase's time is accumul
 
 | Phase | What it does | Inline stats |
 |-------|-------------|-------------|
-| **complete** | Polls handshake on each managed core; when a core completes, traverses fanout list (notify consumers) and fanin list (release producers) via `on_task_complete` | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
+| **complete** | Polls handshake on each managed core; when a core completes, calls `on_subtask_complete(mixed_task_id, subslot)` to set the done bit; when `subtask_done_mask == active_mask`, triggers `on_mixed_task_complete` which traverses fanout list (notify consumers) and fanin list (release producers) | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
 | **scan** | Updates the perf profiling header with latest scheduler state | — |
-| **dispatch** | For each idle core, pops a task from the ready queue via `pto2_scheduler_get_ready_task`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
+| **dispatch** | For each idle core, pops a task from the shape-based ready queue via `get_ready_task(shape)`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
 | **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — |
 
 **Interpreting phase percentages:**
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index 2b4d8a9e..47a65ac7 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -9,10 +9,12 @@ PTO Runtime2 uses a hierarchical profiling system with compile-time macros to co
 ## Profiling Macro Hierarchy
 
 ```
-PTO2_PROFILING (base level, default=0)
+PTO2_PROFILING (base level, default=1)
 ├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1)
+|   └──PTO2_TENSORMAP_PROFILING (tensormap, default=0, requires PTO2_ORCH_PROFILING=1)
 ├── PTO2_SCHED_PROFILING (scheduler, default=0, requires PTO2_PROFILING=1)
-└── PTO2_TENSORMAP_PROFILING (tensormap, default=0, requires PTO2_PROFILING=1)
+└── --enable-profiling (Dump profiling merged swimlane json file for visualization, requires PTO2_PROFILING=1)
+
 ```
 
 ### Compile-Time Validation
@@ -28,8 +30,8 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 #error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
 #endif
 
-#if PTO2_TENSORMAP_PROFILING && !PTO2_PROFILING
-#error "PTO2_TENSORMAP_PROFILING requires PTO2_PROFILING=1"
+#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING
+#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1"
 #endif
 ```
 
@@ -194,7 +196,7 @@ runtime->enable_profiling = true;
 
 ## Common Profiling Configurations
 
-### Development (default)
+### Development (minimal overhead)
 ```bash
 # No profiling overhead
 PTO2_PROFILING=0
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index c9238d89..ae22d562 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -265,10 +265,12 @@ extern "C" int init_runtime_impl(Runtime *runtime,
     {
         runtime->pto2_task_window_size  = parse_env_uint64("PTO2_RING_TASK_WINDOW", 4, true);
         runtime->pto2_heap_size         = parse_env_uint64("PTO2_RING_HEAP", 1024, true);
-        if (runtime->pto2_task_window_size || runtime->pto2_heap_size) {
-            LOG_INFO("Ring buffer overrides: task_window=%lu heap=%lu",
+        runtime->pto2_dep_pool_size     = parse_env_uint64("PTO2_RING_DEP_POOL", 4, false);
+        if (runtime->pto2_task_window_size || runtime->pto2_heap_size || runtime->pto2_dep_pool_size) {
+            LOG_INFO("Ring buffer overrides: task_window=%lu heap=%lu dep_pool=%lu",
                      (unsigned long)(runtime->pto2_task_window_size ? runtime->pto2_task_window_size : PTO2_TASK_WINDOW_SIZE),
-                     (unsigned long)(runtime->pto2_heap_size ? runtime->pto2_heap_size : PTO2_HEAP_SIZE));
+                     (unsigned long)(runtime->pto2_heap_size ? runtime->pto2_heap_size : PTO2_HEAP_SIZE),
+                     (unsigned long)(runtime->pto2_dep_pool_size ? runtime->pto2_dep_pool_size : PTO2_DEP_LIST_POOL_SIZE));
         }
     }
 
@@ -276,15 +278,16 @@ extern "C" int init_runtime_impl(Runtime *runtime,
     uint64_t eff_heap_size = runtime->pto2_heap_size ? runtime->pto2_heap_size : PTO2_HEAP_SIZE;
     uint64_t eff_task_window_size = runtime->pto2_task_window_size ? runtime->pto2_task_window_size : PTO2_TASK_WINDOW_SIZE;
 
-    // Allocate GM heap for orchestrator output buffers
+    // Allocate GM heap for orchestrator output buffers (all rings combined)
+    uint64_t total_heap_size = eff_heap_size * PTO2_MAX_RING_DEPTH;
     long long t_heap_start = _now_ms();
-    void* gm_heap = runtime->host_api.device_malloc(eff_heap_size);
+    void* gm_heap = runtime->host_api.device_malloc(total_heap_size);
     long long t_heap_end = _now_ms();
     if (gm_heap == nullptr) {
         LOG_ERROR("Failed to allocate GM heap");
         return -1;
     }
-    runtime->record_tensor_pair(nullptr, gm_heap, eff_heap_size);
+    runtime->record_tensor_pair(nullptr, gm_heap, total_heap_size);
     runtime->set_pto2_gm_heap(gm_heap);
 
     // Allocate PTO2 shared memory
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index ee54cbd2..ff7d2b18 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -22,18 +22,9 @@
 #include <stddef.h>
 
 // Type headers needed by orchestration
-#include "pto_types.h"          // PTOParam, make_input_param, make_output_param, etc.
+#include "pto_types.h"          // PTOParam, PTOTensorEntry, PTOParamType
 #include "tensor.h"             // Tensor, make_tensor, make_tensor_external
-
-// Worker type constants (duplicated from pto_runtime2_types.h to avoid
-// pulling in the full types header with its internal structures)
-typedef enum {
-    PTO2_WORKER_CUBE = 0,
-    PTO2_WORKER_VECTOR = 1,
-    PTO2_WORKER_AI_CPU = 2,
-    PTO2_WORKER_ACCELERATOR = 3,
-    PTO2_NUM_WORKER_TYPES = 4
-} PTO2WorkerType;
+#include "pto_submit_types.h"   // MixedKernels, INVALID_KERNEL_ID, subtask slots
 
 // =============================================================================
 // Ops Table and Opaque Runtime
@@ -51,12 +42,12 @@ typedef struct PTO2Runtime PTO2Runtime;
  * Populated by the runtime; called by orchestration through inline wrappers.
  */
 typedef struct PTO2RuntimeOps {
-    void (*submit_task)(PTO2Runtime* rt, int32_t kernel_id,
-                        PTO2WorkerType worker_type,
-                        PTOParam* params, int32_t num_params);
+    void (*submit_task)(PTO2Runtime* rt, const MixedKernels& mixed_kernels,
+                        const PTOParam& params);
     void (*scope_begin)(PTO2Runtime* rt);
     void (*scope_end)(PTO2Runtime* rt);
     void (*orchestration_done)(PTO2Runtime* rt);
+    bool (*is_fatal)(PTO2Runtime* rt);
 
     // Logging (populated by runtime, called by orchestration)
     void (*log_error)(const char* func, const char* fmt, ...);
@@ -81,10 +72,29 @@ struct PTO2Runtime {
 // Inline Convenience Wrappers (call through ops table)
 // =============================================================================
 
-static inline void pto2_rt_submit_task(PTO2Runtime* rt, int32_t kernel_id,
-                                        PTO2WorkerType worker_type,
-                                        PTOParam* params, int32_t num_params) {
-    rt->ops->submit_task(rt, kernel_id, worker_type, params, num_params);
+static inline void pto2_rt_submit_task(PTO2Runtime* rt, const MixedKernels& mixed_kernels,
+                                        const PTOParam& params) {
+    rt->ops->submit_task(rt, mixed_kernels, params);
+}
+
+/**
+ * Convenience wrapper: submit an AIC-only task.
+ */
+static inline void pto2_rt_submit_aic_task(PTO2Runtime* rt, int32_t kernel_id,
+                                            const PTOParam& params) {
+    MixedKernels mk;
+    mk.aic_kernel_id = kernel_id;
+    rt->ops->submit_task(rt, mk, params);
+}
+
+/**
+ * Convenience wrapper: submit an AIV-only task (uses AIV0 slot).
+ */
+static inline void pto2_rt_submit_aiv_task(PTO2Runtime* rt, int32_t kernel_id,
+                                            const PTOParam& params) {
+    MixedKernels mk;
+    mk.aiv0_kernel_id = kernel_id;
+    rt->ops->submit_task(rt, mk, params);
 }
 
 static inline void pto2_rt_scope_begin(PTO2Runtime* rt) {
@@ -99,6 +109,10 @@ static inline void pto2_rt_orchestration_done(PTO2Runtime* rt) {
     rt->ops->orchestration_done(rt);
 }
 
+static inline bool pto2_rt_is_fatal(PTO2Runtime* rt) {
+    return rt->ops->is_fatal(rt);
+}
+
 // =============================================================================
 // Logging Macros for Orchestration (call through ops table)
 // =============================================================================
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
index 8d9abfdc..94f2da37 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
@@ -1,9 +1,13 @@
 /**
  * @file pto2_dispatch_payload.h
- * @brief Handshake dispatch payload aligned with runtime2 PTO2TaskDescriptor
+ * @brief Minimal dispatch payload for AICore kernel execution
  *
- * Shared between AICPU (pack from PTO2TaskDescriptor) and AICore (unpack to run kernel).
- * When merging runtime2 into rt2, Handshake.task points to PTO2DispatchPayload.
+ * Shared between AICPU (builds in-place) and AICore (reads to run kernel).
+ * Handshake.task points to PTO2DispatchPayload embedded in PTO2TaskPayload.
+ *
+ * Only contains fields AICore needs to execute: function address + arguments.
+ * Metadata (task_id, kernel_id, core_type) lives in PTO2TaskDescriptor and
+ * is accessed by AICPU when needed (profiling, diagnostics).
  */
 
 #ifndef RT2_PTO2_DISPATCH_PAYLOAD_H_
@@ -11,24 +15,19 @@
 
 #include <stdint.h>
 
-#include "common/core_type.h"
-
 /** Max arguments per task; must match RUNTIME_MAX_ARGS and PTO2_MAX_OUTPUTS */
 #ifndef PTO2_DISPATCH_MAX_ARGS
 #define PTO2_DISPATCH_MAX_ARGS 128
 #endif
 
 /**
- * Dispatch payload: execution-relevant fields from PTO2TaskDescriptor.
- * AICPU packs this from PTO2TaskDescriptor; AICore unpacks to run kernel.
+ * Dispatch payload: minimal execution interface for AICore.
+ * Layout: function_bin_addr followed by args[].
+ * AICore reads function_bin_addr, casts to UnifiedKernelFunc, calls with args.
  */
 struct PTO2DispatchPayload {
-    int32_t task_id;           /**< Task ID (for completion_queue) */
-    int32_t kernel_id;         /**< InCore function id (debug/trace) */
-    CoreType core_type;        /**< AIC or AIV */
     uint64_t function_bin_addr; /**< Kernel entry in GM: (UnifiedKernelFunc)function_bin_addr */
-    int32_t num_args;          /**< Number of valid args[] */
-    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers) */
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars) */
 };
 
 #endif  // RT2_PTO2_DISPATCH_PAYLOAD_H_
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index b5251fd7..d62e0f9a 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -16,6 +16,7 @@
 
 #include "common/unified_log.h"
 #include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
 #include "pto_tensormap.h"
 #include "pto_types.h"
 #include "tensor.h"
@@ -23,7 +24,7 @@
 // =============================================================================
 // Orchestrator Profiling (compile-time toggle)
 // =============================================================================
-#if PTO2_PROFILING
+#if PTO2_ORCH_PROFILING
 #include "aicpu/device_time.h"
 #include "aicpu/performance_collector_aicpu.h"
 // Weak fallback for builds that don't link device_time.cpp (e.g. host).
@@ -43,7 +44,7 @@ __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { retur
 // Also hidden to prevent HOST .so from polluting the global symbol table.
 __attribute__((weak, visibility("hidden"))) void perf_aicpu_record_orch_phase(
     AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint32_t) {}
-// Accumulated nanoseconds per sub-step
+// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
 static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
 static uint64_t g_orch_alloc_cycle = 0;      // task ring alloc
 static uint64_t g_orch_params_cycle = 0;     // param copy
@@ -54,7 +55,6 @@ static uint64_t g_orch_fanin_cycle = 0;      // fanin list + early-return check
 static uint64_t g_orch_scope_end_cycle = 0;  // scope_end overhead
 static int64_t  g_orch_submit_count = 0;
 static uint32_t g_orch_submit_idx = 0;
-#if PTO2_ORCH_PROFILING
 uint64_t g_orch_alloc_wait_cycle = 0;
 uint64_t g_orch_heap_wait_cycle = 0;
 uint64_t g_orch_fanin_wait_cycle = 0;
@@ -64,21 +64,35 @@ uint64_t g_orch_heap_atomic_count = 0;
 uint64_t g_orch_fanin_atomic_count = 0;
 uint64_t g_orch_finalize_atomic_count = 0;
 uint64_t g_orch_scope_end_atomic_count = 0;
-#elif PTO2_SCHED_PROFILING
-// When only PTO2_SCHED_PROFILING is enabled, shared methods still need
-// orch counters as targets for orchestrator-context calls.
-uint64_t g_orch_fanin_atomic_count = 0;
-uint64_t g_orch_fanin_wait_cycle = 0;
-uint64_t g_orch_finalize_atomic_count = 0;
-uint64_t g_orch_scope_end_atomic_count = 0;
-#endif
 #define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
 #define CYCLE_COUNT_LAP(acc) do { _t1 = get_sys_cnt_aicpu(); acc += (_t1 - _t0); _t0 = _t1; } while(0)
-#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid) do { \
-    _t1 = get_sys_cnt_aicpu(); \
-    acc += (_t1 - _t0); \
-    _t0 = _t1; \
-} while(0)
+#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)                                    \
+    do {                                                                              \
+        _t1 = get_sys_cnt_aicpu();                                                    \
+        acc += (_t1 - _t0);                                                           \
+        perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \
+        _t0 = _t1;                                                                    \
+    } while (0)
+#elif PTO2_PROFILING
+#include "aicpu/device_time.h"
+#include "aicpu/performance_collector_aicpu.h"
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
+__attribute__((weak, visibility("hidden"))) void perf_aicpu_record_orch_phase(
+    AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint32_t) {}
+// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
+static uint32_t g_orch_submit_idx = 0;
+#define CYCLE_COUNT_START()                                                           \
+    bool _prof_active = orch->enable_profiling;                                       \
+    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0
+#define CYCLE_COUNT_LAP(acc) do { } while(0)
+#define CYCLE_COUNT_LAP_RECORD(acc, phase_id, tid)                                    \
+    do {                                                                              \
+        if (_prof_active) {                                                           \
+            _t1 = get_sys_cnt_aicpu();                                                \
+            perf_aicpu_record_orch_phase((phase_id), _t0, _t1, g_orch_submit_idx, (tid)); \
+            _t0 = _t1;                                                                \
+        }                                                                             \
+    } while (0)
 #else
 #define CYCLE_COUNT_START()
 #define CYCLE_COUNT_LAP(acc)
@@ -96,46 +110,63 @@ bool pto2_orchestrator_init(
 
     orch->sm_handle = sm_handle;
     orch->gm_heap_base = gm_heap;
-    orch->gm_heap_size = heap_size;
-
-    // Initialize heap ring buffer
-    pto2_heap_ring_init(&orch->heap_ring, gm_heap, heap_size,
-                        &sm_handle->header->heap_tail,
-                        &sm_handle->header->heap_top);
-
-    // Initialize task ring buffer
-    pto2_task_ring_init(&orch->task_ring,
-        sm_handle->task_descriptors,
-        sm_handle->header->task_window_size,
-        &sm_handle->header->last_task_alive,
-        &sm_handle->header->current_task_index);
-
-    // Allocate and initialize dependency list pool (per-orchestrator, no shared memory)
-    PTO2DepListEntry* dep_entries = (PTO2DepListEntry*)calloc(dep_pool_capacity, sizeof(PTO2DepListEntry));
-    if (!dep_entries) {
-        return false;
+    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+    orch->fatal = false;
+
+    // Initialize per-ring resources
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        // Each ring gets its own heap region
+        void* ring_heap_base = (char*)gm_heap + r * heap_size;
+        auto &fc = sm_handle->header->rings[r].fc;
+
+        // Initialize heap ring buffer
+        pto2_heap_ring_init(&orch->rings[r].heap_ring, ring_heap_base, heap_size, &fc.heap_tail, &fc.heap_top);
+        orch->rings[r].heap_ring.error_code_ptr = &sm_handle->header->orch_error_code;
+
+        // Initialize task ring buffer
+        pto2_task_ring_init(&orch->rings[r].task_ring,
+            sm_handle->task_descriptors[r],
+            sm_handle->header->rings[r].task_window_size,
+            &fc.last_task_alive,
+            &fc.current_task_index);
+        orch->rings[r].task_ring.error_code_ptr = &sm_handle->header->orch_error_code;
+
+        // Allocate and initialize dependency list pool (per-ring)
+        PTO2DepListEntry* dep_entries = (PTO2DepListEntry*)calloc(dep_pool_capacity, sizeof(PTO2DepListEntry));
+        if (!dep_entries) {
+            // Cleanup previously allocated rings
+            for (int j = 0; j < r; j++) {
+                free(orch->rings[j].dep_pool.base);
+            }
+            return false;
+        }
+        orch->rings[r].dep_pool.init(dep_entries, dep_pool_capacity, &sm_handle->header->orch_error_code);
     }
-    pto2_dep_pool_init(&orch->dep_pool, dep_entries, dep_pool_capacity);
-    orch->dep_pool_cur_entry = nullptr;
-    orch->dep_pool_last_reclaimed = 0;
 
-    // Initialize TensorMap
-    if (!orch->tensor_map.init_default(sm_handle->header->task_window_size)) {
-        free(dep_entries);
+    // Initialize TensorMap with per-ring task window sizes
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = sm_handle->header->rings[r].task_window_size;
+    }
+    if (!orch->tensor_map.init_default(task_window_sizes)) {
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            free(orch->rings[r].dep_pool.base);
+        }
         return false;
     }
     orch->tensor_map.orch = orch;
-    orch->tensormap_last_cleanup = 0;
 
     // Initialize scope stack: one flat buffer for task IDs + one array for begin offsets
     uint64_t max_depth = PTO2_MAX_SCOPE_DEPTH;
     int32_t init_cap = PTO2_SCOPE_TASKS_INIT_CAP;
-    orch->scope_tasks = (int32_t*)malloc(init_cap * sizeof(int32_t));
+    orch->scope_tasks = (PTO2TaskSlotState**)malloc(init_cap * sizeof(PTO2TaskSlotState*));
     orch->scope_begins = (int32_t*)malloc(max_depth * sizeof(int32_t));
     if (!orch->scope_tasks || !orch->scope_begins) {
         free(orch->scope_tasks);
         free(orch->scope_begins);
-        free(dep_entries);
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+            free(orch->rings[r].dep_pool.base);
+        }
         orch->tensor_map.destroy();
         return false;
     }
@@ -150,8 +181,10 @@ bool pto2_orchestrator_init(
 void pto2_orchestrator_destroy(PTO2OrchestratorState* orch) {
     orch->tensor_map.destroy();
 
-    free(orch->dep_pool.base);
-    orch->dep_pool.base = NULL;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        free(orch->rings[r].dep_pool.base);
+        orch->rings[r].dep_pool.base = NULL;
+    }
 
     free(orch->scope_tasks);
     orch->scope_tasks = NULL;
@@ -161,97 +194,25 @@ void pto2_orchestrator_destroy(PTO2OrchestratorState* orch) {
 
 void pto2_orchestrator_set_scheduler(PTO2OrchestratorState* orch, PTO2SchedulerState* scheduler) {
     orch->scheduler = scheduler;
-    orch->init_task_on_submit = true;  // Default: initialize task on submit
-}
-
-void pto2_orchestrator_set_scheduler_mode(
-    PTO2OrchestratorState* orch, PTO2SchedulerState* scheduler, bool init_on_submit) {
-    orch->scheduler = scheduler;
-    orch->init_task_on_submit = init_on_submit;
-}
-
-// =============================================================================
-// Dep Pool Reclamation
-// =============================================================================
-
-/**
- * Reclaim dead dep pool entries based on scheduler's last_task_alive.
- * Safe to call multiple times — only advances tail forward.
- */
-static void pto2_dep_pool_reclaim(PTO2OrchestratorState* orch) {
-    int32_t last_alive = orch->sm_handle->header->last_task_alive.load(std::memory_order_acquire);
-    if (last_alive > orch->dep_pool_last_reclaimed && last_alive > 0) {
-        int32_t newest_consumed = last_alive - 1;
-        int32_t slot_rc = orch->task_ring.get_task_slot(newest_consumed);
-        int32_t mark = orch->sm_handle->task_payloads[slot_rc].dep_pool_mark;
-        if (mark > 0) {
-            orch->dep_pool.advance_tail(mark);
-        }
-        orch->dep_pool_last_reclaimed = last_alive;
-    }
-}
-
-/**
- * Ensure dep pool has at least `needed` entries available.
- * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
- */
-static void pto2_dep_pool_ensure_space(PTO2OrchestratorState* orch, int32_t needed) {
-    if (pto2_dep_pool_available(&orch->dep_pool) >= needed) return;
-
-    int spin_count = 0;
-    while (pto2_dep_pool_available(&orch->dep_pool) < needed) {
-        pto2_dep_pool_reclaim(orch);
-        if (pto2_dep_pool_available(&orch->dep_pool) >= needed) return;
-
-        spin_count++;
-        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
-            auto& pool = orch->dep_pool;
-            int32_t used = pool.top - pool.tail;
-            int32_t last_alive = orch->sm_handle->header->last_task_alive.load(std::memory_order_acquire);
-            int32_t current = orch->task_ring.current_index_ptr->load(std::memory_order_acquire);
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool cannot reclaim space after %d spins.", spin_count);
-            LOG_ERROR("  - Pool used:     %d / %d (%.1f%%)", used, pool.capacity,
-                      (pool.capacity > 0) ? (100.0 * used / pool.capacity) : 0.0);
-            LOG_ERROR("  - Pool top:      %d (linear)", pool.top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", pool.tail);
-            LOG_ERROR("  - High water:    %d", pool.high_water);
-            LOG_ERROR("  - Needed:        %d entries", needed);
-            LOG_ERROR("  - last_task_alive: %d", last_alive);
-            LOG_ERROR("  - current_task:    %d", current);
-            LOG_ERROR("  - In-flight tasks: %d", current - last_alive);
-            LOG_ERROR("Root Cause:");
-            LOG_ERROR("  Too many concurrent tasks consuming dep pool entries");
-            LOG_ERROR("  relative to the pool capacity (%d).", pool.capacity);
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", pool.capacity, pool.high_water * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", pool.high_water * 2);
-            LOG_ERROR("========================================");
-            exit(1);
-        }
-        SPIN_WAIT_HINT();
-    }
 }
 
 // =============================================================================
 // Scope Management
 // =============================================================================
 
-static void scope_tasks_push(PTO2OrchestratorState* orch, int32_t task_id) {
+static void scope_tasks_push(PTO2OrchestratorState* orch, PTO2TaskSlotState *task_slot_state) {
     if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
         int32_t new_cap = orch->scope_tasks_capacity * 2;
-        int32_t* new_buf = (int32_t*)realloc(orch->scope_tasks, new_cap * sizeof(int32_t));
+        PTO2TaskSlotState** new_buf = (PTO2TaskSlotState**)realloc(orch->scope_tasks, new_cap * sizeof(PTO2TaskSlotState*));
         assert(new_buf && "Failed to grow scope task buffer");
         orch->scope_tasks = new_buf;
         orch->scope_tasks_capacity = new_cap;
     }
-    orch->scope_tasks[orch->scope_tasks_size++] = task_id;
+    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
 }
 
 void pto2_scope_begin(PTO2OrchestratorState* orch) {
+    if (orch->fatal) { return; }
     assert(orch->scope_stack_top < (int32_t)(orch->scope_stack_capacity - 1) && "Scope stack overflow");
 
     ++orch->scope_stack_top;
@@ -259,9 +220,10 @@ void pto2_scope_begin(PTO2OrchestratorState* orch) {
 }
 
 void pto2_scope_end(PTO2OrchestratorState* orch) {
+    if (orch->fatal) { return; }
     assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
 
-#if PTO2_PROFILING
+#if PTO2_ORCH_PROFILING
     uint64_t _se0 = get_sys_cnt_aicpu();
 #endif
 
@@ -275,7 +237,7 @@ void pto2_scope_end(PTO2OrchestratorState* orch) {
     // Rewind the task buffer — these entries are no longer needed
     orch->scope_tasks_size = begin;
 
-#if PTO2_PROFILING
+#if PTO2_ORCH_PROFILING
     uint64_t _se1 = get_sys_cnt_aicpu();
     g_orch_scope_end_cycle += (_se1 - _se0);
     // perf_aicpu_record_orch_phase(AicpuPhaseId::ORCH_SCOPE_END, _se0, _se1, g_orch_submit_idx, -1);
@@ -285,108 +247,207 @@ void pto2_scope_end(PTO2OrchestratorState* orch) {
 // =============================================================================
 // Task Submission
 // =============================================================================
-void pto2_submit_task(
-    PTO2OrchestratorState* orch, int32_t kernel_id, PTO2WorkerType worker_type, PTOParam* params, int32_t num_params) {
+void pto2_submit_mixed_task(
+    PTO2OrchestratorState* orch, const MixedKernels& mixed_kernels, const PTOParam& params) {
     CYCLE_COUNT_START();
 
-    // === STEP 0: Sync TensorMap validity and optional cleanup ===
-    orch->tensor_map.sync_tensormap();
+    // Fast path after fatal error — all subsequent submits are no-ops
+    if (orch->fatal) {
+        return;
+    }
 
-    // Reclaim dead dep pool entries based on scheduler's last_task_alive
-    pto2_dep_pool_reclaim(orch);
+    // Validate PTOParam construction (errors recorded by add_input/add_output/etc.)
+    if (params.has_error) {
+        LOG_ERROR("========================================");
+        LOG_ERROR("FATAL: Invalid PTOParam Detected!");
+        LOG_ERROR("========================================");
+        LOG_ERROR("Error: %s", params.error_msg ? params.error_msg : "(unknown)");
+        LOG_ERROR("  tensor_count: %d, scalar_count: %d", params.tensor_count, params.scalar_count);
+        LOG_ERROR("This is a bug in the orchestration code.");
+        LOG_ERROR("========================================");
+        orch->sm_handle->header->orch_error_code.store(
+            PTO2_ERROR_INVALID_PARAM, std::memory_order_release);
+        orch->fatal = true;
+        return;
+    }
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, -1);
+    
+    // Determine which ring this task belongs to
+    uint8_t ring_id = orch->current_ring_id();
+    auto& task_ring = orch->rings[ring_id].task_ring;
+    PTO2SchedulerState* sched = orch->scheduler;
+    PTO2RingFlowControl &fc = orch->sm_handle->header->rings[ring_id].fc;
+
+    // === Validate submit inputs ===
+    uint8_t active_mask = pto2_mixed_kernels_to_active_mask(mixed_kernels);
+    always_assert(active_mask != 0 && "MixedKernels must have at least one active slot");
+
+    // Normalize single-AIV tasks: if only aiv1 is set, move it to the aiv0 slot.
+    // This guarantees the dispatch path can always use PTO2SubtaskSlot::AIV0 for
+    // AIV_X1 and AIC_AIV_X1 shapes without inspecting active_mask.
+    MixedKernels normalized = mixed_kernels;
+    bool has_aiv0 = (active_mask & PTO2_SUBTASK_MASK_AIV0) != 0;
+    bool has_aiv1 = (active_mask & PTO2_SUBTASK_MASK_AIV1) != 0;
+    if (has_aiv1 && !has_aiv0) {
+        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
+        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
+        active_mask = pto2_mixed_kernels_to_active_mask(normalized);
+    }
 
     // Submission without an open scope is illegal
     always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
 
+    // === Scope deadlock pre-check ===
+    // Tasks within a scope hold a fanout_count reference released only at scope_end.
+    // If scope task count >= window_size, no slots can ever be reclaimed → deadlock.
+    {
+        int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
+        if (scope_task_count >= task_ring.window_size - 1) {
+            int32_t total_submitted = task_ring.current_index_ptr->load(std::memory_order_acquire);
+            int32_t last_alive = task_ring.last_alive_ptr->load(std::memory_order_acquire);
+            int32_t active_count = total_submitted - last_alive;
+
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id);
+            LOG_ERROR("========================================");
+            LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).",
+                      scope_task_count, task_ring.window_size);
+            LOG_ERROR("  scope_depth:        %d", orch->scope_stack_top + 1);
+            LOG_ERROR("  ring_id:            %d", ring_id);
+            LOG_ERROR("  scope_task_count:   %d", scope_task_count);
+            LOG_ERROR("  total_submitted:    %d", total_submitted);
+            LOG_ERROR("  last_task_alive:    %d", last_alive);
+            LOG_ERROR("  active_tasks:       %d / %d", active_count, task_ring.window_size);
+            LOG_ERROR("Root Cause:");
+            LOG_ERROR("  Tasks within a scope hold a fanout_count reference that is only");
+            LOG_ERROR("  released at scope_end. When scope task count >= window_size,");
+            LOG_ERROR("  no slots can be reclaimed -> deadlock.");
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  1. Reduce tasks per scope (use batching/unroll)");
+            LOG_ERROR("  2. Increase task window (current: %d)", task_ring.window_size);
+            LOG_ERROR("     Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("     Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2>");
+            LOG_ERROR("  3. Split work across multiple scopes");
+            LOG_ERROR("========================================");
+            orch->sm_handle->header->orch_error_code.store(
+                PTO2_ERROR_SCOPE_DEADLOCK, std::memory_order_release);
+            orch->fatal = true;
+            return;
+        }
+    }
+
     // === STEP 1: Allocate task slot from Task Ring (blocks until available) ===
-    auto& task_ring = orch->task_ring;
-    int32_t task_id = task_ring.pto2_task_ring_alloc();
-    int32_t slot = task_ring.get_task_slot(task_id);
+    int32_t local_id = task_ring.pto2_task_ring_alloc();
+    if (local_id < 0) { orch->fatal = true; return; }
+    int32_t slot = task_ring.get_task_slot(local_id);
+    PTO2TaskId mixed_task_id = pto2_make_task_id(ring_id, static_cast<uint32_t>(local_id));
 
     PTO2TaskDescriptor& task = task_ring.get_task_by_slot(slot);
-    PTO2TaskPayload* payload = &orch->sm_handle->task_payloads[slot];
-
-    // Initialize task descriptor
-    task.task_id = task_id;
-    task.kernel_id = kernel_id;
-    task.worker_type = worker_type;
-    task.fanin_count = 0;
-    task.fanout_head = nullptr;
-    task.fanout_lock.store(0, std::memory_order_relaxed);
-    // Initial fanout_count = 1 (the owning scope holds one reference)
-    task.fanout_count = 1;
-    task.packed_buffer_base = NULL;
-    task.packed_buffer_end = NULL;
+    PTO2TaskPayload* payload = &orch->sm_handle->task_payloads[ring_id][slot];    
+
+    // Early write-prefetch payload GM cache lines to issue RFO in background.
+    // ~130 lines of computation (output_size, lookup, insert) follow before
+    // param_copy writes, giving ample time for prefetch to complete.
+    // Use locality=3 (PSTL1KEEP) so prefetched CLs survive lookup/insert eviction.
+    for (int32_t i = 0; i < params.tensor_count; i++) {
+        __builtin_prefetch(&payload->tensors[i], 1, 3);
+        __builtin_prefetch(reinterpret_cast<char*>(&payload->tensors[i]) + 64, 1, 3);
+    }
+    for (int32_t i = 0; i < params.scalar_count; i += 8) {
+        __builtin_prefetch(&payload->scalars[i], 1, 3);
+    }
+    __builtin_prefetch(payload, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char*>(payload) + 64, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char*>(payload) + 128, 1, 3);
+
+    // Initialize slot state (scheduler-private)
+    if (sched) {
+        auto& rs = sched->ring_sched_states[ring_id];
+        PTO2TaskSlotState& slot_state = rs.get_slot_state_by_slot(slot);
+        slot_state.fanin_count = 0;
+        slot_state.fanout_head = nullptr;
+        slot_state.fanout_lock.store(0, std::memory_order_relaxed);
+        // Initial fanout_count = 1 (the owning scope holds one reference)
+        slot_state.fanout_count = 1;
+        slot_state.fanout_refcount.store(0, std::memory_order_release);
+        slot_state.fanin_refcount.store(0, std::memory_order_release);
+        slot_state.payload = payload;
+        slot_state.task = &task;
+        slot_state.active_mask = active_mask;
+        slot_state.subtask_done_mask.store(0, std::memory_order_relaxed);
+        slot_state.ring_id = ring_id;
+        scope_tasks_push(orch, &slot_state);
+    } else {
+        scope_tasks_push(orch, nullptr);
+    }
 
     // Register this task in its owning scope
-    scope_tasks_push(orch, task_id);
-
-    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, task_id);
 
-    // Temporary storage for fanin
-    int32_t fanin_temp[PTO2_MAX_INPUTS];
+    // Temporary storage for fanin (cached slot state pointers, avoids repeated ring/slot lookups)
+    PTO2TaskSlotState* fanin_states[PTO2_MAX_INPUTS];
     int32_t fanin_count = 0;
 
-    payload->param_count = num_params;
-    for (int i = 0; i < num_params; i++) {
-        payload->is_tensor[i] = params[i].type != PTOParamType::SCALAR;
-        if (payload->is_tensor[i]) {
-            payload->tensors[i].copy(*params[i].tensor);
-        } else {
-            payload->scalar_value[i] = params[i].scalar_value;
-        }
-    }
-
-    CYCLE_COUNT_LAP_RECORD(g_orch_params_cycle, AicpuPhaseId::ORCH_PARAMS, task_id);
-#if PTO2_ORCH_PROFILING
-    g_orch_params_atomic_count += 2;  // fanout_lock.store + fanout_count.store
-#endif
+    CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, local_id);
 
-    // Temporary storage for collecting output sizes
+    // === STEP 2: Calculate output size + heap alloc (read from params only, no GM access) ===
     int32_t total_output_size = 0;
-    for (int i = 0; i < num_params; i++) {
-        if (params[i].type != PTOParamType::OUTPUT) {
-            continue;
-        }
-        // Only allocate from ring buffer when caller did not provide an address
-        if (payload->tensors[i].buffer.addr == 0) {
-            total_output_size += PTO2_ALIGN_UP(payload->tensors[i].buffer.size, PTO2_PACKED_OUTPUT_ALIGN);
+    for (int i = 0; i < params.tensor_count; i++) {
+        if (params.tensor_types[i] == PTOParamType::OUTPUT
+            && params.tensors[i]->buffer.addr == 0) {
+            total_output_size += PTO2_ALIGN_UP(params.tensors[i]->buffer.size, PTO2_PACKED_OUTPUT_ALIGN);
         }
     }
 
+    void* local_packed_base = nullptr;
+    void* local_packed_end = nullptr;
     if (total_output_size > 0) {
-        task.packed_buffer_base = orch->pto2_alloc_packed_buffer(total_output_size);
-        task.packed_buffer_end = (char*)task.packed_buffer_base + total_output_size;
+        local_packed_base = orch->pto2_alloc_packed_buffer(total_output_size);
+        if (!local_packed_base) { orch->fatal = true; return; }
+        local_packed_end = (char*)local_packed_base + total_output_size;
     }
-    CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, task_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_heap_cycle, AicpuPhaseId::ORCH_HEAP, local_id);
 #if PTO2_ORCH_PROFILING
     if (total_output_size > 0) {
         g_orch_heap_atomic_count += 1;  // heap_top.store in pto2_alloc_packed_buffer
     }
 #endif
 
-    // === STEP 2: First pass - set output addr and process tensor ===
+    // === STEP 3: Sync TensorMap validity and optional cleanup ===
+    // Read current last_task_alive from shared memory for this ring
+    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
+
+    orch->tensor_map.sync_tensormap(ring_id, sm_last_task_alive);
+
+    if (sched) {
+        orch->rings[ring_id].dep_pool.reclaim(*sched, ring_id, sm_last_task_alive);
+    }
+
+    CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, local_id);
+
+    // === STEP 4: Lookup inputs + assign output addrs (all from params, no GM) ===
     int32_t offset = 0;
-    for (int i = 0; i < num_params; i++) {
-        PTOParamType ptype = params[i].type;
+    for (int i = 0; i < params.tensor_count; i++) {
+        PTOParamType ptype = params.tensor_types[i];
 
         switch (ptype) {
             case PTOParamType::INOUT:
             case PTOParamType::INPUT: {
-                // Look up producer via TensorMap
+                if (params.tensors[i]->manual_dep) break;
+                // Look up producer via TensorMap (reads from cached stack tensor)
                 PTO2LookupResult lookup_result;
-                orch->tensor_map.lookup(payload->tensors[i], lookup_result);
+                orch->tensor_map.lookup(*params.tensors[i], lookup_result);
 
                 for (int r = 0; r < lookup_result.count; r++) {
                     PTO2TensorMapEntry& entry = *lookup_result.entries[r].entry;
                     auto overlap_status = lookup_result.entries[r].overlap_status;
                     // Check if this producer is already in fanin list (avoid duplicates)
-                    int producer_task_id = entry.producer_task_id;
+                    auto prod_ring = entry.producer_task_id.ring();
+                    auto prod_local = entry.producer_task_id.local();
+                    PTO2TaskSlotState* prod_state =
+                        &sched->ring_sched_states[prod_ring].get_slot_state_by_task_id(prod_local);
                     bool already_added = false;
                     for (int j = 0; j < fanin_count; j++) {
-                        if (fanin_temp[j] == producer_task_id) {
+                        if (fanin_states[j] == prod_state) {
                             already_added = true;
                             break;
                         }
@@ -395,7 +456,7 @@ void pto2_submit_task(
                     if (!already_added) {
                         // Add to fanin list (this task depends on producer)
                         if (fanin_count < PTO2_MAX_INPUTS) {
-                            fanin_temp[fanin_count++] = producer_task_id;
+                            fanin_states[fanin_count++] = prod_state;
                         }
                     }
                     if (ptype == PTOParamType::INOUT && overlap_status == OverlapStatus::COVERED) {
@@ -412,95 +473,110 @@ void pto2_submit_task(
             }
 
             case PTOParamType::OUTPUT: {
-                auto& tensor = payload->tensors[i];
+                Tensor& tensor = *params.tensors[i];
                 if (tensor.buffer.addr == 0) {
-                    uint64_t alloc_addr = reinterpret_cast<uint64_t>((char*)task.packed_buffer_base + offset);
+                    uint64_t alloc_addr = reinterpret_cast<uint64_t>((char*)local_packed_base + offset);
                     tensor.buffer.addr = alloc_addr;
-                    // Write back allocated address to caller's original Tensor
-                    params[i].tensor->buffer.addr = alloc_addr;
                     offset += PTO2_ALIGN_UP(tensor.buffer.size, PTO2_PACKED_OUTPUT_ALIGN);
                 }
                 break;
             }
-            default:
-                break;
         }
     }
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, task_id);
-
+    CYCLE_COUNT_LAP_RECORD(g_orch_lookup_cycle, AicpuPhaseId::ORCH_LOOKUP, local_id);
 
-    // === STEP 4: Second pass - register outputs in TensorMap ===
-    for (int i = 0; i < num_params; i++) {
-        PTOParamType ptype = params[i].type;
+    // === STEP 5: Register outputs/inouts in TensorMap (must be separate from lookup) ===
+    for (int i = 0; i < params.tensor_count; i++) {
+        PTOParamType ptype = params.tensor_types[i];
         if (ptype == PTOParamType::OUTPUT || ptype == PTOParamType::INOUT) {
-            // Register in TensorMap: this tensor is produced by task_id
-            orch->tensor_map.insert(payload->tensors[i], task_id, ptype == PTOParamType::OUTPUT);
+            if (!params.tensors[i]->manual_dep) {
+                orch->tensor_map.insert(*params.tensors[i], mixed_task_id, ptype == PTOParamType::OUTPUT);
+            }
         }
     }
 
-    CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, task_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_insert_cycle, AicpuPhaseId::ORCH_INSERT, local_id);
+
+    // === STEP 6: Batch-write to GM (single cache line burst) ===
+    // Deferred from allocation phase to avoid scattered GM writes that get
+    // evicted by TensorMap lookup/insert cache pressure.
+    __builtin_prefetch(&task, 1, 1);
+    task.mixed_task_id = mixed_task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)]  = normalized.aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id;
+    task.packed_buffer_base = local_packed_base;
+    task.packed_buffer_end = local_packed_end;
+
+    // Prefetch producer slot_states and cur_slot_state (written at init but likely
+    // evicted by lookup/insert/heap). param_copy below provides hide time.
+    if (sched) {
+        auto& rs = sched->ring_sched_states[ring_id];
+        __builtin_prefetch(&rs.get_slot_state_by_slot(slot), 1, 0);
+        for (int i = 0; i < fanin_count; i++) {
+            __builtin_prefetch(fanin_states[i], 1, 0);
+        }
+    }
 
-    // === STEP 5: Finalize fanin list ===
-    // First build the fanin list
-    if (orch->scheduler) {
-        PTO2SchedulerState* sched = orch->scheduler;
+    payload->init(params);
 
+    CYCLE_COUNT_LAP_RECORD(g_orch_params_cycle, AicpuPhaseId::ORCH_PARAMS, local_id);
+#if PTO2_ORCH_PROFILING
+    g_orch_params_atomic_count += 2;  // fanout_lock.store + fanout_count.store
+#endif
+
+    // === STEP 7: Finalize fanin list ===
+    // First build the fanin list
+    if (sched) {
+        auto& rs = sched->ring_sched_states[ring_id];
+        PTO2TaskSlotState& cur_slot_state = rs.get_slot_state_by_slot(slot);
         // Initialize scheduler state BEFORE adding to producer fanout lists,
-        // so concurrent on_task_complete can safely access task_state/fanout_refcount.
-        sched->task_state[slot].store(PTO2_TASK_PENDING, std::memory_order_relaxed);
-        sched->fanout_refcount[slot].store(0, std::memory_order_relaxed);
+        // so concurrent on_mixed_task_complete can safely access task_state/fanout_refcount.
+        cur_slot_state.task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+        cur_slot_state.fanout_refcount.store(0, std::memory_order_relaxed);
 
+        auto& dep_pool = orch->rings[ring_id].dep_pool;
         // Ensure dep pool has space: fanin_count entries + 1 pre-alloc
-        pto2_dep_pool_ensure_space(orch, fanin_count + 1);
-
-        auto& dep_pool = orch->dep_pool;
-        if (orch->dep_pool_cur_entry == nullptr) {
-            orch->dep_pool_cur_entry = &dep_pool.alloc();
-        }
+        dep_pool.ensure_space(*sched, fc, ring_id, fanin_count + 1);
 
         int32_t early_finished = 0;
-        task.fanin_count = fanin_count + 1;  // +1 redundance for not being ready too early
+        cur_slot_state.fanin_count = fanin_count + 1;  // +1 redundance for not being ready too early
         payload->fanin_actual_count = fanin_count;
         for (int i = 0; i < fanin_count; i++) {
-            payload->fanin_tasks[i] = fanin_temp[i];
+            payload->fanin_slot_states[i] = fanin_states[i];
         }
         for (int i = 0; i < fanin_count; i++) {
-            int32_t producer_task_id = fanin_temp[i];
-            // Add this task to producer's fanout list (with spinlock)
-            int32_t prod_slot = task_ring.get_task_slot(producer_task_id);
-            PTO2TaskDescriptor& producer = task_ring.get_task_by_slot(prod_slot);
-            orch->dep_pool_cur_entry->task_id = task_id;
-            orch->dep_pool_cur_entry->next = producer.fanout_head;
+            PTO2TaskSlotState& producer_slot_state = *fanin_states[i];
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-            pto2_fanout_lock(producer, g_orch_fanin_atomic_count, g_orch_fanin_wait_cycle);
+            pto2_fanout_lock(producer_slot_state, g_orch_fanin_atomic_count, g_orch_fanin_wait_cycle);
 #else
-            pto2_fanout_lock(producer);
+            pto2_fanout_lock(producer_slot_state);
 #endif
             // Normal path: prepend consumer to producer's fanout list
-            producer.fanout_count += 1;
-            int32_t prod_state = sched->task_state[prod_slot].load(std::memory_order_acquire);
+            producer_slot_state.fanout_count += 1;
+            int32_t prod_state = producer_slot_state.task_state.load(std::memory_order_acquire);
             if (prod_state >= PTO2_TASK_COMPLETED) {
                 // Early return optimization: if producer already completed, we can skip adding dependency and directly
                 // decrement fanin_count
                 early_finished++;
             } else {
-                producer.fanout_head = orch->dep_pool_cur_entry;
-            }
-            pto2_fanout_unlock(producer);
-            if (producer.fanout_head == orch->dep_pool_cur_entry) {
-                orch->dep_pool_cur_entry = &dep_pool.alloc();
+                producer_slot_state.fanout_head = dep_pool.prepend(producer_slot_state.fanout_head, &cur_slot_state);
             }
+            pto2_fanout_unlock(producer_slot_state);
         }
-        // Combined release: merge early_finished batch + init_task's +1 release
+        // Combined release: merge early_finished batch with the +1 init release
         // into a single atomic fetch_add (saves one acq_rel cache-line bounce per task).
         int32_t initial_refcount = early_finished + 1;  // +1 for the init release
-        int32_t new_rc = sched->fanin_refcount[slot].fetch_add(initial_refcount, std::memory_order_acq_rel)
+        int32_t new_rc = cur_slot_state.fanin_refcount.fetch_add(initial_refcount, std::memory_order_acq_rel)
                          + initial_refcount;
         if (new_rc >= fanin_count + 1) {
-            sched->ready_queues[task.worker_type].push(task_id);
+            PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask);
+            sched->ready_queues[static_cast<int32_t>(shape)].push(&cur_slot_state);
         }
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+        // Record dep pool watermark in local slot state (used by tail reclamation)
+        cur_slot_state.dep_pool_mark = orch->rings[ring_id].dep_pool.top;
+#if PTO2_ORCH_PROFILING
         // Per producer: fetch_add(fanout_count) + load(task_state) + store(unlock) = 3 atomics
         // Lock atomics (loads + CAS) are counted inside pto2_fanout_lock
         g_orch_fanin_atomic_count += fanin_count * 3;
@@ -510,14 +586,13 @@ void pto2_submit_task(
 #endif
     }
 
-    // Record dep pool watermark for this task (used by tail reclamation)
-    payload->dep_pool_mark = orch->dep_pool.top;
-
-    CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, task_id);
+    CYCLE_COUNT_LAP_RECORD(g_orch_fanin_cycle, AicpuPhaseId::ORCH_FANIN, local_id);
 
 #if PTO2_PROFILING
     orch->tasks_submitted++;
+#if PTO2_ORCH_PROFILING
     g_orch_submit_count++;
+#endif
     g_orch_submit_idx++;
 #endif
 }
@@ -527,13 +602,21 @@ void pto2_submit_task(
 // =============================================================================
 
 void pto2_orchestrator_done(PTO2OrchestratorState* orch) {
-    int32_t total_tasks = orch->task_ring.current_index_ptr->load(std::memory_order_acquire);
-    LOG_INFO("=== [Orchestrator] total_tasks=%d ===", total_tasks);
-    LOG_INFO("=== [DepPool] top=%d tail=%d used=%d high_water=%d capacity=%d ===",
-             orch->dep_pool.top, orch->dep_pool.tail,
-             orch->dep_pool.top - orch->dep_pool.tail,
-             orch->dep_pool.high_water, orch->dep_pool.capacity);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t total_tasks = orch->rings[r].task_ring.current_index_ptr->load(std::memory_order_acquire);
+        if (total_tasks > 0) {
+            LOG_INFO("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks);
+        }
+        auto& pool = orch->rings[r].dep_pool;
+        if (pool.top > 0) {
+            LOG_INFO("=== [DepPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===",
+                     r, pool.top, pool.tail, pool.top - pool.tail, pool.high_water, pool.capacity);
+        }
+    }
     orch->sm_handle->header->orchestrator_done.store(1, std::memory_order_release);
+#if !PTO2_ORCH_PROFILING && PTO2_PROFILING
+    g_orch_submit_idx = 0;
+#endif
 }
 
 // =============================================================================
@@ -548,9 +631,18 @@ void pto2_orchestrator_print_stats(PTO2OrchestratorState* orch) {
     LOG_INFO("Bytes allocated:     %lld", (long long)orch->bytes_allocated);
 #endif
     LOG_INFO("Current scope depth: %d", orch->scope_stack_top + 1);
-    LOG_INFO("Task ring active:    %d", pto2_task_ring_active_count(&orch->task_ring));
-    LOG_INFO("Heap ring used:      %" PRIu64 " / %" PRIu64, orch->heap_ring.top_ptr->load(std::memory_order_relaxed), orch->heap_ring.size);
-    LOG_INFO("Dep pool used:       %d / %d", pto2_dep_pool_used(&orch->dep_pool), orch->dep_pool.capacity);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        int32_t active = pto2_task_ring_active_count(&orch->rings[r].task_ring);
+        if (active > 0) {
+            LOG_INFO("Ring %d task active:  %d", r, active);
+            LOG_INFO("Ring %d heap used:    %" PRIu64 " / %" PRIu64, r,
+                     orch->rings[r].heap_ring.top_ptr->load(std::memory_order_relaxed),
+                     orch->rings[r].heap_ring.size);
+            LOG_INFO("Ring %d dep pool:     %d / %d", r,
+                     orch->rings[r].dep_pool.used(),
+                     orch->rings[r].dep_pool.capacity);
+        }
+    }
     LOG_INFO("TensorMap valid:     %d", orch->tensor_map.valid_count());
     LOG_INFO("===============================");
 }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index e0cabd04..a2d4898d 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -20,6 +20,7 @@
 
 #include "pto_ring_buffer.h"
 #include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
 #include "pto_scheduler.h"
 #include "pto_shared_memory.h"
 #include "pto_tensormap.h"
@@ -38,22 +39,17 @@ struct PTO2OrchestratorState {
     // === SHARED MEMORY ACCESS ===
     PTO2SharedMemoryHandle* sm_handle;
 
-    // === RING BUFFERS ===
-    PTO2HeapRing heap_ring;    // Output buffer allocation
-    PTO2TaskRing task_ring;    // Task slot allocation
-    PTO2DepListPool dep_pool;  // Dependency list storage (per-orchestrator, no atomics needed)
-    PTO2DepListEntry* dep_pool_cur_entry;
-    int32_t dep_pool_last_reclaimed;  // last_task_alive value at last reclamation
+    // === PER-RING RESOURCES ===
+    PTO2RingSet rings[PTO2_MAX_RING_DEPTH];
 
     // === TENSOR MAP (Private) ===
     PTO2TensorMap tensor_map;        // Producer lookup
-    int32_t tensormap_last_cleanup;  // Last cleanup threshold
 
     // === SCOPE STACK (Private) ===
     // Single contiguous buffer of task IDs, partitioned by scope level.
     // scope_begins[i] is the index into scope_tasks where scope i starts.
     // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
-    int32_t* scope_tasks;          // Flat buffer of task IDs (all scopes concatenated)
+    PTO2TaskSlotState** scope_tasks; // Flat buffer of taskSlotState (all scopes concatenated)
     int32_t scope_tasks_size;       // Number of task IDs currently in the buffer
     int32_t scope_tasks_capacity;   // Allocated capacity of scope_tasks
     int32_t* scope_begins;         // scope_begins[i] = start index of scope i in scope_tasks
@@ -64,11 +60,19 @@ struct PTO2OrchestratorState {
     // Note: In simulated mode, orchestrator and scheduler share address space
     // In real mode, they communicate via shared memory only
     PTO2SchedulerState* scheduler;  // For simulated mode only
-    bool init_task_on_submit;       // If true, call scheduler_init_task on submit
+#if PTO2_PROFILING
+    // Runtime profiling switch copied from Runtime::enable_profiling.
+    bool enable_profiling;
+#endif
 
     // === GM HEAP (for output buffers) ===
     void* gm_heap_base;    // Base address of GM heap
-    uint64_t gm_heap_size;   // Size of GM heap
+    uint64_t gm_heap_size;   // Total size of GM heap (all rings)
+
+    // === FATAL ERROR ===
+    // Fatal error flag (single-thread access by orchestrator, no atomic needed)
+    // Cross-thread notification uses shared memory orch_error_code (atomic)
+    bool fatal;
 
     // === STATISTICS ===
 #if PTO2_PROFILING
@@ -78,22 +82,31 @@ struct PTO2OrchestratorState {
 #endif
 
     /**
-     * Allocate packed output buffer for a task
+     * Get current ring index from scope depth.
+     * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
+     */
+    uint8_t current_ring_id() const {
+        int32_t depth = scope_stack_top;
+        if (depth < 0) depth = 0;
+        return depth < PTO2_MAX_RING_DEPTH ? static_cast<uint8_t>(depth) : PTO2_MAX_RING_DEPTH - 1;
+    }
+
+    /**
+     * Allocate packed output buffer from current ring's heap
      */
     void* pto2_alloc_packed_buffer(int32_t total_size) {
         if (total_size <= 0) {
             return NULL;
         }
 
-        void* buffer = heap_ring.pto2_heap_ring_alloc(total_size);
+        uint8_t rid = current_ring_id();
+        void* buffer = rings[rid].heap_ring.pto2_heap_ring_alloc(total_size);
 
 #if PTO2_PROFILING
         buffers_allocated++;
         bytes_allocated += total_size;
 #endif
 
-        // heap_top is now updated atomically inside pto2_heap_ring_alloc via CAS
-
         return buffer;
     }
 };
@@ -125,16 +138,6 @@ void pto2_orchestrator_destroy(PTO2OrchestratorState* orch);
  */
 void pto2_orchestrator_set_scheduler(PTO2OrchestratorState* orch, PTO2SchedulerState* scheduler);
 
-/**
- * Set scheduler reference with mode control
- *
- * @param orch           Orchestrator state
- * @param scheduler      Scheduler state
- * @param init_on_submit If true, init task on submit (single-threaded mode)
- *                       If false, scheduler thread polls for new tasks (multi-threaded)
- */
-void pto2_orchestrator_set_scheduler_mode(
-    PTO2OrchestratorState* orch, PTO2SchedulerState* scheduler, bool init_on_submit);
 
 // =============================================================================
 // Scope Management
@@ -174,16 +177,12 @@ void pto2_scope_end(PTO2OrchestratorState* orch);
  * 6. Initializes task state in scheduler
  *
  * @param orch        Orchestrator state
- * @param kernel_id   InCore function ID
- * @param worker_type Target worker type (CUBE, VECTOR, AI_CPU, ACCELERATOR)
- * @param params      Array of task parameters
- * @param num_params  Number of parameters
+ * @param mixed_kernels  Kernel IDs for AIC/AIV0/AIV1 slots
+ * @param params      Aggregated tensor and scalar parameters
  */
-void pto2_submit_task(PTO2OrchestratorState* orch,
-    int32_t kernel_id,
-    PTO2WorkerType worker_type,
-    PTOParam* params,
-    int32_t num_params);
+void pto2_submit_mixed_task(PTO2OrchestratorState* orch,
+    const MixedKernels& mixed_kernels,
+    const PTOParam& params);
 
 // =============================================================================
 // Flow Control
@@ -229,7 +228,6 @@ struct PTO2OrchProfilingData {
     uint64_t alloc_wait_cycle;      // Cycles spent waiting in task_ring_alloc
     uint64_t heap_wait_cycle;       // Cycles spent waiting in heap_ring_alloc
     uint64_t fanin_wait_cycle;      // Cycles spent waiting in fanout_lock
-    uint64_t finalize_wait_cycle;   // Cycles spent in ready queue push CAS retries
     // Atomic operation counts per phase
     uint64_t alloc_atomic_count;
     uint64_t params_atomic_count;
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
index 65607e5f..daac7846 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
@@ -12,6 +12,7 @@
 #include <string.h>
 #include <stdlib.h>  // for exit()
 #include "common/unified_log.h"
+#include "pto_scheduler.h"
 
 // =============================================================================
 // Heap Ring Buffer Implementation
@@ -42,23 +43,62 @@ void pto2_task_ring_init(PTO2TaskRing* ring, PTO2TaskDescriptor* descriptors,
 // =============================================================================
 // Dependency List Pool Implementation
 // =============================================================================
+void PTO2DepListPool::reclaim(PTO2SchedulerState& sched, uint8_t ring_id, int32_t sm_last_task_alive) {
+    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
+        int32_t mark = sched.ring_sched_states[ring_id].get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
+        if (mark > 0) {
+            advance_tail(mark);
+        }
+        last_reclaimed = sm_last_task_alive;
+    }
+}
 
-void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, int32_t capacity) {
-    pool->base = base;
-    pool->capacity = capacity;
-    pool->top = 1;  // Start from 1, 0 means NULL/empty
-    pool->tail = 1; // Match initial top (no reclaimable entries yet)
-    pool->high_water = 0;
+void PTO2DepListPool::ensure_space(
+    PTO2SchedulerState& sched, PTO2RingFlowControl& fc, uint8_t ring_id, int32_t needed) {
+    if (available() >= needed) return;
 
-    // Initialize entry 0 as NULL marker
-    pool->base[0].task_id = -1;
-    pool->base[0].next = nullptr;
-}
+    int spin_count = 0;
+    int32_t prev_last_alive = fc.last_task_alive.load(std::memory_order_acquire);
+    while (available() < needed) {
+        reclaim(sched, ring_id, prev_last_alive);
+        if (available() >= needed) return;
 
-int32_t pto2_dep_pool_used(PTO2DepListPool* pool) {
-    return pool->top - pool->tail;
-}
+        spin_count++;
 
-int32_t pto2_dep_pool_available(PTO2DepListPool* pool) {
-    return pool->capacity - (pool->top - pool->tail);
-}
+        // Progress detection: reset spin counter if last_task_alive advances
+        int32_t cur_last_alive = fc.last_task_alive.load(std::memory_order_acquire);
+        if (cur_last_alive > prev_last_alive) {
+            spin_count = 0;
+            prev_last_alive = cur_last_alive;
+        }
+
+        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
+            int32_t current = fc.current_task_index.load(std::memory_order_acquire);
+            LOG_ERROR("========================================");
+            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected! (ring %d)", ring_id);
+            LOG_ERROR("========================================");
+            LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count);
+            LOG_ERROR("  - Pool used:     %d / %d (%.1f%%)",
+                used(),
+                capacity,
+                (capacity > 0) ? (100.0 * used() / capacity) : 0.0);
+            LOG_ERROR("  - Pool top:      %d (linear)", top);
+            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
+            LOG_ERROR("  - High water:    %d", high_water);
+            LOG_ERROR("  - Needed:        %d entries", needed);
+            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
+            LOG_ERROR("  - current_task:    %d", current);
+            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
+            LOG_ERROR("Diagnosis:");
+            LOG_ERROR("  last_task_alive is not advancing, so dep pool tail");
+            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
+            LOG_ERROR("Solution:");
+            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
+            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
+            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
+            LOG_ERROR("========================================");
+            exit(1);
+        }
+        SPIN_WAIT_HINT();
+    }
+}
\ No newline at end of file
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 4c05514d..dc60228a 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -27,12 +27,13 @@
 #define PTO_RING_BUFFER_H
 
 #include <inttypes.h>
-#include <stdlib.h>  // for exit()
 
 #include "pto_runtime2_types.h"
 #include "pto_shared_memory.h"
 #include "common/unified_log.h"
 
+struct PTO2SchedulerState;  // Forward declaration for dep_pool reclaim
+
 // Set to 1 to enable periodic BLOCKED/Unblocked messages during spin-wait.
 #ifndef PTO2_SPIN_VERBOSE_LOGGING
 #define PTO2_SPIN_VERBOSE_LOGGING 1
@@ -67,6 +68,9 @@ struct PTO2HeapRing {
     // Reference to shared memory tail (for back-pressure)
     std::atomic<uint64_t>* tail_ptr;  // Points to header->heap_tail
 
+    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
+    std::atomic<int32_t>* error_code_ptr = nullptr;
+
     /**
      * Allocate memory from heap ring
      *
@@ -75,7 +79,7 @@ struct PTO2HeapRing {
      * Never splits a buffer across the wrap-around boundary.
      *
      * @param size  Requested size in bytes
-     * @return Pointer to allocated memory, never NULL (stalls instead)
+     * @return Pointer to allocated memory, or nullptr on fatal error
      */
     void* pto2_heap_ring_alloc(uint64_t size) {
         // Align size for DMA efficiency
@@ -160,7 +164,10 @@ struct PTO2HeapRing {
                 LOG_ERROR("  Runtime env:  PTO2_RING_HEAP=<power-of-2 bytes> (e.g. %lu)",
                           (unsigned long)(this->size * 2));
                 LOG_ERROR("========================================");
-                exit(1);
+                if (error_code_ptr) {
+                    error_code_ptr->store(PTO2_ERROR_HEAP_RING_DEADLOCK, std::memory_order_release);
+                }
+                return nullptr;
             }
 
             SPIN_WAIT_HINT();
@@ -264,6 +271,9 @@ struct PTO2TaskRing {
     // Reference to shared memory last_task_alive (for back-pressure)
     std::atomic<int32_t>* last_alive_ptr;  // Points to header->last_task_alive
 
+    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
+    std::atomic<int32_t>* error_code_ptr = nullptr;
+
     /**
      * Allocate a task slot from task ring
      *
@@ -275,6 +285,7 @@ struct PTO2TaskRing {
     int32_t pto2_task_ring_alloc() {
         // Spin-wait if window is full (back-pressure from Scheduler)
         int spin_count = 0;
+        int32_t prev_last_alive = last_alive_ptr->load(std::memory_order_acquire);
 #if PTO2_SPIN_VERBOSE_LOGGING
         bool notified = false;
 #endif
@@ -310,50 +321,60 @@ struct PTO2TaskRing {
             if (!waiting) { wait_start = get_sys_cnt_aicpu(); waiting = true; }
 #endif
 
+            // Progress detection: reset spin counter if last_task_alive advances
+            int32_t cur_last_alive = last_alive_ptr->load(std::memory_order_acquire);
+            if (cur_last_alive > prev_last_alive) {
+#if PTO2_SPIN_VERBOSE_LOGGING
+                LOG_INFO("[TaskRing] Progress: last_alive %d -> %d (reset spin_count=%d)",
+                         prev_last_alive, cur_last_alive, spin_count);
+#endif
+                spin_count = 0;
+                prev_last_alive = cur_last_alive;
+            }
+
 #if PTO2_SPIN_VERBOSE_LOGGING
             // Periodic block notification
-            if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0 && spin_count < PTO2_FLOW_CONTROL_SPIN_LIMIT) {
-                int32_t last_alive = last_alive_ptr->load(std::memory_order_acquire);
+            if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0 && spin_count > 0 && spin_count < PTO2_FLOW_CONTROL_SPIN_LIMIT) {
                 int32_t current = current_index_ptr->load(std::memory_order_acquire);
-                int32_t active_count = current - last_alive;
+                int32_t active_count = current - cur_last_alive;
                 LOG_WARN("[TaskRing] BLOCKED (Flow Control): current=%d, last_alive=%d, "
                      "active=%d/%d (%.1f%%), spins=%d",
-                     current, last_alive, active_count, window_size,
+                     current, cur_last_alive, active_count, window_size,
                      100.0 * active_count / window_size, spin_count);
                 notified = true;
             }
 #endif
 
-            // Check for potential deadlock
+            // Deadlock: no progress after SPIN_LIMIT spins
             if (spin_count >= PTO2_FLOW_CONTROL_SPIN_LIMIT) {
-                int32_t last_alive = last_alive_ptr->load(std::memory_order_acquire);
                 int32_t current = current_index_ptr->load(std::memory_order_acquire);
-                int32_t active_count = current - last_alive;
+                int32_t active_count = current - cur_last_alive;
 
                 LOG_ERROR("========================================");
                 LOG_ERROR("FATAL: Flow Control Deadlock Detected!");
                 LOG_ERROR("========================================");
                 LOG_ERROR("Task Ring is FULL and no progress after %d spins.", spin_count);
-                LOG_ERROR("Flow Control Status:");
                 LOG_ERROR("  - Current task index:  %d", current);
-                LOG_ERROR("  - Last task alive:     %d", last_alive);
-                LOG_ERROR("  - Active tasks:        %d", active_count);
-                LOG_ERROR("  - Window size:         %d", window_size);
+                LOG_ERROR("  - Last task alive:     %d (stuck here)", cur_last_alive);
+                LOG_ERROR("  - Active tasks:        %d / %d", active_count, window_size);
                 LOG_ERROR("  - Window utilization:  %.1f%%", 100.0 * active_count / window_size);
-                LOG_ERROR("Root Cause:");
-                LOG_ERROR("  Tasks cannot transition to CONSUMED state because:");
-                LOG_ERROR("  - fanout_count includes 1 for the owning scope");
-                LOG_ERROR("  - scope_end() requires orchestrator to continue");
-                LOG_ERROR("  - But orchestrator is blocked waiting for task ring space");
-                LOG_ERROR("  This creates a circular dependency (deadlock).");
+                LOG_ERROR("Diagnosis:");
+                LOG_ERROR("  last_task_alive is stuck at %d, meaning task %d",
+                          cur_last_alive, cur_last_alive);
+                LOG_ERROR("  cannot transition to CONSUMED. Possible causes:");
+                LOG_ERROR("  1. Task %d still executing (subtasks not complete)", cur_last_alive);
+                LOG_ERROR("  2. Task %d fanout not fully released (downstream not done)", cur_last_alive);
+                LOG_ERROR("  3. Scope reference not released (scope_end not called)");
+                LOG_ERROR("  4. Orchestrator blocked here -> can't call scope_end -> circular wait");
                 LOG_ERROR("Solution:");
                 LOG_ERROR("  Increase task window size (current: %d, recommended: %d)", window_size, active_count * 2);
                 LOG_ERROR("  Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
                 LOG_ERROR("  Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2> (e.g. %d)", active_count * 2);
                 LOG_ERROR("========================================");
-
-                // Abort program
-                exit(1);
+                if (error_code_ptr) {
+                    error_code_ptr->store(PTO2_ERROR_FLOW_CONTROL_DEADLOCK, std::memory_order_release);
+                }
+                return -1;
             }
 
             SPIN_WAIT_HINT();
@@ -373,9 +394,6 @@ struct PTO2TaskRing {
 
         // Check if there's room (leave at least 1 slot empty)
         if (active_count < window_size - 1) {
-            int32_t slot = task_id & (window_size - 1);
-            PTO2TaskDescriptor* task = &descriptors[slot];
-            task->task_id = task_id;
             return task_id;
         }
 
@@ -452,13 +470,54 @@ struct PTO2DepListPool {
     int32_t top;              // Linear next-allocation counter (starts from 1)
     int32_t tail;             // Linear first-alive counter (entries before this are dead)
     int32_t high_water;       // Peak concurrent usage (top - tail)
+    int32_t last_reclaimed{0}; // last_task_alive at last successful reclamation
+
+    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
+    std::atomic<int32_t>* error_code_ptr = nullptr;
+
+    /**
+     * Initialize dependency list pool
+     *
+     * @param base      Pool base address from shared memory
+     * @param capacity  Total number of entries
+     */
+    void init(PTO2DepListEntry* in_base, int32_t in_capacity, std::atomic<int32_t>* in_error_code_ptr) {
+        base = in_base;
+        capacity = in_capacity;
+        top = 1;   // Start from 1, 0 means NULL/empty
+        tail = 1;  // Match initial top (no reclaimable entries yet)
+        high_water = 0;
+        last_reclaimed = 0;
+
+        // Initialize entry 0 as NULL marker
+        base[0].slot_state = nullptr;
+        base[0].next = nullptr;
+
+        error_code_ptr = in_error_code_ptr;
+    }
+
+    /**
+     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
+     * Safe to call multiple times — only advances tail forward.
+     *
+     * @param sched              Scheduler state (for reading slot dep_pool_mark)
+     * @param ring_id            Ring layer index
+     * @param sm_last_task_alive Current last_task_alive from shared memory
+     */
+    void reclaim(PTO2SchedulerState& sched, uint8_t ring_id, int32_t sm_last_task_alive);
+
+    /**
+     * Ensure dep pool for a specific ring has at least `needed` entries available.
+     * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
+     */
+    void ensure_space(PTO2SchedulerState& sched, PTO2RingFlowControl &fc, uint8_t ring_id, int32_t needed);
 
     /**
      * Allocate a single entry from the pool (single-thread per pool instance)
      *
-     * @return Reference to allocated entry
+     * @return Pointer to allocated entry, or nullptr on fatal error
      */
-    PTO2DepListEntry& alloc() {
+    PTO2DepListEntry* alloc() {
         int32_t used = top - tail;
         if (used >= capacity) {
             LOG_ERROR("========================================");
@@ -473,13 +532,16 @@ struct PTO2DepListPool {
             LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
             LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
             LOG_ERROR("========================================");
-            exit(1);
+            if (error_code_ptr) {
+                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
+            }
+            return nullptr;
         }
         int32_t idx = top % capacity;
         top++;
         used++;
         if (used > high_water) high_water = used;
-        return base[idx];
+        return &base[idx];
     }
 
     /**
@@ -498,14 +560,15 @@ struct PTO2DepListPool {
      * O(1) operation: allocates new entry and links to current head.
      *
      * @param current_head  Current list head offset (0 = empty list)
-     * @param task_id       Task ID to prepend
+     * @param task_slot     Task slot to prepend
      * @return New head offset
      */
-    PTO2DepListEntry* pto2_dep_list_prepend(PTO2DepListEntry* cur, int32_t task_id) {
-        PTO2DepListEntry& new_entry = alloc();
-        new_entry.task_id = task_id;
-        new_entry.next = cur;
-        return &new_entry;
+    PTO2DepListEntry* prepend(PTO2DepListEntry* cur, PTO2TaskSlotState* slot_state) {
+        PTO2DepListEntry* new_entry = alloc();
+        if (!new_entry) return nullptr;
+        new_entry->slot_state = slot_state;
+        new_entry->next = cur;
+        return new_entry;
     }
 
     /**
@@ -515,21 +578,28 @@ struct PTO2DepListPool {
         if (offset <= 0) return NULL;
         return &base[offset];
     }
+
+    int32_t used() const {
+        return top - tail;
+    }
+
+    int32_t available() const {
+        return capacity - used();
+    }
 };
 
-/**
- * Initialize dependency list pool
- * 
- * @param pool      Pool to initialize
- * @param base      Pool base address from shared memory
- * @param capacity  Total number of entries
- */
-void pto2_dep_pool_init(PTO2DepListPool* pool, PTO2DepListEntry* base, int32_t capacity);
+// =============================================================================
+// Ring Set (per-depth aggregate)
+// =============================================================================
 
 /**
- * Get pool usage statistics
+ * Groups a HeapRing, TaskRing, and DepPool into one per-depth unit.
+ * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth.
  */
-int32_t pto2_dep_pool_used(PTO2DepListPool* pool);
-int32_t pto2_dep_pool_available(PTO2DepListPool* pool);
+struct PTO2RingSet {
+    PTO2HeapRing    heap_ring;
+    PTO2TaskRing    task_ring;
+    PTO2DepListPool dep_pool;
+};
 
 #endif // PTO_RING_BUFFER_H
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index 76f6ee4a..19807408 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -26,11 +26,10 @@ void pto2_set_orch_thread_idx(int idx) {
 // Orchestration Ops Table (function-pointer dispatch for orchestration .so)
 // =============================================================================
 
-static void submit_task_impl(PTO2Runtime* rt, int32_t kernel_id,
-                             PTO2WorkerType worker_type,
-                             PTOParam* params, int32_t num_params) {
-    pto2_submit_task(&rt->orchestrators[pto2_current_orch_idx], kernel_id, worker_type,
-                     params, num_params);
+static void submit_task_impl(PTO2Runtime* rt, const MixedKernels& mixed_kernels,
+                             const PTOParam& params) {
+    pto2_submit_mixed_task(&rt->orchestrators[pto2_current_orch_idx], mixed_kernels,
+                           params);
 }
 
 void pto2_rt_scope_begin(PTO2Runtime* rt) {
@@ -45,11 +44,16 @@ void pto2_rt_orchestration_done(PTO2Runtime* rt) {
     pto2_orchestrator_done(&rt->orchestrators[pto2_current_orch_idx]);
 }
 
+static bool is_fatal_impl(PTO2Runtime* rt) {
+    return rt->orchestrators[pto2_current_orch_idx].fatal;
+}
+
 static const PTO2RuntimeOps s_runtime_ops = {
     .submit_task          = submit_task_impl,
     .scope_begin          = pto2_rt_scope_begin,
     .scope_end            = pto2_rt_scope_end,
     .orchestration_done   = pto2_rt_orchestration_done,
+    .is_fatal             = is_fatal_impl,
     .log_error            = unified_log_error,
     .log_warn             = unified_log_warn,
     .log_info             = unified_log_info,
@@ -69,7 +73,8 @@ PTO2Runtime* pto2_runtime_create(PTO2RuntimeMode mode) {
 
 PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode,
                                          uint64_t task_window_size,
-                                         uint64_t heap_size) {
+                                         uint64_t heap_size,
+                                         int32_t dep_pool_capacity) {
     // Allocate runtime context
     PTO2Runtime* rt = (PTO2Runtime*)calloc(1, sizeof(PTO2Runtime));
     if (!rt) {
@@ -85,16 +90,17 @@ PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode,
         return NULL;
     }
 
-    // Allocate GM heap for output buffers
-    rt->gm_heap_size = heap_size;
+    // Allocate GM heap for output buffers (all rings combined)
+    uint64_t total_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+    rt->gm_heap_size = total_heap_size;
     #if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
-        if (posix_memalign(&rt->gm_heap, PTO2_ALIGN_SIZE, heap_size) != 0) {
+        if (posix_memalign(&rt->gm_heap, PTO2_ALIGN_SIZE, total_heap_size) != 0) {
             pto2_sm_destroy(rt->sm_handle);
             free(rt);
             return NULL;
         }
     #else
-        rt->gm_heap = aligned_alloc(PTO2_ALIGN_SIZE, heap_size);
+        rt->gm_heap = aligned_alloc(PTO2_ALIGN_SIZE, total_heap_size);
         if (!rt->gm_heap) {
             pto2_sm_destroy(rt->sm_handle);
             free(rt);
@@ -105,15 +111,15 @@ PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode,
 
     // Initialize first orchestrator
     if (!pto2_orchestrator_init(&rt->orchestrators[0], rt->sm_handle,
-                                 rt->gm_heap, heap_size)) {
+                                 rt->gm_heap, heap_size, dep_pool_capacity)) {
         free(rt->gm_heap);
         pto2_sm_destroy(rt->sm_handle);
         free(rt);
         return NULL;
     }
 
-    // Initialize scheduler
-    if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, rt->gm_heap)) {
+    // Initialize scheduler (heap_size = per-ring heap size)
+    if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, rt->gm_heap, heap_size)) {
         pto2_orchestrator_destroy(&rt->orchestrators[0]);
         free(rt->gm_heap);
         pto2_sm_destroy(rt->sm_handle);
@@ -131,7 +137,8 @@ PTO2Runtime* pto2_runtime_create_from_sm(PTO2RuntimeMode mode,
                                           PTO2SharedMemoryHandle* sm_handle,
                                           void* gm_heap,
                                           uint64_t heap_size,
-                                          int orch_count) {
+                                          int orch_count,
+                                          int32_t dep_pool_capacity) {
     if (!sm_handle) return NULL;
     if (orch_count < 1) orch_count = 1;
     if (orch_count > PTO2_MAX_ORCH_THREADS) orch_count = PTO2_MAX_ORCH_THREADS;
@@ -143,14 +150,14 @@ PTO2Runtime* pto2_runtime_create_from_sm(PTO2RuntimeMode mode,
     rt->mode = mode;
     rt->sm_handle = sm_handle;
     rt->gm_heap = gm_heap;
-    rt->gm_heap_size = heap_size > 0 ? heap_size : 0;
+    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
     rt->gm_heap_owned = false;
     rt->orch_count = orch_count;
 
     // Initialize all orchestrator states
     for (int i = 0; i < orch_count; i++) {
         if (!pto2_orchestrator_init(&rt->orchestrators[i], rt->sm_handle,
-                                    rt->gm_heap, rt->gm_heap_size)) {
+                                    rt->gm_heap, heap_size, dep_pool_capacity)) {
             for (int j = 0; j < i; j++) {
                 pto2_orchestrator_destroy(&rt->orchestrators[j]);
             }
@@ -159,8 +166,8 @@ PTO2Runtime* pto2_runtime_create_from_sm(PTO2RuntimeMode mode,
         }
     }
 
-    // Initialize scheduler
-    if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, rt->gm_heap)) {
+    // Initialize scheduler (heap_size = per-ring heap size)
+    if (!pto2_scheduler_init(&rt->scheduler, rt->sm_handle, rt->gm_heap, heap_size)) {
         for (int i = 0; i < orch_count; i++) {
             pto2_orchestrator_destroy(&rt->orchestrators[i]);
         }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index cc3dc170..c66c5fe0 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -26,6 +26,7 @@
 #define PTO_RUNTIME2_H
 
 #include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
 #include "pto_shared_memory.h"
 #include "pto_ring_buffer.h"
 #include "pto_tensormap.h"
@@ -58,12 +59,12 @@ enum PTO2RuntimeMode {
 typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
 
 struct PTO2RuntimeOps {
-    void (*submit_task)(PTO2Runtime* rt, int32_t kernel_id,
-                        PTO2WorkerType worker_type,
-                        PTOParam* params, int32_t num_params);
+    void (*submit_task)(PTO2Runtime* rt, const MixedKernels& mixed_kernels,
+                        const PTOParam& params);
     void (*scope_begin)(PTO2Runtime* rt);
     void (*scope_end)(PTO2Runtime* rt);
     void (*orchestration_done)(PTO2Runtime* rt);
+    bool (*is_fatal)(PTO2Runtime* rt);
 
     // Logging (populated by runtime, called by orchestration)
     void (*log_error)(const char* func, const char* fmt, ...);
@@ -123,7 +124,8 @@ PTO2Runtime* pto2_runtime_create(PTO2RuntimeMode mode);
  */
 PTO2Runtime* pto2_runtime_create_custom(PTO2RuntimeMode mode,
                                          uint64_t task_window_size,
-                                         uint64_t heap_size);
+                                         uint64_t heap_size,
+                                         int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
 
 /**
  * Create runtime from existing shared memory and GM heap (e.g. on device).
@@ -139,7 +141,8 @@ PTO2Runtime* pto2_runtime_create_from_sm(PTO2RuntimeMode mode,
                                           PTO2SharedMemoryHandle* sm_handle,
                                           void* gm_heap,
                                           uint64_t heap_size,
-                                          int orch_count = 1);
+                                          int orch_count = 1,
+                                          int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
 
 /**
  * Destroy runtime and free all resources
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 3ed22d87..141be544 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -20,6 +20,7 @@
 #include <stddef.h>
 
 #include "pto_types.h"
+#include "pto_submit_types.h"
 
 // =============================================================================
 // Profiling Configuration
@@ -53,34 +54,47 @@
 #error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1"
 #endif
 
+// =============================================================================
+// AICPU Error Codes (written to shared memory for Host-side diagnosis)
+// =============================================================================
+
+// Orchestrator errors (1-99): detected in orchestrator thread
+#define PTO2_ERROR_NONE                       0
+#define PTO2_ERROR_SCOPE_DEADLOCK             1
+#define PTO2_ERROR_HEAP_RING_DEADLOCK         2
+#define PTO2_ERROR_FLOW_CONTROL_DEADLOCK      3
+#define PTO2_ERROR_DEP_POOL_OVERFLOW          4
+#define PTO2_ERROR_INVALID_PARAM              5   // PTOParam construction error (invalid params)
+
+// Scheduler errors (100+): detected in scheduler threads
+#define PTO2_ERROR_SCHEDULER_TIMEOUT          100
+
 // =============================================================================
 // Configuration Constants
 // =============================================================================
 
 // Task management
-// NOTE: PTO2_TASK_WINDOW_SIZE is now the DEFAULT value only.
+// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
 // Actual window size is passed at runtime to pto2_runtime_create_threaded_custom().
 // Use pto2_task_slot(sched, task_id) for slot calculation.
-#define PTO2_TASK_WINDOW_SIZE     131072  // Default task window size (power of 2)
+#define PTO2_TASK_WINDOW_SIZE     16384   // Default per-ring task window size (power of 2)
+
+// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
+// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
+#define PTO2_MAX_RING_DEPTH       4
 
-// Memory pools
-#define PTO2_HEAP_SIZE            (1024 * 1024 * 1024)  // 1GB default heap
-#define PTO2_DEP_LIST_POOL_SIZE    65536    // Dependency list pool entries
+// Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH)
+#define PTO2_HEAP_SIZE            (256 * 1024 * 1024)  // 256MB per ring (1GB total)
+#define PTO2_DEP_LIST_POOL_SIZE    16384    // Per-ring dependency list pool entries
 #define PTO2_TENSORMAP_POOL_SIZE   (65536)   // TensorMap entry pool
 #define PTO2_TENSORMAP_NUM_BUCKETS 65536    // Power of 2 for fast hash
 
-// Task parameters
-#define PTO2_MAX_PARAMS           128     // Maximum parameters per task (tensors + scalars)
-#define PTO2_MAX_OUTPUTS          16      // Maximum outputs per task
-#define PTO2_MAX_INPUTS           16      // Maximum inputs per task
-#define PTO2_MAX_INOUTS           8       // Maximum in-out params per task
-
 // Scope management
 #define PTO2_MAX_SCOPE_DEPTH      64      // Maximum nesting depth
 #define PTO2_SCOPE_TASKS_INIT_CAP 65536     // Initial capacity for scope task buffer
 
 // Ready queue
-#define PTO2_READY_QUEUE_SIZE     65536   // Per-worker-type queue size
+#define PTO2_READY_QUEUE_SIZE     65536   // Per-shape queue size
 
 // Memory alignment
 #define PTO2_ALIGN_SIZE           64      // Cache line alignment
@@ -89,6 +103,50 @@
 
 // TensorMap cleanup interval
 #define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
+#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
+
+// =============================================================================
+// Multi-Ring task_id Encoding
+// =============================================================================
+
+/**
+ * TaskId: 64-bit encoding used across Runtime2.
+ *
+ * raw encoding: (ring_id << 32) | local_id
+ *
+ * ring_id:  which ring layer (0..PTO2_MAX_RING_DEPTH-1)
+ * local_id: per-ring monotonic counter
+ */
+struct PTO2TaskId {
+    uint64_t raw;
+
+    constexpr PTO2TaskId() : raw(0) {}
+    constexpr explicit PTO2TaskId(uint64_t v) : raw(v) {}
+
+    constexpr uint8_t ring() const { return static_cast<uint8_t>(raw >> 32); }
+    constexpr uint32_t local() const { return static_cast<uint32_t>(raw & 0xFFFFFFFFu); }
+
+    constexpr bool operator==(const PTO2TaskId& other) const { return raw == other.raw; }
+    constexpr bool operator!=(const PTO2TaskId& other) const { return raw != other.raw; }
+};
+
+static_assert(sizeof(PTO2TaskId) == 8, "PTO2TaskId must stay 8 bytes (shared memory ABI)");
+
+static inline PTO2TaskId pto2_make_task_id(uint8_t ring_id, uint32_t local_id) {
+    return PTO2TaskId{(static_cast<uint64_t>(ring_id) << 32) | static_cast<uint64_t>(local_id)};
+}
+
+static inline uint8_t pto2_task_id_ring(PTO2TaskId task_id) {
+    return task_id.ring();
+}
+
+static inline uint32_t pto2_task_id_local(PTO2TaskId task_id) {
+    return task_id.local();
+}
+
+static inline uint64_t pto2_task_id_raw(PTO2TaskId task_id) {
+    return task_id.raw;
+}
 
 // =============================================================================
 // Worker Types
@@ -261,9 +319,10 @@ typedef struct {
  *
  * Used for both fanin_list and fanout_list
  */
+struct PTO2TaskSlotState;  // Forward declaration
 struct PTO2DepListEntry {
-    int32_t task_id;          // The dependent/dependency task ID
-    PTO2DepListEntry* next;      // next entry
+    PTO2TaskSlotState* slot_state;    // Consumer slot state (direct pointer)
+    PTO2DepListEntry* next;           // next entry
 };
 
 // =============================================================================
@@ -271,53 +330,107 @@ struct PTO2DepListEntry {
 // =============================================================================
 
 /**
- * Task descriptor structure
+ * Task descriptor structure (shared memory)
  *
  * Stored in the TaskDescriptor ring buffer in shared memory.
- * Contains both static info (set at submission) and dynamic state.
+ * Contains static identification and buffer pointers only.
+ * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState.
  *
- * Concurrency notes:
- * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
- * - fanin_count set once at submission, read-only after (hot path for ready check)
- * - fanin_tasks stored in TaskPayload (cold path for release)
- * - Other fields set by Orchestrator, read by Scheduler
+ * Fields set by Orchestrator at submission, read by Scheduler for dispatch.
  */
 struct PTO2TaskDescriptor {
-    // Task identification
-    int32_t task_id;              // Unique task identifier (absolute, not wrapped)
-    int32_t kernel_id;            // InCore function to execute
-    int32_t worker_type;          // Target: CUBE, VECTOR, AI_CPU, ACCELERATOR
-    // Dependency lists (linked list heads - offsets into DepListPool)
-    // Fanin: producers this task depends on (set once at submission)
-    int32_t fanin_count;          // Number of producer dependencies
-
-    // Fanout: consumers that depend on this task (grows as consumers submit)
-    // PROTECTED BY fanout_lock
-    std::atomic<int32_t> fanout_lock; // Per-task spinlock (0=unlocked, 1=locked)
-    PTO2DepListEntry* fanout_head;    // Pointer to first fanout entry (nullptr = empty), PROTECTED BY fanout_lock
-    int32_t fanout_count;             // 1 (owning scope) + number of consumers
+    // Mixed-task identification (encodes ring_id in upper 32 bits)
+    PTO2TaskId mixed_task_id;         // raw: (ring_id << 32) | local_id
+
+    // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive)
+    int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT];
 
     // Packed output buffer (all outputs packed into single contiguous buffer)
     void*    packed_buffer_base;  // Start of packed buffer in GM Heap
     void*    packed_buffer_end;   // End of packed buffer (for heap reclamation)
 };
 
+// =============================================================================
+// Per-Slot Scheduling State
+// =============================================================================
+
 /**
  * Task payload data (cold path - only accessed during orchestration and dispatch)
  *
- * Separated from PTO2TaskDescriptor to keep the descriptor cache-friendly
- * for the scheduler's hot completion path (~80 bytes vs ~2912 bytes).
+ * Layout: metadata (counts, fanin pointers) packed in the first 3 cache lines,
+ * followed by bulk tensor and scalar data. This gives sequential write access
+ * during orchestration and groups scheduler-hot fields (fanin_actual_count +
+ * fanin_slot_states) together for on_task_release.
  */
 struct PTO2TaskPayload {
-    Tensor tensors[PTO2_MAX_PARAMS];
-    uint64_t scalar_value[PTO2_MAX_PARAMS];
-    bool is_tensor[PTO2_MAX_PARAMS];
-    int param_count{0};
-    int32_t fanin_tasks[PTO2_MAX_INPUTS];   // Producer task IDs (cold path, used by on_task_release)
-    int32_t fanin_actual_count{0};           // Actual fanin count (without the +1 redundance)
-    int32_t dep_pool_mark{0};                // Dep pool top after this task's submission (for reclamation)
+    // === Cache line 0 (64B) — metadata ===
+    int32_t tensor_count{0};
+    int32_t scalar_count{0};
+    int32_t fanin_actual_count{0};             // Actual fanin count (without the +1 redundance)
+    int32_t _reserved{0};                      // Reserved (dep_pool_mark moved to SlotState for local access)
+    PTO2TaskSlotState* fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release)
+    // === Cache lines 3-34 (2048B) — tensors (alignas(64) forces alignment) ===
+    Tensor tensors[PTO2_MAX_TENSOR_PARAMS];
+    // === Cache lines 35-50 (1024B) — scalars ===
+    uint64_t scalars[PTO2_MAX_SCALAR_PARAMS];
+
+    void init(const PTOParam& params) {
+        tensor_count = params.tensor_count;
+        scalar_count = params.scalar_count;
+        auto src_tensors = params.tensors;
+        for (int32_t i = 0; i < params.tensor_count; i++) {
+            tensors[i].copy(*src_tensors[i]);
+        }
+        static_assert(sizeof(scalars) == sizeof(params.scalars));
+        // Round up to cache line boundary. Both arrays are 1024B so no overrun.
+        // Eliminates branches; extra bytes within the same CL have zero additional cost.
+        memcpy(scalars, params.scalars,
+               PTO2_ALIGN_UP(params.scalar_count * sizeof(uint64_t), 64));
+    }
 };
 
+/**
+ * Per-task slot scheduling state (scheduler-private, NOT in shared memory)
+ *
+ * Consolidates all hot-path scheduling fields into a single cache-friendly
+ * structure (32 bytes = half a cache line). Accessing any field of a task's
+ * slot state brings all related fields into the same cache line.
+ *
+ * Concurrency notes:
+ * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
+ * - fanin_count set once at submission, read-only after (hot path for ready check)
+ * - task_state, fanin_refcount, fanout_refcount updated atomically
+ */
+struct alignas(64) PTO2TaskSlotState {
+    // Fanout lock + list (accessed together under lock in on_task_complete)
+    std::atomic<int32_t> fanout_lock;       // Per-task spinlock (0=unlocked, 1=locked)
+    int32_t fanout_count;                    // 1 (owning scope) + number of consumers
+
+    PTO2DepListEntry* fanout_head;           // Pointer to first fanout entry (nullptr = empty)
+
+    // Task state (completion, consumed check, ready check)
+    std::atomic<PTO2TaskState> task_state;   // PENDING/READY/RUNNING/COMPLETED/CONSUMED
+
+    // Fanin (accessed together in release_fanin_and_check_ready)
+    std::atomic<int32_t> fanin_refcount;     // Dynamic: counts completed producers
+    int32_t fanin_count;                      // Number of producer dependencies (set once)
+
+    // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
+    std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
+
+    PTO2TaskPayload* payload;
+
+    PTO2TaskDescriptor* task;
+
+    // Hot-path completion fields (moved from TaskDescriptor to avoid cross-struct access)
+    uint8_t active_mask;                         // Bitmask of active subtask slots (set once)
+    std::atomic<uint8_t> subtask_done_mask;      // Each subtask sets its done bit on completion
+    uint8_t ring_id;                             // Ring layer this task belongs to (for per-ring reclamation)
+    int32_t dep_pool_mark{0};                    // Dep pool top after this task's submission (orchestrator-only, local memory)
+};
+
+static_assert(sizeof(PTO2TaskSlotState) == 64);
+
 // =============================================================================
 // Cycle Cost Function Type
 // =============================================================================
@@ -381,20 +494,20 @@ typedef void (*PTO2InCoreFunc)(void** args, int32_t num_args);
 #endif
 
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-static inline void pto2_fanout_lock(PTO2TaskDescriptor& task,
+static inline void pto2_fanout_lock(PTO2TaskSlotState& slot_state,
                                      uint64_t& atomic_count, uint64_t& wait_cycle) {
     uint64_t t0 = get_sys_cnt_aicpu();
     bool contended = false;
     uint32_t atomic_ops = 0;
 
     for (;;) {
-        while (task.fanout_lock.load(std::memory_order_acquire) != 0) {
+        while (slot_state.fanout_lock.load(std::memory_order_acquire) != 0) {
             contended = true;
             atomic_ops++;  // each load = 1 atomic
             SPIN_WAIT_HINT();
         }
         int32_t expected = 0;
-        if (task.fanout_lock.compare_exchange_weak(expected, 1,
+        if (slot_state.fanout_lock.compare_exchange_weak(expected, 1,
                                         std::memory_order_acquire, std::memory_order_relaxed)) {
             atomic_ops++;  // successful CAS = 1 atomic
             atomic_count += atomic_ops;
@@ -409,21 +522,21 @@ static inline void pto2_fanout_lock(PTO2TaskDescriptor& task,
 }
 #endif
 
-static inline void pto2_fanout_lock(PTO2TaskDescriptor& task) {
+static inline void pto2_fanout_lock(PTO2TaskSlotState& slot_state) {
     for (;;) {
-        while (task.fanout_lock.load(std::memory_order_acquire) != 0) {
+        while (slot_state.fanout_lock.load(std::memory_order_acquire) != 0) {
             SPIN_WAIT_HINT();
         }
         int32_t expected = 0;
-        if (task.fanout_lock.compare_exchange_weak(expected, 1,
+        if (slot_state.fanout_lock.compare_exchange_weak(expected, 1,
                                         std::memory_order_acquire, std::memory_order_relaxed)) {
             return;
         }
     }
 }
 
-static inline void pto2_fanout_unlock(PTO2TaskDescriptor& task) {
-    task.fanout_lock.store(0, std::memory_order_release);
+static inline void pto2_fanout_unlock(PTO2TaskSlotState& slot_state) {
+    slot_state.fanout_lock.store(0, std::memory_order_release);
 }
 
 #endif // PTO_RUNTIME2_TYPES_H
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
index 16c4ea7f..7e2abca3 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
@@ -85,7 +85,7 @@ bool pto2_ready_queue_init(PTO2ReadyQueue* queue, uint64_t capacity) {
 
     for (uint64_t i = 0; i < capacity; i++) {
         queue->slots[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        queue->slots[i].task_id = -1;
+        queue->slots[i].slot_state = nullptr;
     }
 
     return true;
@@ -102,75 +102,79 @@ void pto2_ready_queue_destroy(PTO2ReadyQueue* queue) {
 // Scheduler Initialization
 // =============================================================================
 
+bool PTO2SchedulerState::RingSchedState::init(
+    PTO2SharedMemoryHandle* sm_handle, int32_t ring_id,
+    void* gm_heap_base, uint64_t per_ring_heap_size) {
+    task_descriptors = sm_handle->task_descriptors[ring_id];
+    heap_base = (char*)gm_heap_base + ring_id * per_ring_heap_size;
+    task_window_size = sm_handle->header->rings[ring_id].task_window_size;
+    task_window_mask = static_cast<int32_t>(task_window_size - 1);
+    last_task_alive = 0;
+    last_heap_consumed = 0;
+    heap_tail = 0;
+    slot_states = nullptr;
+    advance_lock.store(0, std::memory_order_relaxed);
+
+    // Allocate per-task slot state array (dynamically sized based on runtime window_size)
+    slot_states = new (std::nothrow) PTO2TaskSlotState[task_window_size];
+    if (!slot_states) {
+        return false;
+    }
+
+    // Zero-initialize all per-task slot state fields.
+    for (uint64_t i = 0; i < task_window_size; i++) {
+        slot_states[i].fanout_lock.store(0, std::memory_order_relaxed);
+        slot_states[i].fanout_count = 0;
+        slot_states[i].fanout_head = nullptr;
+        slot_states[i].task_state.store(static_cast<PTO2TaskState>(0), std::memory_order_relaxed);
+        slot_states[i].fanin_refcount.store(0, std::memory_order_relaxed);
+        slot_states[i].fanin_count = 0;
+        slot_states[i].fanout_refcount.store(0, std::memory_order_relaxed);
+        slot_states[i].payload = nullptr;
+        slot_states[i].task = nullptr;
+        slot_states[i].active_mask = 0;
+        slot_states[i].subtask_done_mask.store(0, std::memory_order_relaxed);
+        slot_states[i].ring_id = 0;
+    }
+
+    return true;
+}
+
+void PTO2SchedulerState::RingSchedState::destroy() {
+    if (!slot_states) return;
+    delete[] slot_states;
+    slot_states = nullptr;
+}
+
 bool pto2_scheduler_init(PTO2SchedulerState* sched,
                           PTO2SharedMemoryHandle* sm_handle,
-                          void* heap_base) {
+                          void* gm_heap_base, uint64_t per_ring_heap_size) {
     sched->sm_handle = sm_handle;
-    sched->heap_base = heap_base;
-    sched->task_state = nullptr;
-    sched->fanin_refcount = nullptr;
-    sched->fanout_refcount = nullptr;
-#if PTO2_PROFILING
+#if PTO2_SCHED_PROFILING
     sched->tasks_completed.store(0, std::memory_order_relaxed);
     sched->tasks_consumed.store(0, std::memory_order_relaxed);
 #endif
-    sched->ring_advance_lock.store(0, std::memory_order_relaxed);
-
-    // Get runtime task_window_size from shared memory header
-    uint64_t window_size = sm_handle->header->task_window_size;
-    sched->task_window_size = window_size;
-    sched->task_window_mask = window_size - 1;  // For fast modulo (window_size must be power of 2)
-
-    // Initialize local copies of ring pointers
-    sched->last_task_alive = 0;
-    sched->last_heap_consumed = 0;
-    sched->heap_tail = 0;
 
-    // Allocate per-task state arrays (dynamically sized based on runtime window_size)
-    sched->task_state = new (std::nothrow) std::atomic<PTO2TaskState>[window_size];
-    if (!sched->task_state) {
-        return false;
-    }
-
-    sched->fanin_refcount = new (std::nothrow) std::atomic<int32_t>[window_size];
-    if (!sched->fanin_refcount) {
-        delete[] sched->task_state;
-        sched->task_state = nullptr;
-        return false;
-    }
-
-    sched->fanout_refcount = new (std::nothrow) std::atomic<int32_t>[window_size];
-    if (!sched->fanout_refcount) {
-        delete[] sched->fanin_refcount;
-        delete[] sched->task_state;
-        sched->fanin_refcount = nullptr;
-        sched->task_state = nullptr;
-        return false;
-    }
-
-    // Zero-initialize all per-task state arrays.
-    // new[] default-initializes std::atomic<T> which leaves values indeterminate.
-    // Scheduler logic (e.g. fanin_refcount fetch_add in release_fanin_and_check_ready)
-    // assumes slots start at zero before init_task writes them.
-    for (uint64_t i = 0; i < window_size; i++) {
-        sched->task_state[i].store(static_cast<PTO2TaskState>(0), std::memory_order_relaxed);
-        sched->fanin_refcount[i].store(0, std::memory_order_relaxed);
-        sched->fanout_refcount[i].store(0, std::memory_order_relaxed);
+    // Initialize per-ring state
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!sched->ring_sched_states[r].init(sm_handle, r, gm_heap_base, per_ring_heap_size)) {
+            for (int j = 0; j < r; j++) {
+                sched->ring_sched_states[j].destroy();
+            }
+            return false;
+        }
     }
 
-    // Initialize ready queues
-    for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) {
+    // Initialize ready queues (one per resource shape, global)
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
         if (!pto2_ready_queue_init(&sched->ready_queues[i], PTO2_READY_QUEUE_SIZE)) {
             // Cleanup on failure
             for (int j = 0; j < i; j++) {
                 pto2_ready_queue_destroy(&sched->ready_queues[j]);
             }
-            delete[] sched->fanout_refcount;
-            delete[] sched->fanin_refcount;
-            delete[] sched->task_state;
-            sched->fanout_refcount = nullptr;
-            sched->fanin_refcount = nullptr;
-            sched->task_state = nullptr;
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+                sched->ring_sched_states[r].destroy();
+            }
             return false;
         }
     }
@@ -179,22 +183,11 @@ bool pto2_scheduler_init(PTO2SchedulerState* sched,
 }
 
 void pto2_scheduler_destroy(PTO2SchedulerState* sched) {
-    if (sched->task_state) {
-        delete[] sched->task_state;
-        sched->task_state = nullptr;
-    }
-
-    if (sched->fanin_refcount) {
-        delete[] sched->fanin_refcount;
-        sched->fanin_refcount = nullptr;
-    }
-
-    if (sched->fanout_refcount) {
-        delete[] sched->fanout_refcount;
-        sched->fanout_refcount = nullptr;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        sched->ring_sched_states[r].destroy();
     }
 
-    for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) {
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
         pto2_ready_queue_destroy(&sched->ready_queues[i]);
     }
 }
@@ -205,9 +198,15 @@ void pto2_scheduler_destroy(PTO2SchedulerState* sched) {
 
 void pto2_scheduler_print_stats(PTO2SchedulerState* sched) {
     LOG_INFO("=== Scheduler Statistics ===");
-    LOG_INFO("last_task_alive:   %d", sched->last_task_alive);
-    LOG_INFO("heap_tail:         %" PRIu64, sched->heap_tail);
-#if PTO2_PROFILING
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (sched->ring_sched_states[r].last_task_alive > 0 ||
+            sched->ring_sched_states[r].heap_tail > 0) {
+            LOG_INFO("Ring %d:", r);
+            LOG_INFO("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
+            LOG_INFO("  heap_tail:       %" PRIu64, sched->ring_sched_states[r].heap_tail);
+        }
+    }
+#if PTO2_SCHED_PROFILING
     LOG_INFO("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
     LOG_INFO("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
 #endif
@@ -217,10 +216,10 @@ void pto2_scheduler_print_stats(PTO2SchedulerState* sched) {
 void pto2_scheduler_print_queues(PTO2SchedulerState* sched) {
     LOG_INFO("=== Ready Queues ===");
 
-    const char* worker_names[] = {"CUBE", "VECTOR", "AI_CPU", "ACCELERATOR"};
+    const char* shape_names[] = {"AIC_ONLY", "AIV_X1", "AIV_X2", "AIC_AIV_X1", "AIC_AIV_X2"};
 
-    for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) {
-        LOG_INFO("  %s: count=%" PRIu64, worker_names[i],
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        LOG_INFO("  %s: count=%" PRIu64, shape_names[i],
                  sched->ready_queues[i].size());
     }
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index b8bd1983..b3b16ef0 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -2,10 +2,11 @@
  * PTO Runtime2 - Scheduler Interface
  *
  * The Scheduler is responsible for:
- * 1. Maintaining per-worker-type ready queues
+ * 1. Maintaining per-resource-shape ready queues
  * 2. Tracking task state (PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED)
  * 3. Managing fanin/fanout refcounts for dependency resolution
  * 4. Advancing last_task_alive for heap reclamation
+ * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
  *
  * The Scheduler runs on Device AI_CPU and processes:
  * - Task state transitions based on fanin_refcount
@@ -41,44 +42,44 @@
  */
 struct PTO2ReadyQueueSlot {
     std::atomic<int64_t> sequence;
-    int32_t task_id;
-    int32_t _pad;
+    PTO2TaskSlotState* slot_state;
 };
 
 /**
  * Thread-local ready buffer for local-first dispatch optimization.
  *
- * One buffer per scheduling thread (mixed worker types).
+ * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1).
  * Initialized once before the scheduling loop; must be empty at
  * the start of each iteration (verified by always_assert).
  *
- * Phase 1 fills this buffer via on_task_complete().
- * Phase 2 drains it: matched tasks dispatch to idle cores,
- * unmatched tasks are stored in an overflow array for Phase 3.
- * Phase 3 pushes overflow to global readyQ and fills remaining
- * idle cores from global readyQ.
+ * Phase 1 fills per-CoreType buffers via on_task_complete().
+ * dispatch_ready_tasks_to_idle_cores drains them: local-first via
+ * get_ready_task, then remaining tasks pushed to global readyQ.
  */
+// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
+static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
+
 struct PTO2LocalReadyBuffer {
-    int32_t* task_ids = nullptr;  // Points to caller's stack array
+    PTO2TaskSlotState** slot_states = nullptr;
     int count = 0;
     int capacity = 0;
 
-    void reset(int32_t* buf, int cap) {
-        task_ids = buf;
+    void reset(PTO2TaskSlotState** buf, int cap) {
+        slot_states = buf;
         count = 0;
         capacity = cap;
     }
 
-    bool try_push(int32_t task_id) {
-        if (task_ids && count < capacity) {
-            task_ids[count++] = task_id;
+    bool try_push(PTO2TaskSlotState* s) {
+        if (slot_states && count < capacity) {
+            slot_states[count++] = s;
             return true;
         }
         return false;
     }
 
-    int32_t pop() {
-        return (count > 0) ? task_ids[--count] : -1;  // LIFO: better cache locality
+    PTO2TaskSlotState* pop() {
+        return (count > 0) ? slot_states[--count] : nullptr;
     }
 };
 
@@ -110,7 +111,7 @@ struct alignas(64) PTO2ReadyQueue {
         return (e >= d) ? (e - d) : 0;
     }
 
-    bool push(int32_t task_id) {
+    bool push(PTO2TaskSlotState* slot_state) {
         uint64_t pos;
         PTO2ReadyQueueSlot* slot;
         while (true) {
@@ -128,13 +129,13 @@ struct alignas(64) PTO2ReadyQueue {
             }
         }
 
-        slot->task_id = task_id;
+        slot->slot_state = slot_state;
         slot->sequence.store((int64_t)(pos + 1), std::memory_order_release);
         return true;
     }
 
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool push(int32_t task_id, uint64_t& atomic_count, uint64_t& wait_cycle) {
+    bool push(PTO2TaskSlotState* slot_state, uint64_t& atomic_count, uint64_t& wait_cycle) {
         uint64_t pos;
         PTO2ReadyQueueSlot* slot;
         uint64_t t0 = get_sys_cnt_aicpu();
@@ -166,18 +167,18 @@ struct alignas(64) PTO2ReadyQueue {
             wait_cycle += (get_sys_cnt_aicpu() - t0);
         }
 
-        slot->task_id = task_id;
+        slot->slot_state = slot_state;
         slot->sequence.store((int64_t)(pos + 1), std::memory_order_release);
         return true;
     }
 #endif
 
-    int32_t pop() {
+    PTO2TaskSlotState* pop() {
         // Fast-path: skip slot load when queue is clearly empty
         uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
         uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
         if (d >= e) {
-            return -1;
+            return nullptr;
         }
 
         uint64_t pos;
@@ -192,23 +193,23 @@ struct alignas(64) PTO2ReadyQueue {
                         std::memory_order_relaxed, std::memory_order_relaxed))
                     break;
             } else if (diff < 0) {
-                return -1;  // Queue empty
+                return nullptr;  // Queue empty
             }
         }
 
-        int32_t task_id = slot->task_id;
+        PTO2TaskSlotState* result = slot->slot_state;
         slot->sequence.store((int64_t)(pos + mask + 1), std::memory_order_release);
-        return task_id;
+        return result;
     }
 
 #if PTO2_SCHED_PROFILING
-    int32_t pop(uint64_t& atomic_count, uint64_t& wait_cycle) {
+    PTO2TaskSlotState* pop(uint64_t& atomic_count, uint64_t& wait_cycle) {
         // Fast-path: skip slot load when queue is clearly empty
         uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
         uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
         atomic_count += 2;  // dequeue_pos.load + enqueue_pos.load
         if (d >= e) {
-            return -1;
+            return nullptr;
         }
 
         uint64_t pos;
@@ -232,7 +233,7 @@ struct alignas(64) PTO2ReadyQueue {
                 atomic_ops++;  // failed CAS
             } else if (diff < 0) {
                 atomic_count += atomic_ops;
-                return -1;  // Queue empty
+                return nullptr;  // Queue empty
             } else {
                 contended = true;
             }
@@ -243,9 +244,9 @@ struct alignas(64) PTO2ReadyQueue {
             wait_cycle += (get_sys_cnt_aicpu() - t0);
         }
 
-        int32_t task_id = slot->task_id;
+        PTO2TaskSlotState* result = slot->slot_state;
         slot->sequence.store((int64_t)(pos + mask + 1), std::memory_order_release);
-        return task_id;
+        return result;
     }
 #endif
 };
@@ -259,12 +260,13 @@ void pto2_ready_queue_destroy(PTO2ReadyQueue* queue);
 // =============================================================================
 
 /**
- * Statistics returned by on_task_complete
+ * Statistics returned by mixed-task completion processing
  */
 struct PTO2CompletionStats {
     int32_t fanout_edges;      // Number of fanout edges traversed (notify consumers)
     int32_t tasks_enqueued;    // Number of consumers that became READY
     int32_t fanin_edges;       // Number of fanin edges traversed (release producers)
+    bool mixed_task_completed; // True only when this callback completed a mixed task
 };
 
 /**
@@ -278,111 +280,111 @@ struct PTO2SchedulerState {
     // Shared memory access
     PTO2SharedMemoryHandle* sm_handle;
 
-    // Local copies of ring pointers (written to shared memory after update)
-    int32_t last_task_alive;      // Task ring tail (advances on COMPLETED for slot reuse)
-    int32_t last_heap_consumed;   // Heap watermark (advances on CONSUMED for buffer reuse)
-    uint64_t heap_tail;           // Heap ring tail (offset from heap_base)
+    // Per-ring state
+    struct RingSchedState {
+        PTO2TaskDescriptor* task_descriptors;
+        PTO2TaskSlotState* slot_states;
+        int32_t last_task_alive;
+        int32_t last_heap_consumed;
+        uint64_t heap_tail;
+        void* heap_base;
+        int32_t task_window_mask;
+        uint64_t task_window_size;
+        // Try-lock used to advance this ring's pointers (CONSUMED scanning + heap tail update).
+        std::atomic<int32_t> advance_lock;
+
+        bool init(PTO2SharedMemoryHandle* sm_handle, int32_t ring_id,
+                  void* gm_heap_base, uint64_t per_ring_heap_size);
+        void destroy();
+
+        PTO2TaskSlotState& get_slot_state_by_task_id(int32_t local_id) {
+            return slot_states[local_id & task_window_mask];
+        }
+        PTO2TaskSlotState& get_slot_state_by_slot(int32_t slot) {
+            return slot_states[slot];
+        }
 
-    // Heap base address (for converting absolute pointers to offsets)
-    void* heap_base;
+        void sync_to_sm(PTO2SharedMemoryRingHeader& ring) {
+            ring.fc.last_task_alive.store(last_task_alive, std::memory_order_release);
+            ring.fc.heap_tail.store(heap_tail, std::memory_order_release);
+        }
 
-    // === DYNAMIC CONFIGURATION ===
-    uint64_t task_window_size;    // Task window size (power of 2)
-    uint64_t task_window_mask;    // task_window_size - 1 (for fast modulo)
+        void advance_ring_pointers(PTO2SharedMemoryRingHeader& ring) {
+            int32_t current_task_index = ring.fc.current_task_index.load(std::memory_order_acquire);
 
-    // === PRIVATE DATA (not in shared memory) ===
+            while (last_task_alive < current_task_index) {
+                PTO2TaskSlotState& slot_state = get_slot_state_by_task_id(last_task_alive);
+                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) {
+                    break;
+                }
+                last_task_alive++;
+            }
 
-    // Per-task state arrays (dynamically allocated, indexed by task_id & task_window_mask)
-    std::atomic<PTO2TaskState>* task_state; // PENDING/READY/RUNNING/COMPLETED/CONSUMED
-    std::atomic<int32_t>* fanin_refcount;   // Dynamic: counts completed producers
-    std::atomic<int32_t>* fanout_refcount;  // Dynamic: counts released references
+            if (last_task_alive > 0) {
+                int32_t last_consumed_id = last_task_alive - 1;
+                PTO2TaskSlotState& slot_state = get_slot_state_by_task_id(last_consumed_id);
+                PTO2TaskDescriptor& task = *slot_state.task;
+                if (task.packed_buffer_end != NULL) {
+                    heap_tail = (uint64_t)((char*)task.packed_buffer_end - (char*)heap_base);
+                }
+            }
 
-    // Ready queues (one per worker type)
-    PTO2ReadyQueue ready_queues[PTO2_NUM_WORKER_TYPES];
+            sync_to_sm(ring);
+        }
+    } ring_sched_states[PTO2_MAX_RING_DEPTH];
+
+    // Ready queues remain global (scheduling is ring-agnostic)
+    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
 
     // Statistics
-#if PTO2_PROFILING
+#if PTO2_SCHED_PROFILING
     std::atomic<int64_t> tasks_completed;
     std::atomic<int64_t> tasks_consumed;
 #endif
-    std::atomic<int32_t> ring_advance_lock{0};  // Try-lock for advance_ring_pointers
-
     // =========================================================================
     // Inline hot-path methods
     // =========================================================================
-
-    int32_t pto2_task_slot(int32_t task_id) {
-        return task_id & task_window_mask;
-    }
-
-    void sync_to_sm() {
-        PTO2SharedMemoryHeader* header = sm_handle->header;
-        header->last_task_alive.store(last_task_alive, std::memory_order_release);
-        header->heap_tail.store(heap_tail, std::memory_order_release);
-        header->heap_tail_gen.store(last_task_alive, std::memory_order_release);
+    PTO2TaskSlotState& get_slot_state(int32_t ring_id, int32_t local_id) {
+        return ring_sched_states[ring_id].get_slot_state_by_task_id(local_id);
     }
-
-    void advance_ring_pointers() {
-        PTO2SharedMemoryHeader* header = sm_handle->header;
-        int32_t current_task_index = header->current_task_index.load(std::memory_order_acquire);
-
-        while (last_task_alive < current_task_index) {
-            int32_t slot = pto2_task_slot(last_task_alive);
-            if (task_state[slot].load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) {
-                break;
-            }
-            last_task_alive++;
-        }
-
-        if (last_task_alive > 0) {
-            int32_t last_consumed_id = last_task_alive - 1;
-            PTO2TaskDescriptor* last_consumed = pto2_sm_get_task(sm_handle, last_consumed_id);
-            if (last_consumed->packed_buffer_end != NULL) {
-                heap_tail = (uint64_t)((char*)last_consumed->packed_buffer_end - (char*)heap_base);
-            }
-        }
-
-        sync_to_sm();
+    PTO2TaskSlotState& get_slot_state_by_slot(int32_t ring_id, int32_t slot) {
+        return ring_sched_states[ring_id].get_slot_state_by_slot(slot);
     }
 
-    void check_and_handle_consumed(int32_t slot, PTO2TaskDescriptor& task) {
-        if (fanout_refcount[slot].load(std::memory_order_acquire) != task.fanout_count) return;
+    void check_and_handle_consumed(PTO2TaskSlotState& slot_state) {
+        if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return;
 
         PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!task_state[slot].compare_exchange_strong(expected, PTO2_TASK_CONSUMED,
+        if (!slot_state.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED,
                                           std::memory_order_acq_rel, std::memory_order_acquire)) {
             return;
         }
 
-#if PTO2_PROFILING
+#if PTO2_SCHED_PROFILING
         tasks_consumed.fetch_add(1, std::memory_order_relaxed);
 #endif
-        fanout_refcount[slot].store(0, std::memory_order_release);
-        fanin_refcount[slot].store(0, std::memory_order_release);
 
-        // Try-lock — if another thread is advancing, it will scan our CONSUMED task
+        int32_t ring_id = slot_state.ring_id;
+        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
         int32_t expected_lock = 0;
-        if (ring_advance_lock.compare_exchange_strong(expected_lock, 1,
+        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(expected_lock, 1,
                 std::memory_order_acquire, std::memory_order_relaxed)) {
-            advance_ring_pointers();
-            ring_advance_lock.store(0, std::memory_order_release);
+            ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]);
+            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
         }
     }
 
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void check_and_handle_consumed(int32_t task_id, PTO2TaskDescriptor& task,
-                                    uint64_t& atomic_count) {
-        int32_t slot = pto2_task_slot(task_id);
-
-        int32_t fc = task.fanout_count;
-        int32_t rc = fanout_refcount[slot].load(std::memory_order_acquire);
+    void check_and_handle_consumed(PTO2TaskSlotState& slot_state, uint64_t& atomic_count) {
+        int32_t fc = slot_state.fanout_count;
+        int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire);
 
         atomic_count += 2;  // fanout_count.load + fanout_refcount.load
 
         if (rc != fc) return;
 
         PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!task_state[slot].compare_exchange_strong(expected, PTO2_TASK_CONSUMED,
+        if (!slot_state.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED,
                                           std::memory_order_acq_rel, std::memory_order_acquire)) {
             atomic_count += 1;  // failed CAS
             return;
@@ -390,20 +392,17 @@ struct PTO2SchedulerState {
 
         atomic_count += 1;  // successful CAS
 
-#if PTO2_PROFILING
+#if PTO2_SCHED_PROFILING
         tasks_consumed.fetch_add(1, std::memory_order_relaxed);
 #endif
-        fanout_refcount[slot].store(0, std::memory_order_release);
-        fanin_refcount[slot].store(0, std::memory_order_release);
-
-        atomic_count += 2;  // fanout_refcount.store + fanin_refcount.store
 
-        // Try-lock — if another thread is advancing, it will scan our CONSUMED task
+        int32_t ring_id = slot_state.ring_id;
+        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
         int32_t expected_lock = 0;
-        if (ring_advance_lock.compare_exchange_strong(expected_lock, 1,
+        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(expected_lock, 1,
                 std::memory_order_acquire, std::memory_order_relaxed)) {
-            advance_ring_pointers();
-            ring_advance_lock.store(0, std::memory_order_release);
+            ring_sched_states[ring_id].advance_ring_pointers(sm_handle->header->rings[ring_id]);
+            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
             atomic_count += 2;  // try-lock CAS + unlock store
         } else {
             atomic_count += 1;  // failed try-lock CAS
@@ -411,41 +410,37 @@ struct PTO2SchedulerState {
     }
 #endif
 
-    void release_producer(int32_t producer_id) {
-        int32_t slot = pto2_task_slot(producer_id);
-        PTO2TaskDescriptor& producer = pto2_sm_get_task_by_slot(sm_handle, slot);
-        fanout_refcount[slot].fetch_add(1, std::memory_order_acq_rel);
-        check_and_handle_consumed(slot, producer);
+    void release_producer(PTO2TaskSlotState& slot_state) {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+        check_and_handle_consumed(slot_state);
     }
 
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void release_producer(int32_t producer_id, uint64_t& atomic_count) {
-        int32_t slot = pto2_task_slot(producer_id);
-        PTO2TaskDescriptor& producer = pto2_sm_get_task_by_slot(sm_handle, slot);
-        fanout_refcount[slot].fetch_add(1, std::memory_order_acq_rel);
+    void release_producer(PTO2TaskSlotState& slot_state, uint64_t& atomic_count) {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
         atomic_count += 1;  // fanout_refcount.fetch_add
-        check_and_handle_consumed(producer_id, producer, atomic_count);
+        check_and_handle_consumed(slot_state, atomic_count);
     }
 #endif
 
-    bool release_fanin_and_check_ready(int32_t task_id,
-                                        PTO2TaskDescriptor* task,
-                                        PTO2LocalReadyBuffer* local_buf = nullptr) {
-        int32_t slot = pto2_task_slot(task_id);
-
+    bool release_fanin_and_check_ready(PTO2TaskSlotState& slot_state,
+                                        PTO2LocalReadyBuffer* local_bufs = nullptr) {
         // Atomically increment fanin_refcount and check if all producers are done
         // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's
-        // release in init_task, making fanin_count visible — plain load suffices.
-        int32_t new_refcount = fanin_refcount[slot].fetch_add(1, std::memory_order_acq_rel) + 1;
+        // init release, making fanin_count visible — plain load suffices.
+        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
 
-        if (new_refcount == task->fanin_count) {
-            // Local-first: try thread-local buffer before global queue
+        if (new_refcount == slot_state.fanin_count) {
+            // Local-first: try per-CoreType thread-local buffer before global queue
+            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
+            PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask);
             bool pushed_local = false;
-            if (local_buf) {
-                pushed_local = local_buf->try_push(task_id);
+            if (local_bufs) {
+                int32_t buf_idx = (slot_state.active_mask & 0x01) ? 0 : 1;
+                pushed_local = local_bufs[buf_idx].try_push(&slot_state);
             }
             if (!pushed_local) {
-                ready_queues[task->worker_type].push(task_id);
+                ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
             }
             return true;
         }
@@ -453,26 +448,26 @@ struct PTO2SchedulerState {
     }
 
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool release_fanin_and_check_ready(int32_t task_id, PTO2TaskDescriptor* task,
+    bool release_fanin_and_check_ready(PTO2TaskSlotState& slot_state,
                                         uint64_t& atomic_count, uint64_t& push_wait,
-                                        PTO2LocalReadyBuffer* local_buf = nullptr) {
-        int32_t slot = pto2_task_slot(task_id);
-
-        int32_t new_refcount = fanin_refcount[slot].fetch_add(1, std::memory_order_acq_rel) + 1;
+                                        PTO2LocalReadyBuffer* local_bufs = nullptr) {
+        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
         atomic_count += 1;  // fanin_refcount.fetch_add
 
-        if (new_refcount == task->fanin_count) {
+        if (new_refcount == slot_state.fanin_count) {
             PTO2TaskState expected = PTO2_TASK_PENDING;
-            if (task_state[slot].compare_exchange_strong(
+            if (slot_state.task_state.compare_exchange_strong(
                     expected, PTO2_TASK_READY, std::memory_order_acq_rel, std::memory_order_acquire)) {
                 atomic_count += 1;  // CAS(task_state PENDING→READY)
-                // Local-first: try thread-local buffer before global queue
+                // Local-first: try per-CoreType thread-local buffer before global queue
+                PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask);
                 bool pushed_local = false;
-                if (local_buf) {
-                    pushed_local = local_buf->try_push(task_id);
+                if (local_bufs) {
+                    int32_t buf_idx = (slot_state.active_mask & 0x01) ? 0 : 1;
+                    pushed_local = local_bufs[buf_idx].try_push(&slot_state);
                 }
                 if (!pushed_local) {
-                    ready_queues[task->worker_type].push(task_id, atomic_count, push_wait);
+                    ready_queues[static_cast<int32_t>(shape)].push(&slot_state, atomic_count, push_wait);
                 }
                 return true;
             }
@@ -481,89 +476,108 @@ struct PTO2SchedulerState {
     }
 #endif
 
-    void init_task(int32_t task_id, PTO2TaskDescriptor* task) {
-        int32_t slot = pto2_task_slot(task_id);
-
-        task_state[slot].store(PTO2_TASK_PENDING, std::memory_order_relaxed); // Orchestrator is the unique owner
-
-        // Reset fanout_refcount for new task lifecycle.
-        // Do NOT reset fanin_refcount — it may have been incremented by
-        // concurrent on_task_complete between Step 5 and Step 6.
-        fanout_refcount[slot].store(0, std::memory_order_relaxed);
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-        extern uint64_t g_orch_finalize_atomic_count;
-        extern uint64_t g_orch_finalize_wait_cycle;
-        release_fanin_and_check_ready(task_id, task,
-                                       g_orch_finalize_atomic_count, g_orch_finalize_wait_cycle);
-#else
-        release_fanin_and_check_ready(task_id, task);
-#endif
+    PTO2TaskSlotState* get_ready_task(PTO2ResourceShape shape) {
+        return ready_queues[static_cast<int32_t>(shape)].pop();
     }
 
     template<CoreType CT>
-    int32_t get_ready_task() {
-        return ready_queues[static_cast<int32_t>(CT)].pop();
+    PTO2TaskSlotState* get_ready_task(PTO2LocalReadyBuffer* local_bufs) {
+        constexpr int ct = static_cast<int>(CT);
+        if (local_bufs && local_bufs[ct].count > 0) {
+            return local_bufs[ct].pop();
+        }
+        return ready_queues[ct].pop();
     }
 
 #if PTO2_SCHED_PROFILING
+    PTO2TaskSlotState* get_ready_task(PTO2ResourceShape shape, uint64_t& atomic_count, uint64_t& wait_cycle) {
+        return ready_queues[static_cast<int32_t>(shape)].pop(atomic_count, wait_cycle);
+    }
+
     template<CoreType CT>
-    int32_t get_ready_task(uint64_t& atomic_count, uint64_t& wait_cycle) {
-        return ready_queues[static_cast<int32_t>(CT)].pop(atomic_count, wait_cycle);
+    PTO2TaskSlotState* get_ready_task(PTO2LocalReadyBuffer* local_bufs,
+                           uint64_t& atomic_count, uint64_t& wait_cycle) {
+        constexpr int ct = static_cast<int>(CT);
+        if (local_bufs && local_bufs[ct].count > 0) {
+            return local_bufs[ct].pop();
+        }
+        return ready_queues[ct].pop(atomic_count, wait_cycle);
     }
 #endif
 
-    void on_scope_end(const int32_t* task_ids, int32_t count) {
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
+    /**
+     * Requeue a ready task that could not be dispatched (no suitable cluster).
+     * Pushes the task back into its shape-based queue.
+     */
+    void requeue_ready_task(PTO2TaskSlotState& slot_state) {
+        PTO2ResourceShape shape = pto2_active_mask_to_shape(slot_state.active_mask);
+        ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
+    }
+
+    void on_scope_end(PTO2TaskSlotState** task_slot_states, int32_t count) {
+#if PTO2_ORCH_PROFILING
         extern uint64_t g_orch_scope_end_atomic_count;
         for (int32_t i = 0; i < count; i++) {
-            release_producer(task_ids[i], g_orch_scope_end_atomic_count);
+            release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count);
         }
 #else
         for (int32_t i = 0; i < count; i++) {
-            release_producer(task_ids[i]);
+            release_producer(*task_slot_states[i]);
         }
 #endif
     }
 
+    /**
+     * Two-stage completion: first stage.
+     * Called when a single subtask (AIC, AIV0, or AIV1) finishes.
+     * Sets the corresponding done bit in subtask_done_mask.
+     *
+     * @return true if this subtask was the last one, completing the mixed task.
+     */
+    bool on_subtask_complete(PTO2TaskSlotState& slot_state, PTO2SubtaskSlot subslot) {
+        uint8_t done_bit = (1u << static_cast<uint8_t>(subslot));
+        uint8_t prev_mask = slot_state.subtask_done_mask.fetch_or(done_bit, std::memory_order_acq_rel);
+        uint8_t new_mask = prev_mask | done_bit;
+
+        return new_mask == slot_state.active_mask;
+    }
+
+    /**
+     * Two-stage completion: second stage.
+     * Called exactly once when all subtasks of a mixed task are done
+     * (i.e., on_subtask_complete returned true).
+     * Handles fanout notification, fanin release, and self-consumption check.
+     */
 #if PTO2_SCHED_PROFILING
-    PTO2CompletionStats on_task_complete(int32_t task_id, int thread_idx,
-                                          PTO2LocalReadyBuffer* local_buf = nullptr) {
-        PTO2CompletionStats stats = {0, 0, 0};
-#elif PTO2_PROFILING
-    PTO2CompletionStats on_task_complete(int32_t task_id,
-                                          PTO2LocalReadyBuffer* local_buf = nullptr) {
-        PTO2CompletionStats stats = {0, 0, 0};
+    PTO2CompletionStats
 #else
-    void on_task_complete(int32_t task_id,
-                           PTO2LocalReadyBuffer* local_buf = nullptr) {
+    void
 #endif
-        int32_t slot = pto2_task_slot(task_id);
-        PTO2TaskDescriptor& task = pto2_sm_get_task_by_slot(sm_handle, task_id);
-
-#if PTO2_PROFILING
-        tasks_completed.fetch_add(1, std::memory_order_relaxed);
+    on_mixed_task_complete(PTO2TaskSlotState& slot_state, 
+#if PTO2_SCHED_PROFILING
+        int thread_idx,
 #endif
 
+        PTO2LocalReadyBuffer* local_bufs = nullptr) {
+#if PTO2_SCHED_PROFILING
+        PTO2CompletionStats stats = {0, 0, 0, true};
+#endif
 #if PTO2_SCHED_PROFILING
         extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
-        extern uint64_t g_sched_self_consumed_cycle[];
         extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
         extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
-        extern uint64_t g_sched_self_atomic_count[];
-        extern uint64_t g_sched_complete_count[];
         uint64_t lock_atomics = 0, lock_wait = 0;
         PTO2_SCHED_CYCLE_START();
 #endif
 
 #if PTO2_SCHED_PROFILING
-        pto2_fanout_lock(task, lock_atomics, lock_wait);
+        pto2_fanout_lock(slot_state, lock_atomics, lock_wait);
 #else
-        pto2_fanout_lock(task);
+        pto2_fanout_lock(slot_state);
 #endif
-        task_state[slot].store(PTO2_TASK_COMPLETED, std::memory_order_release);
-        PTO2DepListEntry* current = task.fanout_head;  // Protected by fanout_lock
-        pto2_fanout_unlock(task);
+        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+        PTO2DepListEntry* current = slot_state.fanout_head;  // Protected by fanout_lock
+        pto2_fanout_unlock(slot_state);
 
 #if PTO2_SCHED_PROFILING
         lock_atomics += 2;  // state.store + unlock.store
@@ -577,24 +591,15 @@ struct PTO2SchedulerState {
         uint64_t fanout_atomics = 0, push_wait = 0;
 #endif
         while (current != nullptr) {
-            int32_t consumer_id = current->task_id;
-            PTO2TaskDescriptor* consumer = pto2_sm_get_task(sm_handle, consumer_id);
-#if PTO2_PROFILING
-            stats.fanout_edges++;
-#endif
+            PTO2TaskSlotState& consumer_slot = *current->slot_state;
 #if PTO2_SCHED_PROFILING
-            if (release_fanin_and_check_ready(consumer_id, consumer,
-                                               fanout_atomics, push_wait, local_buf)) {
-#if PTO2_PROFILING
-                stats.tasks_enqueued++;
-#endif
-            }
-#elif PTO2_PROFILING
-            if (release_fanin_and_check_ready(consumer_id, consumer, local_buf)) {
+            stats.fanout_edges++;
+            if (release_fanin_and_check_ready(consumer_slot,
+                                               fanout_atomics, push_wait, local_bufs)) {
                 stats.tasks_enqueued++;
             }
 #else
-            release_fanin_and_check_ready(consumer_id, consumer, local_buf);
+            release_fanin_and_check_ready(consumer_slot, local_bufs);
 #endif
             current = current->next;
         }
@@ -603,9 +608,6 @@ struct PTO2SchedulerState {
         g_sched_fanout_atomic_count[thread_idx] += fanout_atomics;
         g_sched_push_wait_cycle[thread_idx] += push_wait;
         PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]);
-#endif
-
-#if PTO2_PROFILING
         return stats;
 #endif
     }
@@ -616,7 +618,7 @@ struct PTO2SchedulerState {
      */
 
 #if PTO2_SCHED_PROFILING
-    int32_t on_task_release(int32_t task_id, int32_t thread_idx) {
+    int32_t on_task_release(PTO2TaskSlotState& slot_state, int32_t thread_idx) {
         PTO2_SCHED_CYCLE_START();
         extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[];
         extern uint64_t g_sched_self_atomic_count[];
@@ -624,16 +626,15 @@ struct PTO2SchedulerState {
         extern uint64_t g_sched_complete_count[];
         uint64_t fanin_atomics = 0;
 #else
-    int32_t on_task_release(int32_t task_id) {
+    int32_t on_task_release(PTO2TaskSlotState& slot_state) {
 #endif
-        int32_t slot = pto2_task_slot(task_id);
-        PTO2TaskPayload* payload = &sm_handle->task_payloads[slot];
+        PTO2TaskPayload* payload = slot_state.payload;
         int32_t fanin_edges = payload->fanin_actual_count;
         for (int32_t i = 0; i < fanin_edges; i++) {
 #if PTO2_SCHED_PROFILING
-            release_producer(payload->fanin_tasks[i], fanin_atomics);
+            release_producer(*payload->fanin_slot_states[i], fanin_atomics);
 #else
-            release_producer(payload->fanin_tasks[i]);
+            release_producer(*payload->fanin_slot_states[i]);
 #endif
         }
 #if PTO2_SCHED_PROFILING
@@ -644,12 +645,12 @@ struct PTO2SchedulerState {
         // Self consumed check
 #if PTO2_SCHED_PROFILING
         uint64_t self_atomics = 0;
-        check_and_handle_consumed(slot, task, self_atomics);
+        check_and_handle_consumed(slot_state, self_atomics);
         g_sched_self_atomic_count[thread_idx] += self_atomics;
         PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]);
         g_sched_complete_count[thread_idx]++;
 #else
-        check_and_handle_consumed(slot, pto2_sm_get_task_by_slot(sm_handle, slot));
+        check_and_handle_consumed(slot_state);
 #endif
         return fanin_edges;
     }
@@ -661,7 +662,7 @@ struct PTO2SchedulerState {
 
 bool pto2_scheduler_init(PTO2SchedulerState* sched,
                           PTO2SharedMemoryHandle* sm_handle,
-                          void* heap_base);
+                          void* gm_heap_base, uint64_t per_ring_heap_size);
 void pto2_scheduler_destroy(PTO2SchedulerState* sched);
 
 // =============================================================================
@@ -678,7 +679,7 @@ const char* pto2_task_state_name(PTO2TaskState state);
 
 #if PTO2_SCHED_PROFILING
 struct PTO2SchedProfilingData {
-    // Sub-phase cycle breakdown within on_task_complete
+    // Sub-phase cycle breakdown within on_mixed_task_complete
     uint64_t lock_cycle;           // pto2_fanout_lock + state store + unlock
     uint64_t fanout_cycle;         // fanout traversal
     uint64_t fanin_cycle;          // fanin traversal
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
index f9f0f65f..be9bd3e0 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.cpp
@@ -1,9 +1,9 @@
 /**
  * PTO Runtime2 - Shared Memory Implementation
- * 
+ *
  * Implements shared memory allocation, initialization, and management
  * for Orchestrator-Scheduler communication.
- * 
+ *
  * Based on: docs/runtime_buffer_manager_methods.md
  */
 
@@ -18,16 +18,24 @@
 // =============================================================================
 
 uint64_t pto2_sm_calculate_size(uint64_t task_window_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+    }
+    return pto2_sm_calculate_size_per_ring(task_window_sizes);
+}
+
+uint64_t pto2_sm_calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
     uint64_t size = 0;
 
     // Header (aligned to cache line)
     size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
 
-    // Task descriptors (hot: dependency metadata only)
-    size += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-
-    // Task payloads (cold: tensors/scalars, only accessed during orchestration and dispatch)
-    size += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+    // Per-ring task descriptors and payloads
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+    }
 
     return size;
 }
@@ -36,6 +44,33 @@ uint64_t pto2_sm_calculate_size(uint64_t task_window_size) {
 // Creation and Destruction
 // =============================================================================
 
+static void pto2_sm_setup_pointers_per_ring(
+    PTO2SharedMemoryHandle* handle,
+    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    char* ptr = (char*)handle->sm_base;
+
+    // Header
+    handle->header = (PTO2SharedMemoryHeader*)ptr;
+    ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+    // Per-ring task descriptors and payloads
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        handle->task_descriptors[r] = (PTO2TaskDescriptor*)ptr;
+        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+
+        handle->task_payloads[r] = (PTO2TaskPayload*)ptr;
+        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+    }
+}
+
+static void pto2_sm_setup_pointers(PTO2SharedMemoryHandle* handle, uint64_t task_window_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+    }
+    pto2_sm_setup_pointers_per_ring(handle, task_window_sizes);
+}
+
 PTO2SharedMemoryHandle* pto2_sm_create(uint64_t task_window_size,
                                         uint64_t heap_size) {
     // Allocate handle
@@ -68,18 +103,7 @@ PTO2SharedMemoryHandle* pto2_sm_create(uint64_t task_window_size,
     memset(handle->sm_base, 0, static_cast<size_t>(sm_size));
 
     // Set up pointers
-    char* ptr = (char*)handle->sm_base;
-
-    // Header
-    handle->header = (PTO2SharedMemoryHeader*)ptr;
-    ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-
-    // Task descriptors
-    handle->task_descriptors = (PTO2TaskDescriptor*)ptr;
-    ptr += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-
-    // Task payloads (cold data)
-    handle->task_payloads = (PTO2TaskPayload*)ptr;
+    pto2_sm_setup_pointers(handle, task_window_size);
 
     // Initialize header
     pto2_sm_init_header(handle, task_window_size, heap_size);
@@ -108,57 +132,71 @@ PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base,
     handle->sm_size = sm_size;
     handle->is_owner = false;
 
-    char* ptr = (char*)sm_base;
-    handle->header = (PTO2SharedMemoryHeader*)ptr;
-    ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    handle->task_descriptors = (PTO2TaskDescriptor*)ptr;
-    ptr += PTO2_ALIGN_UP(task_window_size * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-    handle->task_payloads = (PTO2TaskPayload*)ptr;
-
+    pto2_sm_setup_pointers(handle, task_window_size);
     pto2_sm_init_header(handle, task_window_size, heap_size);
-    
+
     return handle;
 }
 
 void pto2_sm_destroy(PTO2SharedMemoryHandle* handle) {
     if (!handle) return;
-    
+
     if (handle->is_owner && handle->sm_base) {
         free(handle->sm_base);
     }
-    
+
     free(handle);
 }
 
 // =============================================================================
 // Initialization
 // =============================================================================
-// 
+//
 // no need init data in pool, init pool data when used
 void pto2_sm_init_header(PTO2SharedMemoryHandle* handle,
                           uint64_t task_window_size,
                           uint64_t heap_size) {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_window_sizes[r] = task_window_size;
+        heap_sizes[r] = heap_size;
+    }
+    pto2_sm_init_header_per_ring(handle, task_window_sizes, heap_sizes);
+}
+
+void pto2_sm_init_header_per_ring(
+    PTO2SharedMemoryHandle* handle,
+    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]) {
     PTO2SharedMemoryHeader* header = handle->header;
 
-    // Flow control pointers (start at 0)
-    header->current_task_index.store(0, std::memory_order_relaxed);
-    header->heap_top.store(0, std::memory_order_relaxed);
-    header->orchestrator_done.store(0, std::memory_order_relaxed);
-    header->last_task_alive.store(0, std::memory_order_relaxed);
-    header->heap_tail.store(0, std::memory_order_relaxed);
-    header->heap_tail_gen.store(0, std::memory_order_relaxed);
+    // Per-ring flow control (start at 0)
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        header->rings[r].fc.init();
+    }
 
-    // Layout info
-    header->task_window_size = task_window_size;
-    header->heap_size = heap_size;
+    header->orchestrator_done.store(0, std::memory_order_relaxed);
 
-    // Calculate offsets
+    // Per-ring layout info
     uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    header->task_descriptors_offset = offset;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        header->rings[r].task_window_size = task_window_sizes[r];
+        header->rings[r].heap_size = heap_sizes[r];
+        header->rings[r].task_descriptors_offset = offset;
+        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+    }
 
     header->total_size = handle->sm_size;
     header->graph_output_ptr.store(0, std::memory_order_relaxed);
     header->graph_output_size.store(0, std::memory_order_relaxed);
+
+    // Error reporting
+    header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+    header->sched_error_bitmap.store(0, std::memory_order_relaxed);
+    header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+    header->sched_error_thread.store(-1, std::memory_order_relaxed);
 }
 
 // =============================================================================
@@ -173,16 +211,24 @@ void pto2_sm_print_layout(PTO2SharedMemoryHandle* handle) {
     LOG_INFO("=== PTO2 Shared Memory Layout ===");
     LOG_INFO("Base address:       %p", handle->sm_base);
     LOG_INFO("Total size:         %" PRIu64 " bytes", h->total_size);
-    LOG_INFO("Task window size:   %" PRIu64, h->task_window_size);
-    LOG_INFO("Heap size:          %" PRIu64 " bytes", h->heap_size);
-    LOG_INFO("Offsets:");
-    LOG_INFO("  TaskDescriptors:  %" PRIu64 " (0x%" PRIx64 ")", h->task_descriptors_offset, h->task_descriptors_offset);
-    LOG_INFO("Flow control:");
-    LOG_INFO("  heap_top:           %" PRIu64, h->heap_top.load(std::memory_order_acquire));
-    LOG_INFO("  heap_tail:          %" PRIu64, h->heap_tail.load(std::memory_order_acquire));
-    LOG_INFO("  current_task_index: %d", h->current_task_index.load(std::memory_order_acquire));
-    LOG_INFO("  orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
-    LOG_INFO("  last_task_alive:    %d", h->last_task_alive.load(std::memory_order_acquire));
+    LOG_INFO("Ring depth:         %d", PTO2_MAX_RING_DEPTH);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        LOG_INFO("Ring %d:", r);
+        LOG_INFO("  task_window_size: %" PRIu64, h->rings[r].task_window_size);
+        LOG_INFO("  heap_size:        %" PRIu64 " bytes", h->rings[r].heap_size);
+        LOG_INFO("  descriptors_off:  %" PRIu64 " (0x%" PRIx64 ")",
+                 h->rings[r].task_descriptors_offset, h->rings[r].task_descriptors_offset);
+        LOG_INFO("  heap_top:         %" PRIu64, h->rings[r].fc.heap_top.load(std::memory_order_acquire));
+        LOG_INFO("  heap_tail:        %" PRIu64, h->rings[r].fc.heap_tail.load(std::memory_order_acquire));
+        LOG_INFO("  current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire));
+        LOG_INFO("  last_task_alive:  %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire));
+    }
+    LOG_INFO("orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
+    LOG_INFO("Error state:");
+    LOG_INFO("  orch_error_code:    %d", h->orch_error_code.load(std::memory_order_relaxed));
+    LOG_INFO("  sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed));
+    LOG_INFO("  sched_error_code:   %d", h->sched_error_code.load(std::memory_order_relaxed));
+    LOG_INFO("  sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed));
     LOG_INFO("================================");
 }
 
@@ -193,21 +239,35 @@ bool pto2_sm_validate(PTO2SharedMemoryHandle* handle) {
 
     PTO2SharedMemoryHeader* h = handle->header;
 
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (!h->rings[r].fc.validate(handle, r)) return false;
+    }
+
+    return true;
+}
+
+bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle* handle, int32_t ring_id) const {
+    if (!handle) return false;
+    if (!handle->header) return false;
+    if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false;
+
+    const PTO2SharedMemoryHeader* h = handle->header;
+
     // Check that offsets are within bounds
-    if (h->task_descriptors_offset >= h->total_size) return false;
+    if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false;
 
     // Check pointer alignment
-    if ((uintptr_t)handle->task_descriptors % PTO2_ALIGN_SIZE != 0) return false;
+    if ((uintptr_t)handle->task_descriptors[ring_id] % PTO2_ALIGN_SIZE != 0) return false;
 
     // Check flow control pointer sanity
-    int32_t current_task_index = h->current_task_index.load(std::memory_order_acquire);
-    int32_t last_task_alive = h->last_task_alive.load(std::memory_order_acquire);
-    uint64_t heap_top = h->heap_top.load(std::memory_order_acquire);
-    uint64_t heap_tail = h->heap_tail.load(std::memory_order_acquire);
-    if (current_task_index < 0) return false;
-    if (last_task_alive < 0) return false;
-    if (heap_top > h->heap_size) return false;
-    if (heap_tail > h->heap_size) return false;
+    int32_t current = current_task_index.load(std::memory_order_acquire);
+    int32_t last_alive = last_task_alive.load(std::memory_order_acquire);
+    uint64_t top = heap_top.load(std::memory_order_acquire);
+    uint64_t tail = heap_tail.load(std::memory_order_acquire);
+    if (current < 0) return false;
+    if (last_alive < 0) return false;
+    if (top > h->rings[ring_id].heap_size) return false;
+    if (tail > h->rings[ring_id].heap_size) return false;
 
     return true;
 }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index 6b8ce35e..e3ada51f 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -3,13 +3,17 @@
  *
  * Defines the shared memory structure for Orchestrator-Scheduler communication.
  *
- * Memory Layout:
+ * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1):
  *   +---------------------------+
- *   | SharedMemoryHeader        |  (flow control + sync)
+ *   | SharedMemoryHeader        |  (per-ring flow control + sync)
  *   +---------------------------+
- *   | TaskDescriptor[]          |  (ring buffer)
+ *   | Ring 0: TaskDescriptor[]  |
+ *   | Ring 0: TaskPayload[]     |
  *   +---------------------------+
- *   | TaskPayload[]             |  (cold task data)
+ *   | Ring 1: TaskDescriptor[]  |
+ *   | Ring 1: TaskPayload[]     |
+ *   +---------------------------+
+ *   | ...                       |
  *   +---------------------------+
  *
  * Design principles:
@@ -33,32 +37,56 @@ extern "C" {
 // Shared Memory Header
 // =============================================================================
 
+struct PTO2SharedMemoryHandle;
+
 /**
- * Shared memory header structure
- * 
- * Contains flow control pointers and layout information.
+ * Per-ring flow control state in shared memory.
  * Written/read by Orchestrator and Scheduler for synchronization.
  */
-typedef struct {
-    // === FLOW CONTROL POINTERS ===
-
+struct PTO2RingFlowControl {
     // Written by Orchestrator, Read by Scheduler
     std::atomic<uint64_t> heap_top;           // Heap ring allocation pointer
     std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
-    std::atomic<int32_t> orchestrator_done;   // Flag: orchestration complete
-    
+    int32_t _pad0;                            // Alignment padding
+
     // Written by Scheduler, Read by Orchestrator (for back-pressure)
-    std::atomic<uint64_t> heap_tail;          // Heap ring free pointer (on-device, matches pto2_heap_ring_init)
+    std::atomic<uint64_t> heap_tail;          // Heap ring free pointer
     std::atomic<int32_t> last_task_alive;     // Task ring tail (oldest active task)
-    std::atomic<int32_t> heap_tail_gen;       // Ticket counter for serialized heap_tail writes
-                                              // (ensures concurrent threads write in task order)
+    int32_t _pad1;                            // Alignment padding
+
+    void init() {
+        heap_top.store(0, std::memory_order_relaxed);
+        current_task_index.store(0, std::memory_order_relaxed);
+        heap_tail.store(0, std::memory_order_relaxed);
+        last_task_alive.store(0, std::memory_order_relaxed);
+    }
+
+    bool validate(PTO2SharedMemoryHandle* handle, int32_t ring_id) const;
+};
 
-    // === LAYOUT INFO (set once at init) ===
-    uint64_t task_window_size;            // PTO2_TASK_WINDOW_SIZE
-    uint64_t heap_size;                   // Total heap size
+/**
+ * Per-ring shared memory header section.
+ *
+ * Groups flow-control and layout info for a single ring to avoid parallel arrays.
+ */
+struct PTO2SharedMemoryRingHeader {
+    PTO2RingFlowControl fc;
+    uint64_t task_window_size;
+    uint64_t heap_size;
+    uint64_t task_descriptors_offset;  // Offset from SM base, in bytes
+};
 
-    // Offsets into shared memory (relative to SM_Base)
-    uint64_t task_descriptors_offset;     // Offset to TaskDescriptor array
+/**
+ * Shared memory header structure
+ *
+ * Contains per-ring flow control and global layout information.
+ */
+struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
+    // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) ===
+    PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
+
+    // === GLOBAL FIELDS ===
+    std::atomic<int32_t> orchestrator_done;   // Flag: orchestration complete
 
     // Total shared memory size (for validation)
     uint64_t total_size;
@@ -68,10 +96,21 @@ typedef struct {
     std::atomic<uint64_t> graph_output_ptr;   // Address where final output was written (packed buffer)
     std::atomic<uint64_t> graph_output_size;  // Size in bytes
 
-    // Padding to 128-byte cache line
-    uint64_t _padding[4];
+    // === ERROR REPORTING ===
+
+    // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host)
+    // Non-zero signals fatal error. Written by orchestrator, read by scheduler and host.
+    std::atomic<int32_t> orch_error_code;
 
-} PTO2SharedMemoryHeader;
+    // Scheduler error state (Scheduler → Host, independent of orchestrator)
+    // Written by scheduler threads on timeout; read by orchestrator and host.
+    std::atomic<int32_t> sched_error_bitmap;   // Bit X set = thread X had error
+    std::atomic<int32_t> sched_error_code;     // Last scheduler error code (last-writer-wins)
+    std::atomic<int32_t> sched_error_thread;   // Thread index of last error writer
+};
+
+static_assert(sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0,
+              "PTO2SharedMemoryHeader must be aligned to cache line (PTO2_ALIGN_SIZE)");
 
 // =============================================================================
 // Shared Memory Handle
@@ -81,19 +120,19 @@ typedef struct {
  * Handle for shared memory access
  * Provides both Orchestrator and Scheduler views of the same memory
  */
-typedef struct {
+struct PTO2SharedMemoryHandle {
     void*   sm_base;              // Base address of shared memory
     uint64_t sm_size;             // Total size of shared memory
 
-    // Quick pointers into shared memory regions
+    // Quick pointers into shared memory regions (per-ring)
     PTO2SharedMemoryHeader* header;
-    PTO2TaskDescriptor*     task_descriptors;
-    PTO2TaskPayload*        task_payloads;
-    
+    PTO2TaskDescriptor*     task_descriptors[PTO2_MAX_RING_DEPTH];
+    PTO2TaskPayload*        task_payloads[PTO2_MAX_RING_DEPTH];
+
     // Ownership flag
     bool    is_owner;             // True if this handle allocated the memory
-    
-} PTO2SharedMemoryHandle;
+
+};
 
 // =============================================================================
 // Shared Memory API
@@ -102,16 +141,24 @@ typedef struct {
 /**
  * Calculate required shared memory size
  *
- * @param task_window_size  Number of task slots
+ * @param task_window_size  Number of task slots per ring
  * @return Total bytes required
  */
 uint64_t pto2_sm_calculate_size(uint64_t task_window_size);
 
+/**
+ * Calculate required shared memory size for per-ring task windows.
+ *
+ * @param task_window_sizes  Array of window sizes per ring
+ * @return Total bytes required
+ */
+uint64_t pto2_sm_calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+
 /**
  * Create shared memory for Orchestrator and Scheduler
  *
- * @param task_window_size  Number of task slots
- * @param heap_size         Heap size for output buffers
+ * @param task_window_size  Number of task slots per ring
+ * @param heap_size         Heap size per ring for output buffers
  * @return Handle with both views, or NULL on failure
  */
 PTO2SharedMemoryHandle* pto2_sm_create(uint64_t task_window_size,
@@ -128,8 +175,8 @@ PTO2SharedMemoryHandle* pto2_sm_create_default(void);
  *
  * @param sm_base            Base address of pre-allocated buffer
  * @param sm_size            Total size in bytes
- * @param task_window_size   Number of task slots (must match buffer layout)
- * @param heap_size          Heap size (for layout; buffer has no heap region)
+ * @param task_window_size   Number of task slots per ring (must match buffer layout)
+ * @param heap_size          Heap size per ring (for layout; buffer has no heap region)
  * @return Handle, or NULL on failure
  */
 PTO2SharedMemoryHandle* pto2_sm_create_from_buffer(void* sm_base,
@@ -151,23 +198,12 @@ void pto2_sm_init_header(PTO2SharedMemoryHandle* handle,
                           uint64_t heap_size);
 
 /**
- * Get task descriptor by task ID
- * Uses runtime window_size for ring buffer indexing (not compile-time constant)
+ * Initialize shared memory header with per-ring layout information.
  */
-static inline PTO2TaskDescriptor* pto2_sm_get_task(PTO2SharedMemoryHandle* handle,
-                                                    int32_t task_id) {
-    uint64_t window_mask = handle->header->task_window_size - 1;
-    return &handle->task_descriptors[task_id & window_mask];
-}
-
-/**
- * Get task descriptor by task slot
- * Uses runtime window_size for ring buffer indexing (not compile-time constant)
- */
-static inline PTO2TaskDescriptor& pto2_sm_get_task_by_slot(PTO2SharedMemoryHandle* handle,
-                                                    int32_t slot) {
-    return handle->task_descriptors[slot];
-}
+void pto2_sm_init_header_per_ring(
+    PTO2SharedMemoryHandle* handle,
+    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH],
+    const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]);
 
 // =============================================================================
 // Debug Utilities
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
new file mode 100644
index 00000000..177781a3
--- /dev/null
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
@@ -0,0 +1,97 @@
+/**
+ * PTO Submit Types - Shared submit-contract definitions
+ *
+ * Header-only definitions shared by orchestration-facing and runtime-facing
+ * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
+ */
+
+#ifndef PTO_SUBMIT_TYPES_H
+#define PTO_SUBMIT_TYPES_H
+
+#include <stdint.h>
+
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+/**
+ * Subtask slot count: AIC, AIV0, AIV1
+ */
+inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
+
+/**
+ * Subtask slot indices
+ */
+enum class PTO2SubtaskSlot : uint8_t {
+    AIC  = 0,
+    AIV0 = 1,
+    AIV1 = 2,
+};
+
+/**
+ * Subtask mask bits (for active_mask / subtask_done_mask)
+ */
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC  = (1u << 0);  // 0x1
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);  // 0x2
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);  // 0x4
+
+/**
+ * Test whether a subtask slot is active in a given mask
+ */
+static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) {
+    return (mask & (1u << static_cast<uint8_t>(slot))) != 0;
+}
+
+/**
+ * Mixed-task submit contract.
+ *
+ * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
+ * At least one slot must be valid.
+ */
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+};
+
+/**
+ * Resource shape — classifies a MixedKernels into one of 5 queue buckets.
+ */
+enum class PTO2ResourceShape : uint8_t {
+    AIC_ONLY    = 0,   // AIC only
+    AIV_X1      = 1,   // One AIV slot
+    AIV_X2      = 2,   // Both AIV slots
+    AIC_AIV_X1  = 3,   // AIC + one AIV
+    AIC_AIV_X2  = 4,   // AIC + both AIV
+};
+
+inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 5;
+
+/**
+ * Derive resource shape from active_mask.
+ * Caller must ensure active_mask is valid (at least one bit set).
+ */
+static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) {
+    bool has_aic = (active_mask & PTO2_SUBTASK_MASK_AIC) != 0;
+    int aiv_count = ((active_mask & PTO2_SUBTASK_MASK_AIV0) != 0)
+                  + ((active_mask & PTO2_SUBTASK_MASK_AIV1) != 0);
+
+    if (has_aic) {
+        if (aiv_count == 0) return PTO2ResourceShape::AIC_ONLY;
+        if (aiv_count == 1) return PTO2ResourceShape::AIC_AIV_X1;
+        return PTO2ResourceShape::AIC_AIV_X2;
+    }
+    if (aiv_count == 1) return PTO2ResourceShape::AIV_X1;
+    return PTO2ResourceShape::AIV_X2;
+}
+
+/**
+ * Compute active_mask from MixedKernels.
+ */
+static inline uint8_t pto2_mixed_kernels_to_active_mask(const MixedKernels& mk) {
+    uint8_t mask = 0;
+    if (mk.aic_kernel_id  != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
+    if (mk.aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
+    if (mk.aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1;
+    return mask;
+}
+
+#endif // PTO_SUBMIT_TYPES_H
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
index b0f78124..50cd57b1 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.cpp
@@ -38,7 +38,7 @@ uint64_t g_insert_count = 0;
 // Initialization and Destruction
 // =============================================================================
 
-bool PTO2TensorMap::init(int32_t new_num_buckets, int32_t new_pool_size, int32_t new_task_window_size) {
+bool PTO2TensorMap::init(int32_t new_num_buckets, int32_t new_pool_size, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) {
     // Validate power of 2 for fast modulo
     if ((new_num_buckets & (new_num_buckets - 1)) != 0) {
         return false;  // num_buckets must be power of 2
@@ -57,13 +57,14 @@ bool PTO2TensorMap::init(int32_t new_num_buckets, int32_t new_pool_size, int32_t
 
     num_buckets = new_num_buckets;
 
-    // Allocate entry pool
-    entry_pool = (PTO2TensorMapEntry*)calloc(new_pool_size, sizeof(PTO2TensorMapEntry));
+    // Allocate entry pool (64-byte aligned for cache-line-aligned entries)
+    entry_pool = (PTO2TensorMapEntry*)aligned_alloc(alignof(PTO2TensorMapEntry), new_pool_size * sizeof(PTO2TensorMapEntry));
     if (!entry_pool) {
         free(buckets);
         buckets = NULL;
         return false;
     }
+    memset(entry_pool, 0, new_pool_size * sizeof(PTO2TensorMapEntry));
 
     // Allocate free entry list
     free_entry_list = (PTO2TensorMapEntry**)calloc(new_pool_size, sizeof(PTO2TensorMapEntry*));
@@ -86,35 +87,42 @@ bool PTO2TensorMap::init(int32_t new_num_buckets, int32_t new_pool_size, int32_t
         entry_pool[i].prev_in_bucket = nullptr;
         entry_pool[i].next_in_task = nullptr;
         entry_pool[i].prev_in_task = nullptr;
-        entry_pool[i].producer_task_id = -1;
+        entry_pool[i].producer_task_id = PTO2TaskId{};
     }
 
-    // Allocate per-task entry tracking
-    task_entry_head = (PTO2TensorMapEntry**)malloc(new_task_window_size * sizeof(PTO2TensorMapEntry*));
-    if (!task_entry_head) {
-        free(entry_pool);
-        free(buckets);
-        free(free_entry_list);
-        entry_pool = NULL;
-        buckets = NULL;
-        free_entry_list = NULL;
-        return false;
+    // Allocate per-ring per-task entry tracking (each ring has its own window size)
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        task_entry_heads[r] = (PTO2TensorMapEntry**)malloc(new_task_window_sizes[r] * sizeof(PTO2TensorMapEntry*));
+        if (!task_entry_heads[r]) {
+            // Cleanup previously allocated rings
+            for (int j = 0; j < r; j++) {
+                free(task_entry_heads[j]);
+                task_entry_heads[j] = NULL;
+            }
+            free(entry_pool);
+            free(buckets);
+            free(free_entry_list);
+            entry_pool = NULL;
+            buckets = NULL;
+            free_entry_list = NULL;
+            return false;
+        }
+        for (int32_t i = 0; i < new_task_window_sizes[r]; i++) {
+            task_entry_heads[r][i] = nullptr;
+        }
+        task_window_sizes[r] = new_task_window_sizes[r];
     }
 
-    // Initialize all task entry heads to -1 (no entries)
-    for (int32_t i = 0; i < new_task_window_size; i++) {
-        task_entry_head[i] = nullptr;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        last_task_alives[r] = 0;
+        last_cleanup[r] = 0;
     }
 
-    task_window_size = new_task_window_size;
-
-    last_task_alive = 0;
-
     return true;
 }
 
-bool PTO2TensorMap::init_default(int32_t new_task_window_size) {
-    return init(PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_size);
+bool PTO2TensorMap::init_default(const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) {
+    return init(PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
 }
 
 void PTO2TensorMap::destroy() {
@@ -128,9 +136,11 @@ void PTO2TensorMap::destroy() {
         entry_pool = NULL;
     }
 
-    if (task_entry_head) {
-        free(task_entry_head);
-        task_entry_head = NULL;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (task_entry_heads[r]) {
+            free(task_entry_heads[r]);
+            task_entry_heads[r] = NULL;
+        }
     }
 
     if (free_entry_list) {
@@ -193,7 +203,9 @@ void PTO2TensorMap::print_stats() {
     LOG_INFO("Empty buckets:       %d", empty_buckets);
     LOG_INFO("Max chain len:       %d", max_chain);
     LOG_INFO("Avg chain len:       %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0);
-    LOG_INFO("Last task alive:     %d", last_task_alive);
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        LOG_INFO("Last task alive[%d]: %d", r, last_task_alives[r]);
+    }
     LOG_INFO("============================");
 }
 
@@ -209,20 +221,13 @@ int32_t PTO2TensorMap::valid_count() {
     return count;
 }
 
-void PTO2TensorMap::sync_tensormap() {
-    constexpr int MIN_FREE_NUM = 1024;
-    always_assert(orch != nullptr);
-    while(true) {
-        // Read current last_task_alive from shared memory
-        int32_t new_last_task_alive =
-            orch->sm_handle->header->last_task_alive.load(std::memory_order_acquire);
-        sync_validity(new_last_task_alive);
-        if ((pool_size - next_entry_idx + free_num < MIN_FREE_NUM) || new_last_task_alive - orch->tensormap_last_cleanup >= PTO2_TENSORMAP_CLEANUP_INTERVAL) {
-            cleanup_retired(orch->tensormap_last_cleanup, new_last_task_alive);
-            orch->tensormap_last_cleanup = new_last_task_alive;
-        } else {
-            break;
-        }
+void PTO2TensorMap::sync_tensormap(uint8_t ring_id, int32_t sm_last_task_alive) {
+    sync_validity(ring_id, sm_last_task_alive);
+    // Only attempt cleanup when last_task_alive has actually advanced;
+    // otherwise cleanup_retired would empty-loop and we'd spin forever.
+    if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL) {
+        cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
+        last_cleanup[ring_id] = sm_last_task_alive;
     }
 }
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 2f3f3e5d..9d1bb56a 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -8,9 +8,8 @@
  * Key design features:
  * 1. Ring buffer pool for entries (no malloc/free)
  * 2. Lazy invalidation (entries become stale when producer retires)
- * 3. Chain truncation optimization (truncate on first stale entry)
- * 4. Per-task entry tracking for efficient cleanup
- * 5. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions
+ * 3. Per-task per-ring entry tracking for efficient cleanup
+ * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions
  *
  * Hash table with chaining:
  * - buckets[] array of head offsets
@@ -59,30 +58,99 @@ extern uint64_t g_insert_count;
 // =============================================================================
 
 /**
- * TensorMap entry structure
- * Maps tensor region -> producer task ID
+ * TensorMap entry structure — cache-line optimized for lookup
  *
- * Stored in ring buffer pool with lazy invalidation:
- * - Entry is valid only if producer_task_id >= last_task_alive
- * - Stale entries ignored during lookup
- * - Pool wraps around, overwriting stale entries
+ * Cache line 1 (64B, lookup hot path):
+ *   next_in_bucket, producer_task_id, buffer_addr — chain traversal + validity + hash match
+ *   version, ndims, is_all_offset_zero, with_alloc, bucket_index — overlap fast path
+ *   shapes[5] — overlap comparison
  *
- * Chain truncation optimization:
- * - Entries in bucket chains sorted by task_id (newest first)
- * - When lookup hits stale entry, truncate rest of chain
+ * Cache line 2 (64B, insert/remove/slow-path only):
+ *   prev_in_bucket, next_in_task, prev_in_task — chain manipulation
+ *   offsets[5] — only read when !is_all_offset_zero
+ *
+ * When is_all_offset_zero is true, lookup touches only cache line 1.
+ * Entry size: 128B (2 cache lines) vs previous 192B (3 cache lines with embedded Tensor).
  */
-struct PTO2TensorMapEntry {
-    bool with_alloc{true};     // True if entry is task output, False if entry is task inout
-    int32_t producer_task_id;  // Task that produces this region
-    PTO2TensorMapEntry* next_in_bucket;    // Offset to next entry in hash bucket (-1 = end)
-    PTO2TensorMapEntry* prev_in_bucket;    // Offset to prev entry in hash bucket (-1 = head is buckets[bucket])
-    PTO2TensorMapEntry* next_in_task;      // Offset to next entry for same task (-1 = end)
-    PTO2TensorMapEntry* prev_in_task;      // Offset to prev entry for same task (-1 = head is task_entry_head[slot])
-    int32_t bucket_index;      // != -1 if entry is linked in a bucket chain
-                               // CRITICAL: Must be set -1 before overwriting!
-    Tensor tensor;             // Tensor descriptor key
+struct alignas(64) PTO2TensorMapEntry {
+    // === Cache line 1 (64B) — lookup hot path ===
+    PTO2TensorMapEntry* next_in_bucket;    // 8B: next entry in hash bucket chain
+    PTO2TaskId producer_task_id;           // 8B: raw (ring_id << 32) | local_id
+    uint64_t buffer_addr;                  // 8B: tensor base address (hash key)
+    int32_t version;                       // 4B: tensor version for overlap detection
+    uint32_t ndims;                        // 4B: number of dimensions
+    int32_t bucket_index;                  // 4B: bucket index (-1 if unlinked)
+    bool is_all_offset_zero;               // 1B: fast-path flag
+    bool with_alloc;                       // 1B: true=OUTPUT, false=INOUT
+    // padding: 2B
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS]; // 20B: shape per dimension
+    // padding: 4B to fill 64B
+
+    // === Cache line 2 (64B) — insert/remove/slow-path ===
+    PTO2TensorMapEntry* prev_in_bucket;    // 8B: prev in hash bucket chain
+    PTO2TensorMapEntry* next_in_task;      // 8B: next entry for same task
+    PTO2TensorMapEntry* prev_in_task;      // 8B: prev entry for same task
+    uint32_t offsets[RUNTIME_MAX_TENSOR_DIMS]; // 20B: only when !is_all_offset_zero
+    // padding: 20B to fill 64B
+
+    /**
+     * Copy overlap-relevant fields from a Tensor into this entry.
+     */
+    void copy_from_tensor(const Tensor& t) {
+        buffer_addr = t.buffer.addr;
+        version = t.version;
+        ndims = t.ndims;
+        is_all_offset_zero = t.is_all_offset_zero;
+        for (uint32_t i = 0; i < t.ndims; i++) {
+            shapes[i] = t.shapes[i];
+        }
+        if (!t.is_all_offset_zero) {
+            for (uint32_t i = 0; i < t.ndims; i++) {
+                offsets[i] = t.offsets[i];
+            }
+        }
+    }
+
+    /**
+     * Check overlap between input tensor and this entry (the producer output).
+     * Mirrors Tensor::is_overlap() logic but operates on entry fields directly.
+     */
+    OverlapStatus check_overlap(const Tensor& input) const {
+        debug_assert(input.buffer.addr == buffer_addr);
+        debug_assert(input.version >= version);
+        if (input.version > version) {
+            return OverlapStatus::OTHER;
+        }
+        // Fast path: both have zero offsets → ranges are [0, shape[i])
+        if (input.is_all_offset_zero && is_all_offset_zero) {
+            bool contains = true;
+            for (uint32_t i = 0; i < ndims; i++) {
+                if (input.shapes[i] < shapes[i]) {
+                    contains = false;
+                    break;
+                }
+            }
+            return contains ? OverlapStatus::COVERED : OverlapStatus::OTHER;
+        }
+        // Slow path: at least one has non-zero offsets
+        bool contains = true;
+        for (uint32_t i = 0; i < ndims; i++) {
+            uint64_t in_off = input.is_all_offset_zero ? 0 : input.offsets[i];
+            uint64_t ent_off = is_all_offset_zero ? 0 : offsets[i];
+            Segment in_range{in_off, in_off + (uint64_t)input.shapes[i]};
+            Segment ent_range{ent_off, ent_off + (uint64_t)shapes[i]};
+            if (!in_range.line_segment_intersection(ent_range)) {
+                return OverlapStatus::NO_OVERLAP;
+            } else if (!in_range.contains(ent_range)) {
+                contains = false;
+            }
+        }
+        return contains ? OverlapStatus::COVERED : OverlapStatus::OTHER;
+    }
 };
 
+static_assert(sizeof(PTO2TensorMapEntry) == 128, "TensorMapEntry must be exactly 2 cache lines (128 bytes)");
+
 /**
  * Stack-allocated lookup result (avoids heap allocation per lookup)
  */
@@ -122,13 +190,16 @@ struct PTO2TensorMap {
     int32_t next_entry_idx;          // id when next entry insert
     int32_t free_num;                // free entry number in entry pool
 
-    // Per-task entry tracking (for efficient bucket cleanup)
-    PTO2TensorMapEntry** task_entry_head;  // Per-task head offset (-1 = no entries)
-                               // Indexed by task_id % task_window_size
-    int32_t task_window_size;  // Runtime task window size (for slot masking)
+    // Per-ring per-task entry tracking (for efficient bucket cleanup)
+    // Indexed by [ring_id][local_id & (task_window_sizes[ring_id] - 1)]
+    PTO2TensorMapEntry** task_entry_heads[PTO2_MAX_RING_DEPTH];
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];  // Per-ring task window size (for slot masking)
 
-    // Validity threshold (for lazy invalidation)
-    int32_t last_task_alive;  // Cached value from shared memory
+    // Per-ring validity threshold (for lazy invalidation)
+    int32_t last_task_alives[PTO2_MAX_RING_DEPTH];  // Cached from shared memory per ring
+
+    // Per-ring cleanup progress (for periodic cleanup_retired)
+    int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
     PTO2OrchestratorState* orch{nullptr};
 
@@ -162,9 +233,6 @@ struct PTO2TensorMap {
             entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
         }
 
-        // Clear tensor AFTER bucket chain manipulation (hash computation needs valid tensor)
-        entry.tensor = Tensor();
-
         free_entry_list[free_num++] = &entry;
         entry.bucket_index = -1;
         entry.next_in_bucket = nullptr;
@@ -184,12 +252,12 @@ struct PTO2TensorMap {
      * @param pool_size   Size of entry pool
      * @return true on success, false on allocation failure
      */
-    bool init(int32_t num_buckets, int32_t pool_size, int32_t task_window_size);
+    bool init(int32_t num_buckets, int32_t pool_size, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 
     /**
      * Initialize TensorMap with default sizes
      */
-    bool init_default(int32_t task_window_size);
+    bool init_default(const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
 
     /**
      * Destroy TensorMap and free resources
@@ -202,24 +270,23 @@ struct PTO2TensorMap {
      *
      * @param last_task_alive  Current value from shared memory
      */
-    void sync_validity(int32_t last_task_alive) { this->last_task_alive = last_task_alive; }
+    void sync_validity(int32_t ring_id, int32_t last_task_alive) {
+        this->last_task_alives[ring_id] = last_task_alive;
+    }
 
     /**
      * Lookup producer for a tensor region
      *
      * Searches the hash table for a matching region.
      * Returns producer entry if found and valid.
-     *
-     * Chain truncation: When first stale entry is found, truncates
-     * the rest of the chain (all subsequent entries are also stale).
+     * Stale entries from different rings are skipped (not truncated).
      *
      * @param tensor  Tensor to look up
      * @param result  Output: stack-allocated result buffer
      */
     void lookup(const Tensor& tensor, PTO2LookupResult& result) {
         uint32_t bucket_index = hash(tensor.buffer.addr);
-        PTO2TensorMapEntry** prev_ptr = &buckets[bucket_index];  // For truncation
-        PTO2TensorMapEntry* cur_entry = *prev_ptr;
+        PTO2TensorMapEntry* cur_entry = buckets[bucket_index];
 
         result.count = 0;
 #if PTO2_TENSORMAP_PROFILING
@@ -228,33 +295,36 @@ struct PTO2TensorMap {
 #endif
 
         while (cur_entry != nullptr) {
+            // Prefetch next entry to hide pointer-chasing latency.
+            // entry_valid() + is_overlap() computation provides hide time.
+            PTO2TensorMapEntry* next_entry = cur_entry->next_in_bucket;
+            if (next_entry) __builtin_prefetch(next_entry, 0, 0);
+
 #if PTO2_TENSORMAP_PROFILING
             chain_len++;
 #endif
-            // Check validity first
+            // Skip stale entries (no chain truncation — entries from different
+            // rings can be interleaved, so a stale entry from one ring does NOT
+            // imply subsequent entries from other rings are also stale)
             if (!entry_valid(*cur_entry)) {
-                // ========== STALE ENTRY: Truncate chain here ==========
-                // All subsequent entries are guaranteed to be stale too!
-                // Truncate: unlink this and all following entries
-                *prev_ptr = nullptr;  // Terminate chain at previous entry
-
-                // Mark truncated entries as not in bucket (for correct reuse)
-                while (cur_entry != nullptr) {
-                    PTO2TensorMapEntry* next_entry = cur_entry->next_in_bucket;
-                    remove_entry(*cur_entry);
-                    cur_entry = next_entry;
-                }
-                break;
+                cur_entry = next_entry;
+                continue;
             }
 
             // Entry is valid - check if regions OVERLAP (not just exact match)
             // Since we hash only by base_ptr, all entries in this bucket have
             // potential to overlap. We must check actual byte-range overlap.
-            if (tensor.buffer.addr == cur_entry->tensor.buffer.addr) {
+            if (tensor.buffer.addr == cur_entry->buffer_addr) {
+                // Double prefetch: check_overlap provides enough hide time
+                // to also warm up the entry after next.
+                if (next_entry) {
+                    PTO2TensorMapEntry* next_next = next_entry->next_in_bucket;
+                    if (next_next) __builtin_prefetch(next_next, 0, 0);
+                }
 #if PTO2_TENSORMAP_PROFILING
                 g_lookup_overlap_checks++;
 #endif
-                auto overlap_status = tensor.is_overlap(cur_entry->tensor);
+                auto overlap_status = cur_entry->check_overlap(tensor);
                 if (overlap_status != OverlapStatus::NO_OVERLAP) {
                     result.push(cur_entry, overlap_status);
 #if PTO2_TENSORMAP_PROFILING
@@ -264,8 +334,7 @@ struct PTO2TensorMap {
             }
 
             // Move to next entry
-            prev_ptr = &cur_entry->next_in_bucket;
-            cur_entry = *prev_ptr;
+            cur_entry = next_entry;
         }
 #if PTO2_TENSORMAP_PROFILING
         g_lookup_chain_total += chain_len;
@@ -282,21 +351,30 @@ struct PTO2TensorMap {
      * @param tensor            Tensor produced
      * @param producer_task_id  Task ID of producer
      */
-    void insert(const Tensor& tensor, int32_t producer_task_id, bool with_alloc) {
+    void insert(const Tensor& tensor, PTO2TaskId producer_task_id, bool with_alloc) {
 #if PTO2_TENSORMAP_PROFILING
         g_insert_count++;
 #endif
+        // Prefetch bucket head and task_entry_head early; new_entry() + field
+        // initialization below provides hide time for these RFOs.
+        uint32_t bucket_index = hash(tensor.buffer.addr);
+        __builtin_prefetch(&buckets[bucket_index], 1, 0);
+        auto ring_id = producer_task_id.ring();
+        auto local_id = producer_task_id.local();
+        int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+        __builtin_prefetch(&task_entry_heads[ring_id][task_slot], 1, 0);
+
         // Allocate entry from ring buffer pool
         PTO2TensorMapEntry* entry = new_entry();
 
         // Initialize new entry
-        entry->tensor.copy(tensor);
+        entry->copy_from_tensor(tensor);
         entry->producer_task_id = producer_task_id;
         entry->with_alloc = with_alloc;
 
         // Insert at head of hash bucket (maintains task_id descending order)
-        entry->bucket_index = hash(tensor.buffer.addr);
-        entry->next_in_bucket = buckets[entry->bucket_index];
+        entry->bucket_index = bucket_index;
+        entry->next_in_bucket = buckets[bucket_index];
         // Update old head's prev pointer
         if (entry->next_in_bucket != nullptr) {
             entry->next_in_bucket->prev_in_bucket = entry;
@@ -304,15 +382,14 @@ struct PTO2TensorMap {
         buckets[entry->bucket_index] = entry;
         entry->prev_in_bucket = nullptr;  // New head has no predecessor
 
-        // Link to task's entry list (for cleanup)
-        int32_t task_slot = producer_task_id & (task_window_size - 1);
-        entry->next_in_task = task_entry_head[task_slot];
+        // Link to task's entry list (for cleanup), indexed by ring and local slot
+        entry->next_in_task = task_entry_heads[ring_id][task_slot];
         entry->prev_in_task = nullptr;  // New head has no predecessor
         // Update old head's prev pointer
         if (entry->next_in_task != nullptr) {
             entry->next_in_task->prev_in_task = entry;
         }
-        task_entry_head[task_slot] = entry;
+        task_entry_heads[ring_id][task_slot] = entry;
     }
 
     /**
@@ -324,23 +401,25 @@ struct PTO2TensorMap {
      * @param old_last_task_alive  Previous threshold
      * @param new_last_task_alive  New threshold
      */
-    void cleanup_retired(int32_t old_last_task_alive, int32_t new_last_task_alive) {
-        // Iterate through retired tasks and remove their entries from bucket chains
-        for (int32_t task_id = old_last_task_alive; task_id < new_last_task_alive; task_id++) {
-            int32_t task_slot = task_id & (task_window_size - 1);
-            PTO2TensorMapEntry* cur_entry = task_entry_head[task_slot];
+    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive) {
+        // Iterate through retired tasks on this ring and remove their entries
+        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) {
+            int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+            PTO2TensorMapEntry* cur_entry = task_entry_heads[ring_id][task_slot];
 
             while (cur_entry != nullptr) {
                 PTO2TensorMapEntry* next_entry = cur_entry->next_in_task;  // Save before clearing
                 // Only remove if this entry belongs to the retiring task
                 // (slot may have been reused by a newer task)
-                debug_assert(cur_entry->producer_task_id == task_id);
+                debug_assert(cur_entry->producer_task_id ==
+                             pto2_make_task_id(static_cast<uint8_t>(ring_id),
+                                               static_cast<uint32_t>(local_id)));
                 free_entry(*cur_entry);
                 cur_entry = next_entry;
             }
 
-            // Clear task's entry head (slot will be reused by task_id + TASK_WINDOW_SIZE)
-            task_entry_head[task_slot] = nullptr;
+            // Clear task's entry head (slot will be reused by local_id + task_window_sizes[ring_id])
+            task_entry_heads[ring_id][task_slot] = nullptr;
         }
     }
 
@@ -364,7 +443,9 @@ struct PTO2TensorMap {
      * Check if entry is valid (producer has not retired)
      */
     bool entry_valid(const PTO2TensorMapEntry& entry) const {
-        return entry.producer_task_id >= last_task_alive;
+        int32_t ring_id = pto2_task_id_ring(entry.producer_task_id);
+        int32_t local_id = static_cast<int32_t>(pto2_task_id_local(entry.producer_task_id));
+        return local_id >= last_task_alives[ring_id];
     }
 
     void remove_entry(PTO2TensorMapEntry& entry) {
@@ -380,9 +461,11 @@ struct PTO2TensorMap {
         always_assert(entry.bucket_index != -1); // 必须保证仍在桶中
         // Update predecessor's next pointer (O(1) via prev_in_task)
         if (entry.prev_in_task == nullptr) {
-            // Entry is the head of its task chain, update task_entry_head
-            int32_t task_slot = entry.producer_task_id & (task_window_size - 1);
-            task_entry_head[task_slot] = entry.next_in_task;
+            // Entry is the head of its task chain, update task_entry_heads
+            int32_t ring_id = pto2_task_id_ring(entry.producer_task_id);
+            int32_t local_id = static_cast<int32_t>(pto2_task_id_local(entry.producer_task_id));
+            int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
+            task_entry_heads[ring_id][task_slot] = entry.next_in_task;
         } else {
             entry.prev_in_task->next_in_task = entry.next_in_task;
         }
@@ -420,7 +503,7 @@ struct PTO2TensorMap {
      * Called periodically to refresh the lazy invalidation threshold.
      * Also triggers cleanup if threshold has advanced significantly.
      */
-    void sync_tensormap();
+    void sync_tensormap(uint8_t ring_id, int32_t sm_last_task_alive);
 };
 
 #if PTO2_TENSORMAP_PROFILING
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
index 1245e4d2..238b6522 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
@@ -2,8 +2,7 @@
  * Orchestration Build Graph Types - Data structures for orchestration runtime extensions
  *
  * Standalone header defining orchestration-specific types for:
- * - PTOParam: Parameter descriptor for pto_submit_task API
- * - PTOWorkerType: Worker types for heterogeneous scheduling
+ * - PTOParam: Aggregated parameter container for pto_submit_task API
  *
  * Tensor descriptor types (Tensor, PTOBufferHandle, PTOOverlapStrategy) are
  * defined in tensor.h.
@@ -16,10 +15,21 @@
 #define ORCH_BUILD_GRAPH_PTO_TYPES_H
 
 #include <stdint.h>
-#include <assert.h>
+#include <string.h>
+
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#endif
 
 #include "tensor.h"
 
+// Task parameters
+#define PTO2_MAX_TENSOR_PARAMS    16      // Maximum tensor parameters per task
+#define PTO2_MAX_SCALAR_PARAMS    128     // Maximum scalar parameters per task
+#define PTO2_MAX_OUTPUTS          16      // Maximum outputs per task
+#define PTO2_MAX_INPUTS           16      // Maximum inputs per task
+#define PTO2_MAX_INOUTS           8       // Maximum in-out params per task
+
 // =============================================================================
 // Parameter Types (for pto_submit_task API)
 // =============================================================================
@@ -31,64 +41,156 @@ enum class PTOParamType : int32_t {
     INPUT = 0,   // Read-only input buffer
     OUTPUT = 1,  // Write-only output buffer (NULL addr: runtime allocates; non-NULL: use as-is)
     INOUT = 2,   // Read-then-write: consumer of prior producer + modifier for downstream
-    SCALAR = 3   // Raw scalar value (no buffer, no dependency tracking)
 };
 
 /**
- * Parameter Descriptor for pto_submit_task
+ * Aggregated parameter container for pto_submit_task
  *
- * Holds a pointer to the caller's Tensor (reference semantics). The runtime
- * copies the Tensor into the task descriptor for scheduler access, and
- * writes allocated OUTPUT addresses back through the pointer.
+ * Tensor pointers and types are stored in separate parallel arrays for
+ * efficient bulk copy: the runtime can memcpy the pointer array and type
+ * array independently, avoiding per-element branching.
+ * Tensors are dispatched first in kernel args, followed by scalars.
  *
  * Example:
- *   Tensor td_a = make_tensor_external(dev_a, size);
- *   Tensor td_c = make_tensor(size);
- *   PTOParam params[] = {
- *       make_input_param(td_a),
- *       make_output_param(td_c),
- *   };
- *   pto2_rt_submit_task(rt, func_id, worker_type, params, 2);
+ *   Tensor td_a = make_tensor_external(dev_a, shapes, 2);
+ *   Tensor td_c = make_tensor(shapes, 2);
+ *   PTOParam params;
+ *   params.add_input(td_a);
+ *   params.add_output(td_c);
+ *   params.add_scalar(some_value);
+ *   pto2_rt_submit_aic_task(rt, kernel_id, params);
  *   // td_c.buffer.addr is already updated via pointer write-back
  */
 struct PTOParam {
-    PTOParamType type;         // PTOParamType::INPUT, PTOParamType::OUTPUT, or PTOParamType::SCALAR
-    Tensor* tensor{nullptr};   // Pointer to caller's Tensor (reference semantics)
-    uint64_t scalar_value{0};  // Raw value for PTOParamType::SCALAR (e.g., encoded float, int size)
-};
+    Tensor* tensors[PTO2_MAX_TENSOR_PARAMS];
+    PTOParamType tensor_types[PTO2_MAX_TENSOR_PARAMS];
+    uint64_t scalars[PTO2_MAX_SCALAR_PARAMS];
+    int32_t tensor_count{0};
+    int32_t scalar_count{0};
+    bool has_error{false};
+    const char* error_msg{nullptr};
 
-// =============================================================================
-// Factory Helpers
-// =============================================================================
+    void reset() {
+        tensor_count = 0;
+        scalar_count = 0;
+        has_error = false;
+        error_msg = nullptr;
+    }
+
+    void set_error(const char* msg) {
+        if (!has_error) {
+            has_error = true;
+            error_msg = msg;
+        }
+    }
+
+    bool check_add_tensor_valid() {
+        if (scalar_count != 0) {
+            set_error("add_input/add_output/add_inout called after add_scalar: "
+                      "all tensors must be added before any scalars");
+            return false;
+        }
+        if (tensor_count >= PTO2_MAX_TENSOR_PARAMS) {
+            set_error("Too many tensor params (exceeds PTO2_MAX_TENSOR_PARAMS=32)");
+            return false;
+        }
+        return true;
+    }
 
-static inline PTOParam make_scalar_param(uint64_t value) {
-    PTOParam p;
-    p.type = PTOParamType::SCALAR;
-    p.scalar_value = value;
-    return p;
-}
-
-static inline PTOParam make_input_param(Tensor& tensor) {
-    assert(tensor.buffer.addr != 0 && "INPUT param must have a non-NULL buffer address");
-    PTOParam p;
-    p.type = PTOParamType::INPUT;
-    p.tensor = &tensor;
-    return p;
-}
-
-static inline PTOParam make_output_param(Tensor& tensor) {
-    PTOParam p;
-    p.type = PTOParamType::OUTPUT;
-    p.tensor = &tensor;
-    return p;
-}
-
-static inline PTOParam make_inout_param(Tensor& tensor) {
-    assert(tensor.buffer.addr != 0 && "INOUT param must have a non-NULL buffer address");
-    PTOParam p;
-    p.type = PTOParamType::INOUT;
-    p.tensor = &tensor;
-    return p;
-}
+    void add_input(Tensor& t) {
+        if (!check_add_tensor_valid()) { return; }
+        if (t.buffer.addr == 0) {
+            set_error("INPUT tensor must have a non-NULL buffer address");
+            return;
+        }
+        tensors[tensor_count] = &t;
+        tensor_types[tensor_count] = PTOParamType::INPUT;
+        tensor_count++;
+    }
+
+    void add_output(Tensor& t) {
+        if (!check_add_tensor_valid()) { return; }
+        tensors[tensor_count] = &t;
+        tensor_types[tensor_count] = PTOParamType::OUTPUT;
+        tensor_count++;
+    }
+
+    void add_inout(Tensor& t) {
+        if (!check_add_tensor_valid()) { return; }
+        if (t.buffer.addr == 0) {
+            set_error("INOUT tensor must have a non-NULL buffer address");
+            return;
+        }
+        tensors[tensor_count] = &t;
+        tensor_types[tensor_count] = PTOParamType::INOUT;
+        tensor_count++;
+    }
+
+    void add_scalar(uint64_t v) {
+        if (scalar_count >= PTO2_MAX_SCALAR_PARAMS) {
+            set_error("Too many scalar params (exceeds PTO2_MAX_SCALAR_PARAMS=128)");
+            return;
+        }
+        scalars[scalar_count++] = v;
+    }
+
+    void add_scalars(const uint64_t* values, int count) {
+        if (scalar_count + count > PTO2_MAX_SCALAR_PARAMS) {
+            set_error("Too many scalar params (exceeds PTO2_MAX_SCALAR_PARAMS=128)");
+            return;
+        }
+        memcpy(&scalars[scalar_count], values, count * sizeof(uint64_t));
+        scalar_count += count;
+    }
+
+    /**
+     * Zero-extend int32 bit patterns into uint64 scalar slots.
+     * Negative values are treated as their unsigned 32-bit representation
+     * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF).
+     * Uses NEON to process 4 elements per iteration on aarch64.
+     */
+    void add_scalars_i32(const int32_t* values, int count) {
+        if (scalar_count + count > PTO2_MAX_SCALAR_PARAMS) {
+            set_error("Too many scalar params (exceeds PTO2_MAX_SCALAR_PARAMS=128)");
+            return;
+        }
+        uint64_t* dst = &scalars[scalar_count];
+#if defined(__aarch64__)
+        int i = 0;
+        for (; i + 4 <= count; i += 4) {
+            uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t*>(values + i));
+            uint64x2_t lo = vmovl_u32(vget_low_u32(v));
+            uint64x2_t hi = vmovl_u32(vget_high_u32(v));
+            vst1q_u64(dst + i, lo);
+            vst1q_u64(dst + i + 2, hi);
+        }
+        for (; i < count; i++) {
+            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
+        }
+#else
+        for (int i = 0; i < count; i++) {
+            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
+        }
+#endif
+        scalar_count += count;
+    }
+
+    /**
+     * Copy scalars from another PTOParam's scalar array.
+     * Useful when multiple tasks share the same scalar data (e.g., block indices).
+     */
+    void copy_scalars_from(const PTOParam& src, int src_offset, int count) {
+        if (src_offset + count > src.scalar_count) {
+            set_error("Source scalar range out of bounds in copy_scalars_from");
+            return;
+        }
+        if (scalar_count + count > PTO2_MAX_SCALAR_PARAMS) {
+            set_error("Too many scalar params (exceeds PTO2_MAX_SCALAR_PARAMS=128)");
+            return;
+        }
+        memcpy(&scalars[scalar_count], &src.scalars[src_offset], count * sizeof(uint64_t));
+        scalar_count += count;
+    }
+};
 
 #endif  // ORCH_BUILD_GRAPH_PTO_TYPES_H
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
index e9c1a52d..f4ccf0d5 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
@@ -26,6 +26,7 @@ Runtime::Runtime() {
     ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
     pto2_task_window_size = 0;
     pto2_heap_size = 0;
+    pto2_dep_pool_size = 0;
 
     // Initialize tensor pairs
     tensor_pair_count = 0;
@@ -34,6 +35,7 @@ Runtime::Runtime() {
     orch_built_on_host_ = true;
     pto2_gm_sm_ptr_ = nullptr;
     pto2_gm_heap_ptr_ = nullptr;
+    pto2_slot_states_ptr_ = nullptr;
     orch_args_ = nullptr;
     orch_arg_count_ = 0;
 
@@ -93,6 +95,7 @@ int Runtime::get_orch_arg_count() const { return orch_arg_count_; }
 void Runtime::set_orch_built_on_host(bool v) { orch_built_on_host_ = v; }
 void Runtime::set_pto2_gm_sm_ptr(void* p) { pto2_gm_sm_ptr_ = p; }
 void Runtime::set_pto2_gm_heap(void* p) { pto2_gm_heap_ptr_ = p; }
+void Runtime::set_pto2_slot_states_ptr(void* p) { pto2_slot_states_ptr_ = p; }
 void Runtime::set_orch_args(uint64_t* args, int count) {
     orch_arg_count_ = count <= RUNTIME_MAX_ARGS ? count : RUNTIME_MAX_ARGS;
     if (args && orch_arg_count_ > 0) {
@@ -165,11 +168,18 @@ void Runtime::complete_perf_records(PerfBuffer* perf_buf) {
         return;
     }
 
-    // Get PTO2 data structures
+    // Get slot states for fanout traversal
+    // With multi-ring, slot_states are per-ring inside the scheduler and
+    // pto2_slot_states_ptr_ is nullptr. Fanout and ring_id are filled on the
+    // AICPU side (aicpu_executor.cpp) where slot_state is directly available.
+    PTO2TaskSlotState* slot_states = static_cast<PTO2TaskSlotState*>(pto2_slot_states_ptr_);
+    if (slot_states == nullptr) {
+        return;
+    }
+
+    // Get window mask from shared memory header (ring 0 for legacy single-ring path)
     PTO2SharedMemoryHeader* header = static_cast<PTO2SharedMemoryHeader*>(sm_base);
-    PTO2TaskDescriptor* task_descriptors = reinterpret_cast<PTO2TaskDescriptor*>(
-        static_cast<char*>(sm_base) + header->task_descriptors_offset);
-    int32_t window_mask = header->task_window_size - 1;
+    int32_t window_mask = static_cast<int32_t>(header->rings[0].task_window_size) - 1;
 
     uint32_t count = perf_buf->count;
 
@@ -177,16 +187,25 @@ void Runtime::complete_perf_records(PerfBuffer* perf_buf) {
         PerfRecord* record = &perf_buf->records[i];
         int32_t task_id = record->task_id;
 
-        // Get TaskDescriptor from PTO2 shared memory
+        // Get slot state for fanout traversal
         int32_t slot = task_id & window_mask;
-        PTO2TaskDescriptor* task = &task_descriptors[slot];
+        PTO2TaskSlotState& ss = slot_states[slot];
 
         // Fill fanout information by traversing the linked list
         record->fanout_count = 0;
-        PTO2DepListEntry* cur = task->fanout_head;
+        PTO2DepListEntry* cur = ss.fanout_head;
 
         while (cur != nullptr && record->fanout_count < RUNTIME_MAX_FANOUT) {
-            record->fanout[record->fanout_count++] = cur->task_id;
+            // PerfRecord.fanout stores 32-bit legacy task IDs. Our multi-ring task ID
+            // encodes ring_id in the upper 32 bits, so only the legacy single-ring
+            // case (ring_id==0) is representable here.
+            uint64_t mixed = pto2_task_id_raw(cur->slot_state->task->mixed_task_id);
+            if ((mixed >> 32) != 0) {
+                // Skip: cannot represent (ring_id, local_id) in a 32-bit fanout slot.
+                cur = cur->next;
+                continue;
+            }
+            record->fanout[record->fanout_count++] = static_cast<int32_t>(mixed & 0xFFFFFFFFu);
             cur = cur->next;
         }
     }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index b0e1e1de..62508b8b 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -150,6 +150,7 @@ class Runtime {
     // Ring buffer size overrides (0 = use compile-time defaults)
     uint64_t pto2_task_window_size;
     uint64_t pto2_heap_size;
+    uint64_t pto2_dep_pool_size;
 
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     // NOTE: Made public for direct access from aicore code
@@ -172,6 +173,7 @@ class Runtime {
     bool orch_built_on_host_;
     void* pto2_gm_sm_ptr_;  // GM pointer to PTO2 shared memory (device)
     void* pto2_gm_heap_ptr_;  // GM heap for orchestrator output buffers (device)
+    void* pto2_slot_states_ptr_;  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
     uint64_t* orch_args_;   // Arguments for device orchestration
     int orch_arg_count_;
     uint64_t orch_args_storage_[RUNTIME_MAX_ARGS];  // Copy of args for device
@@ -237,6 +239,7 @@ class Runtime {
     void set_orch_built_on_host(bool v);
     void set_pto2_gm_sm_ptr(void* p);
     void set_pto2_gm_heap(void* p);
+    void set_pto2_slot_states_ptr(void* p);
     void set_orch_args(uint64_t* args, int count);
 
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/tensor.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/tensor.h
index 51c46234..10b5b582 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/tensor.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/tensor.h
@@ -36,7 +36,7 @@ struct Segment {
 };
 
 /**
- * Tensor descriptor for Task input/output
+ * Tensor descriptor for Task input/output (128B = 2 cache lines)
  *
  * Describes a memory access pattern on Global Memory (GM) using
  * raw_shapes (underlying buffer dimensions), shapes (current view dimensions),
@@ -46,22 +46,32 @@ struct Segment {
  * - `raw_shapes[]`, `shapes[]`, `offsets[]` are in ELEMENTS
  * - `dtype` specifies element type for interpreting buffer contents
  *
- * Example: buffer.addr=base, dtype=FLOAT32, raw_shapes=[10, 6], shapes=[3, 6], offsets=[1, 0]
- * Memory access pattern:
- *   - Start at buffer.addr + (1*6+0)*4 = buffer.addr + 24 bytes
- *   - Inner dim: access 6 consecutive elements
- *   - Outer dim: 3 rows with stride 6 elements (derived from raw_shapes[1])
+ * Fast-path flags (both on cache line 1):
+ * - is_all_offset_zero: when true, offsets[] are implicitly zero — skip offset read/write
+ * - is_raw_eq_shapes: when true, raw_shapes[] == shapes[] — skip raw_shapes read/write,
+ *   use shapes[] wherever raw_shapes would be needed
+ *
+ * When BOTH flags are true, cache line 2 is never accessed.
+ *
+ * Layout: cache line 1 holds hot-path fields (buffer, start_offset, version,
+ * dtype, ndims, flags, shapes); cache line 2 holds warm-path fields (raw_shapes, offsets).
  */
-struct Tensor {
-    // === Data fields (same layout as former TensorData) ===
+struct alignas(64) Tensor {
+    // === Cache line 1 (64B) — hot path ===
     PTOBufferHandle buffer;                        // Underlying memory buffer (addr in bytes, size in bytes)
+    uint64_t start_offset;                         // Cached 1D element offset (precomputed from raw_shapes + offsets), only calc before incore, useless in orch
     int32_t version;                               // Tensor version for overlap detection
-    uint64_t start_offset;                         // Cached 1D element offset (precomputed from raw_shapes + offsets)
-    uint64_t ndims;                                // Number of dimensions used
     DataType dtype;                                // Data type of tensor elements
-    uint64_t raw_shapes[RUNTIME_MAX_TENSOR_DIMS];  // Underlying buffer shape per dimension
-    uint64_t shapes[RUNTIME_MAX_TENSOR_DIMS];      // Current view shape per dimension
-    uint64_t offsets[RUNTIME_MAX_TENSOR_DIMS];     // Multi-dimensional offset per dimension
+    uint32_t ndims;                                // Number of dimensions used
+    bool is_all_offset_zero;                       // True when all offsets[] are zero (skip offset read/write)
+    bool is_raw_eq_shapes;                         // True when raw_shapes[] == shapes[] (skip raw_shapes read/write)
+    bool manual_dep;                               // True when dependency is managed manually (skip tensormap lookup/insert)
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS];      // Current view shape per dimension
+    uint32_t __padding__;
+
+    // === Cache line 2 (64B) — warm path ===
+    uint32_t raw_shapes[RUNTIME_MAX_TENSOR_DIMS];  // Underlying buffer shape per dimension
+    uint32_t offsets[RUNTIME_MAX_TENSOR_DIMS];     // Multi-dimensional offset per dimension
 
     Tensor() = default;
     Tensor(const Tensor&) = default;
@@ -70,68 +80,121 @@ struct Tensor {
     Tensor& operator=(Tensor&&) = default;
     ~Tensor() = default;
 
+    /// Return the effective raw_shapes pointer (shapes[] when is_raw_eq_shapes).
+    /// Avoids cache line 2 access for the common case.
+    const uint32_t* get_raw_shapes() const {
+        return is_raw_eq_shapes ? shapes : raw_shapes;
+    }
+
     Tensor(void* addr,
         uint64_t buffer_size_bytes,
-        const uint64_t raw_shapes[],
-        const uint64_t shapes[],
-        const uint64_t offsets[],
-        uint64_t ndims,
+        const uint32_t raw_shapes[],
+        const uint32_t shapes[],
+        const uint32_t offsets[],
+        uint32_t ndims,
         DataType dtype,
-        int32_t version) {
-        init(addr, buffer_size_bytes, raw_shapes, shapes, offsets, ndims, dtype, version);
+        int32_t version,
+        bool is_all_offset_zero = false,
+        bool is_raw_eq_shapes = false,
+        bool manual_dep = false) {
+        init(addr, buffer_size_bytes, raw_shapes, shapes, offsets, ndims, dtype, version,
+             is_all_offset_zero, is_raw_eq_shapes, manual_dep);
     }
 
     // --- Initialization ---
     void init(void* addr,
         uint64_t buffer_size_bytes,
-        const uint64_t in_raw_shapes[],
-        const uint64_t in_shapes[],
-        const uint64_t in_offsets[],
-        uint64_t in_ndims,
+        const uint32_t in_raw_shapes[],
+        const uint32_t in_shapes[],
+        const uint32_t in_offsets[],
+        uint32_t in_ndims,
         DataType in_dtype,
-        int32_t in_version) {
+        int32_t in_version,
+        bool in_is_all_offset_zero = false,
+        bool in_is_raw_eq_shapes = false,
+        bool in_manual_dep = false) {
         buffer = {reinterpret_cast<uint64_t>(addr), buffer_size_bytes};
         ndims = in_ndims;
         dtype = in_dtype;
         version = in_version;
-        for (uint64_t i = 0; i < in_ndims; i++) {
-            raw_shapes[i] = in_raw_shapes[i];
+        is_all_offset_zero = in_is_all_offset_zero;
+        is_raw_eq_shapes = in_is_raw_eq_shapes;
+        manual_dep = in_manual_dep;
+        for (uint32_t i = 0; i < in_ndims; i++) {
             shapes[i] = in_shapes[i];
-            offsets[i] = in_offsets[i];
+        }
+        if (!in_is_raw_eq_shapes) {
+            for (uint32_t i = 0; i < in_ndims; i++) {
+                raw_shapes[i] = in_raw_shapes[i];
+            }
+        }
+        if (!in_is_all_offset_zero) {
+            for (uint32_t i = 0; i < in_ndims; i++) {
+                offsets[i] = in_offsets[i];
+            }
         }
     }
 
     void init(const Tensor& other) {
-        buffer = other.buffer;
-        version = other.version;
-        ndims = other.ndims;
-        dtype = other.dtype;
-        for (uint64_t i = 0; i < ndims; i++) {
-            raw_shapes[i] = other.raw_shapes[i];
-            shapes[i] = other.shapes[i];
-            offsets[i] = other.offsets[i];
+        memcpy(this, &other, 64); // fast copy cache line 1
+        if (!other.is_raw_eq_shapes) {
+            for (uint32_t i = 0; i < ndims; i++) {
+                raw_shapes[i] = other.raw_shapes[i];
+            }
+        }
+        if (!other.is_all_offset_zero) {
+            for (uint32_t i = 0; i < ndims; i++) {
+                offsets[i] = other.offsets[i];
+            }
         }
     }
 
-    void init_with_view(const Tensor& other, const uint64_t view_shapes[], const uint64_t view_offsets[]) {
+    void init_with_view(const Tensor& other, const uint32_t view_shapes[], const uint32_t view_offsets[], bool in_manual_dep = false) {
         buffer = other.buffer;
         ndims = other.ndims;
         dtype = other.dtype;
         version = other.version;
-        for (uint64_t i = 0; i < ndims; i++) {
-            raw_shapes[i] = other.raw_shapes[i];
+        manual_dep = in_manual_dep;
+        // view always diverges shapes from raw_shapes, so is_raw_eq_shapes = false.
+        // Read parent's effective raw_shapes (avoids parent cache line 2 when parent is_raw_eq_shapes).
+        is_raw_eq_shapes = false;
+        const uint32_t* parent_raw = other.get_raw_shapes();
+        for (uint32_t i = 0; i < ndims; i++) {
+            raw_shapes[i] = parent_raw[i];
             shapes[i] = view_shapes[i];
-            offsets[i] = other.offsets[i] + view_offsets[i];
         }
+        // Compute offsets and zero-flag
+        bool all_zero = true;
+        if (other.is_all_offset_zero) {
+            for (uint32_t i = 0; i < ndims; i++) {
+                if (view_offsets[i] != 0) { all_zero = false; break; }
+            }
+            if (!all_zero) {
+                for (uint32_t i = 0; i < ndims; i++) {
+                    offsets[i] = view_offsets[i];
+                }
+            }
+        } else {
+            all_zero = false;
+            for (uint32_t i = 0; i < ndims; i++) {
+                offsets[i] = other.offsets[i] + view_offsets[i];
+            }
+        }
+        is_all_offset_zero = all_zero;
     }
 
     // --- Operations ---
     void update_start_offset() {
+        if (is_all_offset_zero) {
+            start_offset = 0;
+            return;
+        }
+        const uint32_t* rs = get_raw_shapes();
         uint64_t result = 0;
         uint64_t stride = 1;
         for (int i = static_cast<int>(ndims) - 1; i >= 0; i--) {
             result += offsets[i] * stride;
-            stride *= raw_shapes[i];
+            stride *= rs[i];
         }
         start_offset = result;
     }
@@ -140,17 +203,17 @@ struct Tensor {
         init(other);
     }
 
-    Tensor view(const uint64_t view_shapes[], const uint64_t view_offsets[]) const {
+    Tensor view(const uint32_t view_shapes[], const uint32_t view_offsets[], bool manual_dep = false) const {
         Tensor result;
-        result.init_with_view(*this, view_shapes, view_offsets);
+        result.init_with_view(*this, view_shapes, view_offsets, manual_dep);
         return result;
     }
 
     bool is_contiguous() const {
-        if (ndims == 0) {
+        if (is_raw_eq_shapes || ndims == 0) {
             return true;
         }
-        for (uint64_t i = 1; i < ndims; i++) {
+        for (uint32_t i = 1; i < ndims; i++) {
             if (shapes[i] != raw_shapes[i]) {
                 return false;
             }
@@ -158,38 +221,45 @@ struct Tensor {
         return true;
     }
 
-    bool valid_reshape(const uint64_t new_shapes[], uint64_t new_ndims) const {
+    bool valid_reshape(const uint32_t new_shapes[], uint32_t new_ndims) const {
         uint64_t x = numel();
         uint64_t y = 1;
-        for (size_t i = 0; i < new_ndims; i++) {
+        for (uint32_t i = 0; i < new_ndims; i++) {
             y *= new_shapes[i];
         }
         return x == y;
     }
 
-    Tensor reshape(const uint64_t new_shapes[], uint64_t new_ndims) const {
+    Tensor reshape(const uint32_t new_shapes[], uint32_t new_ndims, bool manual_dep = false) const {
         debug_assert(valid_reshape(new_shapes, new_ndims));
         always_assert(is_contiguous());
         Tensor result;
         result.copy(*this);
         result.ndims = new_ndims;
-        for (uint64_t i = 0; i < new_ndims; i++) {
-            result.raw_shapes[i] = new_shapes[i];
+        result.is_all_offset_zero = true;
+        result.is_raw_eq_shapes = true;
+        result.manual_dep = manual_dep;
+        for (uint32_t i = 0; i < new_ndims; i++) {
             result.shapes[i] = new_shapes[i];
-            result.offsets[i] = 0;
         }
         return result;
     }
 
-    bool valid_transpose(uint64_t x, uint64_t y) const { return x < ndims && y < ndims; }
+    bool valid_transpose(uint32_t x, uint32_t y) const { return x < ndims && y < ndims; }
 
-    Tensor transpose(uint64_t x, uint64_t y) const {
+    Tensor transpose(uint32_t x, uint32_t y, bool manual_dep = false) const {
         debug_assert(valid_transpose(x, y));
         Tensor result;
         result.copy(*this);
-        std::swap(result.raw_shapes[x], result.raw_shapes[y]);
+        result.manual_dep = manual_dep;
+        // transpose swaps the same dims in both arrays, so equality is preserved
         std::swap(result.shapes[x], result.shapes[y]);
-        std::swap(result.offsets[x], result.offsets[y]);
+        if (!result.is_raw_eq_shapes) {
+            std::swap(result.raw_shapes[x], result.raw_shapes[y]);
+        }
+        if (!result.is_all_offset_zero) {
+            std::swap(result.offsets[x], result.offsets[y]);
+        }
         return result;
     }
 
@@ -198,7 +268,7 @@ struct Tensor {
             return 0;
         }
         uint64_t total = 1;
-        for (uint64_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++) {
             total *= shapes[i];
         }
         return total;
@@ -206,29 +276,6 @@ struct Tensor {
 
     bool is_same_memref(const Tensor& other) const { return buffer.addr == other.buffer.addr; }
 
-    OverlapStatus is_overlap(const Tensor& pre_task_output) const {
-        debug_assert(is_same_memref(pre_task_output));
-        debug_assert(version >= pre_task_output.version);
-        if (version > pre_task_output.version) {
-            return OverlapStatus::OTHER;
-        }
-        bool contains = true;
-        for (uint64_t i = 0; i < ndims; i++) {
-            Segment input_range_dim_i{offsets[i], offsets[i] + shapes[i]};
-            Segment output_range_dim_i{
-                pre_task_output.offsets[i], pre_task_output.offsets[i] + pre_task_output.shapes[i]};
-            if (!input_range_dim_i.line_segment_intersection(output_range_dim_i)) {
-                return OverlapStatus::NO_OVERLAP;
-            } else if (!input_range_dim_i.contains(output_range_dim_i)) {
-                contains = false;
-            }
-        }
-        if (contains) {
-            return OverlapStatus::COVERED;
-        }
-        return OverlapStatus::OTHER;
-    }
-
     std::string dump() const {
         std::stringstream ss;
         std::string indent = "    ";
@@ -239,16 +286,17 @@ struct Tensor {
         ss << indent << "ndims: " << ndims << std::endl;
         ss << indent << "version: " << version << std::endl;
 
+        const uint32_t* rs = get_raw_shapes();
         ss << indent << "raw_shapes: [";
-        for (uint64_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++) {
             if (i > 0) {
                 ss << ", ";
             }
-            ss << raw_shapes[i];
+            ss << rs[i];
         }
         ss << "]" << std::endl;
         ss << indent << "shapes: [";
-        for (uint64_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++) {
             if (i > 0) {
                 ss << ", ";
             }
@@ -256,11 +304,11 @@ struct Tensor {
         }
         ss << "]" << std::endl;
         ss << indent << "offsets: [";
-        for (uint64_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++) {
             if (i > 0) {
                 ss << ", ";
             }
-            ss << offsets[i];
+            ss << (is_all_offset_zero ? 0u : offsets[i]);
         }
         ss << "]" << std::endl;
         ss << "}" << std::endl;
@@ -268,6 +316,9 @@ struct Tensor {
     }
 };
 
+static_assert(sizeof(Tensor) == 128, "Tensor must be exactly 2 cache lines (128 bytes)");
+static_assert(offsetof(Tensor, raw_shapes) == 64);
+
 using TensorData = Tensor;
 
 // =============================================================================
@@ -277,16 +328,18 @@ using TensorData = Tensor;
  * Create a Tensor for pre-allocated external memory.
  */
 static inline Tensor make_tensor_external(void* addr,
-    const uint64_t shapes[],
-    uint64_t ndims,
+    const uint32_t shapes[],
+    uint32_t ndims,
     DataType dtype = DataType::FLOAT32,
+    bool manual_dep = false,
     int32_t version = 0) {
-    static uint64_t zero_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
+    static uint32_t zero_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
     uint64_t total = 1;
-    for (uint64_t i = 0; i < ndims; i++) {
+    for (uint32_t i = 0; i < ndims; i++) {
         total *= shapes[i];
     }
-    return Tensor(addr, total * get_element_size(dtype), shapes, shapes, zero_offsets, ndims, dtype, version);
+    return Tensor(addr, total * get_element_size(dtype), shapes, shapes, zero_offsets, ndims, dtype, version,
+                  /*is_all_offset_zero=*/true, /*is_raw_eq_shapes=*/true, manual_dep);
 }
 
 /**
@@ -295,14 +348,16 @@ static inline Tensor make_tensor_external(void* addr,
  * The runtime allocates from the heap ring and fills buffer.addr during pto2_submit_task
  * when this tensor is passed as OUTPUT param. No buffer content is ever copied.
  */
-static inline Tensor make_tensor(const uint64_t shapes[],
-    uint64_t ndims,
+static inline Tensor make_tensor(const uint32_t shapes[],
+    uint32_t ndims,
     DataType dtype = DataType::FLOAT32,
+    bool manual_dep = false,
     int32_t version = 0) {
-    static uint64_t zero_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
+    static uint32_t zero_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
     uint64_t total = 1;
-    for (uint64_t i = 0; i < ndims; i++) {
+    for (uint32_t i = 0; i < ndims; i++) {
         total *= shapes[i];
     }
-    return Tensor(0, total * get_element_size(dtype), shapes, shapes, zero_offsets, ndims, dtype, version);
+    return Tensor(0, total * get_element_size(dtype), shapes, shapes, zero_offsets, ndims, dtype, version,
+                  /*is_all_offset_zero=*/true, /*is_raw_eq_shapes=*/true, manual_dep);
 }
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
index dc9499cf..fc620749 100644
--- a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -59,15 +59,17 @@ static __aicore__ void pv_matmul_impl(__gm__ Tensor* pij, __gm__ Tensor* vj, __g
     TASSIGN(bTile, 0x0);
     TASSIGN(cTile, 0x0);
 
-    // Load pij and vj to L1
+    // Load pij and vj to L1 with separate events for pipeline overlap
     TLOAD(aMatTile, pijGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);     // A load done
     TLOAD(bMatTile, vjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);     // B load done
 
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    // Move A to L0A as soon as A load completes (B may still be loading)
     wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    // Move to L0A/L0B
     TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
     TMOV(bTile, bMatTile);
 
     set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
@@ -80,6 +82,9 @@ static __aicore__ void pv_matmul_impl(__gm__ Tensor* pij, __gm__ Tensor* vj, __g
     wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
 
     TSTORE(oiGlobal, cTile);
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
index b9f17ecb..2b5126a3 100644
--- a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -60,15 +60,17 @@ static __aicore__ void qk_matmul_impl(__gm__ Tensor* qi, __gm__ Tensor* kj, __gm
     TASSIGN(bTile, 0x0);
     TASSIGN(cTile, 0x0);
 
-    // // Load A and B to L1
+    // Load A and B to L1 with separate events for pipeline overlap
     TLOAD(aMatTile, qiGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);     // A load done
     TLOAD(bMatTile, kjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);     // B load done
 
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    // Move A to L0A as soon as A load completes (B may still be loading)
     wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    // Move from L1 to L0A/L0B
     TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
     TMOV(bTile, bMatTile);
 
     set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
@@ -81,6 +83,9 @@ static __aicore__ void qk_matmul_impl(__gm__ Tensor* qi, __gm__ Tensor* kj, __gm
     wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
 
     TSTORE(sijGlobal, cTile);
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
index 3c4d227f..5fe7e365 100644
--- a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -4,11 +4,11 @@
 //   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
 //   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
 //
-// Scalar layout strategy:
-//   M scalar floats stored contiguously in GM can be loaded as either:
-//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD)
-//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV)
-//   Conversion between layouts uses GM round-trip: ND TSTORE → DN TLOAD.
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
@@ -43,11 +43,6 @@ static __aicore__ void online_update_impl(__gm__ Tensor* mij,
     __gm__ float* oi_ptr = reinterpret_cast<__gm__ float*>(oi->buffer.addr);
     __gm__ float* dst_ptr = reinterpret_cast<__gm__ float*>(dst->buffer.addr);
 
-    // Scalar tile dimensions for RowMajor layout:
-    // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block)
-    // kScalarRows = M / 8 (M=16 → 2 rows, M=64 → 8 rows)
-    constexpr int kScalarCols = 32 / sizeof(float);
-    constexpr int kScalarRows = M / kScalarCols;
     // Aligned rows for ColMajor DN tiles (32-byte alignment)
     constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
 
@@ -56,77 +51,84 @@ static __aicore__ void online_update_impl(__gm__ Tensor* mij,
     // Data (M, N) RowMajor
     using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
 
-    // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
     using GlobalScalarND =
         GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, pto::Stride<1, 1, 1, kScalarCols, 1>>;
 
-    // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
     // --- GlobalTensor instances ---
 
     GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
     GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
     GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
 
-    // ND globals for scalar element-wise operations
-    GlobalScalarND mijGlobalND(mij_ptr + mij->start_offset);
-    GlobalScalarND lijGlobalND(lij_ptr + lij->start_offset);
-    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
-    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
-
-    // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE)
+    // DN globals for loading scalars as ColMajor
     GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
     GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
     GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
 
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
     // --- Tile types ---
 
     using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
     using TileScalarND =
         Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
 
     // --- UB memory layout ---
 
     constexpr int kDataBytes = M * N * sizeof(float);
-    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
     constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
 
     // Data tiles
     TileDataMxN oiNewTile;
     TileDataMxN oiTile;
 
-    // Scalar ND tiles for element-wise arithmetic
-    TileScalarND mijND, lijND, miND, liND;
-    TileScalarND miNewND, alphaND, betaND, tmpND;
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
 
-    // Scalar DN tiles for TROWEXPAND operations
-    TileScalarDN alphaDN, betaDN, liDN;
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
 
     TASSIGN(oiNewTile, 0);
     TASSIGN(oiTile, kDataBytes);
-    TASSIGN(mijND, 2 * kDataBytes);
-    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
-    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
-    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
-    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
-    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
-    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
-    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
-    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
-    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
-    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
 
     if (is_first) {
         // --- First block: copy inputs to accumulators ---
         TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(mijND, mijGlobalND);
-        TLOAD(lijND, lijGlobalND);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
-        // Passthrough to MTE3 (no V compute needed)
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to the same UB as DN tiles for storing as ND format
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);           // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
         set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
         wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
         TSTORE(miGlobalND, mijND);    // mi = mij
@@ -135,13 +137,10 @@ static __aicore__ void online_update_impl(__gm__ Tensor* mij,
 
         if (is_last) {
             // Single block: normalize dst = oi_new / lij
-            // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            TLOAD(liDN, liGlobalDN);
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
             set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
             wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
             TSTORE(dstGlobal, oiNewTile);
@@ -149,66 +148,75 @@ static __aicore__ void online_update_impl(__gm__ Tensor* mij,
     } else {
         // --- Subsequent blocks: accumulate ---
 
-        // Phase 1: Load all inputs
+        // Load all inputs
         TLOAD(oiNewTile, oiNewGlobal);
         TLOAD(oiTile, oiGlobal);
-        TLOAD(mijND, mijGlobalND);
-        TLOAD(lijND, lijGlobalND);
-        TLOAD(miND, miGlobalND);
-        TLOAD(liND, liGlobalND);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
-        // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols)
-        // to resolve RAW hazards on shared UB tiles.
-        TMAX(miNewND, miND, mijND);  // mi_new = max(mi, mij)
-        TSUB(alphaND, miND, miNewND);  // alpha = mi - mi_new
-        TEXP(alphaND, alphaND);  // alpha = exp(mi - mi_new)
-        TSUB(betaND, mijND, miNewND);  // beta = mij - mi_new
-        TEXP(betaND, betaND);  // beta = exp(mij - mi_new)
-        TMUL(liND, alphaND, liND);  // li = alpha * li
-        TMUL(tmpND, betaND, lijND);  // tmp = beta * lij
-        TADD(liND, liND, tmpND);  // li = alpha * li + beta * lij (= li_new)
-
-        // Phase 3: Store scalar results to GM (ND format)
-        // mi_new → mi accumulator, li_new → li accumulator
-        // alpha → mij buffer (reuse), beta → lij buffer (reuse)
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(miGlobalND, miNewND);   // persist mi_new
-        TSTORE(liGlobalND, liND);      // persist li_new
-        TSTORE(mijGlobalND, alphaND);  // temp: alpha to mij buffer
-        TSTORE(lijGlobalND, betaND);   // temp: beta to lij buffer
-
-        // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN
-        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        TLOAD(alphaDN, mijGlobalDN);  // alpha from mij buffer as DN
-        TLOAD(betaDN, lijGlobalDN);   // beta from lij buffer as DN
-        if (is_last) {
-            TLOAD(liDN, liGlobalDN);  // li_new from li buffer as DN
-        }
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-
-        // Phase 5: Scale data tiles using row-broadcast multiply
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);        // mi_new = max(mi, mij)
+        TSUB(alphaRow, miRow, miNewRow);      // alpha_exp = mi - mi_new
+        TEXP(alphaRow, alphaRow);             // alpha = exp(mi - mi_new)
+        TSUB(betaRow, mijRow, miNewRow);      // beta_exp = mij - mi_new
+        TEXP(betaRow, betaRow);               // beta = exp(mij - mi_new)
+        TMUL(tmpRow, alphaRow, liRow);        // alpha * li
+        TMUL(liNewRow, betaRow, lijRow);      // beta * lij
+        TADD(liNewRow, tmpRow, liNewRow);     // li_new = alpha*li + beta*lij
+
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
+
+        // Scale data tiles using row-broadcast multiply
         TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
-        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
-        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);   // oi_new *= beta
+        TADD(oiTile, oiTile, oiNewTile);              // oi = alpha*oi + beta*oi_new
+
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
 
         if (is_last) {
-            // Phase 6: Normalize and output
-            TROWEXPANDDIV(oiTile, oiTile, liDN);  // dst = oi / li_new
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);   // persist mi_new
+            TSTORE(liGlobalND, liNewND);   // persist li_new
             TSTORE(dstGlobal, oiTile);
         } else {
-            // Phase 6: Store updated accumulators
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);   // persist mi_new
+            TSTORE(liGlobalND, liNewND);   // persist li_new
             TSTORE(oiGlobal, oiTile);
         }
     }
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index eec1d4dd..9b5fbf28 100644
--- a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -95,29 +95,39 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
     TROWMAX(maxTile, sijTile, tmpTile);
     TROWEXPANDSUB(pijTile, sijTile, maxTile);
     TEXP(pijTile, pijTile);
-    // Truncate pij to bf16 first, then compute lij from truncated values (matches golden)
+    // Truncate pij to bf16 first
     TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);        // pij bf16 ready, can store early
+
+    // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel
     TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
     TROWSUM(sumTile, pijTile, tmpTile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);        // sum ready
 
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    // Store pij (overlaps with TCVT + TROWSUM above)
     wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(pijGlobal, pijBf16Tile);
+
+    // Store max and sum
     TSTORE(mijGlobal, maxTile);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
     TSTORE(lijGlobal, sumTile);
-    TSTORE(pijGlobal, pijBf16Tile);
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
     __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[3]);
     union {
         uint64_t u;
         float f;
     } scale_conv;
-    scale_conv.u = static_cast<uint64_t>(args[1]);
+    scale_conv.u = static_cast<uint64_t>(args[4]);
     float scale_value = scale_conv.f;
-    __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[2]);
-    __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[3]);
-    __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[4]);
     uint64_t q_tile_size = static_cast<uint64_t>(sij->shapes[0]);
 
     if (q_tile_size == 16) {
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index a3417a8c..b632e4e9 100644
--- a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -118,10 +118,10 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
     // key_cache_size = batch * block_num * block_size * head_dim * data_type
     // value_cache_size = batch * block_num * block_size * head_dim * data_type
     // out = batch * num_heads * head_dim * data_type
-    uint64_t query_shapes[2] = {batch * num_heads, head_dim};
-    uint64_t key_cache_shapes[2] = {batch * block_num * block_size, head_dim};
-    uint64_t value_cache_shapes[2] = {batch * block_num * block_size, head_dim};
-    uint64_t out_shapes[2] = {batch * num_heads, head_dim};
+    uint32_t query_shapes[2] = {(uint32_t)(batch * num_heads), (uint32_t)head_dim};
+    uint32_t key_cache_shapes[2] = {(uint32_t)(batch * block_num * block_size), (uint32_t)head_dim};
+    uint32_t value_cache_shapes[2] = {(uint32_t)(batch * block_num * block_size), (uint32_t)head_dim};
+    uint32_t out_shapes[2] = {(uint32_t)(batch * num_heads), (uint32_t)head_dim};
     Tensor query = make_tensor_external(host_query, query_shapes, 2, data_type);
     Tensor key_cache = make_tensor_external(host_key_cache, key_cache_shapes, 2, data_type);
     Tensor value_cache = make_tensor_external(host_value_cache, value_cache_shapes, 2, data_type);
@@ -144,63 +144,63 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                 CYCLE_COUNT_LAP(prof_scope);
                 uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
 
-                uint64_t oi_shapes[2] = {q_tile, head_dim};
-                uint64_t li_shapes[1] = {q_tile};
-                uint64_t mi_shapes[1] = {q_tile};
+                uint32_t oi_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
+                uint32_t li_shapes[1] = {(uint32_t)q_tile};
+                uint32_t mi_shapes[1] = {(uint32_t)q_tile};
                 Tensor oi = make_tensor(oi_shapes, 2, DataType::FLOAT32);
                 Tensor li_update = make_tensor(li_shapes, 1, DataType::FLOAT32);
                 Tensor mi_update = make_tensor(mi_shapes, 1, DataType::FLOAT32);
                 prof_make_count += 3;
                 CYCLE_COUNT_LAP(prof_make_tensor);
-                uint64_t qi_shapes[2] = {q_tile, head_dim};
-                uint64_t qi_offsets[2] = {cur_offset, 0};
+                uint32_t qi_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
+                uint32_t qi_offsets[2] = {(uint32_t)cur_offset, 0};
                 Tensor qi = query.view(qi_shapes, qi_offsets);
-                uint64_t out_view_shapes[2] = {q_tile, head_dim};
-                uint64_t out_view_offsets[2] = {cur_offset, 0};
+                uint32_t out_view_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
+                uint32_t out_view_offsets[2] = {(uint32_t)cur_offset, 0};
                 Tensor out_view = out.view(out_view_shapes, out_view_offsets);
                 prof_view_count += 2;
                 CYCLE_COUNT_LAP(prof_tensor_view);
 
-                PTOParam params_inplace[] = {
-                    make_output_param(oi),
-                    make_output_param(li_update),
-                    make_output_param(mi_update),
-                };
+                PTOParam params_inplace;
+                params_inplace.add_output(oi);
+                params_inplace.add_output(li_update);
+                params_inplace.add_output(mi_update);
                 CYCLE_COUNT_LAP(prof_param_setup);
-                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3);
+                pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace);
                 prof_submit_count++;
                 CYCLE_COUNT_LAP(prof_submit_task);
 
                 for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
+                    PTO2_SCOPE_GUARD(rt);
+
                     uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn];
                     uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size);
                     CYCLE_COUNT_LAP(prof_param_extract);
 
-                    uint64_t kv_shapes[2] = {block_size, head_dim};
-                    uint64_t kv_offsets[2] = {cur_block_idx * block_size, 0};
+                    uint32_t kv_shapes[2] = {(uint32_t)block_size, (uint32_t)head_dim};
+                    uint32_t kv_offsets[2] = {(uint32_t)(cur_block_idx * block_size), 0};
                     Tensor kj = key_cache.view(kv_shapes, kv_offsets);
                     Tensor vj = value_cache.view(kv_shapes, kv_offsets);
                     prof_view_count += 2;
                     CYCLE_COUNT_LAP(prof_tensor_view);
 
-                    uint64_t sij_shapes[2] = {q_tile, block_size};
+                    uint32_t sij_shapes[2] = {(uint32_t)q_tile, (uint32_t)block_size};
                     Tensor sij = make_tensor(sij_shapes, 2, DataType::FLOAT32);
                     Tensor pij_f16 = make_tensor(sij_shapes, 2, data_type);
                     prof_make_count += 2;
                     CYCLE_COUNT_LAP(prof_make_tensor);
 
-                    PTOParam params_qk[] = {
-                        make_input_param(qi),
-                        make_input_param(kj),
-                        make_output_param(sij),
-                    };
+                    PTOParam params_qk;
+                    params_qk.add_input(qi);
+                    params_qk.add_input(kj);
+                    params_qk.add_output(sij);
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3);
+                    pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
-                    uint64_t sij_valid_shapes[2] = {q_tile, valid_len};
-                    uint64_t sij_valid_offsets[2] = {0, 0};
+                    uint32_t sij_valid_shapes[2] = {(uint32_t)q_tile, (uint32_t)valid_len};
+                    uint32_t sij_valid_offsets[2] = {0, 0};
                     Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
                     prof_view_count += 1;
                     CYCLE_COUNT_LAP(prof_tensor_view);
@@ -210,30 +210,28 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                     prof_make_count += 2;
                     CYCLE_COUNT_LAP(prof_make_tensor);
 
-                    PTOParam params_sf[] = {
-                        make_input_param(sij_valid),
-                        make_scalar_param(float_to_u64(scale_value)),
-                        make_output_param(pij_f16),
-                        make_output_param(mi),
-                        make_output_param(li),
-                    };
+                    PTOParam params_sf;
+                    params_sf.add_input(sij_valid);
+                    params_sf.add_output(pij_f16);
+                    params_sf.add_output(mi);
+                    params_sf.add_output(li);
+                    params_sf.add_scalar(float_to_u64(scale_value));
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5);
+                    pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
-                    uint64_t oi_tmp_shapes[2] = {q_tile, head_dim};
+                    uint32_t oi_tmp_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
                     Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32);
                     prof_make_count += 1;
                     CYCLE_COUNT_LAP(prof_make_tensor);
 
-                    PTOParam params_pv[] = {
-                        make_input_param(pij_f16),
-                        make_input_param(vj),
-                        make_output_param(oi_tmp),
-                    };
+                    PTOParam params_pv;
+                    params_pv.add_input(pij_f16);
+                    params_pv.add_input(vj);
+                    params_pv.add_output(oi_tmp);
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3);
+                    pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -241,19 +239,18 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                     uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
                     CYCLE_COUNT_LAP(prof_param_extract);
 
-                    PTOParam params_up[] = {
-                        make_input_param(mi),
-                        make_input_param(li),
-                        make_input_param(oi_tmp),
-                        make_inout_param(mi_update),
-                        make_inout_param(li_update),
-                        make_inout_param(oi),
-                        make_output_param(out_view),
-                        make_scalar_param(is_first),
-                        make_scalar_param(is_last),
-                    };
+                    PTOParam params_up;
+                    params_up.add_input(mi);
+                    params_up.add_input(li);
+                    params_up.add_input(oi_tmp);
+                    params_up.add_inout(mi_update);
+                    params_up.add_inout(li_update);
+                    params_up.add_inout(oi);
+                    params_up.add_output(out_view);
+                    params_up.add_scalar(is_first);
+                    params_up.add_scalar(is_last);
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9);
+                    pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
                 }
@@ -291,4 +288,4 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
 #undef CYCLE_COUNT_LAP
 }
 
-}  // extern "C"
\ No newline at end of file
+}  // extern "C"