From 634255a1ba49a458545bdef62bfb14b6ef2712cc Mon Sep 17 00:00:00 2001
From: Chao Wang <26245345+ChaoWao@users.noreply.github.com>
Date: Wed, 11 Mar 2026 15:30:54 +0800
Subject: [PATCH 1/3] Refactor: unify paged attention benchmark cases and dtype
 handling (#256)

- Standardize ALL_CASES to 3 identical cases in paged_attention,
  batch_paged_attention, and paged_attention_unroll for fair comparison
- Case1: QHEADS=16, HEADDIM=128, BLOCKSIZE=128, batch=256
- Case2: QHEADS=64, HEADDIM=128, BLOCKSIZE=64, batch=64
- Case3: QHEADS=64, HEADDIM=256, BLOCKSIZE=64, batch=64
- All cases: KVHEADS=1, context_len=8192, query_seqlen=1
- Remove CaseVarSeq from batch_paged_attention (not needed for benchmark)
- Add dtype field to paged_attention_unroll cases and parameterize
  generate_inputs/paged_attention to read dtype from params
---
 .../batch_paged_attention/golden.py           | 15 ++++----
 .../paged_attention/golden.py                 | 14 ++++++--
 .../paged_attention_unroll/golden.py          | 34 +++++++++++++------
 3 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py b/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py
index b07f0d53..8cce3b8a 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py
+++ b/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py
@@ -13,12 +13,12 @@
 
 ALL_CASES = {
     "Case1": {
-        "batch": 64,
+        "batch": 256,
         "num_heads": 16,
         "kv_head_num": 1,
         "head_dim": 128,
         "block_size": 128,
-        "context_len": 8193,
+        "context_len": 8192,
         "max_model_len": 32768,
         "dtype": "bfloat16",
     },
@@ -32,14 +32,13 @@
         "max_model_len": 32768,
         "dtype": "bfloat16",
     },
-    "CaseVarSeq": {
+    "Case3": {
         "batch": 64,
-        "num_heads": 16,
+        "num_heads": 64,
         "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 128,
-        "context_len": 8193,
-        "context_lens_list": [8193, 4096, 1024, 256, 8000, 512, 2048, 7777],
+        "head_dim": 256,
+        "block_size": 64,
+        "context_len": 8192,
         "max_model_len": 32768,
         "dtype": "bfloat16",
     },
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py
index 787ad2c7..898c4fad 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py
@@ -13,12 +13,12 @@
 
 ALL_CASES = {
     "Case1": {
-        "batch": 64,
+        "batch": 256,
         "num_heads": 16,
         "kv_head_num": 1,
         "head_dim": 128,
         "block_size": 128,
-        "context_len": 8193,
+        "context_len": 8192,
         "max_model_len": 32768,
         "dtype": "bfloat16",
     },
@@ -32,6 +32,16 @@
         "max_model_len": 32768,
         "dtype": "bfloat16",
     },
+    "Case3": {
+        "batch": 64,
+        "num_heads": 64,
+        "kv_head_num": 1,
+        "head_dim": 256,
+        "block_size": 64,
+        "context_len": 8192,
+        "max_model_len": 32768,
+        "dtype": "bfloat16",
+    },
 }
 
 DEFAULT_CASE = "Case1"
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py b/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
index 6dc39319..a5d9089e 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
@@ -24,13 +24,14 @@
 # All test cases - production scale
 ALL_CASES = {
     "Case1": {
-        "batch": 64,
+        "batch": 256,
         "num_heads": 16,
         "kv_head_num": 1,
         "head_dim": 128,
         "block_size": 128,
-        "context_len": 8193,
+        "context_len": 8192,
         "max_model_len": 32768,
+        "dtype": "bfloat16",
     },
     "Case2": {
         "batch": 64,
@@ -40,6 +41,17 @@
         "block_size": 64,
         "context_len": 8192,
         "max_model_len": 32768,
+        "dtype": "bfloat16",
+    },
+    "Case3": {
+        "batch": 64,
+        "num_heads": 64,
+        "kv_head_num": 1,
+        "head_dim": 256,
+        "block_size": 64,
+        "context_len": 8192,
+        "max_model_len": 32768,
+        "dtype": "bfloat16",
     },
 }
 
@@ -55,6 +67,7 @@ def generate_inputs(params: dict) -> list:
     block_size = params["block_size"]
     context_len = params["context_len"]
     max_model_len = params["max_model_len"]
+    dtype = getattr(torch, params.get("dtype", "bfloat16"))
 
     max_num_blocks_per_req = max_model_len // block_size
     cur_valid_blocks = (context_len + block_size - 1) // block_size
@@ -77,15 +90,15 @@ def generate_inputs(params: dict) -> list:
         dtype=torch.int64,
     )
 
-    query_bf16 = torch.empty(batch, 1, num_heads * head_dim).uniform_(-0.5, 0.5).to(torch.bfloat16)
-    query_bf16 = query_bf16.reshape(batch, num_heads, head_dim)
+    query_raw = torch.empty(batch, 1, num_heads * head_dim).uniform_(-0.5, 0.5).to(dtype)
+    query_raw = query_raw.reshape(batch, num_heads, head_dim)
 
-    key_bf16 = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-0.5, 0.5).to(torch.bfloat16)
-    value_bf16 = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-1, 1).to(torch.bfloat16)
+    key_raw = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-0.5, 0.5).to(dtype)
+    value_raw = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-1, 1).to(dtype)
 
-    query = query_bf16.flatten()
-    key_cache = key_bf16.flatten()
-    value_cache = value_bf16.flatten()
+    query = query_raw.flatten()
+    key_cache = key_raw.flatten()
+    value_cache = value_raw.flatten()
     block_table_flat = block_table.flatten()
     out = torch.zeros(batch * num_heads * head_dim, dtype=torch.float32)
 
@@ -133,6 +146,7 @@ def paged_attention(
         out: (batch * num_heads, head_dim) float32
     """
     assert num_kv_heads == 1
+    input_dtype = query.dtype
     batch, num_heads_dim, head_dim = query.shape
     _, block_size, _, _ = key_cache.shape
 
@@ -189,7 +203,7 @@ def paged_attention(
             pij = torch.exp(sij - mij)
             pij = pij.masked_fill(~valid_mask, 0.0)
             pij = pij.masked_fill(~batch_mask, 0.0)
-            pij = pij.to(torch.bfloat16).to(torch.float32)
+            pij = pij.to(input_dtype).to(torch.float32)
             lij = pij.sum(dim=-1, keepdim=True)  # (batch, q_tile_size, 1)
 
             # PV matmul: (batch, q_tile_size, head_dim)

From e2e38b9f0b52cd557f825af287211577b7572ec1 Mon Sep 17 00:00:00 2001
From: jvjhfhg <jvjhfhg@outlook.com>
Date: Wed, 11 Mar 2026 18:29:57 +0800
Subject: [PATCH 2/3] Refactor: cluster-based mixed-task dispatch for AICPU
 executor (#249)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add: MixedKernels type and resource shape definitions

- Add pto_submit_types.h with MixedKernels struct, PTO2ResourceShape enum,
  PTO2SubtaskSlot enum, and active_mask/shape conversion helpers
- Remove PTO2WorkerType enum from pto_runtime2_types.h (superseded by
  resource shapes)

* Refactor: submit API from (kernel_id, worker_type) to MixedKernels

- Change submit_task signature to take MixedKernels& instead of
  (kernel_id, worker_type), enabling multi-kernel mixed-task submission
- Add pto2_rt_submit_aic_task / pto2_rt_submit_aiv_task convenience
  wrappers for single-kernel tasks
- Implement pto2_submit_mixed_task with active_mask computation, AIV
  normalization (aiv1-only → aiv0 slot), and shape-based queue routing
- Add mixed_task_id and subslot fields to PTO2DispatchPayload
- Migrate all orchestration call sites to new API

* Refactor: two-stage completion and shape-based ready queues in scheduler

- Change ready queues from worker-type indexed to shape-based indexed
  (PTO2_NUM_RESOURCE_SHAPES queues instead of PTO2_NUM_WORKER_TYPES)
- Add on_subtask_complete() for per-core subtask done-bit tracking
- Rename on_task_complete to on_mixed_task_complete (fires only when
  all subtasks of a mixed task finish)
- Route release_fanin_and_check_ready enqueue through shape-based
  queue using pto2_active_mask_to_shape()
- Remove stale extern declarations left from self-consumed check move

* Refactor: cluster-based dispatch and core assignment in executor

- Add Cluster struct (1 AIC + 2 AIV) and extend CoreStateTracker with
  clusters[], core_idle[], and find_cluster_for_shape()
- Add shape_resource_count() constexpr lookup and get_dispatch_order()
  with even/odd thread differentiation for queue probe order
- Extract pop_ready_task() and dispatch_subtask_to_core() helpers
- Replace 5 duplicated dispatch blocks with unified table-driven loop
- Adapt local-first dispatch to cluster model (find_cluster_for_shape
  instead of per-type idle pool, overflow to shape-based global queue)
- Rewrite assign/reassign_cores_to_threads for cluster-aligned assignment
- Wire completion path through on_subtask_complete/on_mixed_task_complete
- Fix completed_tasks_ to increment only on mixed-task completion, not
  per-subtask, preventing early scheduler termination

* Add: mixed_example covering all 5 resource shapes

- AIC_AIV_X2 (matmul + add + mul), AIC_ONLY (matmul), AIV_X1 (add),
  AIV_X2 (add + mul), AIC_AIV_X1 (matmul + add) per iteration
- 5 kernels: matmul, add, mul, add_standalone, mul_standalone
- 9 output tensors with golden verification (4 iterations × 5 shapes)

* Docs: submit by cluster docs

* Fix review comment
---
 .../orchestration/paged_attention_orch.cpp    |  10 +-
 .../kernels/orchestration/bgemm_orch.cpp      |   4 +-
 .../docs/INCORE_ORCHESTRATION_GUIDE.md        |  21 +-
 .../mixed_example/golden.py                   | 130 ++++
 .../kernels/aic/kernel_matmul.cpp             | 126 ++++
 .../mixed_example/kernels/aiv/kernel_add.cpp  |  89 +++
 .../kernels/aiv/kernel_add_standalone.cpp     |  74 ++
 .../mixed_example/kernels/aiv/kernel_mul.cpp  |  90 +++
 .../kernels/aiv/kernel_mul_standalone.cpp     |  74 ++
 .../mixed_example/kernels/kernel_config.py    |  58 ++
 .../kernels/orchestration/mixed_orch.cpp      | 221 ++++++
 .../orchestration/paged_attention_orch.cpp    |  10 +-
 .../orchestration/example_orchestration.cpp   |  10 +-
 .../aicore/aicore_executor.cpp                |   6 +-
 .../aicpu/aicpu_executor.cpp                  | 665 +++++++++++-------
 .../docs/RUNTIME_LOGIC.md                     |  65 +-
 .../docs/SUBMIT_BY_CLUSTER.md                 | 226 ++++++
 .../docs/device_log_profiling.md              |   4 +-
 .../orchestration/pto_orchestration_api.h     |  39 +-
 .../runtime/pto2_dispatch_payload.h           |   4 +-
 .../runtime/pto_orchestrator.cpp              |  39 +-
 .../runtime/pto_orchestrator.h                |   6 +-
 .../runtime/pto_ring_buffer.h                 |   2 +-
 .../runtime/pto_runtime2.cpp                  |   7 +-
 .../runtime/pto_runtime2.h                    |   4 +-
 .../runtime/pto_runtime2_types.h              |  21 +-
 .../runtime/pto_scheduler.cpp                 |  12 +-
 .../runtime/pto_scheduler.h                   |  99 ++-
 .../runtime/pto_submit_types.h                |  97 +++
 .../orchestration/alternating_orch.cpp        |   4 +-
 .../orchestration/paged_attention_orch.cpp    |  10 +-
 .../kernels/orchestration/bgemm_orch.cpp      |   4 +-
 .../orchestration/paged_attention_orch.cpp    |  10 +-
 .../orchestration/paged_attention_orch.cpp    |  10 +-
 34 files changed, 1862 insertions(+), 389 deletions(-)
 create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/golden.py
 create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
 create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
 create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h

diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 7e7b3b68..56ac566c 100644
--- a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -135,7 +135,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                     make_output_param(li_batch),
                     make_output_param(mi_batch),
                 };
-                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_hub, 3);
+                pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_hub, 3);
 
                 for (uint64_t bn = 0; bn < max_bn; bn++) {
                     uint64_t sij_shapes[2] = {chunk_bc * q_tile, block_size};
@@ -160,7 +160,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(num_heads),
                         make_scalar_param(batch_start),
                     };
-                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 10);
+                    pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 10);
 
                     PTOParam params_sf[] = {
                         make_input_param(sij_b),
@@ -173,7 +173,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(bn),
                         make_scalar_param(batch_start),
                     };
-                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 9);
+                    pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 9);
 
                     PTOParam params_pv[] = {
                         make_input_param(pij_b),
@@ -185,7 +185,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(block_num),
                         make_scalar_param(batch_start),
                     };
-                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 8);
+                    pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 8);
 
                     uint64_t is_first = (bn == 0) ? 1 : 0;
                     uint64_t is_last = (bn == max_bn - 1) ? 1 : 0;
@@ -204,7 +204,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(num_heads),
                         make_scalar_param(batch_start),
                     };
-                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 13);
+                    pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 13);
                 }
             }
         }
diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
index e3936359..6febf360 100644
--- a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
+++ b/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -120,7 +120,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                             make_input_param(B_view),
                             make_output_param(P),
                         };
-                        pto2_rt_submit_task(rt, FUNC_GEMM_TILE, PTO2_WORKER_CUBE,
+                        pto2_rt_submit_aic_task(rt, FUNC_GEMM_TILE,
                                            params_gemm, 3); // gemm
 
                         // C[m,n] += P
@@ -128,7 +128,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                             make_inout_param(C_view),
                             make_input_param(P),
                         };
-                        pto2_rt_submit_task(rt, FUNC_TILE_ADD, PTO2_WORKER_VECTOR,
+                        pto2_rt_submit_aiv_task(rt, FUNC_TILE_ADD,
                                            params_add, 2); // add
                     }
                 }
diff --git a/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md
index fbb18761..86700292 100644
--- a/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md
+++ b/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md
@@ -31,17 +31,28 @@ Validate `arg_count` in `aicpu_orchestration_config` and interpret pointers as d
 2. Wrap orchestration in scopes with `PTO2_SCOPE(rt)` to control tensor lifetimes.
 3. Use `make_tensor_external` for input/output buffers and `make_tensor` for intermediates.
 4. Build `PTOParam` arrays with `make_input_param`, `make_output_param`, `make_inout_param`, and `make_scalar_param`.
-5. Submit tasks with `pto2_rt_submit_task(rt, func_id, worker_type, params, num_params)`.
+5. Submit tasks with one of:
+   - `pto2_rt_submit_aic_task(rt, kernel_id, params, num_params)` — AIC (CUBE) task
+   - `pto2_rt_submit_aiv_task(rt, kernel_id, params, num_params)` — AIV (VECTOR) task
+   - `pto2_rt_submit_task(rt, mixed_kernels, params, num_params)` — mixed task with a `MixedKernels` struct
 
 Dependencies are inferred by TensorMap from input/inout/output tensors, so you do not add explicit edges.
 
-## Worker Types And Kernel IDs
-- Worker types come from `pto_orchestration_api.h` (`PTO2_WORKER_CUBE`, `PTO2_WORKER_VECTOR`, etc.).
+## Submit API And Kernel IDs
+- Submit helpers are defined in `pto_orchestration_api.h`.
+- `pto2_rt_submit_aic_task` and `pto2_rt_submit_aiv_task` are convenience wrappers around `pto2_rt_submit_task` with a `MixedKernels` struct.
+- For mixed AIC+AIV tasks, construct a `MixedKernels` struct directly:
+  ```cpp
+  MixedKernels mk;
+  mk.aic_kernel_id = FUNC_QK;
+  mk.aiv0_kernel_id = FUNC_SF;
+  pto2_rt_submit_task(rt, mk, params, num_params);
+  ```
 - Kernel `func_id` values are defined in `kernels/kernel_config.py` under `KERNELS`.
 
 ## Completion Semantics
 Do not call `pto2_rt_orchestration_done` yourself in device mode. The executor wraps the entry call in an outer scope and signals completion after `aicpu_orchestration_entry` returns.
 
 ## Examples
-- `examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp`
-- `examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp`
+- `examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp` (AIV-only tasks)
+- `examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp` (mixed AIC + AIV tasks)
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/golden.py b/examples/tensormap_and_ringbuffer/mixed_example/golden.py
new file mode 100644
index 00000000..a6412a15
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/mixed_example/golden.py
@@ -0,0 +1,130 @@
+"""
+Golden test specification for mixed AIC+AIV example.
+
+Covers all 5 resource shapes per iteration:
+  1. AIC_AIV_X2: C = A@B, F = D+E, I = G*H
+  2. AIC_ONLY:   J = A@B
+  3. AIV_X1:     K = D+E
+  4. AIV_X2:     L = D+E, M = G*H
+  5. AIC_AIV_X1: N = A@B, O = D+E
+
+All use 128x128 float32 tiles, repeated over num_iters slices.
+
+Args layout (30 args):
+  [ptr_A..ptr_O, size_A..size_O]
+"""
+
+import ctypes
+import torch
+
+__outputs__ = ["C", "F", "I", "J", "K", "L", "M", "N", "O"]
+RTOL = 1e-3
+ATOL = 1e-3
+
+ALL_CASES = {
+    "case1": {"num_iters": 4},
+    "case2": {"num_iters": 1},
+}
+
+DEFAULT_CASE = "case1"
+
+MATMUL_SIZE = 128
+TILE_ELEMS = 128 * 128
+
+
+def generate_inputs(params: dict) -> list:
+    num_iters = params["num_iters"]
+
+    torch.manual_seed(42)
+
+    # Matmul inputs (shared by AIC tasks)
+    A = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01
+    B = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01
+
+    # Add inputs (shared by AIV add tasks)
+    D = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+    E = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+
+    # Mul inputs (shared by AIV mul tasks)
+    G = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+    H = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+
+    # Output buffers (num_iters slices each)
+    C = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIC_AIV_X2 matmul
+    F = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIC_AIV_X2 add
+    I = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIC_AIV_X2 mul
+    J = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIC_ONLY matmul
+    K = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIV_X1 add
+    L = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIV_X2 add
+    M = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIV_X2 mul
+    N = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIC_AIV_X1 matmul
+    O = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32)  # AIC_AIV_X1 add
+
+    A_flat = A.flatten()
+    B_flat = B.flatten()
+
+    return [
+        ("A", A_flat),
+        ("B", B_flat),
+        ("C", C.flatten()),
+        ("D", D),
+        ("E", E),
+        ("F", F.flatten()),
+        ("G", G),
+        ("H", H),
+        ("I", I.flatten()),
+        ("J", J.flatten()),
+        ("K", K.flatten()),
+        ("L", L.flatten()),
+        ("M", M.flatten()),
+        ("N", N.flatten()),
+        ("O", O.flatten()),
+        ("size_A", ctypes.c_int64(A_flat.nbytes)),
+        ("size_B", ctypes.c_int64(B_flat.nbytes)),
+        ("size_C", ctypes.c_int64(C.flatten().nbytes)),
+        ("size_D", ctypes.c_int64(D.nbytes)),
+        ("size_E", ctypes.c_int64(E.nbytes)),
+        ("size_F", ctypes.c_int64(F.flatten().nbytes)),
+        ("size_G", ctypes.c_int64(G.nbytes)),
+        ("size_H", ctypes.c_int64(H.nbytes)),
+        ("size_I", ctypes.c_int64(I.flatten().nbytes)),
+        ("size_J", ctypes.c_int64(J.flatten().nbytes)),
+        ("size_K", ctypes.c_int64(K.flatten().nbytes)),
+        ("size_L", ctypes.c_int64(L.flatten().nbytes)),
+        ("size_M", ctypes.c_int64(M.flatten().nbytes)),
+        ("size_N", ctypes.c_int64(N.flatten().nbytes)),
+        ("size_O", ctypes.c_int64(O.flatten().nbytes)),
+    ]
+
+
+def compute_golden(tensors: dict, params: dict) -> None:
+    num_iters = params["num_iters"]
+
+    A = torch.as_tensor(tensors["A"]).reshape(MATMUL_SIZE, MATMUL_SIZE)
+    B = torch.as_tensor(tensors["B"]).reshape(MATMUL_SIZE, MATMUL_SIZE)
+    D = torch.as_tensor(tensors["D"])
+    E = torch.as_tensor(tensors["E"])
+    G = torch.as_tensor(tensors["G"])
+    H = torch.as_tensor(tensors["H"])
+
+    golden_matmul = torch.matmul(A, B).flatten()
+    golden_add = D + E
+    golden_mul = G * H
+
+    for name in ["C", "J", "N"]:
+        out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS)
+        for i in range(num_iters):
+            out[i] = golden_matmul
+        tensors[name][:] = out.flatten()
+
+    for name in ["F", "K", "L", "O"]:
+        out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS)
+        for i in range(num_iters):
+            out[i] = golden_add
+        tensors[name][:] = out.flatten()
+
+    for name in ["I", "M"]:
+        out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS)
+        for i in range(num_iters):
+            out[i] = golden_mul
+        tensors[name][:] = out.flatten()
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
new file mode 100644
index 00000000..186abb95
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
@@ -0,0 +1,126 @@
+/**
+ * Matrix Multiplication Kernel (Cube Core)
+ *
+ * Computes: C = A @ B (TILE x TILE x TILE matmul)
+ * Uses TMATMUL instruction
+ *
+ * Args (Tensor*):
+ *   args[0] = A (INPUT)  - TILE x TILE
+ *   args[1] = B (INPUT)  - TILE x TILE
+ *   args[2] = C (OUTPUT) - TILE x TILE
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+#include <pto/common/constants.hpp>
+#include <pto/common/pto_tile.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <typename T>
+AICORE constexpr inline T CeilAlign(T num_1, T num_2) {
+    if (num_2 == 0) {
+        return 0;
+    }
+    return (num_1 + num_2 - 1) / num_2 * num_2;
+}
+
+static __aicore__ inline int get_num_tiles(__gm__ Tensor* tensor, uint64_t tile_elems) {
+    uint64_t total_elems = tensor->shapes[0];
+    return static_cast<int>(total_elems / tile_elems);
+}
+
+template <int TILE>
+static __aicore__ void matmul_impl(
+    __gm__ float* input_a,
+    __gm__ float* input_b,
+    __gm__ float* output) {
+
+    constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float);
+    constexpr int M = CeilAlign<int>(TILE, 16);
+    constexpr int K = CeilAlign<int>(TILE, blockAlign);
+    constexpr int N = CeilAlign<int>(TILE, blockAlign);
+
+    using GlobalDataA = GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>,
+        Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataB = GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>,
+        Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+    using GlobalDataC = GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>,
+        Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
+
+    GlobalDataA src0Global(input_a);
+    GlobalDataB src1Global(input_b);
+    GlobalDataC dstGlobal(output);
+
+    using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
+
+    using LeftTile = TileLeft<float, M, K, TILE, TILE>;
+    using RightTile = TileRight<float, K, N, TILE, TILE>;
+    using AccTile = TileAcc<float, M, N, TILE, TILE>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    TLOAD(aMatTile, src0Global);
+    TLOAD(bMatTile, src1Global);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(dstGlobal, cTile);
+
+    set_flag(PIPE_FIX, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_FIX, PIPE_MTE2, EVENT_ID0);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* input_a = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* input_b = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* output  = reinterpret_cast<__gm__ Tensor*>(args[2]);
+
+    constexpr uint64_t TILE_ELEMS = 128 * 128;
+    int num_tiles = get_num_tiles(input_a, TILE_ELEMS);
+
+    __gm__ float* base_a = reinterpret_cast<__gm__ float*>(input_a->buffer.addr) + input_a->start_offset;
+    __gm__ float* base_b = reinterpret_cast<__gm__ float*>(input_b->buffer.addr) + input_b->start_offset;
+    __gm__ float* base_c = reinterpret_cast<__gm__ float*>(output->buffer.addr) + output->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float* a_ptr = base_a + (tile_idx * TILE_ELEMS);
+        __gm__ float* b_ptr = base_b + (tile_idx * TILE_ELEMS);
+        __gm__ float* c_ptr = base_c + (tile_idx * TILE_ELEMS);
+
+        matmul_impl<128>(a_ptr, b_ptr, c_ptr);
+    }
+}
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
new file mode 100644
index 00000000..be9b0ebc
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
@@ -0,0 +1,89 @@
+/**
+ * Element-wise Tensor Addition Kernel (for mixed task)
+ *
+ * Implements: out[i] = src0[i] + src1[i]
+ * Tile size: 128 x 128
+ *
+ * In the mixed task, this kernel shares the param list with the matmul kernel.
+ * Matmul uses args[0..2], this kernel uses args[3..5].
+ *
+ * Args (Tensor*):
+ *   args[3] = src0 (INPUT)  - 128 x 128
+ *   args[4] = src1 (INPUT)  - 128 x 128
+ *   args[5] = out (OUTPUT)  - 128 x 128
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ inline int get_num_tiles(__gm__ Tensor* tensor, uint64_t tile_elems) {
+    uint64_t total_elems = tensor->shapes[0];
+    return static_cast<int>(total_elems / tile_elems);
+}
+
+template <int ROWS, int COLS>
+static __aicore__ void add_impl(
+    __gm__ float* src0,
+    __gm__ float* src1,
+    __gm__ float* out) {
+
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* src0_tensor = reinterpret_cast<__gm__ Tensor*>(args[3]);
+    __gm__ Tensor* src1_tensor = reinterpret_cast<__gm__ Tensor*>(args[4]);
+    __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[5]);
+
+    constexpr uint64_t TILE_ELEMS = 128 * 128;
+    int num_tiles = get_num_tiles(src0_tensor, TILE_ELEMS);
+
+    __gm__ float* base_src0 = reinterpret_cast<__gm__ float*>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float* base_src1 = reinterpret_cast<__gm__ float*>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float* base_out = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float* src0_ptr = base_src0 + (tile_idx * TILE_ELEMS);
+        __gm__ float* src1_ptr = base_src1 + (tile_idx * TILE_ELEMS);
+        __gm__ float* out_ptr = base_out + (tile_idx * TILE_ELEMS);
+
+        add_impl<128, 128>(src0_ptr, src1_ptr, out_ptr);
+    }
+}
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
new file mode 100644
index 00000000..4475907e
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
@@ -0,0 +1,74 @@
+/**
+ * Standalone Element-wise Addition Kernel
+ *
+ * Implements: out[i] = src0[i] + src1[i]
+ * Tile size: 128 x 128
+ *
+ * Reads args[0..2] — for standalone AIV_X1 tasks or AIV0 slot in AIV_X2.
+ *
+ * Args (Tensor*):
+ *   args[0] = src0 (INPUT)  - 128 x 128
+ *   args[1] = src1 (INPUT)  - 128 x 128
+ *   args[2] = out (OUTPUT)  - 128 x 128
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int ROWS, int COLS>
+static __aicore__ void add_impl(
+    __gm__ float* src0,
+    __gm__ float* src1,
+    __gm__ float* out) {
+
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TADD(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* src0_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* src1_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[2]);
+
+    __gm__ float* src0 = reinterpret_cast<__gm__ float*>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float* src1 = reinterpret_cast<__gm__ float*>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float* out = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    add_impl<128, 128>(src0, src1, out);
+}
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
new file mode 100644
index 00000000..d5117419
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
@@ -0,0 +1,90 @@
+/**
+ * Element-wise Tensor Multiplication Kernel (for mixed task, AIV1 slot)
+ *
+ * Implements: out[i] = src0[i] * src1[i]
+ * Tile size: 128 x 128
+ *
+ * In the mixed task, this kernel occupies the AIV1 slot and shares the param
+ * list with the matmul kernel (args[0..2]) and add kernel (args[3..5]).
+ * This kernel uses args[6..8].
+ *
+ * Args (Tensor*):
+ *   args[6] = src0 (INPUT)  - 128 x 128
+ *   args[7] = src1 (INPUT)  - 128 x 128
+ *   args[8] = out (OUTPUT)  - 128 x 128
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ inline int get_num_tiles(__gm__ Tensor* tensor, uint64_t tile_elems) {
+    uint64_t total_elems = tensor->shapes[0];
+    return static_cast<int>(total_elems / tile_elems);
+}
+
+template <int ROWS, int COLS>
+static __aicore__ void mul_impl(
+    __gm__ float* src0,
+    __gm__ float* src1,
+    __gm__ float* out) {
+
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TMUL(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* src0_tensor = reinterpret_cast<__gm__ Tensor*>(args[6]);
+    __gm__ Tensor* src1_tensor = reinterpret_cast<__gm__ Tensor*>(args[7]);
+    __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[8]);
+
+    constexpr uint64_t TILE_ELEMS = 128 * 128;
+    int num_tiles = get_num_tiles(src0_tensor, TILE_ELEMS);
+
+    __gm__ float* base_src0 = reinterpret_cast<__gm__ float*>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float* base_src1 = reinterpret_cast<__gm__ float*>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float* base_out = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+        __gm__ float* src0_ptr = base_src0 + (tile_idx * TILE_ELEMS);
+        __gm__ float* src1_ptr = base_src1 + (tile_idx * TILE_ELEMS);
+        __gm__ float* out_ptr = base_out + (tile_idx * TILE_ELEMS);
+
+        mul_impl<128, 128>(src0_ptr, src1_ptr, out_ptr);
+    }
+}
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
new file mode 100644
index 00000000..3b44b721
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
@@ -0,0 +1,74 @@
+/**
+ * Standalone Element-wise Multiplication Kernel (AIV1 slot)
+ *
+ * Implements: out[i] = src0[i] * src1[i]
+ * Tile size: 128 x 128
+ *
+ * Reads args[3..5] — for AIV1 slot in AIV_X2 tasks where AIV0 uses args[0..2].
+ *
+ * Args (Tensor*):
+ *   args[3] = src0 (INPUT)  - 128 x 128
+ *   args[4] = src1 (INPUT)  - 128 x 128
+ *   args[5] = out (OUTPUT)  - 128 x 128
+ */
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int ROWS, int COLS>
+static __aicore__ void mul_impl(
+    __gm__ float* src0,
+    __gm__ float* src1,
+    __gm__ float* out) {
+
+    using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>;
+    using DynStridDim5 = Stride<1, 1, 1, COLS, 1>;
+    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
+    using TileData = Tile<TileType::Vec, float, ROWS, COLS, BLayout::RowMajor, -1, -1>;
+
+    TileData src0Tile(ROWS, COLS);
+    TileData src1Tile(ROWS, COLS);
+    TileData dstTile(ROWS, COLS);
+    TASSIGN(src0Tile, 0x0);
+    TASSIGN(src1Tile, 0x10000);
+    TASSIGN(dstTile, 0x20000);
+
+    GlobalData src0Global(src0);
+    GlobalData src1Global(src1);
+    GlobalData dstGlobal(out);
+
+    TLOAD(src0Tile, src0Global);
+    TLOAD(src1Tile, src1Global);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    TMUL(dstTile, src0Tile, src1Tile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(dstGlobal, dstTile);
+    set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+    wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* src0_tensor = reinterpret_cast<__gm__ Tensor*>(args[3]);
+    __gm__ Tensor* src1_tensor = reinterpret_cast<__gm__ Tensor*>(args[4]);
+    __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[5]);
+
+    __gm__ float* src0 = reinterpret_cast<__gm__ float*>(src0_tensor->buffer.addr) + src0_tensor->start_offset;
+    __gm__ float* src1 = reinterpret_cast<__gm__ float*>(src1_tensor->buffer.addr) + src1_tensor->start_offset;
+    __gm__ float* out = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset;
+
+    mul_impl<128, 128>(src0, src1, out);
+}
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py b/examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
new file mode 100644
index 00000000..4637f3ce
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
@@ -0,0 +1,58 @@
+"""
+Kernel configuration for mixed AIC+AIV example (tensormap_and_ringbuffer Runtime).
+
+Covers all 5 resource shapes:
+  - AIC_ONLY:   standalone matmul
+  - AIV_X1:     standalone add
+  - AIV_X2:     add (AIV0) + mul (AIV1)
+  - AIC_AIV_X1: matmul (AIC) + add (AIV0)
+  - AIC_AIV_X2: matmul (AIC) + add (AIV0) + mul (AIV1)
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "mixed_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+KERNELS = [
+    {
+        "func_id": 0,
+        "name": "MATMUL",
+        "source": str(_KERNELS_ROOT / "aic" / "kernel_matmul.cpp"),
+        "core_type": "aic",
+    },
+    {
+        "func_id": 1,
+        "name": "ADD",
+        "source": str(_KERNELS_ROOT / "aiv" / "kernel_add.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 2,
+        "name": "MUL",
+        "source": str(_KERNELS_ROOT / "aiv" / "kernel_mul.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 3,
+        "name": "ADD_STANDALONE",
+        "source": str(_KERNELS_ROOT / "aiv" / "kernel_add_standalone.cpp"),
+        "core_type": "aiv",
+    },
+    {
+        "func_id": 4,
+        "name": "MUL_STANDALONE",
+        "source": str(_KERNELS_ROOT / "aiv" / "kernel_mul_standalone.cpp"),
+        "core_type": "aiv",
+    },
+]
+
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "block_dim": 3,
+}
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
new file mode 100644
index 00000000..a97753ec
--- /dev/null
+++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
@@ -0,0 +1,221 @@
+/**
+ * Mixed AIC+AIV Orchestration Function (tensormap_and_ringbuffer Runtime)
+ *
+ * Covers all 5 resource shapes per iteration:
+ *   1. AIC_AIV_X2: AIC matmul(A,B->C) + AIV0 add(D,E->F) + AIV1 mul(G,H->I)
+ *   2. AIC_ONLY:   matmul(A,B->J)
+ *   3. AIV_X1:     add(D,E->K)
+ *   4. AIV_X2:     AIV0 add(D,E->L) + AIV1 mul(G,H->M)
+ *   5. AIC_AIV_X1: AIC matmul(A,B->N) + AIV0 add(D,E->O)
+ *
+ * Args layout (30 args):
+ *   [ptr_A, ptr_B, ptr_C, ptr_D, ptr_E, ptr_F,
+ *    ptr_G, ptr_H, ptr_I, ptr_J, ptr_K, ptr_L,
+ *    ptr_M, ptr_N, ptr_O,
+ *    size_A, size_B, size_C, size_D, size_E, size_F,
+ *    size_G, size_H, size_I, size_J, size_K, size_L,
+ *    size_M, size_N, size_O]
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+// Mixed-task kernels (args offset matches param position in mixed param list)
+#define FUNC_MATMUL         0   // AIC: reads args[0..2]
+#define FUNC_ADD            1   // AIV0 in mixed: reads args[3..5]
+#define FUNC_MUL            2   // AIV1 in mixed: reads args[6..8]
+// Standalone kernels (read args[0..2] or args[3..5])
+#define FUNC_ADD_STANDALONE 3   // AIV: reads args[0..2]
+#define FUNC_MUL_STANDALONE 4   // AIV1 in AIV_X2: reads args[3..5]
+
+#define ARG_PTR_A   0
+#define ARG_PTR_B   1
+#define ARG_PTR_C   2
+#define ARG_PTR_D   3
+#define ARG_PTR_E   4
+#define ARG_PTR_F   5
+#define ARG_PTR_G   6
+#define ARG_PTR_H   7
+#define ARG_PTR_I   8
+#define ARG_PTR_J   9
+#define ARG_PTR_K   10
+#define ARG_PTR_L   11
+#define ARG_PTR_M   12
+#define ARG_PTR_N   13
+#define ARG_PTR_O   14
+#define ARG_SIZE_A  15
+#define ARG_SIZE_B  16
+#define ARG_SIZE_C  17
+#define ARG_SIZE_D  18
+#define ARG_SIZE_E  19
+#define ARG_SIZE_F  20
+#define ARG_SIZE_G  21
+#define ARG_SIZE_H  22
+#define ARG_SIZE_I  23
+#define ARG_SIZE_J  24
+#define ARG_SIZE_K  25
+#define ARG_SIZE_L  26
+#define ARG_SIZE_M  27
+#define ARG_SIZE_N  28
+#define ARG_SIZE_O  29
+
+static constexpr uint64_t TILE_ELEMS = 128 * 128;
+
+extern "C" {
+
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count) {
+    (void)args;
+    (void)arg_count;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 30,
+    };
+}
+
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, int orch_thread_num, int orch_thread_index) {
+    (void)arg_count;
+    (void)orch_thread_num;
+    (void)orch_thread_index;
+
+    void* dev_A = (void*)(uintptr_t)args[ARG_PTR_A];
+    void* dev_B = (void*)(uintptr_t)args[ARG_PTR_B];
+    void* dev_C = (void*)(uintptr_t)args[ARG_PTR_C];
+    void* dev_D = (void*)(uintptr_t)args[ARG_PTR_D];
+    void* dev_E = (void*)(uintptr_t)args[ARG_PTR_E];
+    void* dev_F = (void*)(uintptr_t)args[ARG_PTR_F];
+    void* dev_G = (void*)(uintptr_t)args[ARG_PTR_G];
+    void* dev_H = (void*)(uintptr_t)args[ARG_PTR_H];
+    void* dev_I = (void*)(uintptr_t)args[ARG_PTR_I];
+    void* dev_J = (void*)(uintptr_t)args[ARG_PTR_J];
+    void* dev_K = (void*)(uintptr_t)args[ARG_PTR_K];
+    void* dev_L = (void*)(uintptr_t)args[ARG_PTR_L];
+    void* dev_M = (void*)(uintptr_t)args[ARG_PTR_M];
+    void* dev_N = (void*)(uintptr_t)args[ARG_PTR_N];
+    void* dev_O = (void*)(uintptr_t)args[ARG_PTR_O];
+    size_t size_C = (size_t)args[ARG_SIZE_C];
+
+    int num_iters = (int)(size_C / (TILE_ELEMS * sizeof(float)));
+
+    LOG_INFO(rt, "[mixed_orch] num_iters=%d", num_iters);
+
+    // Input tensors (shared across all tasks)
+    uint64_t ab_shapes[1] = {TILE_ELEMS};
+    Tensor ext_A = make_tensor_external(dev_A, ab_shapes, 1, DataType::FLOAT32);
+    Tensor ext_B = make_tensor_external(dev_B, ab_shapes, 1, DataType::FLOAT32);
+
+    uint64_t de_shapes[1] = {TILE_ELEMS};
+    Tensor ext_D = make_tensor_external(dev_D, de_shapes, 1, DataType::FLOAT32);
+    Tensor ext_E = make_tensor_external(dev_E, de_shapes, 1, DataType::FLOAT32);
+
+    uint64_t gh_shapes[1] = {TILE_ELEMS};
+    Tensor ext_G = make_tensor_external(dev_G, gh_shapes, 1, DataType::FLOAT32);
+    Tensor ext_H = make_tensor_external(dev_H, gh_shapes, 1, DataType::FLOAT32);
+
+    // Output tensors (full buffers, one slice per iteration)
+    uint64_t out_shapes[1] = {(uint64_t)num_iters * TILE_ELEMS};
+    Tensor ext_C = make_tensor_external(dev_C, out_shapes, 1, DataType::FLOAT32);
+    Tensor ext_F = make_tensor_external(dev_F, out_shapes, 1, DataType::FLOAT32);
+    Tensor ext_I = make_tensor_external(dev_I, out_shapes, 1, DataType::FLOAT32);
+    Tensor ext_J = make_tensor_external(dev_J, out_shapes, 1, DataType::FLOAT32);
+    Tensor ext_K = make_tensor_external(dev_K, out_shapes, 1, DataType::FLOAT32);
+    Tensor ext_L = make_tensor_external(dev_L, out_shapes, 1, DataType::FLOAT32);
+    Tensor ext_M = make_tensor_external(dev_M, out_shapes, 1, DataType::FLOAT32);
+    Tensor ext_N = make_tensor_external(dev_N, out_shapes, 1, DataType::FLOAT32);
+    Tensor ext_O = make_tensor_external(dev_O, out_shapes, 1, DataType::FLOAT32);
+
+    for (int i = 0; i < num_iters; i++) {
+        PTO2_SCOPE(rt) {
+            uint64_t view_shapes[1] = {TILE_ELEMS};
+            uint64_t view_offsets[1] = {(uint64_t)i * TILE_ELEMS};
+
+            Tensor C_view = ext_C.view(view_shapes, view_offsets);
+            Tensor F_view = ext_F.view(view_shapes, view_offsets);
+            Tensor I_view = ext_I.view(view_shapes, view_offsets);
+            Tensor J_view = ext_J.view(view_shapes, view_offsets);
+            Tensor K_view = ext_K.view(view_shapes, view_offsets);
+            Tensor L_view = ext_L.view(view_shapes, view_offsets);
+            Tensor M_view = ext_M.view(view_shapes, view_offsets);
+            Tensor N_view = ext_N.view(view_shapes, view_offsets);
+            Tensor O_view = ext_O.view(view_shapes, view_offsets);
+
+            // 1. AIC_AIV_X2: matmul + add + mul
+            {
+                MixedKernels mk;
+                mk.aic_kernel_id = FUNC_MATMUL;
+                mk.aiv0_kernel_id = FUNC_ADD;
+                mk.aiv1_kernel_id = FUNC_MUL;
+                PTOParam params[9] = {
+                    make_input_param(ext_A),
+                    make_input_param(ext_B),
+                    make_output_param(C_view),
+                    make_input_param(ext_D),
+                    make_input_param(ext_E),
+                    make_output_param(F_view),
+                    make_input_param(ext_G),
+                    make_input_param(ext_H),
+                    make_output_param(I_view),
+                };
+                pto2_rt_submit_task(rt, mk, params, 9);
+            }
+
+            // 2. AIC_ONLY: standalone matmul
+            {
+                PTOParam params[3] = {
+                    make_input_param(ext_A),
+                    make_input_param(ext_B),
+                    make_output_param(J_view),
+                };
+                pto2_rt_submit_aic_task(rt, FUNC_MATMUL, params, 3);
+            }
+
+            // 3. AIV_X1: standalone add
+            {
+                PTOParam params[3] = {
+                    make_input_param(ext_D),
+                    make_input_param(ext_E),
+                    make_output_param(K_view),
+                };
+                pto2_rt_submit_aiv_task(rt, FUNC_ADD_STANDALONE, params, 3);
+            }
+
+            // 4. AIV_X2: add (AIV0) + mul (AIV1)
+            {
+                MixedKernels mk;
+                mk.aiv0_kernel_id = FUNC_ADD_STANDALONE;
+                mk.aiv1_kernel_id = FUNC_MUL_STANDALONE;
+                PTOParam params[6] = {
+                    make_input_param(ext_D),
+                    make_input_param(ext_E),
+                    make_output_param(L_view),
+                    make_input_param(ext_G),
+                    make_input_param(ext_H),
+                    make_output_param(M_view),
+                };
+                pto2_rt_submit_task(rt, mk, params, 6);
+            }
+
+            // 5. AIC_AIV_X1: matmul (AIC) + add (AIV0)
+            {
+                MixedKernels mk;
+                mk.aic_kernel_id = FUNC_MATMUL;
+                mk.aiv0_kernel_id = FUNC_ADD;
+                PTOParam params[6] = {
+                    make_input_param(ext_A),
+                    make_input_param(ext_B),
+                    make_output_param(N_view),
+                    make_input_param(ext_D),
+                    make_input_param(ext_E),
+                    make_output_param(O_view),
+                };
+                pto2_rt_submit_task(rt, mk, params, 6);
+            }
+        }
+    }
+
+    LOG_INFO(rt, "[mixed_orch] Submitted %d iterations x 5 shapes = %d tasks", num_iters, num_iters * 5);
+}
+
+}  // extern "C"
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 9184031e..9bc691a9 100644
--- a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -141,7 +141,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                     make_output_param(li_update),
                     make_output_param(mi_update),
                 };
-                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); // create_inplace
+                pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace, 3); // create_inplace
 
                 for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
                     uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn];
@@ -160,7 +160,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_input_param(kj),
                         make_output_param(sij),
                     };
-                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3); // c1
+                    pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 3); // c1
 
                     uint64_t sij_valid_shapes[2] = {q_tile, valid_len};
                     uint64_t sij_valid_offsets[2] = {0, 0};
@@ -174,7 +174,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_output_param(mi),
                         make_output_param(li),
                     };
-                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5); // v1
+                    pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 5); // v1
 
                     uint64_t oi_tmp_shapes[2] = {q_tile, head_dim};
                     Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32);
@@ -184,7 +184,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_input_param(vj),
                         make_output_param(oi_tmp),
                     };
-                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3); // c2
+                    pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 3); // c2
 
                     uint64_t is_first = (bn == 0) ? 1 : 0;
                     uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
@@ -200,7 +200,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(is_first),
                         make_scalar_param(is_last),
                     };
-                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); // v2
+                    pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 9); // v2
                 }
             }
         }
diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp b/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
index 4e6df402..c55eccf5 100644
--- a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
+++ b/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
@@ -107,7 +107,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
         make_input_param(ext_b),
         make_output_param(c),
     };
-    pto2_rt_submit_task(rt, 0, PTO2_WORKER_VECTOR, params_t0, 3); // kernel_add
+    pto2_rt_submit_aiv_task(rt, 0, params_t0, 3); // kernel_add
 
     // Inner scope: owns t1, t2, t3, t4; intermediates d, e, g release on scope end.
     // c flows in from outer scope (outer-scope tensors are visible to inner scopes).
@@ -123,7 +123,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
             make_output_param(d),
             make_scalar_param((uint64_t)3),
         };
-        pto2_rt_submit_task(rt, 1, PTO2_WORKER_VECTOR, params_t1, 3); // kernel_add_scalar
+        pto2_rt_submit_aiv_task(rt, 1, params_t1, 3); // kernel_add_scalar
 
         // t2: e = c + 2 (kernel_id=1, kernel_add_scalar)
         PTOParam params_t2[] = {
@@ -132,7 +132,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
             make_output_param(e),
             make_scalar_param((uint64_t)3),
         };
-        pto2_rt_submit_task(rt, 1, PTO2_WORKER_VECTOR, params_t2, 3); // kernel_add_scalar
+        pto2_rt_submit_aiv_task(rt, 1, params_t2, 3); // kernel_add_scalar
 
         // t3: g = d * e (kernel_id=2, kernel_mul)
         PTOParam params_t3[] = {
@@ -141,7 +141,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
             make_output_param(g),
             make_scalar_param((uint64_t)3),
         };
-        pto2_rt_submit_task(rt, 2, PTO2_WORKER_VECTOR, params_t3, 3); // kernel_mul
+        pto2_rt_submit_aiv_task(rt, 2, params_t3, 3); // kernel_mul
 
         // t4: f = g + c (kernel_id=0, kernel_add)
         PTOParam params_t4[] = {
@@ -149,7 +149,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
             make_input_param(c),
             make_output_param(ext_f),
         };
-        pto2_rt_submit_task(rt, 0, PTO2_WORKER_VECTOR, params_t4, 3); // kernel_add
+        pto2_rt_submit_aiv_task(rt, 0, params_t4, 3); // kernel_add
     }  // inner scope ends: releases d, e, g
 }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index c687086b..b9057a99 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -106,7 +106,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
 
             __gm__ PTO2DispatchPayload* payload = my_payload;
 
-            write_reg(RegId::COND, MAKE_ACK_VALUE(payload->task_id));
+            write_reg(RegId::COND, MAKE_ACK_VALUE(payload->mixed_task_id));
 
             // Performance profiling: record start time
             uint64_t start_time = 0;
@@ -121,13 +121,13 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in
             if (profiling_enabled) {
                 uint64_t end_time = get_sys_cnt_aicore();
                 __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr;
-                perf_aicore_record_task(perf_buf, payload->task_id, payload->kernel_id,
+                perf_aicore_record_task(perf_buf, payload->mixed_task_id, payload->kernel_id,
                                        start_time, end_time, kernel_ready_time,
                                        core_type);
             }
 
             last_task_id = task_id;
-            write_reg(RegId::COND, MAKE_FIN_VALUE(payload->task_id));
+            write_reg(RegId::COND, MAKE_FIN_VALUE(payload->mixed_task_id));
         }
     }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 89f13b47..69ddda44 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -98,16 +98,57 @@ struct CoreTypeTracker {
         idle[idle_count++] = running[idx];
         running[idx] = running[--running_count];
     }
+
+    int32_t find_idle_index(int32_t core_id) {
+        for (int32_t i = 0; i < idle_count; i++) {
+            if (idle[i] == core_id) return i;
+        }
+        return -1;
+    }
+};
+
+struct Cluster {
+    int32_t aic_core_id;
+    int32_t aiv_core_ids[2];
 };
 
 struct CoreStateTracker {
     CoreTypeTracker by_type[2];  // indexed by static_cast<int32_t>(CoreType)
+    Cluster clusters[MAX_AIC_PER_THREAD];
+    int32_t cluster_count;
+    bool core_idle[MAX_CORES_PER_THREAD];
 
     CoreTypeTracker& aic() { return by_type[0]; }
     CoreTypeTracker& aiv() { return by_type[1]; }
 
     template<CoreType CT>
     CoreTypeTracker& get() { return by_type[static_cast<int32_t>(CT)]; }
+
+    int32_t find_cluster_for_shape(PTO2ResourceShape shape) {
+        for (int32_t i = 0; i < cluster_count; i++) {
+            Cluster& c = clusters[i];
+            switch (shape) {
+            case PTO2ResourceShape::AIC_ONLY:
+                if (core_idle[c.aic_core_id]) return i;
+                break;
+            case PTO2ResourceShape::AIV_X1:
+                if (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]]) return i;
+                break;
+            case PTO2ResourceShape::AIV_X2:
+                if (core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i;
+                break;
+            case PTO2ResourceShape::AIC_AIV_X1:
+                if (core_idle[c.aic_core_id] &&
+                    (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]])) return i;
+                break;
+            case PTO2ResourceShape::AIC_AIV_X2:
+                if (core_idle[c.aic_core_id] &&
+                    core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i;
+                break;
+            }
+        }
+        return -1;
+    }
 };
 
 struct AicpuExecutor {
@@ -188,16 +229,19 @@ struct AicpuExecutor {
     void diagnose_stuck_state(
         Runtime* runtime, int32_t thread_idx, const int32_t* cur_thread_cores, int32_t core_num, Handshake* hank);
 
-    // Build PTO2DispatchPayload from PTO2TaskDescriptor.
-    template<CoreType CT>
+    // Build PTO2DispatchPayload from PTO2TaskDescriptor for a specific subtask slot.
     void build_pto2_payload(PTO2DispatchPayload* out,
         Runtime* runtime,
         PTO2TaskDescriptor* task,
-        PTO2TaskPayload* task_payload) {
-        out->task_id = task->task_id;
-        out->kernel_id = task->kernel_id;
-        out->core_type = CT;
-        out->function_bin_addr = runtime->get_function_bin_addr(task->kernel_id);
+        PTO2TaskPayload* task_payload,
+        PTO2SubtaskSlot subslot,
+        CoreType core_type) {
+        int32_t slot_idx = static_cast<int32_t>(subslot);
+        out->mixed_task_id = task->mixed_task_id;
+        out->subslot = subslot;
+        out->kernel_id = task->kernel_id[slot_idx];
+        out->core_type = core_type;
+        out->function_bin_addr = runtime->get_function_bin_addr(task->kernel_id[slot_idx]);
         int32_t n = 0;
 
         for (int32_t i = 0; i < task_payload->param_count; i++) {
@@ -216,6 +260,7 @@ struct AicpuExecutor {
     template <CoreType CT>
     void check_running_cores_for_completion(int32_t thread_idx,
         CoreTypeTracker& ct,
+        bool* core_idle,
         Handshake* hank,
         int32_t* executing_task_ids,
         int32_t& completed_this_turn,
@@ -261,43 +306,49 @@ struct AicpuExecutor {
 
             if (done) {
                 executing_task_ids[core_id] = AICPU_TASK_INVALID;
-#if PTO2_SCHED_PROFILING
                 PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
-                PTO2CompletionStats cstats = rt->scheduler.on_task_complete(task_id, thread_idx, local_bufs);
-                notify_edges_total += cstats.fanout_edges;
-                if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
-                notify_tasks_enqueued += cstats.tasks_enqueued;
-                phase_complete_count++;
+                int32_t mixed_task_id = payload->mixed_task_id;
+                PTO2SubtaskSlot subslot = payload->subslot;
+
+                // Two-stage completion: mark subtask done, then handle mixed-task completion
+                bool mixed_complete = rt->scheduler.on_subtask_complete(mixed_task_id, subslot);
+                if (mixed_complete) {
+#if PTO2_SCHED_PROFILING
+                    PTO2CompletionStats cstats = rt->scheduler.on_mixed_task_complete(mixed_task_id, thread_idx, local_bufs);
+                    notify_edges_total += cstats.fanout_edges;
+                    if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
+                    notify_tasks_enqueued += cstats.tasks_enqueued;
+                    phase_complete_count++;
 #elif PTO2_PROFILING
-                PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
-                PTO2CompletionStats cstats = rt->scheduler.on_task_complete(task_id, local_bufs);
-                notify_edges_total += cstats.fanout_edges;
-                if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
-                notify_tasks_enqueued += cstats.tasks_enqueued;
-                phase_complete_count++;
+                    PTO2CompletionStats cstats = rt->scheduler.on_mixed_task_complete(mixed_task_id, local_bufs);
+                    notify_edges_total += cstats.fanout_edges;
+                    if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges;
+                    notify_tasks_enqueued += cstats.tasks_enqueued;
+                    phase_complete_count++;
 #else
-                rt->scheduler.on_task_complete(task_id, local_bufs);
+                    rt->scheduler.on_mixed_task_complete(mixed_task_id, local_bufs);
 #endif
-                if (deferred_release_count < 64) {
-                    deferred_release_ids[deferred_release_count++] = task_id;
-                } else {
-                    DEV_ALWAYS("Thread %d: release", thread_idx);
-                    while (deferred_release_count > 0) {
+                    if (deferred_release_count < 64) {
+                        deferred_release_ids[deferred_release_count++] = mixed_task_id;
+                    } else {
+                        DEV_ALWAYS("Thread %d: release", thread_idx);
+                        while (deferred_release_count > 0) {
 #if PTO2_SCHED_PROFILING
-                        int32_t fe =
-                            rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count], thread_idx);
+                            int32_t fe =
+                                rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count], thread_idx);
 #else
-                        int32_t fe = rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count]);
+                            int32_t fe = rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count]);
 #endif
-                        (void)fe;
+                            (void)fe;
 #if PTO2_PROFILING
-                        fanin_edges_total += fe;
-                        if (fe > fanin_max_degree) fanin_max_degree = fe;
+                            fanin_edges_total += fe;
+                            if (fe > fanin_max_degree) fanin_max_degree = fe;
 #endif
+                        }
                     }
                 }
                 ct.move_running_to_idle(i);
-
+                core_idle[core_id] = true;
 #if PTO2_PROFILING
                 if (profiling_enabled) {
 #if PTO2_SCHED_PROFILING
@@ -310,7 +361,7 @@ struct AicpuExecutor {
                     uint32_t count = perf_buf->count;
                     if (count > 0) {
                         PerfRecord* record = &perf_buf->records[count - 1];
-                        if (record->task_id == static_cast<uint32_t>(payload->task_id)) {
+                        if (record->task_id == static_cast<uint32_t>(payload->mixed_task_id)) {
                             perf_aicpu_record_dispatch_and_finish_time(
                                 record, dispatch_timestamps_[core_id], finish_ts);
                         }
@@ -321,114 +372,132 @@ struct AicpuExecutor {
                 }
 #endif
 
-                DEV_DEBUG("Thread %d: %s core %d completed PTO2 task %d",
+                DEV_DEBUG("Thread %d: %s core %d completed PTO2 task %d (mixed_complete=%d)",
                     thread_idx,
                     CT == CoreType::AIC ? "AIC" : "AIV",
                     core_id,
-                    task_id);
+                    task_id,
+                    mixed_complete ? 1 : 0);
                 cur_thread_completed++;
-                completed_this_turn++;
+                if (mixed_complete) {
+                    completed_this_turn++;
+                }
                 made_progress = true;
             }
         }
     }
 
-    template <CoreType CT>
-    void dispatch_ready_tasks_to_idle_cores(Runtime* runtime,
-        int32_t thread_idx,
-        CoreTypeTracker& ct,
-        int32_t* executing_task_ids,
-        bool& made_progress,
-        PTO2TaskDescriptor* task_descriptors,
-        PTO2TaskPayload* task_payloads,
-        int32_t window_mask,
-        PTO2LocalReadyBuffer* local_bufs
-#if PTO2_PROFILING
-        ,
-        bool profiling_enabled,
-        uint64_t& pop_hit,
-        uint64_t& pop_miss,
-        uint32_t& phase_dispatch_count,
-        uint64_t& local_dispatch_count,
-        uint64_t& local_overflow_count
-#endif
-#if PTO2_SCHED_PROFILING
-        ,
-        uint64_t& sched_dispatch_pop_cycle,
-        uint64_t& sched_dispatch_setup_cycle
-#endif
-    ) {
-        constexpr int ct_idx = static_cast<int>(CT);
+    static const char* shape_name(PTO2ResourceShape shape) {
+        switch (shape) {
+        case PTO2ResourceShape::AIC_ONLY:   return "AIC_ONLY";
+        case PTO2ResourceShape::AIV_X1:     return "AIV_X1";
+        case PTO2ResourceShape::AIV_X2:     return "AIV_X2";
+        case PTO2ResourceShape::AIC_AIV_X1: return "AIC_AIV_X1";
+        case PTO2ResourceShape::AIC_AIV_X2: return "AIC_AIV_X2";
+        }
+        return "UNKNOWN";
+    }
 
-        for (int32_t i = ct.idle_count - 1; i >= 0; i--) {
-            int32_t core_id = ct.idle[i];
+    struct ResourceCount {
+        int32_t aic;
+        int32_t aiv;
+    };
+
+    static constexpr ResourceCount shape_resource_count(PTO2ResourceShape shape) {
+        constexpr ResourceCount kTable[PTO2_NUM_RESOURCE_SHAPES] = {
+            {1, 0},  // AIC_ONLY    = 0
+            {0, 1},  // AIV_X1      = 1
+            {0, 2},  // AIV_X2      = 2
+            {1, 1},  // AIC_AIV_X1  = 3
+            {1, 2},  // AIC_AIV_X2  = 4
+        };
+        return kTable[static_cast<int>(shape)];
+    }
+
+    /**
+     * Returns the dispatch probe order for a given scheduler thread.
+     * Widest shapes first to avoid consuming cluster resources with narrow tasks.
+     * Even/odd threads use different fallback orders (AIC-first vs AIV-first)
+     * to reduce contention on the same ready queue across adjacent threads.
+     */
+    static const PTO2ResourceShape* get_dispatch_order(int32_t thread_idx) {
+        // Even threads: AIC-first fallback after widest
+        static constexpr PTO2ResourceShape kEvenOrder[PTO2_NUM_RESOURCE_SHAPES] = {
+            PTO2ResourceShape::AIC_AIV_X2,
+            PTO2ResourceShape::AIC_AIV_X1,
+            PTO2ResourceShape::AIC_ONLY,
+            PTO2ResourceShape::AIV_X2,
+            PTO2ResourceShape::AIV_X1,
+        };
+        // Odd threads: AIV-first fallback after widest
+        static constexpr PTO2ResourceShape kOddOrder[PTO2_NUM_RESOURCE_SHAPES] = {
+            PTO2ResourceShape::AIC_AIV_X2,
+            PTO2ResourceShape::AIV_X2,
+            PTO2ResourceShape::AIC_AIV_X1,
+            PTO2ResourceShape::AIV_X1,
+            PTO2ResourceShape::AIC_ONLY,
+        };
+        return (thread_idx % 2 == 0) ? kEvenOrder : kOddOrder;
+    }
 
+    int32_t pop_ready_task(PTO2ResourceShape shape, int32_t thread_idx
 #if PTO2_PROFILING
-            int local_count_before = local_bufs[ct_idx].count;
+        , uint64_t& pop_hit, uint64_t& pop_miss
 #endif
 #if PTO2_SCHED_PROFILING
-            extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
-            uint64_t t_pop_start = get_sys_cnt_aicpu();
-            int32_t task_id = rt->scheduler.get_ready_task<CT>(
-                local_bufs,
-                g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]);
-            sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
-#else
-            int32_t task_id = rt->scheduler.get_ready_task<CT>(local_bufs);
-#endif
-            if (task_id >= 0) {
-#if PTO2_PROFILING
-                pop_hit++;
-                phase_dispatch_count++;
-                if (local_bufs[ct_idx].count < local_count_before) {
-                    local_dispatch_count++;
-                }
+        , uint64_t& sched_dispatch_pop_cycle
 #endif
+    ) {
+        (void)thread_idx;
 #if PTO2_SCHED_PROFILING
-                uint64_t t_setup_start = get_sys_cnt_aicpu();
+        extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
+        uint64_t t_pop_start = get_sys_cnt_aicpu();
+        int32_t task_id = rt->scheduler.get_ready_task(shape,
+            g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]);
+        sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
+#else
+        int32_t task_id = rt->scheduler.get_ready_task(shape);
 #endif
-                PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask];
-                PTO2TaskPayload* task_pl = &task_payloads[task_id & window_mask];
-                PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
-                build_pto2_payload<CT>(payload, runtime, task, task_pl);
+        if (task_id >= 0) {
 #if PTO2_PROFILING
-                if (profiling_enabled) {
-                    dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
-                    if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) {
-                        perf_aicpu_switch_buffer(runtime, core_id, thread_idx);
-                        core_dispatch_counts_[core_id] = 0;
-                    }
-                    core_dispatch_counts_[core_id]++;
-                }
-#endif
-                write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast<uint64_t>(task_id + 1));
-                ct.move_idle_to_running(i);
-                executing_task_ids[core_id] = task_id;
-                made_progress = true;
-#if PTO2_SCHED_PROFILING
-                sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+            pop_hit++;
 #endif
-                DEV_DEBUG("Thread %d: Dispatching PTO2 task %d to %s core %d",
-                    thread_idx,
-                    task_id,
-                    CT == CoreType::AIC ? "AIC" : "AIV",
-                    core_id);
-            } else {
+        } else {
 #if PTO2_PROFILING
-                pop_miss++;
+            pop_miss++;
 #endif
-                break;
-            }
         }
+        return task_id;
+    }
 
-        // Drain remaining local tasks to global queue (idle cores exhausted)
-        while (local_bufs[ct_idx].count > 0) {
-            int32_t task_id = local_bufs[ct_idx].pop();
-            rt->scheduler.ready_queues[ct_idx].push(task_id);
+    void dispatch_subtask_to_core(
+        Runtime* runtime, CoreStateTracker& tracker, int32_t* executing_task_ids,
+        int32_t core_id, CoreType core_type,
+        int32_t task_id, PTO2TaskDescriptor* task, PTO2TaskPayload* task_pl,
+        PTO2SubtaskSlot subslot
 #if PTO2_PROFILING
-            local_overflow_count++;
+        , bool profiling_enabled, int32_t thread_idx
 #endif
+    ) {
+        PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id];
+        build_pto2_payload(payload, runtime, task, task_pl, subslot, core_type);
+#if PTO2_PROFILING
+        if (profiling_enabled) {
+            dispatch_timestamps_[core_id] = get_sys_cnt_aicpu();
+            if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) {
+                perf_aicpu_switch_buffer(runtime, core_id, thread_idx);
+                core_dispatch_counts_[core_id] = 0;
+            }
+            core_dispatch_counts_[core_id]++;
         }
+#endif
+        write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast<uint64_t>(task_id + 1));
+
+        CoreTypeTracker& ct = tracker.by_type[static_cast<int32_t>(core_type)];
+        int32_t idle_idx = ct.find_idle_index(core_id);
+        ct.move_idle_to_running(idle_idx);
+        tracker.core_idle[core_id] = false;
+        executing_task_ids[core_id] = task_id;
     }
 };
 
@@ -525,14 +594,13 @@ int32_t AicpuExecutor::handshake_all_cores(Runtime* runtime) {
  * (Aligned with host_build_graph mechanism)
  */
 void AicpuExecutor::assign_cores_to_threads() {
-    // Determine how many cores each thread gets initially:
-    // - Mixed mode: distribute among scheduler threads only
-    // - All-orchestrator mode: distribute among all threads (they all transition to schedulers)
+    // Cluster-aligned assignment: each cluster = 1 AIC + 2 AIV (adjacent pair)
     int32_t divisor = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_;
-    int32_t aic_per_thread = aic_count_ / divisor;
-    int32_t aiv_per_thread = aiv_count_ / divisor;
+    int32_t cluster_count = aic_count_;
+    int32_t clusters_per_thread = cluster_count / divisor;
 
-    DEV_INFO("Assigning cores: %d AIC per thread, %d AIV per thread", aic_per_thread, aiv_per_thread);
+    DEV_INFO("Assigning cores: %d clusters, %d per thread (%d AIC, %d AIV)",
+             cluster_count, clusters_per_thread, aic_count_, aiv_count_);
 
     for (int32_t i = 0; i < thread_num_; i++) {
         for (int32_t j = 0; j < MAX_CORES_PER_THREAD; j++) {
@@ -542,6 +610,8 @@ void AicpuExecutor::assign_cores_to_threads() {
         trackers_[i].aiv().running_count = 0;
         trackers_[i].aic().idle_count = 0;
         trackers_[i].aiv().idle_count = 0;
+        trackers_[i].cluster_count = 0;
+        memset(trackers_[i].core_idle, 0, sizeof(trackers_[i].core_idle));
     }
 
     for (int32_t t = 0; t < thread_num_; t++) {
@@ -553,31 +623,36 @@ void AicpuExecutor::assign_cores_to_threads() {
         }
 
         int32_t core_idx = 0;
+        CoreStateTracker& tracker = trackers_[t];
 
-        // Assign AIC cores
-        int32_t aic_start = t * aic_per_thread;
-        for (int32_t i = 0; i < aic_per_thread; i++) {
-            int32_t worker_id = aic_cores_[aic_start + i].worker_id;
-            core_assignments_[t][core_idx++] = worker_id;
-            trackers_[t].aic().idle[trackers_[t].aic().idle_count++] = worker_id;
-            DEV_INFO("Thread %d: assigned AIC worker_id=%d", t, worker_id);
-        }
+        for (int32_t c = 0; c < clusters_per_thread; c++) {
+            int32_t ci = t * clusters_per_thread + c;
+            int32_t aic_wid = aic_cores_[ci].worker_id;
+            int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id;
+            int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id;
+
+            tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}};
 
-        // Assign AIV cores
-        int32_t aiv_start = t * aiv_per_thread;
-        for (int32_t i = 0; i < aiv_per_thread; i++) {
-            int32_t worker_id = aiv_cores_[aiv_start + i].worker_id;
-            core_assignments_[t][core_idx++] = worker_id;
-            trackers_[t].aiv().idle[trackers_[t].aiv().idle_count++] = worker_id;
-            DEV_INFO("Thread %d: assigned AIV worker_id=%d", t, worker_id);
+            core_assignments_[t][core_idx++] = aic_wid;
+            tracker.aic().idle[tracker.aic().idle_count++] = aic_wid;
+            tracker.core_idle[aic_wid] = true;
+
+            core_assignments_[t][core_idx++] = aiv0_wid;
+            core_assignments_[t][core_idx++] = aiv1_wid;
+            tracker.aiv().idle[tracker.aiv().idle_count++] = aiv0_wid;
+            tracker.aiv().idle[tracker.aiv().idle_count++] = aiv1_wid;
+            tracker.core_idle[aiv0_wid] = true;
+            tracker.core_idle[aiv1_wid] = true;
+
+            DEV_INFO("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)",
+                     t, ci, aic_wid, aiv0_wid, aiv1_wid);
         }
 
         core_count_per_thread_[t] = core_idx;
-
-        DEV_INFO("Thread %d: total %d cores", t, core_idx);
+        DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx, clusters_per_thread);
     }
 
-    thread_cores_num_ = aic_per_thread + aiv_per_thread;
+    thread_cores_num_ = clusters_per_thread * 3;
 }
 
 /**
@@ -586,100 +661,90 @@ void AicpuExecutor::assign_cores_to_threads() {
  * Writes into new_core_assignments_ / new_core_count_per_thread_.
  */
 void AicpuExecutor::reassign_cores_for_all_threads() {
-    // Calculate how many AIC/AIV each thread should have
+    DEV_INFO("Reassigning cores (cluster-aligned) for all %d threads: %d AIC, %d AIV",
+             thread_num_, aic_count_, aiv_count_);
 
-    DEV_INFO("Reassigning cores for all %d threads: %d AIC, %d AIV", thread_num_, aic_count_, aiv_count_);
+    // Collect running/idle state from all threads before reassignment
+    int32_t running_cores[128];
+    int32_t running_task_ids[128];
+    int32_t running_count = 0;
 
-    int32_t aic_running_cores[128];
-    int32_t aic_running_task_ids[128];
-    int32_t aic_idle_cores[128];
-    int32_t aic_running_cores_num = 0;
-    int32_t aic_idle_cores_num = 0;
-
-    int32_t aiv_running_cores[128];
-    int32_t aiv_running_task_ids[128];
-    int32_t aiv_idle_cores[128];
-    int32_t aiv_running_cores_num = 0;
-    int32_t aiv_idle_cores_num = 0;
+    bool was_idle[MAX_CORES_PER_THREAD];
+    memset(was_idle, 0, sizeof(was_idle));
 
     for (int32_t i = 0; i < thread_num_; i++) {
-        core_count_per_thread_[i] = 0;
         for (int32_t j = 0; j < trackers_[i].aic().running_count; j++) {
             int32_t core_id = trackers_[i].aic().running[j];
-            aic_running_cores[aic_running_cores_num] = core_id;
-            aic_running_task_ids[aic_running_cores_num] = executing_task_ids_[i][core_id];
-            aic_running_cores_num++;
+            running_cores[running_count] = core_id;
+            running_task_ids[running_count] = executing_task_ids_[i][core_id];
+            running_count++;
         }
         for (int32_t j = 0; j < trackers_[i].aic().idle_count; j++) {
-            aic_idle_cores[aic_idle_cores_num++] = trackers_[i].aic().idle[j];
+            was_idle[trackers_[i].aic().idle[j]] = true;
         }
         for (int32_t j = 0; j < trackers_[i].aiv().running_count; j++) {
             int32_t core_id = trackers_[i].aiv().running[j];
-            aiv_running_cores[aiv_running_cores_num] = core_id;
-            aiv_running_task_ids[aiv_running_cores_num] = executing_task_ids_[i][core_id];
-            aiv_running_cores_num++;
+            running_cores[running_count] = core_id;
+            running_task_ids[running_count] = executing_task_ids_[i][core_id];
+            running_count++;
         }
         for (int32_t j = 0; j < trackers_[i].aiv().idle_count; j++) {
-            aiv_idle_cores[aiv_idle_cores_num++] = trackers_[i].aiv().idle[j];
+            was_idle[trackers_[i].aiv().idle[j]] = true;
         }
+    }
+
+    // Reset all trackers
+    for (int32_t i = 0; i < thread_num_; i++) {
+        core_count_per_thread_[i] = 0;
         trackers_[i].aic().running_count = 0;
         trackers_[i].aic().idle_count = 0;
         trackers_[i].aiv().running_count = 0;
         trackers_[i].aiv().idle_count = 0;
+        trackers_[i].cluster_count = 0;
+        memset(trackers_[i].core_idle, 0, sizeof(trackers_[i].core_idle));
         for (int32_t j = 0; j < MAX_CORES_PER_THREAD; j++) {
             executing_task_ids_[i][j] = AICPU_TASK_INVALID;
         }
     }
-    for (int32_t i = 0; i < aic_count_; i++) {
-        int32_t thread_idx = i % thread_num_;
-        int32_t core_id = aic_cores_[i].worker_id;
-        core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = core_id;
-        bool found = false;
-        for (int32_t j = 0; j < aic_running_cores_num; j++) {
-            if (core_id == aic_running_cores[j]) {
-                trackers_[thread_idx].aic().running[trackers_[thread_idx].aic().running_count++] = core_id;
-                executing_task_ids_[thread_idx][core_id] = aic_running_task_ids[j];
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            for (int32_t j = 0; j < aic_idle_cores_num; j++) {
-                if (core_id == aic_idle_cores[j]) {
-                    trackers_[thread_idx].aic().idle[trackers_[thread_idx].aic().idle_count++] = core_id;
-                    break;
-                }
-            }
-        }
-    }
-    for (int32_t i = 0; i < aiv_count_; i++) {
-        int32_t thread_idx = i % thread_num_;
-        int32_t core_id = aiv_cores_[i].worker_id;
-        core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = core_id;
-        bool found = false;
-        for (int32_t j = 0; j < aiv_running_cores_num; j++) {
-            if (core_id == aiv_running_cores[j]) {
-                trackers_[thread_idx].aiv().running[trackers_[thread_idx].aiv().running_count++] = core_id;
-                executing_task_ids_[thread_idx][core_id] = aiv_running_task_ids[j];
-                found = true;
-                break;
+
+    // Restore a single core's running/idle state into its new thread's tracker
+    auto reassign_core = [&](int32_t worker_id, CoreTypeTracker& type_tracker,
+                             CoreStateTracker& tracker, int32_t thread_idx) {
+        core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = worker_id;
+        for (int32_t j = 0; j < running_count; j++) {
+            if (running_cores[j] == worker_id) {
+                type_tracker.running[type_tracker.running_count++] = worker_id;
+                executing_task_ids_[thread_idx][worker_id] = running_task_ids[j];
+                return;
             }
         }
-        if (!found) {
-            for (int32_t j = 0; j < aiv_idle_cores_num; j++) {
-                if (core_id == aiv_idle_cores[j]) {
-                    trackers_[thread_idx].aiv().idle[trackers_[thread_idx].aiv().idle_count++] = core_id;
-                    break;
-                }
-            }
+        if (was_idle[worker_id]) {
+            type_tracker.idle[type_tracker.idle_count++] = worker_id;
+            tracker.core_idle[worker_id] = true;
         }
+    };
+
+    // Assign whole clusters round-robin across all threads
+    for (int32_t ci = 0; ci < aic_count_; ci++) {
+        int32_t t = ci % thread_num_;
+        CoreStateTracker& tracker = trackers_[t];
+
+        int32_t aic_wid = aic_cores_[ci].worker_id;
+        int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id;
+        int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id;
+
+        tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}};
+
+        reassign_core(aic_wid, tracker.aic(), tracker, t);
+        reassign_core(aiv0_wid, tracker.aiv(), tracker, t);
+        reassign_core(aiv1_wid, tracker.aiv(), tracker, t);
     }
 
     // Log final distribution for verification
     DEV_INFO("Core reassignment complete:");
     for (int32_t t = 0; t < thread_num_; t++) {
-        DEV_INFO("  Thread %d: %d cores (AIC: running=%d idle=%d, AIV: running=%d idle=%d)",
-                 t, core_count_per_thread_[t],
+        DEV_INFO("  Thread %d: %d cores, %d clusters (AIC: running=%d idle=%d, AIV: running=%d idle=%d)",
+                 t, core_count_per_thread_[t], trackers_[t].cluster_count,
                  trackers_[t].aic().running_count, trackers_[t].aic().idle_count,
                  trackers_[t].aiv().running_count, trackers_[t].aiv().idle_count);
     }
@@ -942,7 +1007,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
         if (tracker.aic().running_count > 0) {
             try_completed = true;
             check_running_cores_for_completion<CoreType::AIC>(
-                thread_idx, tracker.aic(), hank, executing_task_ids,
+                thread_idx, tracker.aic(), tracker.core_idle, hank, executing_task_ids,
                 completed_this_turn, cur_thread_completed, made_progress,
                 deferred_release_ids, deferred_release_count,
                 local_bufs
@@ -961,7 +1026,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
         if (tracker.aiv().running_count > 0) {
             try_completed = true;
             check_running_cores_for_completion<CoreType::AIV>(
-                thread_idx, tracker.aiv(), hank, executing_task_ids,
+                thread_idx, tracker.aiv(), tracker.core_idle, hank, executing_task_ids,
                 completed_this_turn, cur_thread_completed, made_progress,
                 deferred_release_ids, deferred_release_count,
                 local_bufs
@@ -1003,45 +1068,153 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
         }
 #endif
 
-        // Dispatch: local queue first (zero MPMC operations), then global readyQ
+        // Phase 2: Local dispatch — drain local_bufs, match to idle clusters (zero MPMC operations)
+        // Phase 3: Global queue — push overflow to readyQ + fill remaining idle cores from readyQ
         bool try_pushed = false;
 
-        // Process AIC cores: local AIC buffer + global CUBE queue
-        // Enter when local buffer has tasks (even if no idle cores, to drain to global queue)
-        // or when idle cores can be filled from global queue
-        if (local_bufs[0].count > 0 ||
-            (tracker.aic().idle_count > 0 && rt->scheduler.ready_queues[PTO2_WORKER_CUBE].size() > 0)) {
-            try_pushed = true;
-            dispatch_ready_tasks_to_idle_cores<CoreType::AIC>(
-                runtime, thread_idx, tracker.aic(), executing_task_ids, made_progress,
-                task_descriptors, task_payloads, window_mask,
-                local_bufs
+        // Local dispatch: drain both per-CoreType local_bufs, match to idle clusters by shape
+        int32_t overflow_ids[LOCAL_READY_CAP_PER_TYPE * PTO2_LOCAL_DISPATCH_TYPE_NUM];
+        int overflow_count = 0;
+        for (int bi = 0; bi < PTO2_LOCAL_DISPATCH_TYPE_NUM; bi++) {
+            while (local_bufs[bi].count > 0) {
+                int32_t task_id = local_bufs[bi].pop();
+                PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask];
+                PTO2ResourceShape shape = pto2_active_mask_to_shape(task->active_mask);
+                int32_t ci = tracker.find_cluster_for_shape(shape);
+
+                if (ci >= 0) {
+                    try_pushed = true;
+                    PTO2TaskPayload* task_pl = &task_payloads[task_id & window_mask];
+                    Cluster& c = tracker.clusters[ci];
+#if PTO2_SCHED_PROFILING
+                    uint64_t t_setup_start = get_sys_cnt_aicpu();
+#endif
+                    ResourceCount rc = shape_resource_count(shape);
+
+                    if (rc.aic) {
+                        dispatch_subtask_to_core(runtime, tracker, executing_task_ids,
+                            c.aic_core_id, CoreType::AIC,
+                            task_id, task, task_pl, PTO2SubtaskSlot::AIC
+#if PTO2_PROFILING
+                            , profiling_enabled, thread_idx
+#endif
+                        );
+                    }
+                    if (rc.aiv >= 1) {
+                        int32_t aiv0 = tracker.core_idle[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1];
+                        dispatch_subtask_to_core(runtime, tracker, executing_task_ids,
+                            aiv0, CoreType::AIV,
+                            task_id, task, task_pl, PTO2SubtaskSlot::AIV0
+#if PTO2_PROFILING
+                            , profiling_enabled, thread_idx
+#endif
+                        );
+                    }
+                    if (rc.aiv >= 2) {
+                        dispatch_subtask_to_core(runtime, tracker, executing_task_ids,
+                            c.aiv_core_ids[1], CoreType::AIV,
+                            task_id, task, task_pl, PTO2SubtaskSlot::AIV1
+#if PTO2_PROFILING
+                            , profiling_enabled, thread_idx
+#endif
+                        );
+                    }
 #if PTO2_PROFILING
-                , profiling_enabled, pop_hit, pop_miss, phase_dispatch_count
-                , local_dispatch_count, local_overflow_count
+                    pop_hit++;
+                    phase_dispatch_count++;
+                    local_dispatch_count++;
 #endif
 #if PTO2_SCHED_PROFILING
-                , sched_dispatch_pop_cycle, sched_dispatch_setup_cycle
+                    sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
 #endif
-            );
+                    made_progress = true;
+                    DEV_DEBUG("Thread %d: Dispatching %s task %d to cluster %d (local)",
+                              thread_idx, shape_name(shape), task_id, ci);
+                } else {
+                    overflow_ids[overflow_count++] = task_id;
+#if PTO2_PROFILING
+                    local_overflow_count++;
+#endif
+                }
+            }
         }
 
-        // Process AIV cores: local AIV buffer + global VECTOR queue
-        if (local_bufs[1].count > 0 ||
-            (tracker.aiv().idle_count > 0 && rt->scheduler.ready_queues[PTO2_WORKER_VECTOR].size() > 0)) {
-            try_pushed = true;
-            dispatch_ready_tasks_to_idle_cores<CoreType::AIV>(
-                runtime, thread_idx, tracker.aiv(), executing_task_ids, made_progress,
-                task_descriptors, task_payloads, window_mask,
-                local_bufs
+        // Push overflow to global readyQ (shape-based)
+        for (int i = 0; i < overflow_count; i++) {
+            PTO2TaskDescriptor* task = &task_descriptors[overflow_ids[i] & window_mask];
+            PTO2ResourceShape shape = pto2_active_mask_to_shape(task->active_mask);
+            rt->scheduler.ready_queues[static_cast<int32_t>(shape)].push(overflow_ids[i]);
+        }
+
+        // Phase 3: Global dispatch — fill remaining idle cores from global readyQ (cluster-based)
+        const PTO2ResourceShape* dispatch_order = get_dispatch_order(thread_idx);
+
+        for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) {
+            PTO2ResourceShape shape = dispatch_order[si];
+            if (rt->scheduler.ready_queues[static_cast<int32_t>(shape)].size() == 0) continue;
+
+            while (true) {
+                int32_t ci = tracker.find_cluster_for_shape(shape);
+                if (ci < 0) break;
+
+                int32_t task_id = pop_ready_task(shape, thread_idx
 #if PTO2_PROFILING
-                , profiling_enabled, pop_hit, pop_miss, phase_dispatch_count
-                , local_dispatch_count, local_overflow_count
+                    , pop_hit, pop_miss
 #endif
 #if PTO2_SCHED_PROFILING
-                , sched_dispatch_pop_cycle, sched_dispatch_setup_cycle
+                    , sched_dispatch_pop_cycle
 #endif
-            );
+                );
+                if (task_id < 0) break;
+
+                try_pushed = true;
+#if PTO2_PROFILING
+                phase_dispatch_count++;
+#endif
+#if PTO2_SCHED_PROFILING
+                uint64_t t_setup_start = get_sys_cnt_aicpu();
+#endif
+                Cluster& c = tracker.clusters[ci];
+                PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask];
+                PTO2TaskPayload* task_pl = &task_payloads[task_id & window_mask];
+                ResourceCount rc = shape_resource_count(shape);
+
+                if (rc.aic) {
+                    dispatch_subtask_to_core(runtime, tracker, executing_task_ids,
+                        c.aic_core_id, CoreType::AIC, task_id, task, task_pl,
+                        PTO2SubtaskSlot::AIC
+#if PTO2_PROFILING
+                        , profiling_enabled, thread_idx
+#endif
+                    );
+                }
+                if (rc.aiv >= 1) {
+                    int32_t aiv_id = tracker.core_idle[c.aiv_core_ids[0]]
+                        ? c.aiv_core_ids[0] : c.aiv_core_ids[1];
+                    dispatch_subtask_to_core(runtime, tracker, executing_task_ids,
+                        aiv_id, CoreType::AIV, task_id, task, task_pl,
+                        PTO2SubtaskSlot::AIV0
+#if PTO2_PROFILING
+                        , profiling_enabled, thread_idx
+#endif
+                    );
+                }
+                if (rc.aiv >= 2) {
+                    dispatch_subtask_to_core(runtime, tracker, executing_task_ids,
+                        c.aiv_core_ids[1], CoreType::AIV, task_id, task, task_pl,
+                        PTO2SubtaskSlot::AIV1
+#if PTO2_PROFILING
+                        , profiling_enabled, thread_idx
+#endif
+                    );
+                }
+                made_progress = true;
+#if PTO2_SCHED_PROFILING
+                sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
+#endif
+                DEV_DEBUG("Thread %d: Dispatching %s task %d to cluster %d",
+                    thread_idx, shape_name(shape), task_id, ci);
+            }
         }
 
 #if PTO2_PROFILING
@@ -1089,7 +1262,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                     PTO2TaskState st = sched->task_state[slot].load(std::memory_order_relaxed);
                     int32_t rc = sched->fanin_refcount[slot].load(std::memory_order_relaxed);
                     int32_t fi = task_descriptors[slot].fanin_count;
-                    int32_t kid = task_descriptors[slot].kernel_id;
+                    int32_t kid = task_descriptors[slot].kernel_id[0];
                     if (st >= PTO2_TASK_COMPLETED) continue; // Already done
                     if (st == PTO2_TASK_READY || st == PTO2_TASK_RUNNING) { cnt_inflight++; continue; }
                     // PENDING
@@ -1124,7 +1297,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                     int32_t hw_kernel = -1;
                     if (hh->task != 0) {
                         const PTO2DispatchPayload* pl = reinterpret_cast<const PTO2DispatchPayload*>((uintptr_t)hh->task);
-                        hw_task_id = pl->task_id;
+                        hw_task_id = pl->mixed_task_id;
                         hw_kernel  = pl->kernel_id;
                     }
                     DEV_ALWAYS("    AIC core[%d] cid=%d sw_task=%d hw_task=%d hw_kernel=%d",
@@ -1138,7 +1311,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                     int32_t hw_kernel = -1;
                     if (hh->task != 0) {
                         const PTO2DispatchPayload* pl = reinterpret_cast<const PTO2DispatchPayload*>((uintptr_t)hh->task);
-                        hw_task_id = pl->task_id;
+                        hw_task_id = pl->mixed_task_id;
                         hw_kernel  = pl->kernel_id;
                     }
                     uint64_t cond_reg = read_reg(core_id_to_reg_addr_[cid], RegId::COND);
@@ -1147,6 +1320,14 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa
                                EXTRACT_TASK_STATE(cond_reg), EXTRACT_TASK_ID(cond_reg),
                                executing_task_ids[cid], hw_task_id, hw_kernel);
                 }
+                // Dump cluster state
+                for (int32_t cli = 0; cli < tracker.cluster_count && cli < STALL_DUMP_CORE_MAX; cli++) {
+                    Cluster& cl = tracker.clusters[cli];
+                    DEV_ALWAYS("    cluster[%d] aic=%d(%s) aiv0=%d(%s) aiv1=%d(%s)",
+                               cli, cl.aic_core_id, tracker.core_idle[cl.aic_core_id] ? "idle" : "busy",
+                               cl.aiv_core_ids[0], tracker.core_idle[cl.aiv_core_ids[0]] ? "idle" : "busy",
+                               cl.aiv_core_ids[1], tracker.core_idle[cl.aiv_core_ids[1]] ? "idle" : "busy");
+                }
             }
             if (idle_iterations > MAX_IDLE_ITERATIONS) {
                 DEV_ERROR("Thread %d: PTO2 timeout after %d idle iterations", thread_idx, idle_iterations);
@@ -1761,13 +1942,17 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int32_t thread_idx,
     DEV_ALWAYS("Progress: %d/%d tasks (%.1f%%)",
              completed, total, total > 0 ? completed * 100.0 / total : 0.0);
 
-    uint64_t aic_ready = 0, aiv_ready = 0;
+    uint64_t aic_ready = 0, aiv_ready = 0, aiv_x2_ready = 0, mixed_x1_ready = 0, mixed_x2_ready = 0;
     if (rt) {
         PTO2SchedulerState* sched = &rt->scheduler;
-        aic_ready = sched->ready_queues[PTO2_WORKER_CUBE].size();
-        aiv_ready = sched->ready_queues[PTO2_WORKER_VECTOR].size();
+        aic_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_ONLY)].size();
+        aiv_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIV_X1)].size();
+        aiv_x2_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIV_X2)].size();
+        mixed_x1_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_AIV_X1)].size();
+        mixed_x2_ready = sched->ready_queues[static_cast<int32_t>(PTO2ResourceShape::AIC_AIV_X2)].size();
     }
-    DEV_ALWAYS("Ready Queues: AIC=%lu, AIV=%lu", aic_ready, aiv_ready);
+    DEV_ALWAYS("Ready Queues: AIC=%lu, AIV=%lu, AIV_X2=%lu, AIC_AIV_X1=%lu, AIC_AIV_X2=%lu",
+               aic_ready, aiv_ready, aiv_x2_ready, mixed_x1_ready, mixed_x2_ready);
 
     int32_t busy_cores = 0;
     int32_t idle_cores = 0;
@@ -1791,7 +1976,7 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int32_t thread_idx,
                 DEV_ALWAYS("  Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s), executing_task_id=%d, kernel_id=%d",
                         core_id, core_type_str, reg_val, reg_task_id,
                         reg_state == TASK_FIN_STATE ? "FIN" : "ACK",
-                        payload->task_id, payload->kernel_id);
+                        payload->mixed_task_id, payload->kernel_id);
             } else {
                 DEV_ALWAYS("  Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s) but task_id not tracked",
                         core_id, core_type_str, reg_val, reg_task_id,
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index 7c302db7..a90ca050 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -318,23 +318,31 @@ When `pto2_submit_task` processes parameters:
 
 ## 6. Task Descriptor and States
 
-### 6.1 PTO2TaskDescriptor
+### 6.1 PTO2TaskDescriptor (Hot Path)
 
 | Field | Description |
 |-------|-------------|
-| `task_id` | Monotonically increasing ID |
-| `kernel_id` | Function ID (maps to compiled kernel binary) |
-| `worker_type` | CUBE (AIC), VECTOR (AIV), AI_CPU, or ACCELERATOR |
-| `fanin_head` | Head of fanin dependency list (pointer into DepListPool) |
+| `mixed_task_id` | Canonical mixed-task ID (monotonically increasing) |
+| `kernel_id[3]` | Per-slot kernel IDs: `[AIC, AIV0, AIV1]`; `INVALID_KERNEL_ID` = inactive |
+| `active_mask` | Bitmask of active subtask slots: `bit0=AIC`, `bit1=AIV0`, `bit2=AIV1` |
+| `subtask_done_mask` | Atomic bitmask; each subtask sets its done bit on completion |
 | `fanin_count` | Number of producer dependencies |
 | `fanout_lock` | Per-task spinlock for concurrent fanout modification |
 | `fanout_head` | Head of fanout consumer list (pointer, protected by `fanout_lock`) |
 | `fanout_count` | 1 (scope ref) + number of consumers |
 | `packed_buffer_base` | Start of packed buffer in GM Heap |
 | `packed_buffer_end` | End of packed buffer (for heap reclamation) |
-| `is_active` | Task slot is in use |
-| `params[16]` | Tensor and scalar parameters (`PTOParam` array) |
+
+### 6.1b PTO2TaskPayload (Cold Path)
+
+| Field | Description |
+|-------|-------------|
+| `tensors[16]` | Tensor descriptors for parameters |
+| `scalar_value[16]` | Scalar parameter values |
+| `is_tensor[16]` | Whether each parameter is tensor or scalar |
 | `param_count` | Number of valid parameters |
+| `fanin_tasks[]` | Producer task IDs (used by `on_task_release`) |
+| `fanin_actual_count` | Actual fanin count |
 
 ### 6.2 Task State Machine
 
@@ -406,8 +414,8 @@ Scopes control the lifetime of intermediate buffers. Each scope:
 ```cpp
 PTO2_SCOPE(rt) {
     // Tasks submitted here belong to this scope
-    pto2_rt_submit_task(rt, FUNC_QK, PTO2_WORKER_CUBE, params, n);
-    pto2_rt_submit_task(rt, FUNC_SF, PTO2_WORKER_VECTOR, params, n);
+    pto2_rt_submit_aic_task(rt, FUNC_QK, params, n);
+    pto2_rt_submit_aiv_task(rt, FUNC_SF, params, n);
 }
 // scope_end: scope reference released from all tasks above
 ```
@@ -435,11 +443,11 @@ Each scheduler thread runs a tight loop with two main phases:
 
 **Phase 1 — Completion Handling**:
 - Poll register `COND` on each managed core
-- When `TASK_FIN_STATE` detected: record completion timestamps, mark `task_state[slot] = COMPLETED`, acquire fanout lock, traverse fanout list (incrementing consumers' `fanin_refcount`), mark `task_state[slot] = CONSUMED`, advance `last_task_alive` watermark
+- When `TASK_FIN_STATE` detected: record completion timestamps, call `on_subtask_complete(mixed_task_id, subslot)` to set the done bit; when `subtask_done_mask == active_mask`, trigger `on_mixed_task_complete(mixed_task_id)` which marks `task_state[slot] = COMPLETED`, acquires fanout lock, traverses fanout list (incrementing consumers' `fanin_refcount`), marks `task_state[slot] = CONSUMED`, and advances `last_task_alive` watermark
 
 **Phase 2 — Dispatch**:
-- For each idle core: pop a task from the ready queue (lock-free MPMC Vyukov queue, one per worker type)
-- Build `PTO2DispatchPayload` from `TaskDescriptor`
+- For each idle core: pop a task from the matching shape-based ready queue (lock-free MPMC Vyukov queue, one per resource shape)
+- Build `PTO2DispatchPayload` from `TaskDescriptor` with `mixed_task_id`, `subslot`, `kernel_id`, and `core_type`
 - Write task pointer to `Handshake.task`, signal AICore via register `DATA_MAIN_BASE`
 
 After these phases, the scheduler updates profiling headers and checks for termination (all tasks completed and orchestrator done).
@@ -448,9 +456,9 @@ After these phases, the scheduler updates profiling headers and checks for termi
 
 Ready queues use a lock-free bounded MPMC (Vyukov) design:
 
-- One `PTO2ReadyQueue` per worker type (4 types: CUBE, VECTOR, AI_CPU, ACCELERATOR)
-- **Push**: any thread (orchestrator via `init_task`, or scheduler on completion) pushes newly-ready tasks
-- **Pop**: scheduler threads pop from the queue matching the idle core's worker type
+- One `PTO2ReadyQueue` per resource shape (5 shapes: `AIC_ONLY`, `AIV_X1`, `AIV_X2`, `AIC_AIV_X1`, `AIC_AIV_X2`)
+- **Push**: any thread (orchestrator via `init_task`, or scheduler on completion) pushes newly-ready tasks to the queue matching `pto2_active_mask_to_shape(task->active_mask)`
+- **Pop**: scheduler threads pop from the queue matching the idle core's resource shape
 - Per-slot sequence counters prevent ABA problems
 - `enqueue_pos` and `dequeue_pos` are on separate cache lines to avoid false sharing
 
@@ -505,8 +513,10 @@ Built by the scheduler from `PTO2TaskDescriptor`:
 
 | Field | Description |
 |-------|-------------|
-| `task_id` | Task identifier |
-| `kernel_id` | Function ID |
+| `mixed_task_id` | Mixed-task identifier (for completion aggregation) |
+| `subslot` | Which subtask slot this dispatch represents (`AIC`, `AIV0`, or `AIV1`) |
+| `kernel_id` | Function ID for this subtask slot |
+| `core_type` | AIC or AIV |
 | `function_bin_addr` | GM address of compiled kernel binary |
 | `num_args` | Number of arguments |
 | `args[]` | Tensor addresses and scalar values |
@@ -557,7 +567,9 @@ The orchestration API is defined in `pto_orchestration_api.h`. Orchestration cod
 
 | Function/Macro | Purpose |
 |----------------|---------|
-| `pto2_rt_submit_task(rt, kernel_id, worker_type, params, n)` | Submit a task with parameters |
+| `pto2_rt_submit_task(rt, mixed_kernels, params, n)` | Submit a mixed task with `MixedKernels` struct |
+| `pto2_rt_submit_aic_task(rt, kernel_id, params, n)` | Convenience: submit AIC-only task |
+| `pto2_rt_submit_aiv_task(rt, kernel_id, params, n)` | Convenience: submit AIV-only task |
 | `PTO2_SCOPE(rt) { ... }` | RAII scope for buffer lifetime |
 | `pto2_rt_orchestration_done(rt)` | Signal orchestration complete |
 | `pto2_rt_init_tensor_pool(rt)` | Initialize tensor pool for `make_tensor()` |
@@ -573,14 +585,17 @@ The orchestration API is defined in `pto_orchestration_api.h`. Orchestration cod
 | `make_inout_param(tensor)` | INOUT parameter — read then written |
 | `make_scalar_param(value)` | 64-bit scalar parameter |
 
-### 11.3 Worker Types
+### 11.3 Resource Shapes
+
+Tasks are queued by resource shape, which is derived from the `active_mask` in the `MixedKernels` struct:
 
-| Type | Target |
-|------|--------|
-| `PTO2_WORKER_CUBE` | AIC cores (matrix multiplication) |
-| `PTO2_WORKER_VECTOR` | AIV cores (vector operations) |
-| `PTO2_WORKER_AI_CPU` | AICPU (scalar ops, control flow) |
-| `PTO2_WORKER_ACCELERATOR` | Fixed-function accelerators (DMA, etc.) |
+| Shape | Active Mask | Description |
+|-------|-------------|-------------|
+| `AIC_ONLY` | AIC only | AIC cores (matrix multiplication) |
+| `AIV_X1` | AIV0 or AIV1 only | Single AIV core (vector operations) |
+| `AIV_X2` | AIV0 + AIV1 | Two AIV cores |
+| `AIC_AIV_X1` | AIC + one AIV | AIC + single AIV core |
+| `AIC_AIV_X2` | AIC + AIV0 + AIV1 | Full cluster (AIC + two AIV cores) |
 
 ### 11.4 Orchestration Export Interface
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
new file mode 100644
index 00000000..72619284
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md
@@ -0,0 +1,226 @@
+# Submit by Cluster - Requirements and Main-Branch-Aligned Design
+
+## 1. Goal
+
+Define a single, main-branch-aligned specification for PTO2 cluster submission that combines:
+
+1. Product requirements (what must be true).
+2. Runtime design (how it is implemented on current main baseline).
+
+The target model is: one submitted graph node is one `MixedTask`, and dispatch/completion is mixed-task-granular.
+
+## 2. Background and Motivation
+
+Future Ascend hardware is expected to provide stronger locality within an AICore cluster (`1 AIC + 2 AIV`).
+The runtime therefore needs a "submit together, run together" model for related AIC/AIV kernels.
+
+Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-dispatch of multiple kernels to one cluster.
+
+## 3. Scope
+
+### In Scope
+
+1. New orchestration-facing submit API for cluster-aware mixed submission.
+2. Runtime/backend scheduler and executor changes to treat a mixed submit as one atomic scheduling unit.
+3. Dependency gating, readiness, dispatch, completion, and reclamation at mixed-task granularity.
+4. AIV slot equivalence (`AIV0` and `AIV1` are equivalent execution targets).
+
+### Out of Scope
+
+1. User-facing cluster pinning (`allocate_cluster/free_cluster`-style APIs).
+2. New worker types beyond AIC/AIV.
+3. Cross-cluster user placement policies.
+4. Hardware topology changes beyond `1 AIC + 2 AIV` per cluster.
+
+## 4. Main-Branch Baseline Constraints
+
+Design must preserve the current main runtime architecture:
+
+1. Multi-orchestrator runtime wiring (`orchestrators[]`, `orch_count`, thread-local `pto2_current_orch_idx`).
+2. Executor threading split (orchestrator threads vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`).
+3. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold).
+
+## 5. Terminology
+
+1. `cluster`: one physical unit with `1 AIC + 2 AIV`.
+2. `MixedKernels`: 3 submit slots (`AIC`, `AIV0`, `AIV1`) with `INVALID_KERNEL_ID` for inactive slots.
+3. `MixedTask`: one runtime graph node created by one submit call.
+4. `active_mask`: bitmask of active subtask slots.
+5. `resource shape`: normalized lane demand class of a mixed task.
+
+## 6. API Contract
+
+```cpp
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+};
+
+static inline void pto2_rt_submit_task(PTO2Runtime* rt,
+                                       const MixedKernels& mixed_kernels,
+                                       PTOParam* params,
+                                       int32_t num_params);
+
+static inline void pto2_rt_submit_aic_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           PTOParam* params,
+                                           int32_t num_params);
+
+static inline void pto2_rt_submit_aiv_task(PTO2Runtime* rt,
+                                           int32_t kernel_id,
+                                           PTOParam* params,
+                                           int32_t num_params);
+```
+
+Rules:
+
+1. One submit call creates one `MixedTask`.
+2. All active slots share the same `params` and `num_params`.
+3. At least one slot must be active.
+4. `aiv0_kernel_id` and `aiv1_kernel_id` are semantically equivalent.
+5. Wrappers are orchestration sugar only (inline in orchestration API); no dedicated runtime ops entries.
+6. Submit-contract types are defined once in a shared header-only submit-types surface consumed by orchestration and runtime headers.
+7. Invalid submits follow existing PTO2 behavior (`always_assert`), not a new recoverable return-code API.
+
+## 7. Data Model (Requirements + Design)
+
+`PTO2TaskDescriptor` (hot path) carries mixed-task identity/state:
+
+1. `mixed_task_id`
+2. `active_mask`
+3. `subtask_done_mask`
+4. `kernel_id[3]` for `(AIC, AIV0, AIV1)`
+5. dependency heads/counters and packed-buffer metadata
+
+`PTO2TaskPayload` (cold path) carries:
+
+1. shared params/tensors/scalars copied once per mixed submit
+2. fanin mixed-task IDs
+3. other cold-path submit metadata
+
+Producer identity in TensorMap is mixed-task ID end-to-end.
+
+## 8. Scheduling Model
+
+### 8.1 Resource Shapes
+
+Runtime uses shape-based ready queues (not worker-type queues):
+
+1. `AIC_ONLY`
+2. `AIV_X1`
+3. `AIV_X2`
+4. `AIC_AIV_X1`
+5. `AIC_AIV_X2`
+
+Queueing key is normalized resource shape (not raw slot label).
+
+### 8.2 Atomic Cluster Dispatch
+
+1. Dispatch decision unit is one mixed task.
+2. For multi-slot mixed tasks, partial launch is forbidden.
+3. A mixed task is dispatchable only when one local owned cluster can satisfy all required lanes.
+4. Compatible mixed tasks may co-reside over time if they use disjoint free lanes.
+
+### 8.3 Dependency and Completion
+
+1. Fanin release/readiness remains dependency-correct and graph-level.
+2. Two-stage completion:
+   - `on_subtask_complete(mixed_task_id, subslot)`
+   - `on_mixed_task_complete(mixed_task_id)` only when `subtask_done_mask == active_mask`
+3. Downstream release is triggered once per mixed task completion, not once per subslot.
+
+## 9. Executor Ownership and Numbering
+
+### 9.1 Canonical Flattened Numbering (Unchanged)
+
+Given `block_dim` clusters:
+
+1. AIC IDs: `[0, block_dim)`
+2. AIV IDs: `[block_dim, 3 * block_dim)`
+3. Cluster `i`: `{i, block_dim + i, 2 * block_dim + i}`
+
+This project-defined flattened numbering is kept unchanged.
+
+### 9.2 Cluster Ownership
+
+1. One cluster must be owned by one scheduler domain/thread at a time.
+2. No split-cluster ownership in either:
+   - initial `assign_cores_to_threads()`
+   - post-orchestrator `reassign_cores_for_all_threads()`
+3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment.
+
+## 10. Functional Requirements
+
+### 10.1 Valid Mixed Shapes
+
+1. AIC only
+2. AIV only (1 or 2 AIV lanes)
+3. AIC + 1 AIV
+4. AIC + 2 AIV
+
+### 10.2 Runtime Behavior per Submit
+
+1. Validate submit arguments.
+2. Allocate mixed-task ID and initialize descriptor/payload once.
+3. Build fanin/fanout at mixed-task granularity.
+4. Enqueue by shape when ready.
+5. Dispatch all active lanes atomically when resources allow.
+6. Aggregate completion and release downstream once.
+
+## 11. Non-Functional Requirements
+
+1. Correctness: no dependency violation, no partial mixed-task dispatch.
+2. Determinism: dependency-correct ordering preserved; AIV lane choice may vary but remains semantically equivalent.
+3. Fairness: resource-aware polling heuristic is allowed; strict starvation-free guarantee across all shapes is not required.
+4. Performance: no obvious regression for non-cluster workflows.
+5. Observability: lifecycle visibility for submit/ready/dispatch/block/complete.
+
+## 12. Acceptance Criteria
+
+Feature is accepted when:
+
+1. Orchestration compiles and submits via `MixedKernels` API/wrappers.
+2. Scheduler dispatches each mixed task as one cluster scheduling decision.
+3. Dependencies gate mixed-task readiness correctly.
+4. AIV execution remains cluster-local and semantically equivalent across lanes.
+5. Existing non-cluster workflows continue to pass without behavior regression.
+6. Cluster ownership is never split across scheduler domains before/after transition.
+
+## 13. Verification Matrix
+
+Recommended validation coverage:
+
+1. Mapping correctness for cluster-to-core ID relation.
+2. Atomic dispatch for multi-slot shapes.
+3. Dependency gating and completion aggregation (`done_mask == active_mask`).
+4. Lane-occupancy co-residency behavior for compatible shapes.
+5. Multi-orchestrator and core-transition ownership stability.
+6. Invalid submit handling (`always_assert` path).
+7. Regression coverage for existing examples/tests.
+
+Milestone command (device):
+
+```bash
+python examples/scripts/run_example.py \
+  -k tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels \
+  -g tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py \
+  -p a2a3 -d 9
+```
+
+Final validation:
+
+```bash
+./ci.sh
+```
+
+## 14. Resolved Decisions
+
+1. Legacy orchestration-facing single-task submit is replaced by mixed submit contract.
+2. Invalid mixed submits fail with existing submit-time assert behavior.
+3. Per-cluster concurrent capacity is lane-occupancy-driven, not a fixed constant.
+4. Submit-contract types live in one shared header-only surface.
+5. Resource-aware dispatch heuristics are allowed without a strict starvation-free guarantee.
+
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
index 3b23d7f7..c619f36a 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
@@ -110,9 +110,9 @@ The scheduler loop runs four phases each iteration. Each phase's time is accumul
 
 | Phase | What it does | Inline stats |
 |-------|-------------|-------------|
-| **complete** | Polls handshake on each managed core; when a core completes, traverses fanout list (notify consumers) and fanin list (release producers) via `on_task_complete` | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
+| **complete** | Polls handshake on each managed core; when a core completes, calls `on_subtask_complete(mixed_task_id, subslot)` to set the done bit; when `subtask_done_mask == active_mask`, triggers `on_mixed_task_complete` which traverses fanout list (notify consumers) and fanin list (release producers) | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release |
 | **scan** | Updates the perf profiling header with latest scheduler state | — |
-| **dispatch** | For each idle core, pops a task from the ready queue via `pto2_scheduler_get_ready_task`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
+| **dispatch** | For each idle core, pops a task from the shape-based ready queue via `get_ready_task(shape)`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) |
 | **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — |
 
 **Interpreting phase percentages:**
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index ee54cbd2..4cc39212 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -24,16 +24,7 @@
 // Type headers needed by orchestration
 #include "pto_types.h"          // PTOParam, make_input_param, make_output_param, etc.
 #include "tensor.h"             // Tensor, make_tensor, make_tensor_external
-
-// Worker type constants (duplicated from pto_runtime2_types.h to avoid
-// pulling in the full types header with its internal structures)
-typedef enum {
-    PTO2_WORKER_CUBE = 0,
-    PTO2_WORKER_VECTOR = 1,
-    PTO2_WORKER_AI_CPU = 2,
-    PTO2_WORKER_ACCELERATOR = 3,
-    PTO2_NUM_WORKER_TYPES = 4
-} PTO2WorkerType;
+#include "pto_submit_types.h"   // MixedKernels, INVALID_KERNEL_ID, subtask slots
 
 // =============================================================================
 // Ops Table and Opaque Runtime
@@ -51,8 +42,7 @@ typedef struct PTO2Runtime PTO2Runtime;
  * Populated by the runtime; called by orchestration through inline wrappers.
  */
 typedef struct PTO2RuntimeOps {
-    void (*submit_task)(PTO2Runtime* rt, int32_t kernel_id,
-                        PTO2WorkerType worker_type,
+    void (*submit_task)(PTO2Runtime* rt, const MixedKernels& mixed_kernels,
                         PTOParam* params, int32_t num_params);
     void (*scope_begin)(PTO2Runtime* rt);
     void (*scope_end)(PTO2Runtime* rt);
@@ -81,10 +71,29 @@ struct PTO2Runtime {
 // Inline Convenience Wrappers (call through ops table)
 // =============================================================================
 
-static inline void pto2_rt_submit_task(PTO2Runtime* rt, int32_t kernel_id,
-                                        PTO2WorkerType worker_type,
+static inline void pto2_rt_submit_task(PTO2Runtime* rt, const MixedKernels& mixed_kernels,
                                         PTOParam* params, int32_t num_params) {
-    rt->ops->submit_task(rt, kernel_id, worker_type, params, num_params);
+    rt->ops->submit_task(rt, mixed_kernels, params, num_params);
+}
+
+/**
+ * Convenience wrapper: submit an AIC-only task.
+ */
+static inline void pto2_rt_submit_aic_task(PTO2Runtime* rt, int32_t kernel_id,
+                                            PTOParam* params, int32_t num_params) {
+    MixedKernels mk;
+    mk.aic_kernel_id = kernel_id;
+    rt->ops->submit_task(rt, mk, params, num_params);
+}
+
+/**
+ * Convenience wrapper: submit an AIV-only task (uses AIV0 slot).
+ */
+static inline void pto2_rt_submit_aiv_task(PTO2Runtime* rt, int32_t kernel_id,
+                                            PTOParam* params, int32_t num_params) {
+    MixedKernels mk;
+    mk.aiv0_kernel_id = kernel_id;
+    rt->ops->submit_task(rt, mk, params, num_params);
 }
 
 static inline void pto2_rt_scope_begin(PTO2Runtime* rt) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
index 385d7bf0..3ad84225 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
@@ -12,6 +12,7 @@
 #include <stdint.h>
 
 #include "common/core_type.h"
+#include "pto_submit_types.h"
 
 /** Max arguments per task; must match RUNTIME_MAX_ARGS and PTO2_MAX_OUTPUTS */
 #ifndef PTO2_DISPATCH_MAX_ARGS
@@ -23,7 +24,8 @@
  * AICPU packs this from PTO2TaskDescriptor; AICore unpacks to run kernel.
  */
 struct PTO2DispatchPayload {
-    int32_t task_id;           /**< Task ID (for completion_queue) */
+    int32_t mixed_task_id;     /**< Mixed-task ID (for completion aggregation) */
+    PTO2SubtaskSlot subslot;   /**< Which subtask slot this dispatch represents */
     int32_t kernel_id;         /**< InCore function id (debug/trace) */
     CoreType core_type;        /**< AIC or AIV */
     uint64_t function_bin_addr; /**< Kernel entry in GM: (UnifiedKernelFunc)function_bin_addr */
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 1f6d9d67..09f909eb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -218,10 +218,27 @@ void pto2_scope_end(PTO2OrchestratorState* orch) {
 // =============================================================================
 // Task Submission
 // =============================================================================
-void pto2_submit_task(
-    PTO2OrchestratorState* orch, int32_t kernel_id, PTO2WorkerType worker_type, PTOParam* params, int32_t num_params) {
+void pto2_submit_mixed_task(
+    PTO2OrchestratorState* orch, const MixedKernels& mixed_kernels, PTOParam* params, int32_t num_params) {
     CYCLE_COUNT_START();
 
+    // === Validate submit inputs ===
+    uint8_t active_mask = pto2_mixed_kernels_to_active_mask(mixed_kernels);
+    always_assert(active_mask != 0 && "MixedKernels must have at least one active slot");
+    always_assert((params != nullptr || num_params == 0) && "params must not be null when num_params > 0");
+
+    // Normalize single-AIV tasks: if only aiv1 is set, move it to the aiv0 slot.
+    // This guarantees the dispatch path can always use PTO2SubtaskSlot::AIV0 for
+    // AIV_X1 and AIC_AIV_X1 shapes without inspecting active_mask.
+    MixedKernels normalized = mixed_kernels;
+    bool has_aiv0 = (active_mask & PTO2_SUBTASK_MASK_AIV0) != 0;
+    bool has_aiv1 = (active_mask & PTO2_SUBTASK_MASK_AIV1) != 0;
+    if (has_aiv1 && !has_aiv0) {
+        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
+        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
+        active_mask = pto2_mixed_kernels_to_active_mask(normalized);
+    }
+
     // === STEP 0: Sync TensorMap validity and optional cleanup ===
     orch->tensor_map.sync_tensormap();
 
@@ -238,10 +255,13 @@ void pto2_submit_task(
     PTO2TaskDescriptor& task = task_ring.get_task_by_slot(slot);
     PTO2TaskPayload* payload = &orch->sm_handle->task_payloads[slot];
 
-    // Initialize task descriptor
-    task.task_id = task_id;
-    task.kernel_id = kernel_id;
-    task.worker_type = worker_type;
+    // Initialize mixed-task descriptor
+    task.mixed_task_id = task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)]  = normalized.aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id;
+    task.active_mask = active_mask;
+    task.subtask_done_mask.store(0, std::memory_order_relaxed);
     task.fanin_count = 0;
     task.fanout_head = nullptr;
     task.fanout_lock.store(0, std::memory_order_relaxed);
@@ -364,7 +384,7 @@ void pto2_submit_task(
     for (int i = 0; i < num_params; i++) {
         PTOParamType ptype = params[i].type;
         if (ptype == PTOParamType::OUTPUT || ptype == PTOParamType::INOUT) {
-            // Register in TensorMap: this tensor is produced by task_id
+            // Register in TensorMap: this tensor is produced by task_id (mixed_task_id)
             orch->tensor_map.insert(payload->tensors[i], task_id, ptype == PTOParamType::OUTPUT);
         }
     }
@@ -377,7 +397,7 @@ void pto2_submit_task(
         PTO2SchedulerState* sched = orch->scheduler;
 
         // Initialize scheduler state BEFORE adding to producer fanout lists,
-        // so concurrent on_task_complete can safely access task_state/fanout_refcount.
+        // so concurrent on_mixed_task_complete can safely access task_state/fanout_refcount.
         sched->task_state[slot].store(PTO2_TASK_PENDING, std::memory_order_relaxed);
         sched->fanout_refcount[slot].store(0, std::memory_order_relaxed);
 
@@ -425,7 +445,8 @@ void pto2_submit_task(
         int32_t new_rc = sched->fanin_refcount[slot].fetch_add(initial_refcount, std::memory_order_acq_rel)
                          + initial_refcount;
         if (new_rc >= fanin_count + 1) {
-            sched->ready_queues[task.worker_type].push(task_id);
+            PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask);
+            sched->ready_queues[static_cast<int32_t>(shape)].push(task_id);
         }
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
         // Per producer: fetch_add(fanout_count) + load(task_state) + store(unlock) = 3 atomics
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index c4fa970c..4fdd1473 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -20,6 +20,7 @@
 
 #include "pto_ring_buffer.h"
 #include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
 #include "pto_scheduler.h"
 #include "pto_shared_memory.h"
 #include "pto_tensormap.h"
@@ -178,9 +179,8 @@ void pto2_scope_end(PTO2OrchestratorState* orch);
  * @param params      Array of task parameters
  * @param num_params  Number of parameters
  */
-void pto2_submit_task(PTO2OrchestratorState* orch,
-    int32_t kernel_id,
-    PTO2WorkerType worker_type,
+void pto2_submit_mixed_task(PTO2OrchestratorState* orch,
+    const MixedKernels& mixed_kernels,
     PTOParam* params,
     int32_t num_params);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index ed760359..48e52c2f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -358,7 +358,7 @@ struct PTO2TaskRing {
         if (active_count < window_size - 1) {
             int32_t slot = task_id & (window_size - 1);
             PTO2TaskDescriptor* task = &descriptors[slot];
-            task->task_id = task_id;
+            task->mixed_task_id = task_id;
             return task_id;
         }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
index 76f6ee4a..8ebb0033 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
@@ -26,11 +26,10 @@ void pto2_set_orch_thread_idx(int idx) {
 // Orchestration Ops Table (function-pointer dispatch for orchestration .so)
 // =============================================================================
 
-static void submit_task_impl(PTO2Runtime* rt, int32_t kernel_id,
-                             PTO2WorkerType worker_type,
+static void submit_task_impl(PTO2Runtime* rt, const MixedKernels& mixed_kernels,
                              PTOParam* params, int32_t num_params) {
-    pto2_submit_task(&rt->orchestrators[pto2_current_orch_idx], kernel_id, worker_type,
-                     params, num_params);
+    pto2_submit_mixed_task(&rt->orchestrators[pto2_current_orch_idx], mixed_kernels,
+                           params, num_params);
 }
 
 void pto2_rt_scope_begin(PTO2Runtime* rt) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index cc3dc170..e09521ce 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -26,6 +26,7 @@
 #define PTO_RUNTIME2_H
 
 #include "pto_runtime2_types.h"
+#include "pto_submit_types.h"
 #include "pto_shared_memory.h"
 #include "pto_ring_buffer.h"
 #include "pto_tensormap.h"
@@ -58,8 +59,7 @@ enum PTO2RuntimeMode {
 typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
 
 struct PTO2RuntimeOps {
-    void (*submit_task)(PTO2Runtime* rt, int32_t kernel_id,
-                        PTO2WorkerType worker_type,
+    void (*submit_task)(PTO2Runtime* rt, const MixedKernels& mixed_kernels,
                         PTOParam* params, int32_t num_params);
     void (*scope_begin)(PTO2Runtime* rt);
     void (*scope_end)(PTO2Runtime* rt);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 33556475..60c19ecd 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -20,6 +20,7 @@
 #include <stddef.h>
 
 #include "pto_types.h"
+#include "pto_submit_types.h"
 
 // =============================================================================
 // Profiling Configuration
@@ -79,7 +80,7 @@
 #define PTO2_SCOPE_TASKS_INIT_CAP 65536     // Initial capacity for scope task buffer
 
 // Ready queue
-#define PTO2_READY_QUEUE_SIZE     65536   // Per-worker-type queue size (16x larger to avoid queue full)
+#define PTO2_READY_QUEUE_SIZE     65536   // Per-shape queue size (16x larger to avoid queue full)
 
 // Memory alignment
 #define PTO2_ALIGN_SIZE           64      // Cache line alignment
@@ -282,13 +283,21 @@ struct PTO2DepListEntry {
  * - Other fields set by Orchestrator, read by Scheduler
  */
 struct PTO2TaskDescriptor {
-    // Task identification
-    int32_t task_id;              // Unique task identifier (absolute, not wrapped)
-    int32_t kernel_id;            // InCore function to execute
-    int32_t worker_type;          // Target: CUBE, VECTOR, AI_CPU, ACCELERATOR
+    // Mixed-task identification
+    int32_t mixed_task_id;            // Canonical mixed-task ID
+
+    // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive)
+    int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT];
+
+    // Active subtask mask: bit0=AIC, bit1=AIV0, bit2=AIV1
+    uint8_t active_mask;
+
+    // Completion aggregation: each subtask sets its done bit atomically
+    std::atomic<uint8_t> subtask_done_mask;
+
     // Dependency lists (linked list heads - offsets into DepListPool)
     // Fanin: producers this task depends on (set once at submission)
-    int32_t fanin_count;          // Number of producer dependencies
+    int32_t fanin_count;              // Number of producer dependencies
 
     // Fanout: consumers that depend on this task (grows as consumers submit)
     // PROTECTED BY fanout_lock
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
index 16c4ea7f..7cf8ab7f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp
@@ -158,8 +158,8 @@ bool pto2_scheduler_init(PTO2SchedulerState* sched,
         sched->fanout_refcount[i].store(0, std::memory_order_relaxed);
     }
 
-    // Initialize ready queues
-    for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) {
+    // Initialize ready queues (one per resource shape)
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
         if (!pto2_ready_queue_init(&sched->ready_queues[i], PTO2_READY_QUEUE_SIZE)) {
             // Cleanup on failure
             for (int j = 0; j < i; j++) {
@@ -194,7 +194,7 @@ void pto2_scheduler_destroy(PTO2SchedulerState* sched) {
         sched->fanout_refcount = nullptr;
     }
 
-    for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) {
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
         pto2_ready_queue_destroy(&sched->ready_queues[i]);
     }
 }
@@ -217,10 +217,10 @@ void pto2_scheduler_print_stats(PTO2SchedulerState* sched) {
 void pto2_scheduler_print_queues(PTO2SchedulerState* sched) {
     LOG_INFO("=== Ready Queues ===");
 
-    const char* worker_names[] = {"CUBE", "VECTOR", "AI_CPU", "ACCELERATOR"};
+    const char* shape_names[] = {"AIC_ONLY", "AIV_X1", "AIV_X2", "AIC_AIV_X1", "AIC_AIV_X2"};
 
-    for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) {
-        LOG_INFO("  %s: count=%" PRIu64, worker_names[i],
+    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
+        LOG_INFO("  %s: count=%" PRIu64, shape_names[i],
                  sched->ready_queues[i].size());
     }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index f94e4466..5d9dd77d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -2,10 +2,11 @@
  * PTO Runtime2 - Scheduler Interface
  *
  * The Scheduler is responsible for:
- * 1. Maintaining per-worker-type ready queues
+ * 1. Maintaining per-resource-shape ready queues
  * 2. Tracking task state (PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED)
  * 3. Managing fanin/fanout refcounts for dependency resolution
  * 4. Advancing last_task_alive for heap reclamation
+ * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
  *
  * The Scheduler runs on Device AI_CPU and processes:
  * - Task state transitions based on fanin_refcount
@@ -260,12 +261,13 @@ void pto2_ready_queue_destroy(PTO2ReadyQueue* queue);
 // =============================================================================
 
 /**
- * Statistics returned by on_task_complete
+ * Statistics returned by mixed-task completion processing
  */
 struct PTO2CompletionStats {
     int32_t fanout_edges;      // Number of fanout edges traversed (notify consumers)
     int32_t tasks_enqueued;    // Number of consumers that became READY
     int32_t fanin_edges;       // Number of fanin edges traversed (release producers)
+    bool mixed_task_completed; // True only when this callback completed a mixed task
 };
 
 /**
@@ -298,8 +300,8 @@ struct PTO2SchedulerState {
     std::atomic<int32_t>* fanin_refcount;   // Dynamic: counts completed producers
     std::atomic<int32_t>* fanout_refcount;  // Dynamic: counts released references
 
-    // Ready queues (one per worker type)
-    PTO2ReadyQueue ready_queues[PTO2_NUM_WORKER_TYPES];
+    // Ready queues (one per resource shape)
+    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
 
     // Statistics
 #if PTO2_PROFILING
@@ -441,12 +443,15 @@ struct PTO2SchedulerState {
 
         if (new_refcount == task->fanin_count) {
             // Local-first: try per-CoreType thread-local buffer before global queue
+            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
+            PTO2ResourceShape shape = pto2_active_mask_to_shape(task->active_mask);
             bool pushed_local = false;
-            if (local_bufs && task->worker_type >= 0 && task->worker_type < PTO2_LOCAL_DISPATCH_TYPE_NUM) {
-                pushed_local = local_bufs[task->worker_type].try_push(task_id);
+            if (local_bufs) {
+                int32_t buf_idx = (task->active_mask & 0x01) ? 0 : 1;
+                pushed_local = local_bufs[buf_idx].try_push(task_id);
             }
             if (!pushed_local) {
-                ready_queues[task->worker_type].push(task_id);
+                ready_queues[static_cast<int32_t>(shape)].push(task_id);
             }
             return true;
         }
@@ -468,12 +473,14 @@ struct PTO2SchedulerState {
                     expected, PTO2_TASK_READY, std::memory_order_acq_rel, std::memory_order_acquire)) {
                 atomic_count += 1;  // CAS(task_state PENDING→READY)
                 // Local-first: try per-CoreType thread-local buffer before global queue
+                PTO2ResourceShape shape = pto2_active_mask_to_shape(task->active_mask);
                 bool pushed_local = false;
-                if (local_bufs && task->worker_type >= 0 && task->worker_type < PTO2_LOCAL_DISPATCH_TYPE_NUM) {
-                    pushed_local = local_bufs[task->worker_type].try_push(task_id);
+                if (local_bufs) {
+                    int32_t buf_idx = (task->active_mask & 0x01) ? 0 : 1;
+                    pushed_local = local_bufs[buf_idx].try_push(task_id);
                 }
                 if (!pushed_local) {
-                    ready_queues[task->worker_type].push(task_id, atomic_count, push_wait);
+                    ready_queues[static_cast<int32_t>(shape)].push(task_id, atomic_count, push_wait);
                 }
                 return true;
             }
@@ -489,7 +496,7 @@ struct PTO2SchedulerState {
 
         // Reset fanout_refcount for new task lifecycle.
         // Do NOT reset fanin_refcount — it may have been incremented by
-        // concurrent on_task_complete between Step 5 and Step 6.
+        // concurrent on_mixed_task_complete between Step 5 and Step 6.
         fanout_refcount[slot].store(0, std::memory_order_relaxed);
 
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
@@ -502,9 +509,8 @@ struct PTO2SchedulerState {
 #endif
     }
 
-    template<CoreType CT>
-    int32_t get_ready_task() {
-        return ready_queues[static_cast<int32_t>(CT)].pop();
+    int32_t get_ready_task(PTO2ResourceShape shape) {
+        return ready_queues[static_cast<int32_t>(shape)].pop();
     }
 
     template<CoreType CT>
@@ -517,9 +523,8 @@ struct PTO2SchedulerState {
     }
 
 #if PTO2_SCHED_PROFILING
-    template<CoreType CT>
-    int32_t get_ready_task(uint64_t& atomic_count, uint64_t& wait_cycle) {
-        return ready_queues[static_cast<int32_t>(CT)].pop(atomic_count, wait_cycle);
+    int32_t get_ready_task(PTO2ResourceShape shape, uint64_t& atomic_count, uint64_t& wait_cycle) {
+        return ready_queues[static_cast<int32_t>(shape)].pop(atomic_count, wait_cycle);
     }
 
     template<CoreType CT>
@@ -533,6 +538,17 @@ struct PTO2SchedulerState {
     }
 #endif
 
+    /**
+     * Requeue a ready task that could not be dispatched (no suitable cluster).
+     * Pushes the task back into its shape-based queue.
+     */
+    void requeue_ready_task(int32_t task_id) {
+        int32_t slot = pto2_task_slot(task_id);
+        PTO2TaskDescriptor& task = pto2_sm_get_task_by_slot(sm_handle, slot);
+        PTO2ResourceShape shape = pto2_active_mask_to_shape(task.active_mask);
+        ready_queues[static_cast<int32_t>(shape)].push(task_id);
+    }
+
     void on_scope_end(const int32_t* task_ids, int32_t count) {
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
         extern uint64_t g_orch_scope_end_atomic_count;
@@ -546,19 +562,43 @@ struct PTO2SchedulerState {
 #endif
     }
 
+    /**
+     * Two-stage completion: first stage.
+     * Called when a single subtask (AIC, AIV0, or AIV1) finishes.
+     * Sets the corresponding done bit in subtask_done_mask.
+     *
+     * @return true if this subtask was the last one, completing the mixed task.
+     */
+    bool on_subtask_complete(int32_t mixed_task_id, PTO2SubtaskSlot subslot) {
+        int32_t slot = pto2_task_slot(mixed_task_id);
+        PTO2TaskDescriptor& task = pto2_sm_get_task_by_slot(sm_handle, slot);
+
+        uint8_t done_bit = (1u << static_cast<uint8_t>(subslot));
+        uint8_t prev_mask = task.subtask_done_mask.fetch_or(done_bit, std::memory_order_acq_rel);
+        uint8_t new_mask = prev_mask | done_bit;
+
+        return new_mask == task.active_mask;
+    }
+
+    /**
+     * Two-stage completion: second stage.
+     * Called exactly once when all subtasks of a mixed task are done
+     * (i.e., on_subtask_complete returned true).
+     * Handles fanout notification, fanin release, and self-consumption check.
+     */
 #if PTO2_SCHED_PROFILING
-    PTO2CompletionStats on_task_complete(int32_t task_id, int thread_idx,
-                                          PTO2LocalReadyBuffer* local_bufs = nullptr) {
-        PTO2CompletionStats stats = {0, 0, 0};
+    PTO2CompletionStats on_mixed_task_complete(int32_t mixed_task_id, int thread_idx,
+                                               PTO2LocalReadyBuffer* local_bufs = nullptr) {
+        PTO2CompletionStats stats = {0, 0, 0, true};
 #elif PTO2_PROFILING
-    PTO2CompletionStats on_task_complete(int32_t task_id,
-                                          PTO2LocalReadyBuffer* local_bufs = nullptr) {
-        PTO2CompletionStats stats = {0, 0, 0};
+    PTO2CompletionStats on_mixed_task_complete(int32_t mixed_task_id,
+                                               PTO2LocalReadyBuffer* local_bufs = nullptr) {
+        PTO2CompletionStats stats = {0, 0, 0, true};
 #else
-    void on_task_complete(int32_t task_id,
-                           PTO2LocalReadyBuffer* local_bufs = nullptr) {
+    void on_mixed_task_complete(int32_t mixed_task_id,
+                                PTO2LocalReadyBuffer* local_bufs = nullptr) {
 #endif
-        int32_t slot = pto2_task_slot(task_id);
+        int32_t slot = pto2_task_slot(mixed_task_id);
         PTO2TaskDescriptor& task = pto2_sm_get_task_by_slot(sm_handle, slot);
 
 #if PTO2_PROFILING
@@ -567,11 +607,8 @@ struct PTO2SchedulerState {
 
 #if PTO2_SCHED_PROFILING
         extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
-        extern uint64_t g_sched_self_consumed_cycle[];
         extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
         extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
-        extern uint64_t g_sched_self_atomic_count[];
-        extern uint64_t g_sched_complete_count[];
         uint64_t lock_atomics = 0, lock_wait = 0;
         PTO2_SCHED_CYCLE_START();
 #endif
@@ -664,7 +701,7 @@ struct PTO2SchedulerState {
         // Self consumed check
 #if PTO2_SCHED_PROFILING
         uint64_t self_atomics = 0;
-        check_and_handle_consumed(slot, task, self_atomics);
+        check_and_handle_consumed(slot, pto2_sm_get_task_by_slot(sm_handle, slot), self_atomics);
         g_sched_self_atomic_count[thread_idx] += self_atomics;
         PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]);
         g_sched_complete_count[thread_idx]++;
@@ -698,7 +735,7 @@ const char* pto2_task_state_name(PTO2TaskState state);
 
 #if PTO2_SCHED_PROFILING
 struct PTO2SchedProfilingData {
-    // Sub-phase cycle breakdown within on_task_complete
+    // Sub-phase cycle breakdown within on_mixed_task_complete
     uint64_t lock_cycle;           // pto2_fanout_lock + state store + unlock
     uint64_t fanout_cycle;         // fanout traversal
     uint64_t fanin_cycle;          // fanin traversal
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
new file mode 100644
index 00000000..177781a3
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
@@ -0,0 +1,97 @@
+/**
+ * PTO Submit Types - Shared submit-contract definitions
+ *
+ * Header-only definitions shared by orchestration-facing and runtime-facing
+ * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
+ */
+
+#ifndef PTO_SUBMIT_TYPES_H
+#define PTO_SUBMIT_TYPES_H
+
+#include <stdint.h>
+
+inline constexpr int32_t INVALID_KERNEL_ID = -1;
+
+/**
+ * Subtask slot count: AIC, AIV0, AIV1
+ */
+inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
+
+/**
+ * Subtask slot indices
+ */
+enum class PTO2SubtaskSlot : uint8_t {
+    AIC  = 0,
+    AIV0 = 1,
+    AIV1 = 2,
+};
+
+/**
+ * Subtask mask bits (for active_mask / subtask_done_mask)
+ */
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC  = (1u << 0);  // 0x1
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);  // 0x2
+inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);  // 0x4
+
+/**
+ * Test whether a subtask slot is active in a given mask
+ */
+static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) {
+    return (mask & (1u << static_cast<uint8_t>(slot))) != 0;
+}
+
+/**
+ * Mixed-task submit contract.
+ *
+ * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
+ * At least one slot must be valid.
+ */
+struct MixedKernels {
+    int32_t aic_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
+    int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
+};
+
+/**
+ * Resource shape — classifies a MixedKernels into one of 5 queue buckets.
+ */
+enum class PTO2ResourceShape : uint8_t {
+    AIC_ONLY    = 0,   // AIC only
+    AIV_X1      = 1,   // One AIV slot
+    AIV_X2      = 2,   // Both AIV slots
+    AIC_AIV_X1  = 3,   // AIC + one AIV
+    AIC_AIV_X2  = 4,   // AIC + both AIV
+};
+
+inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 5;
+
+/**
+ * Derive resource shape from active_mask.
+ * Caller must ensure active_mask is valid (at least one bit set).
+ */
+static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) {
+    bool has_aic = (active_mask & PTO2_SUBTASK_MASK_AIC) != 0;
+    int aiv_count = ((active_mask & PTO2_SUBTASK_MASK_AIV0) != 0)
+                  + ((active_mask & PTO2_SUBTASK_MASK_AIV1) != 0);
+
+    if (has_aic) {
+        if (aiv_count == 0) return PTO2ResourceShape::AIC_ONLY;
+        if (aiv_count == 1) return PTO2ResourceShape::AIC_AIV_X1;
+        return PTO2ResourceShape::AIC_AIV_X2;
+    }
+    if (aiv_count == 1) return PTO2ResourceShape::AIV_X1;
+    return PTO2ResourceShape::AIV_X2;
+}
+
+/**
+ * Compute active_mask from MixedKernels.
+ */
+static inline uint8_t pto2_mixed_kernels_to_active_mask(const MixedKernels& mk) {
+    uint8_t mask = 0;
+    if (mk.aic_kernel_id  != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
+    if (mk.aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
+    if (mk.aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1;
+    return mask;
+}
+
+#endif // PTO_SUBMIT_TYPES_H
diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
index 2674005e..a522d153 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
@@ -121,7 +121,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
                 make_input_param(B_view),
                 make_output_param(C_view),
             };
-            pto2_rt_submit_task(rt, FUNC_MATMUL, PTO2_WORKER_CUBE, params_matmul, 3);
+            pto2_rt_submit_aic_task(rt, FUNC_MATMUL, params_matmul, 3);
             total_matmul++;
         }
 
@@ -142,7 +142,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
                 make_input_param(Y_view),
                 make_output_param(Z_view),
             };
-            pto2_rt_submit_task(rt, FUNC_ADD, PTO2_WORKER_VECTOR, params_add, 3);
+            pto2_rt_submit_aiv_task(rt, FUNC_ADD, params_add, 3);
             total_add++;
         }
     }
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 3c25e9f1..f841e272 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -135,7 +135,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                     make_output_param(li_batch),
                     make_output_param(mi_batch),
                 };
-                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_hub, 3);
+                pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_hub, 3);
 
                 for (uint64_t bn = 0; bn < max_bn; bn++) {
                     PTO2_SCOPE(rt) {
@@ -161,7 +161,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                             make_scalar_param(num_heads),
                             make_scalar_param(batch_start),
                         };
-                        pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 10);
+                        pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 10);
 
                         PTOParam params_sf[] = {
                             make_input_param(sij_b),
@@ -174,7 +174,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                             make_scalar_param(bn),
                             make_scalar_param(batch_start),
                         };
-                        pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 9);
+                        pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 9);
 
                         PTOParam params_pv[] = {
                             make_input_param(pij_b),
@@ -186,7 +186,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                             make_scalar_param(block_num),
                             make_scalar_param(batch_start),
                         };
-                        pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 8);
+                        pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 8);
 
                         uint64_t is_first = (bn == 0) ? 1 : 0;
                         uint64_t is_last = (bn == max_bn - 1) ? 1 : 0;
@@ -205,7 +205,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                             make_scalar_param(num_heads),
                             make_scalar_param(batch_start),
                         };
-                        pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 13);
+                        pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 13);
                     }
                 }
             }
diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
index 563795a5..fb65329c 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
@@ -124,7 +124,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
                     make_output_param(P),
                     make_input_param(ext_config),
                 };
-                pto2_rt_submit_task(rt, FUNC_GEMM_TILE, PTO2_WORKER_CUBE,
+                pto2_rt_submit_aic_task(rt, FUNC_GEMM_TILE,
                                    params_gemm, 4);
                 total_gemm++;
 
@@ -133,7 +133,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) {
                     make_input_param(P),
                     make_input_param(ext_config),
                 };
-                pto2_rt_submit_task(rt, FUNC_TILE_ADD, PTO2_WORKER_VECTOR,
+                pto2_rt_submit_aiv_task(rt, FUNC_TILE_ADD,
                                    params_add, 3);
                 total_add++;
             }
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index a3417a8c..3f061be8 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -167,7 +167,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                     make_output_param(mi_update),
                 };
                 CYCLE_COUNT_LAP(prof_param_setup);
-                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3);
+                pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace, 3);
                 prof_submit_count++;
                 CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -195,7 +195,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_output_param(sij),
                     };
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3);
+                    pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 3);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -218,7 +218,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_output_param(li),
                     };
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5);
+                    pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 5);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -233,7 +233,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_output_param(oi_tmp),
                     };
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3);
+                    pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 3);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -253,7 +253,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_scalar_param(is_last),
                     };
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9);
+                    pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 9);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
                 }
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
index cf028cb6..8e67888e 100644
--- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
@@ -159,7 +159,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                     make_output_param(mi_update),
                 };
                 CYCLE_COUNT_LAP(prof_param_setup);
-                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3);
+                pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace, 3);
                 prof_submit_count++;
                 CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -198,7 +198,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_scalar_param(block_indices[7]),
                     };
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 12);
+                    pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 12);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -220,7 +220,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_scalar_param(valid_len_last),
                     };
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 7);
+                    pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 7);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -245,7 +245,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_scalar_param(block_indices[7]),
                     };
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 12);
+                    pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 12);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
 
@@ -266,7 +266,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim
                         make_scalar_param(is_last),
                     };
                     CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9);
+                    pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 9);
                     prof_submit_count++;
                     CYCLE_COUNT_LAP(prof_submit_task);
                 }

From 83537f90c388625d727a3a4993a496758fd7ef6d Mon Sep 17 00:00:00 2001
From: ChaoZheng109 <zhengchao47@huawei.com>
Date: Wed, 11 Mar 2026 20:07:02 +0800
Subject: [PATCH 3/3] Refactor: reorganize examples and tests by architecture
 prefix

- Move examples from runtime-first layout (host_build_graph/,
  aicpu_build_graph/, tensormap_and_ringbuffer/) to arch-first
  layout (a2a3/<runtime>/, a5/<runtime>/)
- Move device tests to matching tests/device_tests/<arch>/ layout
- Update ci.sh to extract arch from path and track per-task
  platforms, replacing global HW_PLATFORM/SIM_PLATFORM variables
- Add print_log_on_fail param to run_task() and fix attempt
  number display (off-by-one) in summary output
- Update benchmark_rounds.sh with -p/--platform flag to derive
  arch from platform name
- Update CLAUDE.md example path to new layout
---
 CLAUDE.md                                     |   4 +-
 ci.sh                                         |  63 ++--
 .../aicpu_build_graph/bgemm/README.md         |   0
 .../aicpu_build_graph/bgemm/golden.py         |   0
 .../bgemm/kernels/aic/kernel_gemm_tile.cpp    |   0
 .../bgemm/kernels/aiv/kernel_tile_add.cpp     |   0
 .../bgemm/kernels/kernel_config.py            |   0
 .../kernels/orchestration/bgemm_orch.cpp      |   0
 .../docs/INCORE_ORCHESTRATION_GUIDE.md        |   0
 .../vector_example/README.md                  |   0
 .../vector_example/golden.py                  |   0
 .../vector_example/kernels/aiv/kernel_add.cpp |   0
 .../kernels/aiv/kernel_add_scalar.cpp         |   0
 .../vector_example/kernels/aiv/kernel_mul.cpp |   0
 .../vector_example/kernels/kernel_config.py   |   0
 .../kernels/orchestration/orchestration.cpp   |   0
 .../host_build_graph/bgemm/README.md          |   0
 .../host_build_graph/bgemm/golden.py          |   0
 .../bgemm/kernels/aic/kernel_gemm_tile.cpp    |   0
 .../bgemm/kernels/aiv/kernel_tile_add.cpp     |   0
 .../bgemm/kernels/kernel_config.py            |   0
 .../kernels/orchestration/bgemm_orch.cpp      |   0
 .../docs/INCORE_ORCHESTRATION_GUIDE.md        |   0
 .../host_build_graph/matmul/golden.py         |   0
 .../matmul/kernels/aic/kernel_matmul.cpp      |   0
 .../matmul/kernels/aiv/kernel_add_exp.cpp     |   0
 .../matmul/kernels/aiv/kernel_log_sqrt.cpp    |   0
 .../matmul/kernels/kernel_config.py           |   0
 .../kernels/orchestration/matmul_orch.cpp     |   0
 .../paged_attention/golden.py                 |   0
 .../kernels/aic/aic_pv_matmul.cpp             |   0
 .../kernels/aic/aic_qk_matmul.cpp             |   0
 .../kernels/aiv/aiv_online_update.cpp         |   0
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   0
 .../paged_attention/kernels/kernel_config.py  |   0
 .../orchestration/paged_attention_orch.cpp    |   0
 .../host_build_graph/vector_example/README.md |   0
 .../host_build_graph/vector_example/golden.py |   0
 .../vector_example/kernels/aiv/kernel_add.cpp |   0
 .../kernels/aiv/kernel_add_scalar.cpp         |   0
 .../vector_example/kernels/aiv/kernel_mul.cpp |   0
 .../vector_example/kernels/kernel_config.py   |   0
 .../kernels/orchestration/example_orch.cpp    |   0
 .../batch_paged_attention/golden.py           |   0
 .../kernels/aic/aic_hub.cpp                   |   0
 .../kernels/aic/aic_pv_matmul.cpp             |   0
 .../kernels/aic/aic_qk_matmul.cpp             |   0
 .../kernels/aiv/aiv_hub.cpp                   |   0
 .../kernels/aiv/aiv_online_update.cpp         |   0
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   0
 .../kernels/kernel_config.py                  |   0
 .../orchestration/paged_attention_orch.cpp    |   0
 .../tensormap_and_ringbuffer/bgemm/golden.py  |   0
 .../bgemm/kernels/aic/kernel_gemm_tile.cpp    |   0
 .../bgemm/kernels/aiv/kernel_tile_add.cpp     |   0
 .../bgemm/kernels/kernel_config.py            |   0
 .../kernels/orchestration/bgemm_orch.cpp      |   0
 .../docs/INCORE_ORCHESTRATION_GUIDE.md        |   0
 .../mixed_example/golden.py                   |   0
 .../kernels/aic/kernel_matmul.cpp             |   0
 .../mixed_example/kernels/aiv/kernel_add.cpp  |   0
 .../kernels/aiv/kernel_add_standalone.cpp     |   0
 .../mixed_example/kernels/aiv/kernel_mul.cpp  |   0
 .../kernels/aiv/kernel_mul_standalone.cpp     |   0
 .../mixed_example/kernels/kernel_config.py    |   0
 .../kernels/orchestration/mixed_orch.cpp      |   0
 .../multi-round-paged-attention/golden.py     |   0
 .../kernels/kernel_config.py                  |   0
 .../paged_attention/TFILLPAD_INPLACE_BUG.md   |   0
 .../paged_attention/golden.py                 |   0
 .../paged_attention/kernels/aic/aic_hub.cpp   |   0
 .../kernels/aic/aic_pv_matmul.cpp             |   0
 .../kernels/aic/aic_qk_matmul.cpp             |   0
 .../paged_attention/kernels/aiv/aiv_hub.cpp   |   0
 .../kernels/aiv/aiv_online_update.cpp         |   0
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   0
 .../paged_attention/kernels/kernel_config.py  |   0
 .../orchestration/paged_attention_orch.cpp    |   0
 .../vector_example/golden.py                  |   0
 .../vector_example/kernels/aiv/kernel_add.cpp |   0
 .../kernels/aiv/kernel_add_scalar.cpp         |   0
 .../vector_example/kernels/aiv/kernel_mul.cpp |   0
 .../vector_example/kernels/kernel_config.py   |   0
 .../orchestration/example_orchestration.cpp   |   0
 .../paged_attention/golden.py                 |  45 +++
 .../kernels/aic/aic_pv_matmul.cpp             |  90 ++++++
 .../kernels/aic/aic_qk_matmul.cpp             |  91 ++++++
 .../kernels/aiv/aiv_online_update.cpp         | 220 +++++++++++++
 .../kernels/aiv/aiv_softmax_prepare.cpp       |  94 ++++++
 .../paged_attention/kernels/kernel_config.py  |  42 +++
 .../orchestration/paged_attention_orch.cpp    | 256 +++++++++++++++
 .../paged_attention/TFILLPAD_INPLACE_BUG.md   | 205 ++++++++++++
 .../paged_attention/golden.py                 |  67 ++++
 .../paged_attention/kernels/aic/aic_hub.cpp   |   0
 .../kernels/aic/aic_pv_matmul.cpp             |  89 ++++++
 .../kernels/aic/aic_qk_matmul.cpp             |  90 ++++++
 .../paged_attention/kernels/aiv/aiv_hub.cpp   |   0
 .../kernels/aiv/aiv_online_update.cpp         | 224 +++++++++++++
 .../kernels/aiv/aiv_softmax_prepare.cpp       | 147 +++++++++
 .../paged_attention/kernels/kernel_config.py  |  46 +++
 .../orchestration/paged_attention_orch.cpp    | 214 +++++++++++++
 .../paged_attention/README.md                 |   0
 .../paged_attention/golden.py                 |   0
 .../kernels/aic/aic_pv_matmul.cpp             |   0
 .../kernels/aic/aic_qk_matmul.cpp             |   0
 .../kernels/aiv/aiv_online_update.cpp         |   0
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   0
 .../paged_attention/kernels/kernel_config.py  |   0
 .../orchestration/paged_attention_orch.cpp    |   0
 .../paged_attention/README.md                 |   0
 .../paged_attention/golden.py                 |   0
 .../kernels/aic/aic_pv_matmul.cpp             |   0
 .../kernels/aic/aic_qk_matmul.cpp             |   0
 .../kernels/aiv/aiv_online_update.cpp         |   0
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   0
 .../paged_attention/kernels/kernel_config.py  |   0
 .../orchestration/paged_attention_orch.cpp    |   0
 .../alternating_matmul_add/golden.py          |   0
 .../kernels/aic/kernel_matmul.cpp             |   0
 .../kernels/aiv/kernel_add.cpp                |   0
 .../kernels/kernel_config.py                  |   0
 .../orchestration/alternating_orch.cpp        |   0
 .../batch_paged_attention/golden.py           |   0
 .../kernels/aic/aic_hub.cpp                   |   0
 .../kernels/aic/aic_pv_matmul.cpp             |   0
 .../kernels/aic/aic_qk_matmul.cpp             |   0
 .../kernels/aiv/aiv_hub.cpp                   |   0
 .../kernels/aiv/aiv_online_update.cpp         |   0
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   0
 .../kernels/kernel_config.py                  |   0
 .../orchestration/paged_attention_orch.cpp    |   0
 .../benchmark_bgemm/golden.py                 |   0
 .../kernels/aic/kernel_gemm_tile.cpp          |   0
 .../kernels/aiv/kernel_tile_add.cpp           |   0
 .../benchmark_bgemm/kernels/kernel_config.py  |   0
 .../kernels/orchestration/bgemm_orch.cpp      |   0
 .../paged_attention/golden.py                 |   0
 .../paged_attention}/kernels/aic/aic_hub.cpp  |   0
 .../kernels/aic/aic_pv_matmul.cpp             |   0
 .../kernels/aic/aic_qk_matmul.cpp             |   0
 .../paged_attention}/kernels/aiv/aiv_hub.cpp  |   0
 .../kernels/aiv/aiv_online_update.cpp         |   0
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   0
 .../paged_attention/kernels/kernel_config.py  |   0
 .../orchestration/paged_attention_orch.cpp    |   0
 .../paged_attention_unroll/golden.py          |   0
 .../kernels/aic/aic_hub.cpp                   |  18 ++
 .../kernels/aic/aic_pv_matmul.cpp             |   0
 .../kernels/aic/aic_qk_matmul.cpp             |   0
 .../kernels/aiv/aiv_hub.cpp                   |  18 ++
 .../kernels/aiv/aiv_online_update.cpp         |   0
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   0
 .../kernels/kernel_config.py                  |   0
 .../orchestration/paged_attention_orch.cpp    |   0
 .../paged_attention/README.md                 | 192 ++++++++++++
 .../paged_attention/golden.py                 |  45 +++
 .../kernels/aic/aic_pv_matmul.cpp             |  97 ++++++
 .../kernels/aic/aic_qk_matmul.cpp             |  98 ++++++
 .../kernels/aiv/aiv_online_update.cpp         | 227 ++++++++++++++
 .../kernels/aiv/aiv_softmax_prepare.cpp       | 123 ++++++++
 .../paged_attention/kernels/kernel_config.py  |  43 +++
 .../orchestration/paged_attention_orch.cpp    | 261 ++++++++++++++++
 .../paged_attention/golden.py                 |  55 ++++
 .../paged_attention/kernels/aic/aic_hub.cpp   |  18 ++
 .../kernels/aic/aic_pv_matmul.cpp             |  97 ++++++
 .../kernels/aic/aic_qk_matmul.cpp             |  98 ++++++
 .../paged_attention/kernels/aiv/aiv_hub.cpp   |  18 ++
 .../kernels/aiv/aiv_online_update.cpp         | 232 ++++++++++++++
 .../kernels/aiv/aiv_softmax_prepare.cpp       | 128 ++++++++
 .../paged_attention/kernels/kernel_config.py  |  45 +++
 .../orchestration/paged_attention_orch.cpp    | 294 ++++++++++++++++++
 tools/benchmark_rounds.sh                     |  22 +-
 172 files changed, 4088 insertions(+), 28 deletions(-)
 rename examples/{ => a2a3}/aicpu_build_graph/bgemm/README.md (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/bgemm/golden.py (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/bgemm/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/vector_example/README.md (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/vector_example/golden.py (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/bgemm/README.md (100%)
 rename examples/{ => a2a3}/host_build_graph/bgemm/golden.py (100%)
 rename examples/{ => a2a3}/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/bgemm/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md (100%)
 rename examples/{ => a2a3}/host_build_graph/matmul/golden.py (100%)
 rename examples/{ => a2a3}/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/matmul/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/paged_attention/golden.py (100%)
 rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/vector_example/README.md (100%)
 rename examples/{ => a2a3}/host_build_graph/vector_example/golden.py (100%)
 rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp (100%)
 rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/golden.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/golden.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/golden.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/golden.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/golden.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py (100%)
 rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp (100%)
 create mode 100644 examples/a5/host_build_graph/paged_attention/golden.py
 create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
 create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
 create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
 create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
 create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py
 create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py
 rename {tests/device_tests => examples/a5}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp (100%)
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
 rename {tests/device_tests => examples/a5}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp (100%)
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
 rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/README.md (100%)
 rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/golden.py (100%)
 rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp (100%)
 rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%)
 rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/kernel_config.py (100%)
 rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%)
 rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/README.md (100%)
 rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/golden.py (100%)
 rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp (100%)
 rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%)
 rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/kernel_config.py (100%)
 rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/golden.py (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/golden.py (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/golden.py (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/golden.py (100%)
 rename tests/device_tests/{tensormap_and_ringbuffer/paged_attention_unroll => a2a3/tensormap_and_ringbuffer/paged_attention}/kernels/aic/aic_hub.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%)
 rename tests/device_tests/{tensormap_and_ringbuffer/paged_attention_unroll => a2a3/tensormap_and_ringbuffer/paged_attention}/kernels/aiv/aiv_hub.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/golden.py (100%)
 create mode 100644 tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp (100%)
 create mode 100644 tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py (100%)
 rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp (100%)
 create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/README.md
 create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/golden.py
 create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
 create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
 create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
 create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
 create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/kernel_config.py
 create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/golden.py
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
 create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp

diff --git a/CLAUDE.md b/CLAUDE.md
index 330d8991..ea8e3e2e 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -35,8 +35,8 @@ PTO Runtime compiles three independent programs (Host `.so`, AICPU `.so`, AICore
 ### Run a single example
 ```bash
 python examples/scripts/run_example.py \
-    -k examples/host_build_graph/vector_example/kernels \
-    -g examples/host_build_graph/vector_example/golden.py \
+    -k examples/a2a3/host_build_graph/vector_example/kernels \
+    -g examples/a2a3/host_build_graph/vector_example/golden.py \
     -p a2a3sim
 ```
 
diff --git a/ci.sh b/ci.sh
index e4d9db03..17541d88 100755
--- a/ci.sh
+++ b/ci.sh
@@ -200,8 +200,10 @@ DEVICE_TESTS_DIR="tests/device_tests"
 
 declare -a HW_TASK_NAMES=()
 declare -a HW_TASK_DIRS=()
+declare -a HW_TASK_PLATS=()
 declare -a SIM_TASK_NAMES=()
 declare -a SIM_TASK_DIRS=()
+declare -a SIM_TASK_PLATS=()
 
 # Discover examples
 while IFS= read -r -d '' example_dir; do
@@ -211,15 +213,21 @@ while IFS= read -r -d '' example_dir; do
     [[ -f "$kernel_config" && -f "$golden" ]] || continue
 
     example_name="${example_dir#$EXAMPLES_DIR/}"
-    example_runtime="${example_name%%/*}"  # Extract runtime from path
+    example_arch="${example_name%%/*}"  # Extract arch (a2a3/a5) from path
+    example_rest="${example_name#*/}"
+    example_runtime="${example_rest%%/*}"  # Extract runtime from path
 
     # Filter by runtime if specified
-    if [[ -n "$RUNTIME" && "$example_name" != "$RUNTIME"/* ]]; then
+    if [[ -n "$RUNTIME" && "$example_runtime" != "$RUNTIME" ]]; then
         continue
     fi
 
-    # Filter by platform's supported runtimes
+    # Filter by platform's arch and supported runtimes
     if [[ -n "$PLATFORM" ]]; then
+        platform_base="${PLATFORM%sim}"
+        if [[ "$example_arch" != "$platform_base" ]]; then
+            continue  # Skip examples not matching platform arch
+        fi
         platform_runtimes="$(get_platform_runtimes "$PLATFORM")"
         if [[ ! " $platform_runtimes " =~ " $example_runtime " ]]; then
             continue  # Skip unsupported runtime for this platform
@@ -230,18 +238,23 @@ while IFS= read -r -d '' example_dir; do
         if [[ "$PLATFORM" =~ sim$ ]]; then
             SIM_TASK_NAMES+=("example:${example_name}")
             SIM_TASK_DIRS+=("${example_dir}")
+            SIM_TASK_PLATS+=("${PLATFORM}")
         else
             HW_TASK_NAMES+=("example:${example_name}")
             HW_TASK_DIRS+=("${example_dir}")
+            HW_TASK_PLATS+=("${PLATFORM}")
         fi
     elif [[ "$OS" == "Darwin" ]]; then
         SIM_TASK_NAMES+=("example:${example_name}")
         SIM_TASK_DIRS+=("${example_dir}")
+        SIM_TASK_PLATS+=("${example_arch}sim")
     else
         HW_TASK_NAMES+=("example:${example_name}")
         HW_TASK_DIRS+=("${example_dir}")
+        HW_TASK_PLATS+=("${example_arch}")
         SIM_TASK_NAMES+=("example:${example_name}")
         SIM_TASK_DIRS+=("${example_dir}")
+        SIM_TASK_PLATS+=("${example_arch}sim")
     fi
 done < <(find "$EXAMPLES_DIR" -mindepth 1 -type d -print0 | sort -z)
 
@@ -257,15 +270,21 @@ if [[ -d "$DEVICE_TESTS_DIR" ]]; then
             golden="${test_dir}/golden.py"
             [[ -f "$kernel_config" && -f "$golden" ]] || continue
             test_name="${test_dir#$DEVICE_TESTS_DIR/}"
-            test_runtime="${test_name%%/*}"  # Extract runtime from path
+            test_arch="${test_name%%/*}"  # Extract arch (a2a3/a5) from path
+            test_rest="${test_name#*/}"
+            test_runtime="${test_rest%%/*}"  # Extract runtime from path
 
             # Filter by runtime if specified
-            if [[ -n "$RUNTIME" && "$test_name" != "$RUNTIME"/* ]]; then
+            if [[ -n "$RUNTIME" && "$test_runtime" != "$RUNTIME" ]]; then
                 continue
             fi
 
-            # Filter by platform's supported runtimes
+            # Filter by platform's arch and supported runtimes
             if [[ -n "$PLATFORM" ]]; then
+                platform_base="${PLATFORM%sim}"
+                if [[ "$test_arch" != "$platform_base" ]]; then
+                    continue  # Skip tests not matching platform arch
+                fi
                 platform_runtimes="$(get_platform_runtimes "$PLATFORM")"
                 if [[ ! " $platform_runtimes " =~ " $test_runtime " ]]; then
                     continue  # Skip unsupported runtime for this platform
@@ -274,6 +293,7 @@ if [[ -d "$DEVICE_TESTS_DIR" ]]; then
 
             HW_TASK_NAMES+=("device_test:${test_name}")
             HW_TASK_DIRS+=("${test_dir}")
+            HW_TASK_PLATS+=("${PLATFORM:-${test_arch}}")
         done < <(find "$DEVICE_TESTS_DIR" -mindepth 1 -type d -print0 | sort -z)
     else
         echo "Skipping device tests (hardware platforms only)"
@@ -282,10 +302,6 @@ fi
 
 echo "Discovered ${#HW_TASK_NAMES[@]} hardware tasks, ${#SIM_TASK_NAMES[@]} simulation tasks"
 
-# Determine platforms for execution
-HW_PLATFORM="${PLATFORM:-a2a3}"
-SIM_PLATFORM="${PLATFORM:-a2a3sim}"
-
 MAX_RETRIES=3
 
 # ---- Unified task runner ----
@@ -293,7 +309,7 @@ MAX_RETRIES=3
 # Log naming: ${safe_name}_${platform}_attempt${attempt}.log
 # Result format: name|platform|PASS/FAIL|device:X|attempt:N|Xs
 run_task() {
-    local name="$1" dir="$2" platform="$3" attempt="$4" device_id="$5"
+    local name="$1" dir="$2" platform="$3" attempt="$4" device_id="$5" print_log_on_fail="${6:-true}"
     local safe_name="${name//[:\/]/_}"
     local task_log="${LOG_DIR}/${safe_name}_${platform}_attempt${attempt}.log"
     local start_time=$SECONDS
@@ -319,9 +335,11 @@ run_task() {
     else
         status="FAIL"
         echo "[${platform}${device_id:+:dev${device_id}}] FAIL: $name (${elapsed}s)"
-        echo "--- LOG: $name (attempt $attempt) ---"
-        cat "$task_log"
-        echo "--- END ---"
+        if [[ "$print_log_on_fail" == "true" ]]; then
+            echo "--- LOG: $name (attempt $attempt) ---"
+            cat "$task_log"
+            echo "--- END ---"
+        fi
     fi
     echo "${name}|${platform}|${status}|device:${device_id:-sim}|attempt:${attempt}|${elapsed}s" \
         >> "$RESULTS_FILE"
@@ -348,7 +366,7 @@ run_sim_tasks() {
         local -a pids=()
         for idx in "${indices[@]}"; do
             (
-                if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "$SIM_PLATFORM" "$attempt"; then
+                if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "${SIM_TASK_PLATS[$idx]}" "$attempt"; then
                     echo "${idx}|PASS" >> "$sim_marker"
                 else
                     echo "${idx}|FAIL" >> "$sim_marker"
@@ -359,7 +377,7 @@ run_sim_tasks() {
         for pid in "${pids[@]}"; do wait "$pid" 2>/dev/null || true; done
     else
         for idx in "${indices[@]}"; do
-            if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "$SIM_PLATFORM" "$attempt"; then
+            if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "${SIM_TASK_PLATS[$idx]}" "$attempt"; then
                 echo "${idx}|PASS" >> "$sim_marker"
             else
                 echo "${idx}|FAIL" >> "$sim_marker"
@@ -406,7 +424,7 @@ run_hw_tasks() {
 
                 IFS=':' read -r idx attempt <<< "$entry"
 
-                if run_task "${HW_TASK_NAMES[$idx]}" "${HW_TASK_DIRS[$idx]}" "$HW_PLATFORM" "$attempt" "$device_id"; then
+                if run_task "${HW_TASK_NAMES[$idx]}" "${HW_TASK_DIRS[$idx]}" "${HW_TASK_PLATS[$idx]}" "$attempt" "$device_id" "false"; then
                     echo "${idx}|PASS" >> "$hw_marker"
                 else
                     next=$((attempt + 1))
@@ -414,9 +432,14 @@ run_hw_tasks() {
                         flock "$lock" bash -c "echo '${idx}:${next}' >> \"$queue\""
                     else
                         echo "${idx}|FAIL" >> "$hw_marker"
+                        local safe_name="${HW_TASK_NAMES[$idx]//[:\/]/_}"
+                        local last_log="${LOG_DIR}/${safe_name}_${HW_TASK_PLATS[$idx]}_attempt${attempt}.log"
+                        echo "--- LOG: ${HW_TASK_NAMES[$idx]} (attempt $attempt) ---"
+                        cat "$last_log"
+                        echo "--- END ---"
+                        echo "[${HW_TASK_PLATS[$idx]}:dev${device_id}] Device quarantined after exhausting retries"
+                        break
                     fi
-                    echo "[${HW_PLATFORM}:dev${device_id}] Device quarantined after failure"
-                    break
                 fi
             done
         ) &
@@ -606,7 +629,7 @@ for i in "${!TASK_ORDER[@]}"; do
 
     platform="${FINAL_PLATFORM[$i]}"
     device="${FINAL_DEVICE[$i]}"
-    attempt="${FINAL_ATTEMPT[$i]}"
+    attempt=$(( FINAL_ATTEMPT[$i] + 1 ))
     timing="${FINAL_TIMING[$i]}"
 
     if [[ "$result" == "FAIL" ]]; then
diff --git a/examples/aicpu_build_graph/bgemm/README.md b/examples/a2a3/aicpu_build_graph/bgemm/README.md
similarity index 100%
rename from examples/aicpu_build_graph/bgemm/README.md
rename to examples/a2a3/aicpu_build_graph/bgemm/README.md
diff --git a/examples/aicpu_build_graph/bgemm/golden.py b/examples/a2a3/aicpu_build_graph/bgemm/golden.py
similarity index 100%
rename from examples/aicpu_build_graph/bgemm/golden.py
rename to examples/a2a3/aicpu_build_graph/bgemm/golden.py
diff --git a/examples/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
similarity index 100%
rename from examples/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
rename to examples/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
diff --git a/examples/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
similarity index 100%
rename from examples/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
rename to examples/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
diff --git a/examples/aicpu_build_graph/bgemm/kernels/kernel_config.py b/examples/a2a3/aicpu_build_graph/bgemm/kernels/kernel_config.py
similarity index 100%
rename from examples/aicpu_build_graph/bgemm/kernels/kernel_config.py
rename to examples/a2a3/aicpu_build_graph/bgemm/kernels/kernel_config.py
diff --git a/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
similarity index 100%
rename from examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
rename to examples/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
diff --git a/examples/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
similarity index 100%
rename from examples/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
rename to examples/a2a3/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
diff --git a/examples/aicpu_build_graph/vector_example/README.md b/examples/a2a3/aicpu_build_graph/vector_example/README.md
similarity index 100%
rename from examples/aicpu_build_graph/vector_example/README.md
rename to examples/a2a3/aicpu_build_graph/vector_example/README.md
diff --git a/examples/aicpu_build_graph/vector_example/golden.py b/examples/a2a3/aicpu_build_graph/vector_example/golden.py
similarity index 100%
rename from examples/aicpu_build_graph/vector_example/golden.py
rename to examples/a2a3/aicpu_build_graph/vector_example/golden.py
diff --git a/examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp
similarity index 100%
rename from examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp
rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp
diff --git a/examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp b/examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
similarity index 100%
rename from examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
diff --git a/examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
similarity index 100%
rename from examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
diff --git a/examples/aicpu_build_graph/vector_example/kernels/kernel_config.py b/examples/a2a3/aicpu_build_graph/vector_example/kernels/kernel_config.py
similarity index 100%
rename from examples/aicpu_build_graph/vector_example/kernels/kernel_config.py
rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/kernel_config.py
diff --git a/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp b/examples/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
similarity index 100%
rename from examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp
diff --git a/examples/host_build_graph/bgemm/README.md b/examples/a2a3/host_build_graph/bgemm/README.md
similarity index 100%
rename from examples/host_build_graph/bgemm/README.md
rename to examples/a2a3/host_build_graph/bgemm/README.md
diff --git a/examples/host_build_graph/bgemm/golden.py b/examples/a2a3/host_build_graph/bgemm/golden.py
similarity index 100%
rename from examples/host_build_graph/bgemm/golden.py
rename to examples/a2a3/host_build_graph/bgemm/golden.py
diff --git a/examples/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
similarity index 100%
rename from examples/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
rename to examples/a2a3/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp
diff --git a/examples/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
similarity index 100%
rename from examples/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
rename to examples/a2a3/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp
diff --git a/examples/host_build_graph/bgemm/kernels/kernel_config.py b/examples/a2a3/host_build_graph/bgemm/kernels/kernel_config.py
similarity index 100%
rename from examples/host_build_graph/bgemm/kernels/kernel_config.py
rename to examples/a2a3/host_build_graph/bgemm/kernels/kernel_config.py
diff --git a/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
similarity index 100%
rename from examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
rename to examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp
diff --git a/examples/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
similarity index 100%
rename from examples/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
rename to examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md
diff --git a/examples/host_build_graph/matmul/golden.py b/examples/a2a3/host_build_graph/matmul/golden.py
similarity index 100%
rename from examples/host_build_graph/matmul/golden.py
rename to examples/a2a3/host_build_graph/matmul/golden.py
diff --git a/examples/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp b/examples/a2a3/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp
similarity index 100%
rename from examples/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp
rename to examples/a2a3/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp
diff --git a/examples/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp b/examples/a2a3/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp
similarity index 100%
rename from examples/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp
rename to examples/a2a3/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp
diff --git a/examples/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp b/examples/a2a3/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp
similarity index 100%
rename from examples/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp
rename to examples/a2a3/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp
diff --git a/examples/host_build_graph/matmul/kernels/kernel_config.py b/examples/a2a3/host_build_graph/matmul/kernels/kernel_config.py
similarity index 100%
rename from examples/host_build_graph/matmul/kernels/kernel_config.py
rename to examples/a2a3/host_build_graph/matmul/kernels/kernel_config.py
diff --git a/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp b/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
similarity index 100%
rename from examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
rename to examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp
diff --git a/examples/host_build_graph/paged_attention/golden.py b/examples/a2a3/host_build_graph/paged_attention/golden.py
similarity index 100%
rename from examples/host_build_graph/paged_attention/golden.py
rename to examples/a2a3/host_build_graph/paged_attention/golden.py
diff --git a/examples/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from examples/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
rename to examples/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/examples/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from examples/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
rename to examples/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/examples/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from examples/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
rename to examples/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
diff --git a/examples/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from examples/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
rename to examples/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/examples/host_build_graph/paged_attention/kernels/kernel_config.py b/examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py
similarity index 100%
rename from examples/host_build_graph/paged_attention/kernels/kernel_config.py
rename to examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py
diff --git a/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
rename to examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
diff --git a/examples/host_build_graph/vector_example/README.md b/examples/a2a3/host_build_graph/vector_example/README.md
similarity index 100%
rename from examples/host_build_graph/vector_example/README.md
rename to examples/a2a3/host_build_graph/vector_example/README.md
diff --git a/examples/host_build_graph/vector_example/golden.py b/examples/a2a3/host_build_graph/vector_example/golden.py
similarity index 100%
rename from examples/host_build_graph/vector_example/golden.py
rename to examples/a2a3/host_build_graph/vector_example/golden.py
diff --git a/examples/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp
similarity index 100%
rename from examples/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp
rename to examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp
diff --git a/examples/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
similarity index 100%
rename from examples/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
rename to examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp
diff --git a/examples/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
similarity index 100%
rename from examples/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
rename to examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp
diff --git a/examples/host_build_graph/vector_example/kernels/kernel_config.py b/examples/a2a3/host_build_graph/vector_example/kernels/kernel_config.py
similarity index 100%
rename from examples/host_build_graph/vector_example/kernels/kernel_config.py
rename to examples/a2a3/host_build_graph/vector_example/kernels/kernel_config.py
diff --git a/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
similarity index 100%
rename from examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
rename to examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
diff --git a/examples/tensormap_and_ringbuffer/bgemm/golden.py b/examples/a2a3/tensormap_and_ringbuffer/bgemm/golden.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/bgemm/golden.py
rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/golden.py
diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp
diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp
diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py
rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py
diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
diff --git a/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md
similarity index 100%
rename from examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md
rename to examples/a2a3/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/golden.py b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/golden.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/mixed_example/golden.py
rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/golden.py
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
diff --git a/examples/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py b/examples/a2a3/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py
rename to examples/a2a3/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py
diff --git a/examples/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py
rename to examples/a2a3/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/golden.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/golden.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/golden.py
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/golden.py
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
diff --git a/examples/tensormap_and_ringbuffer/vector_example/golden.py b/examples/a2a3/tensormap_and_ringbuffer/vector_example/golden.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/vector_example/golden.py
rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/golden.py
diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp
diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp
diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp
diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py
similarity index 100%
rename from examples/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py
rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py
diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
similarity index 100%
rename from examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp
diff --git a/examples/a5/host_build_graph/paged_attention/golden.py b/examples/a5/host_build_graph/paged_attention/golden.py
new file mode 100644
index 00000000..17fafcdb
--- /dev/null
+++ b/examples/a5/host_build_graph/paged_attention/golden.py
@@ -0,0 +1,45 @@
+"""Paged Attention Golden - host_build_graph example (small scale, float16)."""
+
+from paged_attention_golden import (
+    generate_inputs as _generate_inputs,
+    compute_golden,
+    run_golden_test,
+)
+
+__outputs__ = ["out"]
+
+RTOL = 1e-2
+ATOL = 1e-2
+
+ALL_CASES = {
+    "Case1": {
+        "batch": 1,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 16,
+        "max_model_len": 256,
+        "dtype": "float16",
+    },
+    "Case2": {
+        "batch": 1,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 64,
+        "max_model_len": 256,
+        "dtype": "float16",
+    },
+}
+
+DEFAULT_CASE = "Case1"
+
+
+def generate_inputs(params: dict) -> list:
+    return _generate_inputs(params, return_all_sizes=True)
+
+
+if __name__ == "__main__":
+    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 00000000..a59b1243
--- /dev/null
+++ b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,90 @@
+// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
+//
+// Fixed tile size: (16, 16) @ (16, 16) -> (16, 16)
+//
+// pij is float16 (converted from fp32 in softmax_prepare via TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ void pv_matmul_impl(__gm__ uint8_t* pij_raw, __gm__ uint8_t* vj_raw, __gm__ uint8_t* oi_raw)
+{
+    constexpr int M = 16, K = 16, N = 16;
+
+    __gm__ half* pij = reinterpret_cast<__gm__ half*>(pij_raw);
+    __gm__ half* vj  = reinterpret_cast<__gm__ half*>(vj_raw);
+    __gm__ float*      oi  = reinterpret_cast<__gm__ float*>(oi_raw);
+
+    // pij (M, K) fp16, vj (K, N) fp16 in ND (row-major), oi_new (M, N) fp32
+    using GlobalA   = GlobalTensor<half, Shape<1, 1, 1, M, K>, pto::Stride<M*K, M*K, M*K, K, 1>>;
+    using GlobalB   = GlobalTensor<half, Shape<1, 1, 1, K, N>, pto::Stride<K*N, K*N, K*N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M*N, M*N, M*N, N, 1>>;
+
+    GlobalA   pijGlobal(pij);
+    GlobalB   vjGlobal(vj);
+    GlobalOut oiGlobal(oi);
+
+    // L1 Mat tiles: standard ND pattern for both A and B
+    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    // L0 tiles
+    using LeftTile  = TileLeft<half, M, K, M, K>;
+    using RightTile = TileRight<half, K, N, K, N>;
+    using AccTile   = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile  aTile;
+    RightTile bTile;
+    AccTile   cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load pij and vj to L1
+    TLOAD(aMatTile, pijGlobal);
+    TLOAD(bMatTile, vjGlobal);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    // Move to L0A/L0B
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Single matmul: (M,K) x (K,N) -> (M,N)
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(oiGlobal, cTile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args)
+{
+    __gm__ uint8_t* pij    = reinterpret_cast<__gm__ uint8_t*>(args[0]);
+    __gm__ uint8_t* vj     = reinterpret_cast<__gm__ uint8_t*>(args[1]);
+    __gm__ uint8_t* oi_new = reinterpret_cast<__gm__ uint8_t*>(args[2]);
+
+    pv_matmul_impl(pij, vj, oi_new);
+}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 00000000..a173def0
--- /dev/null
+++ b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,91 @@
+// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
+//
+// Fixed tile size: (16, 16) @ (16, 16).T -> (16, 16)
+//
+// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
+// This is equivalent to (K, N) in column-major (DN) layout.
+// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ void qk_matmul_impl(__gm__ uint8_t* qi_raw, __gm__ uint8_t* kj_raw, __gm__ uint8_t* sij_raw)
+{
+    constexpr int M = 16, K = 16, N = 16;
+
+    __gm__ half* qi  = reinterpret_cast<__gm__ half*>(qi_raw);
+    __gm__ half* kj  = reinterpret_cast<__gm__ half*>(kj_raw);
+    __gm__ float*      sij = reinterpret_cast<__gm__ float*>(sij_raw);
+
+    // qi (M, K) fp16 in ND (row-major) layout
+    using GlobalA   = GlobalTensor<half, Shape<1, 1, 1, M, K>, pto::Stride<M*K, M*K, M*K, K, 1>>;
+    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
+    using GlobalB   = GlobalTensor<half, Shape<1, 1, 1, K, N>, pto::Stride<K*N, K*N, K*N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M*N, M*N, M*N, N, 1>>;
+
+    GlobalA   qiGlobal(qi);
+    GlobalB   kjGlobal(kj);
+    GlobalOut sijGlobal(sij);
+
+    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
+    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    // L0 tiles
+    using LeftTile  = TileLeft<half, M, K, M, K>;
+    using RightTile = TileRight<half, K, N, K, N>;
+    using AccTile   = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile  aTile;
+    RightTile bTile;
+    AccTile   cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load qi and kj to L1
+    TLOAD(aMatTile, qiGlobal);
+    TLOAD(bMatTile, kjGlobal);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    // Move to L0A/L0B
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Single matmul: (M,K) x (K,N) -> (M,N)
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(sijGlobal, cTile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args)
+{
+    __gm__ uint8_t* qi  = reinterpret_cast<__gm__ uint8_t*>(args[0]);
+    __gm__ uint8_t* kj  = reinterpret_cast<__gm__ uint8_t*>(args[1]);
+    __gm__ uint8_t* sij = reinterpret_cast<__gm__ uint8_t*>(args[2]);
+
+    qk_matmul_impl(qi, kj, sij);
+}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 00000000..6f2ecd65
--- /dev/null
+++ b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,220 @@
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Fixed tile size: oi/oi_new are (16, 16), mij/lij/mi/li are 16-element vectors
+//
+// Scalar layout strategy:
+//   M scalar floats stored contiguously in GM can be loaded as either:
+//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD)
+//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV)
+//   Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ void online_update_impl(__gm__ uint8_t* mij_raw, __gm__ uint8_t* lij_raw,
+                               __gm__ uint8_t* oi_new_raw, __gm__ uint8_t* mi_raw,
+                               __gm__ uint8_t* li_raw, __gm__ uint8_t* oi_raw,
+                               int is_first, int is_last, __gm__ uint8_t* dst_raw)
+{
+    constexpr int M = 16, N = 16;
+
+    __gm__ float* mij_ptr    = reinterpret_cast<__gm__ float*>(mij_raw);
+    __gm__ float* lij_ptr    = reinterpret_cast<__gm__ float*>(lij_raw);
+    __gm__ float* oi_new_ptr = reinterpret_cast<__gm__ float*>(oi_new_raw);
+    __gm__ float* mi_ptr     = reinterpret_cast<__gm__ float*>(mi_raw);
+    __gm__ float* li_ptr     = reinterpret_cast<__gm__ float*>(li_raw);
+    __gm__ float* oi_ptr     = reinterpret_cast<__gm__ float*>(oi_raw);
+    __gm__ float* dst_ptr    = reinterpret_cast<__gm__ float*>(dst_raw);
+
+    // Scalar tile dimensions for RowMajor layout:
+    // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block)
+    // kScalarRows = M / 8 (M=16 -> 2 rows)
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+
+    // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor
+    using GlobalScalarND = GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>,
+                                        pto::Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>,
+                                        pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr);
+    GlobalDataMxN oiGlobal(oi_ptr);
+    GlobalDataMxN dstGlobal(dst_ptr);
+
+    // ND globals for scalar element-wise operations
+    GlobalScalarND mijGlobalND(mij_ptr);
+    GlobalScalarND lijGlobalND(lij_ptr);
+    GlobalScalarND miGlobalND(mi_ptr);
+    GlobalScalarND liGlobalND(li_ptr);
+
+    // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE)
+    GlobalScalarDN mijGlobalDN(mij_ptr);
+    GlobalScalarDN lijGlobalDN(lij_ptr);
+    GlobalScalarDN liGlobalDN(li_ptr);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarND = Tile<TileType::Vec, float, kScalarRows, kScalarCols,
+                              BLayout::RowMajor, kScalarRows, kScalarCols>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar ND tiles for element-wise arithmetic
+    TileScalarND mijND, lijND, miND, liND;
+    TileScalarND miNewND, alphaND, betaND, tmpND;
+
+    // Scalar DN tiles for TROWEXPAND operations
+    TileScalarDN alphaDN, betaDN, liDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijND, 2 * kDataBytes);
+    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
+    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
+    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
+    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
+    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
+    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
+    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijND, mijGlobalND);
+        TLOAD(lijND, lijGlobalND);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Passthrough to MTE3 (no V compute needed)
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);     // mi = mij
+        TSTORE(liGlobalND, lijND);     // li = lij
+        TSTORE(oiGlobal, oiNewTile);   // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            TLOAD(liDN, liGlobalDN);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Phase 1: Load all inputs
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijND, mijGlobalND);
+        TLOAD(lijND, lijGlobalND);
+        TLOAD(miND, miGlobalND);
+        TLOAD(liND, liGlobalND);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols)
+        // to resolve RAW hazards on shared UB tiles.
+        TMAX(miNewND, miND, mijND);          // mi_new = max(mi, mij)
+        TSUB(alphaND, miND, miNewND);        // alpha = mi - mi_new
+        TEXP(alphaND, alphaND);              // alpha = exp(mi - mi_new)
+        TSUB(betaND, mijND, miNewND);        // beta = mij - mi_new
+        TEXP(betaND, betaND);                // beta = exp(mij - mi_new)
+        TMUL(liND, alphaND, liND);           // li = alpha * li
+        TMUL(tmpND, betaND, lijND);          // tmp = beta * lij
+        TADD(liND, liND, tmpND);             // li = alpha * li + beta * lij (= li_new)
+
+        // Phase 3: Store scalar results to GM (ND format)
+        // mi_new -> mi accumulator, li_new -> li accumulator
+        // alpha -> mij buffer (reuse), beta -> lij buffer (reuse)
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, miNewND);         // persist mi_new
+        TSTORE(liGlobalND, liND);            // persist li_new
+        TSTORE(mijGlobalND, alphaND);        // temp: alpha to mij buffer
+        TSTORE(lijGlobalND, betaND);         // temp: beta to lij buffer
+
+        // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        TLOAD(alphaDN, mijGlobalDN);         // alpha from mij buffer as DN
+        TLOAD(betaDN, lijGlobalDN);          // beta from lij buffer as DN
+        if (is_last) {
+            TLOAD(liDN, liGlobalDN);         // li_new from li buffer as DN
+        }
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+
+        // Phase 5: Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);   // oi_new *= beta
+        TADD(oiTile, oiTile, oiNewTile);               // oi = alpha*oi + beta*oi_new
+
+        if (is_last) {
+            // Phase 6: Normalize and output
+            TROWEXPANDDIV(oiTile, oiTile, liDN);      // dst = oi / li_new
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Phase 6: Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ uint8_t* mij    = reinterpret_cast<__gm__ uint8_t*>(args[0]);
+    __gm__ uint8_t* lij    = reinterpret_cast<__gm__ uint8_t*>(args[1]);
+    __gm__ uint8_t* oi_new = reinterpret_cast<__gm__ uint8_t*>(args[2]);
+    __gm__ uint8_t* mi     = reinterpret_cast<__gm__ uint8_t*>(args[3]);
+    __gm__ uint8_t* li     = reinterpret_cast<__gm__ uint8_t*>(args[4]);
+    __gm__ uint8_t* oi     = reinterpret_cast<__gm__ uint8_t*>(args[5]);
+    int is_first  = static_cast<int>(args[6]);
+    int is_last   = static_cast<int>(args[7]);
+    __gm__ uint8_t* dst    = reinterpret_cast<__gm__ uint8_t*>(args[8]);
+
+    online_update_impl(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 00000000..7b168049
--- /dev/null
+++ b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,94 @@
+// Softmax Preparation Kernel (AIV)
+//
+// Fixed tile size: sij is (16, 16)
+//
+// Computes:
+//   sij_scale = sij * scale
+//   mij = row_max(sij_scale)        -> (M, 1)
+//   pij = exp(sij_scale - mij)      -> (M, N)
+//   lij = row_sum(pij)              -> (M, 1)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+static __aicore__ void softmax_prepare_impl(__gm__ uint8_t* sij_raw, float scale_value,
+                                 __gm__ uint8_t* pij_raw, __gm__ uint8_t* mij_raw,
+                                 __gm__ uint8_t* lij_raw)
+{
+    constexpr int M = 16, N = 16;
+
+    __gm__ float* sij = reinterpret_cast<__gm__ float*>(sij_raw);
+    __gm__ half*  pij = reinterpret_cast<__gm__ half*>(pij_raw);
+    __gm__ float* mij = reinterpret_cast<__gm__ float*>(mij_raw);
+    __gm__ float* lij = reinterpret_cast<__gm__ float*>(lij_raw);
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_f16 = GlobalTensor<half, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    GlobalDataMxN sijGlobal(sij);
+    GlobalDataMxN_f16 pijGlobal(pij);
+    GlobalScalarDN mijGlobal(mij);
+    GlobalScalarDN lijGlobal(lij);
+
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_f16 = Tile<TileType::Vec, half, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    TileVecMxN sijTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileScalarDN maxTile;
+    TileScalarDN sumTile;
+    TileVecMxN_f16 pijF16Tile;
+
+    TASSIGN(sijTile, 0x0);
+    TASSIGN(pijTile, M * N * sizeof(float));
+    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
+    TASSIGN(maxTile, 3 * M * N * sizeof(float));
+    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
+    TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+
+    TLOAD(sijTile, sijGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    TMULS(sijTile, sijTile, scale_value);
+    TROWMAX(maxTile, sijTile, tmpTile);
+    TROWEXPANDSUB(pijTile, sijTile, maxTile);
+    TEXP(pijTile, pijTile);
+    // Truncate pij to fp16 first, then compute lij from truncated values (matches golden)
+    TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND);
+    TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND);
+    TROWSUM(sumTile, pijTile, tmpTile);
+
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mijGlobal, maxTile);
+    TSTORE(lijGlobal, sumTile);
+    TSTORE(pijGlobal, pijF16Tile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ uint8_t* sij = reinterpret_cast<__gm__ uint8_t*>(args[0]);
+    union { uint64_t u; float f; } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[1]);
+    float scale_value = scale_conv.f;
+    __gm__ uint8_t* pij = reinterpret_cast<__gm__ uint8_t*>(args[2]);
+    __gm__ uint8_t* mij = reinterpret_cast<__gm__ uint8_t*>(args[3]);
+    __gm__ uint8_t* lij = reinterpret_cast<__gm__ uint8_t*>(args[4]);
+
+    softmax_prepare_impl(sij, scale_value, pij, mij, lij);
+}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py b/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py
new file mode 100644
index 00000000..d826b9fc
--- /dev/null
+++ b/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py
@@ -0,0 +1,42 @@
+"""
+Paged Attention Kernel and Orchestration Configuration
+
+Defines the kernels and orchestration function for paged attention
+with AIC/AIV subgraph splitting:
+
+AIC Kernels (Matrix Multiplication):
+  - aic_qk_matmul: Q @ K^T computation
+  - aic_pv_matmul: P @ V computation
+
+AIV Kernels (Vector Operations):
+  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
+  - aiv_online_update: online softmax accumulation + fused normalization
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+
+# Orchestration config
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
+    "function_name": "build_paged_attention_graph",
+}
+
+# Kernel configs
+KERNELS = [
+    # AIC kernels (matrix multiplication using Cube unit)
+    {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),       "core_type": "aic"},
+    {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),       "core_type": "aic"},
+    # AIV kernels (vector operations)
+    {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"},
+    {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),   "core_type": "aiv"},
+]
+
+# Runtime configuration
+RUNTIME_CONFIG = {
+    "runtime": "host_build_graph",
+    "aicpu_thread_num": 3,
+    "orch_thread_num": 0,
+    "block_dim": 3,
+}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 00000000..5b29b587
--- /dev/null
+++ b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,256 @@
+/**
+ * Paged Attention Orchestration - Small Scale (16x16)
+ *
+ * Supports small-scale paged attention with:
+ *   Query: (batch, q_head_num, head_dim) fp16
+ *   Key:   (total_blocks, block_size, kv_head_num, head_dim) fp16 (NOT transposed)
+ *   Value: (total_blocks, block_size, kv_head_num, head_dim) fp16
+ *   Output: (batch * q_head_num, head_dim) float32
+ *
+ * Head tiling: q_tile_size = min(num_heads, 128)
+ * GQA: kv_head_num can differ from q_head_num
+ */
+
+#include "runtime.h"
+#include <iostream>
+#include <algorithm>
+#include <cstring>
+
+#define FUNC_QK_MATMUL       0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL       2
+#define FUNC_ONLINE_UPDATE   3
+
+extern "C" {
+
+int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) {
+    if (arg_count < 14) {
+        std::cerr << "Expected at least 14 args, got " << arg_count << '\n';
+        return -1;
+    }
+
+    void* host_query = reinterpret_cast<void*>(args[0]);
+    void* host_key_cache = reinterpret_cast<void*>(args[1]);
+    void* host_value_cache = reinterpret_cast<void*>(args[2]);
+    int* host_block_table = reinterpret_cast<int*>(args[3]);
+    int* host_context_lens = reinterpret_cast<int*>(args[4]);
+    void* host_out = reinterpret_cast<void*>(args[5]);
+    int64_t* host_config = reinterpret_cast<int64_t*>(args[6]);
+
+    size_t query_size = static_cast<size_t>(args[7]);
+    size_t key_cache_size = static_cast<size_t>(args[8]);
+    size_t value_cache_size = static_cast<size_t>(args[9]);
+    size_t block_table_size = static_cast<size_t>(args[10]);
+    size_t context_lens_size = static_cast<size_t>(args[11]);
+    size_t out_size = static_cast<size_t>(args[12]);
+    size_t config_size = static_cast<size_t>(args[13]);
+
+    int batch = static_cast<int>(host_config[0]);
+    int num_heads = static_cast<int>(host_config[1]);
+    int kv_head_num = static_cast<int>(host_config[2]);
+    int head_dim = static_cast<int>(host_config[3]);
+    int block_size = static_cast<int>(host_config[4]);
+    int max_num_blocks = static_cast<int>(host_config[5]);
+    uint64_t scale_value_bits = static_cast<uint64_t>(host_config[6]);
+
+    int q_tile_size = std::min(num_heads, 128);
+    int num_head_tiles = (num_heads + q_tile_size - 1) / q_tile_size;
+
+    std::cout << "\n=== build_paged_attention_graph ===" << '\n';
+    std::cout << "batch=" << batch << ", num_heads=" << num_heads
+              << ", kv_head_num=" << kv_head_num << ", head_dim=" << head_dim << '\n';
+    std::cout << "block_size=" << block_size << ", max_num_blocks=" << max_num_blocks << '\n';
+    std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n';
+
+    // Allocate device memory for inputs/outputs
+    void* dev_query = runtime->host_api.device_malloc(query_size);
+    void* dev_key_cache = runtime->host_api.device_malloc(key_cache_size);
+    void* dev_value_cache = runtime->host_api.device_malloc(value_cache_size);
+    void* dev_out = runtime->host_api.device_malloc(out_size);
+
+    if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) {
+        std::cerr << "Error: Failed to allocate device memory\n";
+        return -1;
+    }
+
+    runtime->host_api.copy_to_device(dev_query, host_query, query_size);
+    runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size);
+    runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size);
+    runtime->record_tensor_pair(host_out, dev_out, out_size);
+
+    // Buffer sizes depend on q_tile_size and block_size
+    size_t sij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
+    size_t pij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(uint16_t);
+    size_t mij_size    = static_cast<size_t>(q_tile_size) * sizeof(float);
+    size_t lij_size    = mij_size;
+    size_t oi_new_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+
+    // Per-batch-per-block intermediate buffers
+    int total_buffers = batch * max_num_blocks;
+    void** dev_sij_arr    = new void*[total_buffers];
+    void** dev_pij_arr    = new void*[total_buffers];
+    void** dev_mij_arr    = new void*[total_buffers];
+    void** dev_lij_arr    = new void*[total_buffers];
+    void** dev_oi_new_arr = new void*[total_buffers];
+
+    for (int i = 0; i < total_buffers; i++) {
+        dev_sij_arr[i]    = runtime->host_api.device_malloc(sij_size);
+        dev_pij_arr[i]    = runtime->host_api.device_malloc(pij_size);
+        dev_mij_arr[i]    = runtime->host_api.device_malloc(mij_size);
+        dev_lij_arr[i]    = runtime->host_api.device_malloc(lij_size);
+        dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size);
+    }
+
+    // Per-(batch, head_tile) accumulators
+    int total_accums = batch * num_head_tiles;
+    size_t mi_size = static_cast<size_t>(q_tile_size) * sizeof(float);
+    size_t li_size = mi_size;
+    size_t oi_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+
+    void** dev_mi_arr = new void*[total_accums];
+    void** dev_li_arr = new void*[total_accums];
+    void** dev_oi_arr = new void*[total_accums];
+
+    for (int i = 0; i < total_accums; i++) {
+        dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size);
+        dev_li_arr[i] = runtime->host_api.device_malloc(li_size);
+        dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size);
+    }
+
+    std::cout << "Allocated " << total_buffers << " per-block buffers\n";
+    std::cout << "Allocated " << total_accums << " per-(batch,head_tile) accumulators\n";
+
+    int total_tasks = 0;
+
+    for (int b_idx = 0; b_idx < batch; b_idx++) {
+        int cur_seq = host_context_lens[b_idx];
+        int bn_this_batch = (cur_seq + block_size - 1) / block_size;
+
+        for (int ht = 0; ht < num_head_tiles; ht++) {
+            int cur_offset = ht * q_tile_size;
+
+            // Query: (batch, q_head_num, head_dim) fp16
+            // qi points to heads [cur_offset .. cur_offset+q_tile_size) for batch b_idx
+            uint8_t* qi_ptr = reinterpret_cast<uint8_t*>(dev_query)
+                + static_cast<int64_t>(b_idx * num_heads + cur_offset) * head_dim * sizeof(uint16_t);
+
+            // Output: (batch * q_head_num, head_dim) float32
+            uint8_t* out_ptr = reinterpret_cast<uint8_t*>(dev_out)
+                + static_cast<int64_t>(b_idx * num_heads + cur_offset) * head_dim * sizeof(float);
+
+            // GQA: which kv_head this head tile maps to
+            int kv_head_idx = cur_offset / (num_heads / kv_head_num);
+
+            // Per-(batch, head_tile) accumulators
+            int accum_idx = b_idx * num_head_tiles + ht;
+            void* dev_mi = dev_mi_arr[accum_idx];
+            void* dev_li = dev_li_arr[accum_idx];
+            void* dev_oi = dev_oi_arr[accum_idx];
+
+            int t_up_prev = -1;
+
+            for (int bn = 0; bn < bn_this_batch; bn++) {
+                int cur_block_idx = host_block_table[b_idx * max_num_blocks + bn];
+
+                // Key: (total_blocks, block_size, kv_head_num, head_dim) fp16
+                uint8_t* kj_ptr = reinterpret_cast<uint8_t*>(dev_key_cache)
+                    + (static_cast<int64_t>(cur_block_idx) * block_size * kv_head_num + kv_head_idx)
+                      * head_dim * sizeof(uint16_t);
+
+                // Value: (total_blocks, block_size, kv_head_num, head_dim) fp16
+                uint8_t* vj_ptr = reinterpret_cast<uint8_t*>(dev_value_cache)
+                    + (static_cast<int64_t>(cur_block_idx) * block_size * kv_head_num + kv_head_idx)
+                      * head_dim * sizeof(uint16_t);
+
+                int buf_idx = b_idx * max_num_blocks + bn;
+                void* dev_sij    = dev_sij_arr[buf_idx];
+                void* dev_pij    = dev_pij_arr[buf_idx];
+                void* dev_mij    = dev_mij_arr[buf_idx];
+                void* dev_lij    = dev_lij_arr[buf_idx];
+                void* dev_oi_new = dev_oi_new_arr[buf_idx];
+
+                // QK: qi(M, K) @ kj.T(K, N) -> sij(M, N)
+                uint64_t qk_args[6] = {
+                    reinterpret_cast<uint64_t>(qi_ptr),
+                    reinterpret_cast<uint64_t>(kj_ptr),
+                    reinterpret_cast<uint64_t>(dev_sij),
+                    static_cast<uint64_t>(q_tile_size),
+                    static_cast<uint64_t>(head_dim),
+                    static_cast<uint64_t>(block_size)
+                };
+                int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
+                total_tasks++;
+
+                // SF: scale, rowmax, exp, rowsum -> pij, mij, lij
+                uint64_t sf_args[7] = {
+                    reinterpret_cast<uint64_t>(dev_sij),
+                    scale_value_bits,
+                    reinterpret_cast<uint64_t>(dev_pij),
+                    reinterpret_cast<uint64_t>(dev_mij),
+                    reinterpret_cast<uint64_t>(dev_lij),
+                    static_cast<uint64_t>(q_tile_size),
+                    static_cast<uint64_t>(block_size)
+                };
+                int t_sf = runtime->add_task(sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
+                total_tasks++;
+
+                // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N')
+                uint64_t pv_args[6] = {
+                    reinterpret_cast<uint64_t>(dev_pij),
+                    reinterpret_cast<uint64_t>(vj_ptr),
+                    reinterpret_cast<uint64_t>(dev_oi_new),
+                    static_cast<uint64_t>(q_tile_size),
+                    static_cast<uint64_t>(block_size),
+                    static_cast<uint64_t>(head_dim)
+                };
+                int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
+                total_tasks++;
+
+                runtime->add_successor(t_qk, t_sf);
+                runtime->add_successor(t_sf, t_pv);
+
+                // Online Update: serialized across blocks (each depends on previous)
+                int is_first = (bn == 0) ? 1 : 0;
+                int is_last  = (bn == bn_this_batch - 1) ? 1 : 0;
+
+                uint64_t up_args[11] = {
+                    reinterpret_cast<uint64_t>(dev_mij),
+                    reinterpret_cast<uint64_t>(dev_lij),
+                    reinterpret_cast<uint64_t>(dev_oi_new),
+                    reinterpret_cast<uint64_t>(dev_mi),
+                    reinterpret_cast<uint64_t>(dev_li),
+                    reinterpret_cast<uint64_t>(dev_oi),
+                    static_cast<uint64_t>(is_first),
+                    static_cast<uint64_t>(is_last),
+                    reinterpret_cast<uint64_t>(out_ptr),
+                    static_cast<uint64_t>(q_tile_size),
+                    static_cast<uint64_t>(head_dim)
+                };
+                int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
+                total_tasks++;
+
+                runtime->add_successor(t_pv, t_up);
+                if (t_up_prev >= 0) {
+                    runtime->add_successor(t_up_prev, t_up);
+                }
+                t_up_prev = t_up;
+            }
+        }
+    }
+
+    delete[] dev_sij_arr;
+    delete[] dev_pij_arr;
+    delete[] dev_mij_arr;
+    delete[] dev_lij_arr;
+    delete[] dev_oi_new_arr;
+    delete[] dev_mi_arr;
+    delete[] dev_li_arr;
+    delete[] dev_oi_arr;
+
+    std::cout << "Created " << total_tasks << " tasks\n";
+    runtime->print_runtime();
+
+    return 0;
+}
+
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md b/examples/a5/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md
new file mode 100644
index 00000000..5d83385a
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md
@@ -0,0 +1,205 @@
+# TFILLPAD_INPLACE Bug at Small Tile Width (N ≤ 16)
+
+## Summary
+
+`TFILLPAD_INPLACE` produces incorrect padding results on Ascend A2/A3 hardware when
+the tile column count `N` is small (e.g. N=16 for float32). The bug manifests as
+corrupted data in the padded region for certain `valid_len` values, causing downstream
+softmax and attention computations to produce wrong results.
+
+## Affected Configuration
+
+- **Platform**: Ascend A2/A3 (tested on hardware, also reproduces on simulator)
+- **Data type**: float32 (sizeof=4)
+- **Tile shape**: (M, N) = (16, 16) — i.e. 2 × 32-byte blocks per row
+- **PTO source**: `include/pto/npu/a2a3/TFillPad.hpp`
+
+The bug does NOT reproduce at larger N values (N=32, 64, 128) where the same
+`valid_len` values work correctly.
+
+## Reproduction
+
+In the paged attention example (`examples/tensormap_and_ringbuffer/paged_attention/`),
+the softmax preparation kernel uses `TFILLPAD_INPLACE` to mask invalid key positions
+with `-inf` before computing softmax:
+
+```cpp
+// Tile types
+using TileSijDyn = Tile<TileType::Vec, float, 16, 16, BLayout::RowMajor, 16, -1>;
+using TileSijPad = Tile<TileType::Vec, float, 16, 16, BLayout::RowMajor, 16, 16,
+                        SLayout::NoneBox, 512, PadValue::Min>;
+
+TileSijDyn sijDynTile(valid_len);  // valid_len = number of valid columns
+TileSijPad sijPadTile;
+// Both assigned to same UB address (in-place)
+TASSIGN(sijDynTile, 0x0);
+TASSIGN(sijPadTile, 0x0);
+
+// After loading sij from GM:
+TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+// Expected: columns [valid_len, 16) filled with -inf (0xff800000)
+// Actual:   corrupted for certain valid_len values
+```
+
+### Test Matrix (N=16, float32, on hardware)
+
+| valid_len | context_len | blocks | TFILLPAD_INPLACE only | SetValue only | TFILLPAD + SetValue |
+|-----------|-------------|--------|-----------------------|---------------|---------------------|
+| 1         | 17          | 2      | FAIL (27/256)         | PASS          | PASS                |
+| 7         | 23          | 2      | FAIL (29/256)         | PASS          | PASS                |
+| 8         | 24          | 2      | FAIL (28/256)         | FAIL (182/256)| PASS                |
+| 9         | 25          | 2      | PASS                  | PASS          | PASS                |
+| 12        | 28          | 2      | PASS                  | PASS          | PASS                |
+| 15        | 31          | 2      | PASS                  | PASS          | PASS                |
+| 16 (full) | 32          | 2      | PASS                  | PASS          | PASS                |
+| 1         | 33          | 3      | FAIL (25/256)         | FAIL (88/256) | PASS                |
+
+### Cross-dimension validation (confirming N=16 is the trigger)
+
+| num_heads | head_dim | block_size (=N) | context_len | valid_len | Result |
+|-----------|----------|-----------------|-------------|-----------|--------|
+| 16        | 16       | **16**          | 33          | 1         | FAIL   |
+| 16        | 16       | **32**          | 33          | 1         | PASS   |
+| 16        | **32**   | **16**          | 33          | 1         | FAIL   |
+
+block_size determines N in the softmax tile (M, N). When block_size=32 (N=32),
+the same valid_len=1 passes. When block_size=16 (N=16), it fails regardless of
+head_dim.
+
+## Root Cause Analysis
+
+The bug is in the `TFillPad` function in `include/pto/npu/a2a3/TFillPad.hpp`.
+The function has two internal code paths for filling padding:
+
+### Path A: `Handle32BAlignedPad_Other` (lines 103-134)
+
+Fills the **partial 32-byte block** at the boundary using `vector_dup` with a
+norm-mode bitmask. This path is reliable.
+
+### Path B: `PadRightSingleRow` + `PadRightRemainingRows` (lines 136-167)
+
+Fills **complete 32-byte blocks** to the right of the boundary. Uses `vector_dup`
+for row 0, then `vcopy` with `srcRepeatStride=0` (broadcast) to replicate to
+remaining rows. **This path has the bug.**
+
+### Which path runs depends on `valid_len`
+
+The key variable is `srcValidCol32B` — the valid_len rounded up to the next
+32-byte-aligned element count:
+
+```
+elements_per_block = 32 / sizeof(float) = 8
+srcValidCol32B = ceil(valid_len / 8) * 8
+padOffset = srcValidCol32B
+padCols = N - srcValidCol32B        // columns for Path B
+pad_32B = srcValidCol32B - valid_len // columns for Path A
+```
+
+For N=16 (2 blocks of 8 elements each):
+
+```
+valid_len ∈ [1, 8]:
+    srcValidCol32B = 8
+    padOffset = 8,  padCols = 8   → Path B runs (fills block 1)
+    pad_32B = 8 - valid_len       → Path A runs if valid_len < 8
+
+valid_len ∈ [9, 15]:
+    srcValidCol32B = 16
+    padOffset = 16, padCols = 0   → Path B is a NO-OP
+    pad_32B = 16 - valid_len      → Path A runs (fills within block 1)
+
+valid_len = 16:
+    No padding needed (full block)
+```
+
+**Pattern: valid_len ≤ 8 → Path B runs → BUG. valid_len ≥ 9 → only Path A → OK.**
+
+### Path B code trace (the buggy path)
+
+```cpp
+// PadRightSingleRow: fill row 0's right padding
+set_mask_count();
+set_vector_mask(0, padCols);  // padCols = 8
+vector_dup(dstPtr + padOffset, dupPadValue, 1, 1, 1, 8, 0);
+//         ^-- dstPtr + 8 (element 8 of row 0)
+pipe_barrier(PIPE_V);
+
+// PadRightRemainingRows: broadcast row 0's pattern to rows 1..M-1
+dstRepeatStride = N * sizeof(float) / 32;  // = 16 * 4 / 32 = 2
+_dstPtr = dstPtr + padOffset + copyDstCols; // = dstPtr + 8 + 16 = dstPtr + 24
+fillRow = M - 1;  // = 15
+
+vcopy(_dstPtr, dstPtr + padOffset, 15, 1, 0, 2, 0);
+//    dst       src                rep  dB sB dR sR
+//    row1:8    row0:8             15   1  0  2  0
+//
+// dstRepeatStride=2 (64 bytes = 1 row), srcRepeatStride=0 (broadcast)
+// mask: counter mode, 8 elements (inherited from PadRightSingleRow)
+```
+
+The `vcopy` with `srcRepeatStride=0` and `dstRepeatStride=2` at N=16 appears to
+produce incorrect results on hardware. The exact hardware failure mode is unclear,
+but it consistently corrupts the padding data.
+
+### Why valid_len=8 is special
+
+When `valid_len=8`:
+- `pad_32B = 8 - 8 = 0` → Path A computes `mask = 0xff >> 8 << 8 = 0`
+- `set_vector_mask(0, 0)` is called, then `vector_dup` with zero mask
+- This is effectively a no-op, but may have undefined behavior on hardware
+- Path B still runs and produces incorrect results
+- Additionally, `SetValue`-only workaround also fails for valid_len=8,
+  suggesting the zero-mask `vector_dup` in Path A corrupts pipeline state
+
+## Workaround
+
+The working fix uses **both** `TFILLPAD_INPLACE` and scalar `SetValue` writes:
+
+```cpp
+// Step 1: TFILLPAD_INPLACE sets up vector pipeline state correctly
+//         (mask modes, barriers, etc.) even though its data output is buggy
+TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+
+// Step 2: SetValue patches the actual data with correct -inf values
+if (valid_len < static_cast<uint64_t>(N)) {
+    constexpr float NEG_INF = -__builtin_huge_valf();
+    for (int r = 0; r < M; r++) {
+        for (uint64_t c = valid_len; c < N; c++) {
+            sijTile.SetValue(static_cast<uint32_t>(r * N + c), NEG_INF);
+        }
+    }
+}
+```
+
+**Why both are needed:**
+
+| Approach               | valid_len=1 | valid_len=7 | valid_len=8 |
+|------------------------|-------------|-------------|-------------|
+| TFILLPAD_INPLACE only  | FAIL        | FAIL        | FAIL        |
+| SetValue only          | PASS        | PASS        | FAIL        |
+| TFILLPAD + SetValue    | PASS        | PASS        | PASS        |
+
+- `TFILLPAD_INPLACE` alone: Path B produces wrong data
+- `SetValue` alone: works for most cases, but valid_len=8 fails because
+  Path A's zero-mask `vector_dup` (which runs before SetValue in the
+  TFILLPAD-only case) apparently sets up necessary pipeline state that
+  subsequent vector operations depend on
+- Both together: TFILLPAD handles pipeline state, SetValue fixes the data
+
+## Scope
+
+- **Affected**: Any `TFILLPAD_INPLACE` call with float32 tiles where
+  `N ≤ 16` and `valid_len ≤ N/2` (i.e. valid data fits within the first
+  32-byte block of each row)
+- **Not affected**: N ≥ 32 (tested with N=32, 64, 128 — all pass)
+- **Not affected**: Full tiles (valid_len == N)
+- **Likely affected**: float16/bfloat16 tiles with N ≤ 32 (untested, but
+  the same code path would be triggered since elements_per_block=16 for
+  16-bit types, and the same vcopy broadcast pattern is used)
+
+## Files
+
+- Bug location: `include/pto/npu/a2a3/TFillPad.hpp`, functions
+  `PadRightSingleRow` (line 136) and `PadRightRemainingRows` (line 146)
+- Workaround applied in: `examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp`
+- Test configuration: `examples/tensormap_and_ringbuffer/paged_attention/golden.py`
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py
new file mode 100644
index 00000000..6eeb936e
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py
@@ -0,0 +1,67 @@
+"""Paged Attention Golden - tensormap_and_ringbuffer example (small scale, float16)."""
+
+from paged_attention_golden import (
+    generate_inputs as _generate_inputs,
+    compute_golden,
+    run_golden_test,
+)
+
+__outputs__ = ["out"]
+
+RTOL = 1e-2
+ATOL = 1e-2
+
+ALL_CASES = {
+    "Case1": {
+        "batch": 1,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "max_model_len": 256,
+        "dtype": "float16",
+    },
+    "Case2": {
+        "batch": 1,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 128,
+        "max_model_len": 256,
+        "dtype": "float16",
+    },
+    "CaseVarSeq2": {
+        "batch": 2,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 33,
+        "context_lens_list": [33, 17],
+        "max_model_len": 256,
+        "dtype": "float16",
+    },
+    "CaseVarSeq4": {
+        "batch": 4,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 16,
+        "block_size": 16,
+        "context_len": 128,
+        "context_lens_list": [33, 64, 128, 15],
+        "max_model_len": 256,
+        "dtype": "float16",
+    },
+}
+
+DEFAULT_CASE = "Case1"
+
+
+def generate_inputs(params: dict) -> list:
+    return _generate_inputs(params, return_all_sizes=False)
+
+
+if __name__ == "__main__":
+    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
rename to examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 00000000..0f9b0ae5
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,89 @@
+// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
+//
+// Fixed tile size: (16, 16) @ (16, 16) -> (16, 16)
+//
+// pij is float16 (converted from fp32 in softmax_prepare via TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_impl(__gm__ Tensor* pij, __gm__ Tensor* vj, __gm__ Tensor* oi) {
+    __gm__ half* pij_addr = reinterpret_cast<__gm__ half*>(pij->buffer.addr);
+    __gm__ half* vj_addr = reinterpret_cast<__gm__ half*>(vj->buffer.addr);
+    __gm__ float* oi_addr = reinterpret_cast<__gm__ float*>(oi->buffer.addr);
+
+    // pij (M, K) fp16, vj (K, N) fp16 in ND (row-major), oi_new (M, N) fp32
+    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
+
+    GlobalA pijGlobal(pij_addr + pij->start_offset);
+    GlobalB vjGlobal(vj_addr + vj->start_offset);
+    GlobalOut oiGlobal(oi_addr + oi->start_offset);
+
+    // L1 Mat tiles: standard ND pattern for both A and B
+    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    // L0 tiles
+    using LeftTile = TileLeft<half, M, K, M, K>;
+    using RightTile = TileRight<half, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load pij and vj to L1
+    TLOAD(aMatTile, pijGlobal);
+    TLOAD(bMatTile, vjGlobal);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    // Move to L0A/L0B
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Single matmul: (M,K) x (K,N) -> (M,N)
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(oiGlobal, cTile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* vj = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]);
+
+    pv_matmul_impl<16, 16, 16>(pij, vj, oi_new);
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 00000000..3b9ef46f
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,90 @@
+// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
+//
+// Fixed tile size: (16, 16) @ (16, 16).T -> (16, 16)
+//
+// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
+// This is equivalent to (K, N) in column-major (DN) layout.
+// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_impl(__gm__ Tensor* qi, __gm__ Tensor* kj, __gm__ Tensor* sij) {
+    __gm__ half* qi_addr = reinterpret_cast<__gm__ half*>(qi->buffer.addr);
+    __gm__ half* kj_addr = reinterpret_cast<__gm__ half*>(kj->buffer.addr);
+    __gm__ float* sij_addr = reinterpret_cast<__gm__ float*>(sij->buffer.addr);
+
+    // qi (M, K) fp16 in ND (row-major) layout
+    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
+    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
+    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
+
+    GlobalA qiGlobal(qi_addr + qi->start_offset);
+    GlobalB kjGlobal(kj_addr + kj->start_offset);
+    GlobalOut sijGlobal(sij_addr + sij->start_offset);
+
+    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
+    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    // L0 tiles
+    using LeftTile = TileLeft<half, M, K, M, K>;
+    using RightTile = TileRight<half, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load A and B to L1
+    TLOAD(aMatTile, qiGlobal);
+    TLOAD(bMatTile, kjGlobal);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    // Move from L1 to L0A/L0B
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Matmul
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(sijGlobal, cTile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* qi = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* kj = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[2]);
+
+    qk_matmul_impl<16, 16, 16>(qi, kj, sij);
+}
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
rename to examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 00000000..7351f73f
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,224 @@
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Fixed tile size: oi/oi_new are (16, 16), mij/lij/mi/li are 16-element vectors
+//
+// Scalar layout strategy:
+//   M scalar floats stored contiguously in GM can be loaded as either:
+//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD)
+//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV)
+//   Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(__gm__ Tensor* mij,
+    __gm__ Tensor* lij,
+    __gm__ Tensor* oi_new,
+    __gm__ Tensor* mi,
+    __gm__ Tensor* li,
+    __gm__ Tensor* oi,
+    uint64_t is_first,
+    uint64_t is_last,
+    __gm__ Tensor* dst) {
+    __gm__ float* mij_ptr = reinterpret_cast<__gm__ float*>(mij->buffer.addr);
+    __gm__ float* lij_ptr = reinterpret_cast<__gm__ float*>(lij->buffer.addr);
+    __gm__ float* oi_new_ptr = reinterpret_cast<__gm__ float*>(oi_new->buffer.addr);
+    __gm__ float* mi_ptr = reinterpret_cast<__gm__ float*>(mi->buffer.addr);
+    __gm__ float* li_ptr = reinterpret_cast<__gm__ float*>(li->buffer.addr);
+    __gm__ float* oi_ptr = reinterpret_cast<__gm__ float*>(oi->buffer.addr);
+    __gm__ float* dst_ptr = reinterpret_cast<__gm__ float*>(dst->buffer.addr);
+
+    // Scalar tile dimensions for RowMajor layout:
+    // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block)
+    // kScalarRows = M / 8 (M=16 -> 2 rows)
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+
+    // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, pto::Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
+    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
+
+    // ND globals for scalar element-wise operations
+    GlobalScalarND mijGlobalND(mij_ptr + mij->start_offset);
+    GlobalScalarND lijGlobalND(lij_ptr + lij->start_offset);
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
+    // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE)
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar ND tiles for element-wise arithmetic
+    TileScalarND mijND, lijND, miND, liND;
+    TileScalarND miNewND, alphaND, betaND, tmpND;
+
+    // Scalar DN tiles for TROWEXPAND operations
+    TileScalarDN alphaDN, betaDN, liDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijND, 2 * kDataBytes);
+    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
+    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
+    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
+    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
+    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
+    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
+    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijND, mijGlobalND);
+        TLOAD(lijND, lijGlobalND);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Passthrough to MTE3 (no V compute needed)
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);    // mi = mij
+        TSTORE(liGlobalND, lijND);    // li = lij
+        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            TLOAD(liDN, liGlobalDN);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Phase 1: Load all inputs
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijND, mijGlobalND);
+        TLOAD(lijND, lijGlobalND);
+        TLOAD(miND, miGlobalND);
+        TLOAD(liND, liGlobalND);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols)
+        // to resolve RAW hazards on shared UB tiles.
+        TMAX(miNewND, miND, mijND);  // mi_new = max(mi, mij)
+        TSUB(alphaND, miND, miNewND);  // alpha = mi - mi_new
+        TEXP(alphaND, alphaND);  // alpha = exp(mi - mi_new)
+        TSUB(betaND, mijND, miNewND);  // beta = mij - mi_new
+        TEXP(betaND, betaND);  // beta = exp(mij - mi_new)
+        TMUL(liND, alphaND, liND);  // li = alpha * li
+        TMUL(tmpND, betaND, lijND);  // tmp = beta * lij
+        TADD(liND, liND, tmpND);  // li = alpha * li + beta * lij (= li_new)
+
+        // Phase 3: Store scalar results to GM (ND format)
+        // mi_new -> mi accumulator, li_new -> li accumulator
+        // alpha -> mij buffer (reuse), beta -> lij buffer (reuse)
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, miNewND);   // persist mi_new
+        TSTORE(liGlobalND, liND);      // persist li_new
+        TSTORE(mijGlobalND, alphaND);  // temp: alpha to mij buffer
+        TSTORE(lijGlobalND, betaND);   // temp: beta to lij buffer
+
+        // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        TLOAD(alphaDN, mijGlobalDN);  // alpha from mij buffer as DN
+        TLOAD(betaDN, lijGlobalDN);   // beta from lij buffer as DN
+        if (is_last) {
+            TLOAD(liDN, liGlobalDN);  // li_new from li buffer as DN
+        }
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+
+        // Phase 5: Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
+        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+
+        if (is_last) {
+            // Phase 6: Normalize and output
+            TROWEXPANDDIV(oiTile, oiTile, liDN);  // dst = oi / li_new
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Phase 6: Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* mi = reinterpret_cast<__gm__ Tensor*>(args[3]);
+    __gm__ Tensor* li = reinterpret_cast<__gm__ Tensor*>(args[4]);
+    __gm__ Tensor* oi = reinterpret_cast<__gm__ Tensor*>(args[5]);
+    __gm__ Tensor* dst = reinterpret_cast<__gm__ Tensor*>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+
+    online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 00000000..d0f97987
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,147 @@
+// Softmax Preparation Kernel (AIV) with partial block masking
+//
+// Fixed tile size: sij is (16, 16)
+//
+// For partial blocks (valid_len < N), positions [valid_len, N) in sij are
+// filled with -inf before softmax, ensuring exp(-inf)=0 so that invalid
+// key positions contribute zero attention weight.
+//
+// Uses TFILLPAD_INPLACE for vector pipeline state setup, then patches with
+// scalar SetValue writes to fix a hardware bug in TFILLPAD's vcopy broadcast
+// path at small N (N=16).
+//
+// Computes:
+//   sij_masked = pad(sij, valid_len, -inf)
+//   sij_scale = sij_masked * scale
+//   mij = row_max(sij_scale)        -> (M, 1)
+//   pij = exp(sij_scale - mij)      -> (M, N)
+//   lij = row_sum(pij)              -> (M, 1)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
+    float scale_value,
+    __gm__ Tensor* pij,
+    __gm__ Tensor* mij,
+    __gm__ Tensor* lij) {
+    uint64_t valid_len = static_cast<uint64_t>(sij->shapes[1]);
+    __gm__ float* sij_addr = reinterpret_cast<__gm__ float*>(sij->buffer.addr);
+    __gm__ half* pij_addr = reinterpret_cast<__gm__ half*>(pij->buffer.addr);
+    __gm__ float* mij_addr = reinterpret_cast<__gm__ float*>(mij->buffer.addr);
+    __gm__ float* lij_addr = reinterpret_cast<__gm__ float*>(lij->buffer.addr);
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_f16 = GlobalTensor<half, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    GlobalDataMxN sijGlobal(sij_addr + sij->start_offset);
+    GlobalDataMxN_f16 pijGlobal(pij_addr + pij->start_offset);
+    GlobalScalarDN mijGlobal(mij_addr + mij->start_offset);
+    GlobalScalarDN lijGlobal(lij_addr + lij->start_offset);
+
+    // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_f16 = Tile<TileType::Vec, half, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    TileVecMxN sijTile;
+    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijPad sijPadTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileScalarDN maxTile;
+    TileScalarDN sumTile;
+    TileVecMxN_f16 pijF16Tile;
+
+    TASSIGN(sijTile, 0x0);
+    TASSIGN(sijDynTile, 0x0);
+    TASSIGN(sijPadTile, 0x0);
+    TASSIGN(pijTile, M * N * sizeof(float));
+    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
+    TASSIGN(maxTile, 3 * M * N * sizeof(float));
+    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
+    TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+
+    // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
+    TLOAD(sijTile, sijGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    // Mask columns [valid_len, N) with -inf.
+    // Use TFILLPAD_INPLACE for the main fill, then patch with SetValue for
+    // cases where TFILLPAD's vcopy broadcast path fails at small N.
+    TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+    // Patch: SetValue ensures correctness for valid_len <= N/2 where
+    // TFILLPAD's PadRightRemainingRows vcopy has a hardware issue.
+    if (valid_len < static_cast<uint64_t>(N)) {
+        // Cross-pipeline sync: wait for PIPE_V vcopy in TFILLPAD to complete
+        // before PIPE_S scalar SetValue writes to the same UB addresses.
+        // Without this, PIPE_V vcopy and PIPE_S SetValue race on UB memory,
+        // causing sporadic FAIL when vcopy finishes after SetValue.
+        // Pattern from TFillPad.hpp Handle32BAlignedPad_Byte (PtoSetWaitFlag).
+        set_flag(PIPE_V, PIPE_S, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_S, EVENT_ID0);
+        constexpr float NEG_INF = -__builtin_huge_valf();
+        for (int r = 0; r < M; r++) {
+            for (uint64_t c = valid_len; c < N; c++) {
+                sijTile.SetValue(static_cast<uint32_t>(r * N + c), NEG_INF);
+            }
+        }
+        // Ensure PIPE_S scalar UB writes are visible to subsequent PIPE_V ops.
+        // dsb(DSB_UB) is a hardware-only intrinsic; in simulation there are no
+        // real pipelines so the barrier is unnecessary and DSB_UB is undefined.
+#ifdef DSB_UB
+        dsb(DSB_UB);
+#endif
+    }
+
+    TMULS(sijTile, sijTile, scale_value);
+    TROWMAX(maxTile, sijTile, tmpTile);
+    TROWEXPANDSUB(pijTile, sijTile, maxTile);
+    TEXP(pijTile, pijTile);
+    // Truncate pij to fp16 first, then compute lij from truncated values (matches golden)
+    TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND);
+    TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND);
+    TROWSUM(sumTile, pijTile, tmpTile);
+
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mijGlobal, maxTile);
+    TSTORE(lijGlobal, sumTile);
+    TSTORE(pijGlobal, pijF16Tile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[1]);
+    float scale_value = scale_conv.f;
+    __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[3]);
+    __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[4]);
+
+    softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij);
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
new file mode 100644
index 00000000..d7627cd0
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
@@ -0,0 +1,46 @@
+"""
+Paged Attention Kernel and Orchestration Configuration
+
+Defines the kernels and orchestration function for paged attention
+with AIC/AIV subgraph splitting:
+
+AIC Kernels (Matrix Multiplication):
+  - aic_qk_matmul: Q @ K^T computation
+  - aic_pv_matmul: P @ V computation
+
+AIV Kernels (Vector Operations):
+  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
+  - aiv_online_update: online softmax accumulation + fused normalization
+
+Note: aiv_normalize has been merged into aiv_online_update for efficiency.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+
+# Orchestration config
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
+    "function_name": "aicpu_orchestration_entry",
+}
+
+# Kernel configs (aiv_normalize removed - merged into aiv_online_update)
+KERNELS = [
+    # AIC kernels (matrix multiplication using Cube unit)
+    {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),       "core_type": "aic"},
+    {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),       "core_type": "aic"},
+    {"func_id": 4, "name": "AIC_HUB", "source": str(_KERNELS_ROOT / "aic" / "aic_hub.cpp"),       "core_type": "aic"},
+    # AIV kernels (vector operations)
+    {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"},
+    {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),   "core_type": "aiv"},
+    {"func_id": 5, "name": "AIV_HUB", "source": str(_KERNELS_ROOT / "aiv" / "aiv_hub.cpp"),       "core_type": "aiv"},
+]
+
+# Runtime configuration
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "orch_thread_num": 2,
+    "block_dim": 24,
+}
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 00000000..9184031e
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,214 @@
+/**
+ * Paged Attention Orchestration Function - 16x16 Version
+ *
+ * Simplified for 16x16 framework-generated matmul kernels.
+ * Each block processes a single 16x16 matmul operation.
+ *
+ * Memory Layout:
+ *   Query: (batch, 16, 16) - one 16x16 tile per batch fp16
+ *   Key:   (total_blocks, 16, 16) - stored as K^T for direct matmul fp16
+ *   Value: (total_blocks, 16, 16) - direct format fp16
+ *
+ * This file compiles as a standalone .so with zero runtime link dependencies.
+ * All runtime calls go through the PTO2RuntimeOps function-pointer table.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+#define FUNC_AIC_HUB 4
+#define FUNC_AIV_HUB 5
+
+// Helper to encode float as uint64_t for scalar params
+static uint64_t float_to_u64(float f) {
+    union {
+        float f32;
+        uint64_t u64;
+    } conv;
+    conv.u64 = 0;  // Clear upper bits
+    conv.f32 = f;
+    return conv.u64;
+}
+
+extern "C" {
+
+/**
+ * Orchestration config — the executor reads these values to set up
+ * shared memory and runtime before calling aicpu_orchestration_entry.
+ */
+__attribute__((visibility("default")))
+PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count) {
+    (void)args;
+    (void)arg_count;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 10,
+    };
+}
+
+/**
+ * Orchestration entry — receives a PTO2Runtime* with ops table populated.
+ * The executor wraps this call in PTO2_SCOPE, so we are already inside
+ * the outer scope on entry.
+ */
+__attribute__((visibility("default")))
+void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, int orch_thread_num, int orch_thread_index) {
+    (void)arg_count;
+
+    // Extract device pointers (first 7)
+    void* host_query = (void*)(uintptr_t)args[0];           // [batch, num_heads, head_dim]
+    void* host_key_cache = (void*)(uintptr_t)args[1];       // [batch, block_num, block_size, head_dim]
+    void* host_value_cache = (void*)(uintptr_t)args[2];     // [batch, block_num, block_size, head_dim]
+    int* host_block_table = (int*)(uintptr_t)args[3];       // [batch, block_num]
+    int* host_context_lens = (int*)(uintptr_t)args[4];      // [batch]
+    void* host_out = (void*)(uintptr_t)args[5];             // [batch, num_heads, head_dim]
+    int64_t* host_config = (int64_t*)(uintptr_t)args[6];
+
+    // Extract sizes (next 3 args after pointers)
+    size_t query_size = (size_t)args[7];
+    size_t key_cache_size = (size_t)args[8];
+    size_t value_cache_size = (size_t)args[9];
+
+    // Extract config parameters
+    uint64_t batch = (uint64_t)(int)host_config[0];
+    uint64_t num_heads = (uint64_t)(int)host_config[1];
+    int kv_head_num = (int)host_config[2];
+    uint64_t head_dim = (uint64_t)(int)host_config[3];
+    uint64_t block_size = (uint64_t)(int)host_config[4];
+    uint64_t block_num = (uint64_t)(int)host_config[5];
+    // Reinterpret scale_bits as float (golden.py packs float via struct.pack)
+    union { uint32_t u; float f; } scale_conv;
+    scale_conv.u = (uint32_t)host_config[6];
+    float scale_value = scale_conv.f;
+    uint64_t q_head_num = num_heads;
+    uint64_t q_tile = 16;
+    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
+    DataType data_type = DataType::FLOAT16;
+    uint64_t elem_size = get_element_size(data_type);
+
+    (void)kv_head_num;
+
+    // Partition batch across orchestrators
+    uint64_t b_start = batch * orch_thread_index / orch_thread_num;
+    uint64_t b_end = batch * (orch_thread_index + 1) / orch_thread_num;
+
+    LOG_INFO(rt, "orch_idx=%d/%d batch=%lu b_range=[%lu,%lu)",
+             orch_thread_index, orch_thread_num,
+             (unsigned long)batch, (unsigned long)b_start, (unsigned long)b_end);
+
+    // Compute actual tensor shapes from buffer sizes (not from max block_num)
+    uint64_t query_shapes[2] = {batch * num_heads, head_dim};
+    uint64_t kv_total_rows = key_cache_size / (head_dim * elem_size);
+    uint64_t key_cache_shapes[2] = {kv_total_rows, head_dim};
+    uint64_t value_cache_shapes[2] = {kv_total_rows, head_dim};
+    uint64_t out_shapes[2] = {batch * num_heads, head_dim};
+    Tensor query = make_tensor_external(host_query, query_shapes, 2, data_type);
+    Tensor key_cache = make_tensor_external(host_key_cache, key_cache_shapes, 2, data_type);
+    Tensor value_cache = make_tensor_external(host_value_cache, value_cache_shapes, 2, data_type);
+    Tensor out = make_tensor_external(host_out, out_shapes, 2, DataType::FLOAT32);
+    LOG_DEBUG(rt, "query=%s", query.dump().c_str());
+    LOG_DEBUG(rt, "key_cache=%s", key_cache.dump().c_str());
+    LOG_DEBUG(rt, "value_cache=%s", value_cache.dump().c_str());
+    LOG_DEBUG(rt, "out=%s", out.dump().c_str());
+
+    for (uint64_t b_idx = b_start; b_idx < b_end; b_idx++) {
+        uint64_t cur_seq = host_context_lens[b_idx];
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            PTO2_SCOPE(rt) {
+                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
+                uint64_t oi_shapes[2] = {q_tile, head_dim};
+                uint64_t li_shapes[1] = {q_tile};
+                uint64_t mi_shapes[1] = {q_tile};
+                Tensor oi = make_tensor(oi_shapes, 2, DataType::FLOAT32);
+                Tensor li_update = make_tensor(li_shapes, 1, DataType::FLOAT32);
+                Tensor mi_update = make_tensor(mi_shapes, 1, DataType::FLOAT32);
+
+                uint64_t qi_shapes[2] = {q_tile, head_dim};
+                uint64_t qi_offsets[2] = {cur_offset, 0};
+                Tensor qi = query.view(qi_shapes, qi_offsets);
+                uint64_t out_view_shapes[2] = {q_tile, head_dim};
+                uint64_t out_view_offsets[2] = {cur_offset, 0};
+                Tensor out_view = out.view(out_view_shapes, out_view_offsets);
+
+                PTOParam params_inplace[] = {
+                    make_output_param(oi),
+                    make_output_param(li_update),
+                    make_output_param(mi_update),
+                };
+                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); // create_inplace
+
+                for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
+                    uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn];
+                    uint64_t valid_len = block_size < (cur_seq - bn * block_size) ? block_size : (cur_seq - bn * block_size);
+                    uint64_t kv_shapes[2] = {block_size, head_dim};
+                    uint64_t kv_offsets[2] = {cur_block_idx * block_size, 0};
+                    Tensor kj = key_cache.view(kv_shapes, kv_offsets);
+                    Tensor vj = value_cache.view(kv_shapes, kv_offsets);
+
+                    uint64_t sij_shapes[2] = {q_tile, block_size};
+                    Tensor sij = make_tensor(sij_shapes, 2, DataType::FLOAT32);
+                    Tensor pij_f16 = make_tensor(sij_shapes, 2, data_type);
+
+                    PTOParam params_qk[] = {
+                        make_input_param(qi),
+                        make_input_param(kj),
+                        make_output_param(sij),
+                    };
+                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3); // c1
+
+                    uint64_t sij_valid_shapes[2] = {q_tile, valid_len};
+                    uint64_t sij_valid_offsets[2] = {0, 0};
+                    Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
+                    Tensor li = make_tensor(li_shapes, 1, DataType::FLOAT32);
+                    Tensor mi = make_tensor(mi_shapes, 1, DataType::FLOAT32);
+                    PTOParam params_sf[] = {
+                        make_input_param(sij_valid),
+                        make_scalar_param(float_to_u64(scale_value)),
+                        make_output_param(pij_f16),
+                        make_output_param(mi),
+                        make_output_param(li),
+                    };
+                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5); // v1
+
+                    uint64_t oi_tmp_shapes[2] = {q_tile, head_dim};
+                    Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32);
+
+                    PTOParam params_pv[] = {
+                        make_input_param(pij_f16),
+                        make_input_param(vj),
+                        make_output_param(oi_tmp),
+                    };
+                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3); // c2
+
+                    uint64_t is_first = (bn == 0) ? 1 : 0;
+                    uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
+
+                    PTOParam params_up[] = {
+                        make_input_param(mi),
+                        make_input_param(li),
+                        make_input_param(oi_tmp),
+                        make_inout_param(mi_update),
+                        make_inout_param(li_update),
+                        make_inout_param(oi),
+                        make_output_param(out_view),
+                        make_scalar_param(is_first),
+                        make_scalar_param(is_last),
+                    };
+                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); // v2
+                }
+            }
+        }
+    }
+
+    LOG_INFO(rt, "orch_idx=%d: tasks submitted for batch=[%lu,%lu), num_heads=%lu",
+                  orch_thread_index, (unsigned long)b_start, (unsigned long)b_end,
+                  (unsigned long)num_heads);
+}
+
+}  // extern "C"
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/README.md b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/README.md
similarity index 100%
rename from tests/device_tests/aicpu_build_graph/paged_attention/README.md
rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/README.md
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/golden.py b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/golden.py
similarity index 100%
rename from tests/device_tests/aicpu_build_graph/paged_attention/golden.py
rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/golden.py
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/kernel_config.py b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py
similarity index 100%
rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/kernel_config.py
rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py
diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
diff --git a/tests/device_tests/host_build_graph/paged_attention/README.md b/tests/device_tests/a2a3/host_build_graph/paged_attention/README.md
similarity index 100%
rename from tests/device_tests/host_build_graph/paged_attention/README.md
rename to tests/device_tests/a2a3/host_build_graph/paged_attention/README.md
diff --git a/tests/device_tests/host_build_graph/paged_attention/golden.py b/tests/device_tests/a2a3/host_build_graph/paged_attention/golden.py
similarity index 100%
rename from tests/device_tests/host_build_graph/paged_attention/golden.py
rename to tests/device_tests/a2a3/host_build_graph/paged_attention/golden.py
diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from tests/device_tests/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from tests/device_tests/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/kernel_config.py b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py
similarity index 100%
rename from tests/device_tests/host_build_graph/paged_attention/kernels/kernel_config.py
rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py
diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/golden.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/golden.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/golden.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/golden.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/golden.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/golden.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
diff --git a/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp
new file mode 100644
index 00000000..0974de37
--- /dev/null
+++ b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp
@@ -0,0 +1,18 @@
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+constexpr int M = 16;
+constexpr int K = 16;
+constexpr int N = 16;
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {}
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp
diff --git a/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp
new file mode 100644
index 00000000..0974de37
--- /dev/null
+++ b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp
@@ -0,0 +1,18 @@
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+constexpr int M = 16;
+constexpr int K = 16;
+constexpr int N = 16;
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {}
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py
diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/README.md b/tests/device_tests/a5/host_build_graph/paged_attention/README.md
new file mode 100644
index 00000000..c0b6ebd0
--- /dev/null
+++ b/tests/device_tests/a5/host_build_graph/paged_attention/README.md
@@ -0,0 +1,192 @@
+# Paged Attention (Device Test)
+
+This example demonstrates Paged Attention implementation using CCE (Cube Core Engine) code generation, with AIC matmul kernels and AIV vector kernels using PTO Tile API.
+
+## Overview
+
+Paged Attention is an efficient attention mechanism that processes KV cache in fixed-size blocks, enabling memory-efficient inference for long sequences. This implementation uses:
+
+- **CCE-style codegen** for AIC kernels (Cube unit matmul)
+- **PTO Tile API** for AIV kernels (Vector unit operations)
+- **Online Softmax** algorithm for numerically stable incremental computation
+
+### Supported Platforms
+
+| Platform | Description |
+|----------|-------------|
+| a2a3 | Ascend hardware (requires device ID) |
+
+> This test uses bfloat16 data types and production-scale shapes that are not supported by the a2a3sim simulator. It only runs on real hardware.
+
+### Algorithm
+
+For each query token, the attention is computed incrementally across KV cache blocks:
+
+```
+For each block j:
+    sij = Qi @ Kj^T                    # QK MatMul (AIC)
+    mij, lij, pij = softmax_prepare(sij)  # Softmax (AIV)
+    oi_new = pij @ Vj                  # PV MatMul (AIC)
+    oi = online_update(oi, oi_new, mij, lij)  # Accumulate (AIV)
+```
+
+### Kernel Design (AIC/AIV Split)
+
+| Kernel | Core Type | Operation | Key Instructions |
+|--------|-----------|-----------|------------------|
+| aic_qk_matmul | AIC (Cube) | Q @ K^T | TLOAD/TMOV/TMATMUL/TSTORE |
+| aiv_softmax_prepare | AIV (Vector) | scale, rowmax, exp, rowsum | TMULS/TROWMAX/TROWEXPANDSUB/TEXP/TROWSUM |
+| aic_pv_matmul | AIC (Cube) | P @ V | TLOAD/TMOV/TMATMUL/TSTORE |
+| aiv_online_update | AIV (Vector) | Online Softmax + normalize | TMAX/TSUB/TEXP/TROWEXPANDMUL/TROWEXPANDDIV |
+
+### Memory Hierarchy (AIC Matmul)
+
+```
+GM -> L1 (Mat tiles) -> L0A/L0B -> L0C (Accumulator) -> GM
+```
+
+### Task Graph Structure
+
+For each batch, the task dependency pattern is:
+
+```
+Block 0: QK -> SF -> PV --+
+Block 1: QK -> SF -> PV --+--> UP[0] -> UP[1] -> ... -> UP[n]
+Block n: QK -> SF -> PV --+
+```
+
+- **QK/SF/PV chains**: Run in parallel across blocks
+- **UP (Online Update)**: Serialized within batch due to accumulator dependency
+
+## Quick Start
+
+```bash
+# Run on hardware (specify device ID)
+python examples/scripts/run_example.py \
+  -k tests/device_tests/host_build_graph/paged_attention/kernels \
+  -g tests/device_tests/host_build_graph/paged_attention/golden.py \
+  -p a2a3 -d 0
+
+# Run multi-block test case
+PA_CASE=Case2 python examples/scripts/run_example.py \
+  -k tests/device_tests/host_build_graph/paged_attention/kernels \
+  -g tests/device_tests/host_build_graph/paged_attention/golden.py \
+  -p a2a3 -d 0
+```
+
+## Directory Structure
+
+```
+paged_attention/
+├── README.md                    # This file
+├── golden.py                    # Input generation and expected output
+└── kernels/
+    ├── kernel_config.py         # Kernel registration config
+    ├── aic/                      # AIC kernels (CCE codegen style)
+    │   ├── aic_qk_matmul.cpp     # Q @ K^T matmul
+    │   └── aic_pv_matmul.cpp     # P @ V matmul
+    ├── aiv/                      # AIV kernels (PTO Tile API)
+    │   ├── aiv_softmax_prepare.cpp  # Softmax preparation
+    │   └── aiv_online_update.cpp    # Online Softmax update + normalize
+    └── orchestration/
+        └── paged_attention_orch.cpp # Task graph builder
+```
+
+## Test Cases
+
+| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Description |
+|------|-------|-----------|-------------|----------|------------|-------------|-------------|
+| Case1 | 1 | 16 | 1 | 128 | 128 | 256 | Small scale (default) |
+| Case2 | 8 | 64 | 1 | 128 | 64 | 8192 | Production scale |
+
+All test cases use **bfloat16** Q/K/V inputs with GQA (kv_head_num=1).
+
+## Key Technical Details
+
+### AIC Kernels (CCE Codegen)
+
+```cpp
+// L1 tiles: ColMajor + SLayout::RowMajor (required for matmul)
+using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+// L0 tiles: Use standard TileLeft/TileRight/TileAcc aliases
+using LeftTile = TileLeft<float, M, K, M, K>;
+using RightTile = TileRight<float, K, N, K, N>;
+using AccTile = TileAcc<float, M, N, M, N>;
+
+// Pipeline: MTE2 -> MTE1 -> M -> FIX -> MTE3
+TLOAD(aMatTile, qiGlobal);           // GM -> L1
+TMOV(aTile, aMatTile);               // L1 -> L0A
+TMATMUL(cTile, aTile, bTile);        // L0A x L0B -> L0C
+TSTORE(sijGlobal, cTile);            // L0C -> GM
+```
+
+### AIV Kernels (PTO Tile API)
+
+**softmax_prepare**: Uses DN layout (ColMajor, 16x1) for row reduction results
+
+```cpp
+using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, kRows, 1>;
+
+TMULS(sijTile, sijTile, scale_value);      // Scale
+TROWMAX(maxTile, sijTile, tmpTile);        // Row max
+TROWEXPANDSUB(pijTile, sijTile, maxTile);  // Subtract max (broadcast)
+TEXP(pijTile, pijTile);                    // Exp
+TROWSUM(sumTile, pijTile, tmpTile);        // Row sum
+```
+
+**online_update**: Uses ND/DN layout conversion for hardware compatibility
+
+```cpp
+// ND (1x16, RowMajor) for scalar arithmetic - TSUB/TMUL/TADD require RowMajor
+using TileScalarND = Tile<TileType::Vec, float, 1, kNumHeads, BLayout::RowMajor, 1, kNumHeads>;
+// DN (16x1, ColMajor) for row broadcast - TROWEXPANDMUL/TROWEXPANDDIV require this
+using TileScalarDN = Tile<TileType::Vec, float, kNumHeads, 1, BLayout::ColMajor, kNumHeads, 1>;
+
+// Arithmetic in ND layout
+TMAX(miNewTileND, miTileND, mijTileND);
+TSUB(alphaTileND, miTileND, miNewTileND);
+TEXP(alphaTileND, alphaTileND);
+
+// Reshape ND -> DN for broadcast operations
+TRESHAPE(alphaTileDN, alphaTileND);
+TROWEXPANDMUL(oiTile, oiTile, alphaTileDN);
+```
+
+### Data Layout
+
+- **K stored as K^T**: (head_dim, block_size) for direct matmul compatibility
+- **V stored normally**: (block_size, head_dim)
+
+## Expected Output
+
+```
+=== Compiling and Registering Kernels ===
+Compiling kernel: .../aic_qk_matmul.cpp (func_id=0)
+Compiling kernel: .../aiv_softmax_prepare.cpp (func_id=1)
+Compiling kernel: .../aic_pv_matmul.cpp (func_id=2)
+Compiling kernel: .../aiv_online_update.cpp (func_id=3)
+...
+=== build_paged_attention_graph (16x16 framework version) ===
+batch=1, num_heads=16, kv_head_num=1, head_dim=16
+block_size=16, block_num=1
+...
+Created 4 tasks
+...
+=== Comparing Results ===
+Comparing out: shape=(256,), dtype=float32
+  out: PASS (256/256 elements matched)
+
+============================================================
+TEST PASSED
+============================================================
+```
+
+## Reference
+
+This implementation uses the Online Softmax algorithm for paged attention, with identical kernel structure to the PyPTO reference implementation.
+
+## See Also
+
+- [Test Framework Documentation](../../../../examples/scripts/README.md)
diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/golden.py b/tests/device_tests/a5/host_build_graph/paged_attention/golden.py
new file mode 100644
index 00000000..b5dd811d
--- /dev/null
+++ b/tests/device_tests/a5/host_build_graph/paged_attention/golden.py
@@ -0,0 +1,45 @@
+"""Paged Attention Golden - host_build_graph test (production scale, bfloat16)."""
+
+from paged_attention_golden import (
+    generate_inputs as _generate_inputs,
+    compute_golden,
+    run_golden_test,
+)
+
+__outputs__ = ["out"]
+
+RTOL = 1e-3
+ATOL = 1e-3
+
+ALL_CASES = {
+    "Case1": {
+        "batch": 256,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 128,
+        "block_size": 128,
+        "context_len": 8100,
+        "max_model_len": 32768,
+        "dtype": "bfloat16",
+    },
+    "Case2": {
+        "batch": 64,
+        "num_heads": 64,
+        "kv_head_num": 1,
+        "head_dim": 128,
+        "block_size": 64,
+        "context_len": 8150,
+        "max_model_len": 32768,
+        "dtype": "bfloat16",
+    },
+}
+
+DEFAULT_CASE = "Case1"
+
+
+def generate_inputs(params: dict) -> list:
+    return _generate_inputs(params, return_all_sizes=True)
+
+
+if __name__ == "__main__":
+    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 00000000..55827067
--- /dev/null
+++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,97 @@
+// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_impl(__gm__ uint8_t* pij_raw, __gm__ uint8_t* vj_raw, __gm__ uint8_t* oi_raw)
+{
+    __gm__ bfloat16_t* pij = reinterpret_cast<__gm__ bfloat16_t*>(pij_raw);
+    __gm__ bfloat16_t* vj  = reinterpret_cast<__gm__ bfloat16_t*>(vj_raw);
+    __gm__ float*      oi  = reinterpret_cast<__gm__ float*>(oi_raw);
+
+    // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32
+    using GlobalA   = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, pto::Stride<M*K, M*K, M*K, K, 1>>;
+    using GlobalB   = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, pto::Stride<K*N, K*N, K*N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M*N, M*N, M*N, N, 1>>;
+
+    GlobalA   pijGlobal(pij);
+    GlobalB   vjGlobal(vj);
+    GlobalOut oiGlobal(oi);
+
+    // L1 Mat tiles: standard ND pattern for both A and B
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    // L0 tiles
+    using LeftTile  = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile   = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile  aTile;
+    RightTile bTile;
+    AccTile   cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load pij and vj to L1
+    TLOAD(aMatTile, pijGlobal);
+    TLOAD(bMatTile, vjGlobal);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    // Move to L0A/L0B
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Single matmul: (M,K) x (K,N) -> (M,N)
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(oiGlobal, cTile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args)
+{
+    __gm__ uint8_t* pij    = reinterpret_cast<__gm__ uint8_t*>(args[0]);
+    __gm__ uint8_t* vj     = reinterpret_cast<__gm__ uint8_t*>(args[1]);
+    __gm__ uint8_t* oi_new = reinterpret_cast<__gm__ uint8_t*>(args[2]);
+    int q_tile_size = static_cast<int>(args[3]);
+    // args[4] = block_size, args[5] = head_dim
+
+    if (q_tile_size == 16) {
+        pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
+    } else {
+        pv_matmul_impl<64, 64, 128>(pij, vj, oi_new);
+    }
+}
diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 00000000..608879f9
--- /dev/null
+++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,98 @@
+// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
+// This is equivalent to (K, N) in column-major (DN) layout.
+// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_impl(__gm__ uint8_t* qi_raw, __gm__ uint8_t* kj_raw, __gm__ uint8_t* sij_raw)
+{
+    __gm__ bfloat16_t* qi  = reinterpret_cast<__gm__ bfloat16_t*>(qi_raw);
+    __gm__ bfloat16_t* kj  = reinterpret_cast<__gm__ bfloat16_t*>(kj_raw);
+    __gm__ float*      sij = reinterpret_cast<__gm__ float*>(sij_raw);
+
+    // qi (M, K) bf16 in ND (row-major) layout
+    using GlobalA   = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, pto::Stride<M*K, M*K, M*K, K, 1>>;
+    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
+    using GlobalB   = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, pto::Stride<K*N, K*N, K*N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M*N, M*N, M*N, N, 1>>;
+
+    GlobalA   qiGlobal(qi);
+    GlobalB   kjGlobal(kj);
+    GlobalOut sijGlobal(sij);
+
+    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    // L0 tiles
+    using LeftTile  = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile   = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile  aTile;
+    RightTile bTile;
+    AccTile   cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load qi and kj to L1
+    TLOAD(aMatTile, qiGlobal);
+    TLOAD(bMatTile, kjGlobal);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    // Move to L0A/L0B
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Single matmul: (M,K) x (K,N) -> (M,N)
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(sijGlobal, cTile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args)
+{
+    __gm__ uint8_t* qi  = reinterpret_cast<__gm__ uint8_t*>(args[0]);
+    __gm__ uint8_t* kj  = reinterpret_cast<__gm__ uint8_t*>(args[1]);
+    __gm__ uint8_t* sij = reinterpret_cast<__gm__ uint8_t*>(args[2]);
+    int q_tile_size = static_cast<int>(args[3]);
+    // args[4] = head_dim (128), args[5] = block_size
+
+    if (q_tile_size == 16) {
+        qk_matmul_impl<16, 128, 128>(qi, kj, sij);
+    } else {
+        qk_matmul_impl<64, 128, 64>(qi, kj, sij);
+    }
+}
diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 00000000..71f28d2d
--- /dev/null
+++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,227 @@
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+//
+// Scalar layout strategy:
+//   M scalar floats stored contiguously in GM can be loaded as either:
+//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD)
+//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV)
+//   Conversion between layouts uses GM round-trip: ND TSTORE → DN TLOAD.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(__gm__ uint8_t* mij_raw, __gm__ uint8_t* lij_raw,
+                               __gm__ uint8_t* oi_new_raw, __gm__ uint8_t* mi_raw,
+                               __gm__ uint8_t* li_raw, __gm__ uint8_t* oi_raw,
+                               int is_first, int is_last, __gm__ uint8_t* dst_raw)
+{
+    __gm__ float* mij_ptr    = reinterpret_cast<__gm__ float*>(mij_raw);
+    __gm__ float* lij_ptr    = reinterpret_cast<__gm__ float*>(lij_raw);
+    __gm__ float* oi_new_ptr = reinterpret_cast<__gm__ float*>(oi_new_raw);
+    __gm__ float* mi_ptr     = reinterpret_cast<__gm__ float*>(mi_raw);
+    __gm__ float* li_ptr     = reinterpret_cast<__gm__ float*>(li_raw);
+    __gm__ float* oi_ptr     = reinterpret_cast<__gm__ float*>(oi_raw);
+    __gm__ float* dst_ptr    = reinterpret_cast<__gm__ float*>(dst_raw);
+
+    // Scalar tile dimensions for RowMajor layout:
+    // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block)
+    // kScalarRows = M / 8 (M=16 → 2 rows, M=64 → 8 rows)
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+
+    // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor
+    using GlobalScalarND = GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>,
+                                        pto::Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>,
+                                        pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr);
+    GlobalDataMxN oiGlobal(oi_ptr);
+    GlobalDataMxN dstGlobal(dst_ptr);
+
+    // ND globals for scalar element-wise operations
+    GlobalScalarND mijGlobalND(mij_ptr);
+    GlobalScalarND lijGlobalND(lij_ptr);
+    GlobalScalarND miGlobalND(mi_ptr);
+    GlobalScalarND liGlobalND(li_ptr);
+
+    // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE)
+    GlobalScalarDN mijGlobalDN(mij_ptr);
+    GlobalScalarDN lijGlobalDN(lij_ptr);
+    GlobalScalarDN liGlobalDN(li_ptr);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarND = Tile<TileType::Vec, float, kScalarRows, kScalarCols,
+                              BLayout::RowMajor, kScalarRows, kScalarCols>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar ND tiles for element-wise arithmetic
+    TileScalarND mijND, lijND, miND, liND;
+    TileScalarND miNewND, alphaND, betaND, tmpND;
+
+    // Scalar DN tiles for TROWEXPAND operations
+    TileScalarDN alphaDN, betaDN, liDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijND, 2 * kDataBytes);
+    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
+    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
+    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
+    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
+    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
+    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
+    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijND, mijGlobalND);
+        TLOAD(lijND, lijGlobalND);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Passthrough to MTE3 (no V compute needed)
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);     // mi = mij
+        TSTORE(liGlobalND, lijND);     // li = lij
+        TSTORE(oiGlobal, oiNewTile);   // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            TLOAD(liDN, liGlobalDN);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Phase 1: Load all inputs
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijND, mijGlobalND);
+        TLOAD(lijND, lijGlobalND);
+        TLOAD(miND, miGlobalND);
+        TLOAD(liND, liGlobalND);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols)
+        // to resolve RAW hazards on shared UB tiles.
+        TMAX(miNewND, miND, mijND);          // mi_new = max(mi, mij)
+        TSUB(alphaND, miND, miNewND);        // alpha = mi - mi_new
+        TEXP(alphaND, alphaND);              // alpha = exp(mi - mi_new)
+        TSUB(betaND, mijND, miNewND);        // beta = mij - mi_new
+        TEXP(betaND, betaND);                // beta = exp(mij - mi_new)
+        TMUL(liND, alphaND, liND);           // li = alpha * li
+        TMUL(tmpND, betaND, lijND);          // tmp = beta * lij
+        TADD(liND, liND, tmpND);             // li = alpha * li + beta * lij (= li_new)
+
+        // Phase 3: Store scalar results to GM (ND format)
+        // mi_new → mi accumulator, li_new → li accumulator
+        // alpha → mij buffer (reuse), beta → lij buffer (reuse)
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, miNewND);         // persist mi_new
+        TSTORE(liGlobalND, liND);            // persist li_new
+        TSTORE(mijGlobalND, alphaND);        // temp: alpha to mij buffer
+        TSTORE(lijGlobalND, betaND);         // temp: beta to lij buffer
+
+        // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        TLOAD(alphaDN, mijGlobalDN);         // alpha from mij buffer as DN
+        TLOAD(betaDN, lijGlobalDN);          // beta from lij buffer as DN
+        if (is_last) {
+            TLOAD(liDN, liGlobalDN);         // li_new from li buffer as DN
+        }
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+
+        // Phase 5: Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);   // oi_new *= beta
+        TADD(oiTile, oiTile, oiNewTile);               // oi = alpha*oi + beta*oi_new
+
+        if (is_last) {
+            // Phase 6: Normalize and output
+            TROWEXPANDDIV(oiTile, oiTile, liDN);      // dst = oi / li_new
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Phase 6: Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ uint8_t* mij    = reinterpret_cast<__gm__ uint8_t*>(args[0]);
+    __gm__ uint8_t* lij    = reinterpret_cast<__gm__ uint8_t*>(args[1]);
+    __gm__ uint8_t* oi_new = reinterpret_cast<__gm__ uint8_t*>(args[2]);
+    __gm__ uint8_t* mi     = reinterpret_cast<__gm__ uint8_t*>(args[3]);
+    __gm__ uint8_t* li     = reinterpret_cast<__gm__ uint8_t*>(args[4]);
+    __gm__ uint8_t* oi     = reinterpret_cast<__gm__ uint8_t*>(args[5]);
+    int is_first  = static_cast<int>(args[6]);
+    int is_last   = static_cast<int>(args[7]);
+    __gm__ uint8_t* dst    = reinterpret_cast<__gm__ uint8_t*>(args[8]);
+    int q_tile_size = static_cast<int>(args[9]);
+    // args[10] = head_dim (128)
+
+    if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
+}
diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 00000000..dde7537c
--- /dev/null
+++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,123 @@
+// Softmax Preparation Kernel (AIV) with partial block masking
+//
+// Operates on (M, N) tile where M=q_tile_size, N=block_size:
+//   Case1: sij is (16, 128)
+//   Case2: sij is (64, 64)
+//
+// For partial blocks (valid_len < N), positions [valid_len, N) in sij are
+// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0
+// so that invalid key positions contribute zero attention weight.
+//
+// Computes:
+//   sij_masked = TFILLPAD(sij, valid_len, pad=-inf)
+//   sij_scale = sij_masked * scale
+//   mij = row_max(sij_scale)        -> (M, 1)
+//   pij = exp(sij_scale - mij)      -> (M, N)
+//   lij = row_sum(pij)              -> (M, 1)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_impl(__gm__ uint8_t* sij_raw, float scale_value,
+                                 __gm__ uint8_t* pij_raw, __gm__ uint8_t* mij_raw,
+                                 __gm__ uint8_t* lij_raw, int valid_len)
+{
+    __gm__ float*      sij = reinterpret_cast<__gm__ float*>(sij_raw);
+    __gm__ bfloat16_t* pij = reinterpret_cast<__gm__ bfloat16_t*>(pij_raw);
+    __gm__ float*      mij = reinterpret_cast<__gm__ float*>(mij_raw);
+    __gm__ float*      lij = reinterpret_cast<__gm__ float*>(lij_raw);
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    GlobalDataMxN sijGlobal(sij);
+    GlobalDataMxN_bf16 pijGlobal(pij);
+    GlobalScalarDN mijGlobal(mij);
+    GlobalScalarDN lijGlobal(lij);
+
+    // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N,
+                            SLayout::NoneBox, 512, PadValue::Min>;
+
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    TileVecMxN sijTile;
+    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijPad sijPadTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileScalarDN maxTile;
+    TileScalarDN sumTile;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    // All sij tiles share UB address 0x0 (in-place masking)
+    TASSIGN(sijTile, 0x0);
+    TASSIGN(sijDynTile, 0x0);
+    TASSIGN(sijPadTile, 0x0);
+    TASSIGN(pijTile, M * N * sizeof(float));
+    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
+    TASSIGN(maxTile, 3 * M * N * sizeof(float));
+    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
+    TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+
+    // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
+    TLOAD(sijTile, sijGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary,
+    // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N.
+    TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+
+    TMULS(sijTile, sijTile, scale_value);
+    TROWMAX(maxTile, sijTile, tmpTile);
+    TROWEXPANDSUB(pijTile, sijTile, maxTile);
+    TEXP(pijTile, pijTile);
+    // Truncate pij to bf16 first, then compute lij from truncated values (matches golden)
+    TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+    TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+    TROWSUM(sumTile, pijTile, tmpTile);
+
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mijGlobal, maxTile);
+    TSTORE(lijGlobal, sumTile);
+    TSTORE(pijGlobal, pijBf16Tile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ uint8_t* sij = reinterpret_cast<__gm__ uint8_t*>(args[0]);
+    union { uint64_t u; float f; } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[1]);
+    float scale_value = scale_conv.f;
+    __gm__ uint8_t* pij = reinterpret_cast<__gm__ uint8_t*>(args[2]);
+    __gm__ uint8_t* mij = reinterpret_cast<__gm__ uint8_t*>(args[3]);
+    __gm__ uint8_t* lij = reinterpret_cast<__gm__ uint8_t*>(args[4]);
+    int q_tile_size = static_cast<int>(args[5]);
+    // args[6] = block_size
+    int valid_len = static_cast<int>(args[7]);
+
+    if (q_tile_size == 16) {
+        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij, valid_len);
+    } else {
+        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij, valid_len);
+    }
+}
diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/kernel_config.py b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/kernel_config.py
new file mode 100644
index 00000000..03f4a7c4
--- /dev/null
+++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/kernel_config.py
@@ -0,0 +1,43 @@
+"""
+Paged Attention Kernel and Orchestration Configuration
+
+Defines the kernels and orchestration function for paged attention
+with AIC/AIV subgraph splitting:
+
+AIC Kernels (Matrix Multiplication):
+  - aic_qk_matmul: Q @ K^T computation
+  - aic_pv_matmul: P @ V computation
+
+AIV Kernels (Vector Operations):
+  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
+  - aiv_online_update: online softmax accumulation + fused normalization
+
+Note: aiv_normalize has been merged into aiv_online_update for efficiency.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+
+# Orchestration config
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
+    "function_name": "build_paged_attention_graph",
+}
+
+# Kernel configs (aiv_normalize removed - merged into aiv_online_update)
+KERNELS = [
+    # AIC kernels (matrix multiplication using Cube unit)
+    {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),       "core_type": "aic"},
+    {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),       "core_type": "aic"},
+    # AIV kernels (vector operations)
+    {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"},
+    {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),   "core_type": "aiv"},
+]
+
+# Runtime configuration
+RUNTIME_CONFIG = {
+    "runtime": "host_build_graph",
+    "aicpu_thread_num": 3,
+    "block_dim": 24,
+}
diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 00000000..2b2192dc
--- /dev/null
+++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,261 @@
+/**
+ * Paged Attention Orchestration - Production Scale
+ *
+ * Supports production-scale paged attention with:
+ *   Query: (batch, q_head_num, head_dim) bf16
+ *   Key:   (total_blocks, block_size, kv_head_num, head_dim) bf16 (NOT transposed)
+ *   Value: (total_blocks, block_size, kv_head_num, head_dim) bf16
+ *   Output: (batch * q_head_num, head_dim) float32
+ *
+ * Head tiling: q_tile_size = min(num_heads, 128)
+ * GQA: kv_head_num can differ from q_head_num
+ */
+
+#include "runtime.h"
+#include <iostream>
+#include <algorithm>
+#include <cstring>
+
+#define FUNC_QK_MATMUL       0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL       2
+#define FUNC_ONLINE_UPDATE   3
+
+extern "C" {
+
+int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) {
+    if (arg_count < 14) {
+        std::cerr << "Expected at least 14 args, got " << arg_count << '\n';
+        return -1;
+    }
+
+    void* host_query = reinterpret_cast<void*>(args[0]);
+    void* host_key_cache = reinterpret_cast<void*>(args[1]);
+    void* host_value_cache = reinterpret_cast<void*>(args[2]);
+    int* host_block_table = reinterpret_cast<int*>(args[3]);
+    int* host_context_lens = reinterpret_cast<int*>(args[4]);
+    void* host_out = reinterpret_cast<void*>(args[5]);
+    int64_t* host_config = reinterpret_cast<int64_t*>(args[6]);
+
+    size_t query_size = static_cast<size_t>(args[7]);
+    size_t key_cache_size = static_cast<size_t>(args[8]);
+    size_t value_cache_size = static_cast<size_t>(args[9]);
+    size_t block_table_size = static_cast<size_t>(args[10]);
+    size_t context_lens_size = static_cast<size_t>(args[11]);
+    size_t out_size = static_cast<size_t>(args[12]);
+    size_t config_size = static_cast<size_t>(args[13]);
+
+    int batch = static_cast<int>(host_config[0]);
+    int num_heads = static_cast<int>(host_config[1]);
+    int kv_head_num = static_cast<int>(host_config[2]);
+    int head_dim = static_cast<int>(host_config[3]);
+    int block_size = static_cast<int>(host_config[4]);
+    int max_num_blocks = static_cast<int>(host_config[5]);
+    uint64_t scale_value_bits = static_cast<uint64_t>(host_config[6]);
+
+    int q_tile_size = std::min(num_heads, 128);
+    int num_head_tiles = (num_heads + q_tile_size - 1) / q_tile_size;
+
+    std::cout << "\n=== build_paged_attention_graph ===" << '\n';
+    std::cout << "batch=" << batch << ", num_heads=" << num_heads
+              << ", kv_head_num=" << kv_head_num << ", head_dim=" << head_dim << '\n';
+    std::cout << "block_size=" << block_size << ", max_num_blocks=" << max_num_blocks << '\n';
+    std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n';
+
+    // Allocate device memory for inputs/outputs
+    void* dev_query = runtime->host_api.device_malloc(query_size);
+    void* dev_key_cache = runtime->host_api.device_malloc(key_cache_size);
+    void* dev_value_cache = runtime->host_api.device_malloc(value_cache_size);
+    void* dev_out = runtime->host_api.device_malloc(out_size);
+
+    if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) {
+        std::cerr << "Error: Failed to allocate device memory\n";
+        return -1;
+    }
+
+    runtime->host_api.copy_to_device(dev_query, host_query, query_size);
+    runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size);
+    runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size);
+    runtime->record_tensor_pair(host_out, dev_out, out_size);
+
+    // Buffer sizes depend on q_tile_size and block_size
+    size_t sij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
+    size_t pij_size    = static_cast<size_t>(q_tile_size) * block_size * sizeof(uint16_t);
+    size_t mij_size    = static_cast<size_t>(q_tile_size) * sizeof(float);
+    size_t lij_size    = mij_size;
+    size_t oi_new_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+
+    // Per-batch-per-block intermediate buffers
+    int total_buffers = batch * max_num_blocks;
+    void** dev_sij_arr    = new void*[total_buffers];
+    void** dev_pij_arr    = new void*[total_buffers];
+    void** dev_mij_arr    = new void*[total_buffers];
+    void** dev_lij_arr    = new void*[total_buffers];
+    void** dev_oi_new_arr = new void*[total_buffers];
+
+    for (int i = 0; i < total_buffers; i++) {
+        dev_sij_arr[i]    = runtime->host_api.device_malloc(sij_size);
+        dev_pij_arr[i]    = runtime->host_api.device_malloc(pij_size);
+        dev_mij_arr[i]    = runtime->host_api.device_malloc(mij_size);
+        dev_lij_arr[i]    = runtime->host_api.device_malloc(lij_size);
+        dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size);
+    }
+
+    // Per-(batch, head_tile) accumulators
+    int total_accums = batch * num_head_tiles;
+    size_t mi_size = static_cast<size_t>(q_tile_size) * sizeof(float);
+    size_t li_size = mi_size;
+    size_t oi_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
+
+    void** dev_mi_arr = new void*[total_accums];
+    void** dev_li_arr = new void*[total_accums];
+    void** dev_oi_arr = new void*[total_accums];
+
+    for (int i = 0; i < total_accums; i++) {
+        dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size);
+        dev_li_arr[i] = runtime->host_api.device_malloc(li_size);
+        dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size);
+    }
+
+    std::cout << "Allocated " << total_buffers << " per-block buffers\n";
+    std::cout << "Allocated " << total_accums << " per-(batch,head_tile) accumulators\n";
+
+    int total_tasks = 0;
+
+    for (int b_idx = 0; b_idx < batch; b_idx++) {
+        int cur_seq = host_context_lens[b_idx];
+        int bn_this_batch = (cur_seq + block_size - 1) / block_size;
+
+        for (int ht = 0; ht < num_head_tiles; ht++) {
+            int cur_offset = ht * q_tile_size;
+
+            // Query: (batch, q_head_num, head_dim) bf16
+            // qi points to heads [cur_offset .. cur_offset+q_tile_size) for batch b_idx
+            uint8_t* qi_ptr = reinterpret_cast<uint8_t*>(dev_query)
+                + static_cast<int64_t>(b_idx * num_heads + cur_offset) * head_dim * sizeof(uint16_t);
+
+            // Output: (batch * q_head_num, head_dim) float32
+            uint8_t* out_ptr = reinterpret_cast<uint8_t*>(dev_out)
+                + static_cast<int64_t>(b_idx * num_heads + cur_offset) * head_dim * sizeof(float);
+
+            // GQA: which kv_head this head tile maps to
+            int kv_head_idx = cur_offset / (num_heads / kv_head_num);
+
+            // Per-(batch, head_tile) accumulators
+            int accum_idx = b_idx * num_head_tiles + ht;
+            void* dev_mi = dev_mi_arr[accum_idx];
+            void* dev_li = dev_li_arr[accum_idx];
+            void* dev_oi = dev_oi_arr[accum_idx];
+
+            int t_up_prev = -1;
+
+            for (int bn = 0; bn < bn_this_batch; bn++) {
+                int cur_block_idx = host_block_table[b_idx * max_num_blocks + bn];
+                int valid_len = std::min(block_size, cur_seq - bn * block_size);
+
+                // Key: (total_blocks, block_size, kv_head_num, head_dim) bf16
+                // Stride to block: cur_block_idx * (block_size * kv_head_num * head_dim)
+                // Then offset to kv_head: kv_head_idx * head_dim (within each token row)
+                // But since we want contiguous (block_size, head_dim), and kv_head_num=1 makes it simple:
+                uint8_t* kj_ptr = reinterpret_cast<uint8_t*>(dev_key_cache)
+                    + (static_cast<int64_t>(cur_block_idx) * block_size * kv_head_num + kv_head_idx)
+                      * head_dim * sizeof(uint16_t);
+
+                // Value: (total_blocks, block_size, kv_head_num, head_dim) bf16 - same layout as key
+                uint8_t* vj_ptr = reinterpret_cast<uint8_t*>(dev_value_cache)
+                    + (static_cast<int64_t>(cur_block_idx) * block_size * kv_head_num + kv_head_idx)
+                      * head_dim * sizeof(uint16_t);
+
+                int buf_idx = b_idx * max_num_blocks + bn;
+                void* dev_sij    = dev_sij_arr[buf_idx];
+                void* dev_pij    = dev_pij_arr[buf_idx];
+                void* dev_mij    = dev_mij_arr[buf_idx];
+                void* dev_lij    = dev_lij_arr[buf_idx];
+                void* dev_oi_new = dev_oi_new_arr[buf_idx];
+
+                // QK: qi(M, K) @ kj.T(K, N) -> sij(M, N)
+                uint64_t qk_args[6] = {
+                    reinterpret_cast<uint64_t>(qi_ptr),
+                    reinterpret_cast<uint64_t>(kj_ptr),
+                    reinterpret_cast<uint64_t>(dev_sij),
+                    static_cast<uint64_t>(q_tile_size),
+                    static_cast<uint64_t>(head_dim),
+                    static_cast<uint64_t>(block_size)
+                };
+                int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
+                total_tasks++;
+
+                // SF: scale, rowmax, exp, rowsum -> pij, mij, lij
+                uint64_t sf_args[8] = {
+                    reinterpret_cast<uint64_t>(dev_sij),
+                    scale_value_bits,
+                    reinterpret_cast<uint64_t>(dev_pij),
+                    reinterpret_cast<uint64_t>(dev_mij),
+                    reinterpret_cast<uint64_t>(dev_lij),
+                    static_cast<uint64_t>(q_tile_size),
+                    static_cast<uint64_t>(block_size),
+                    static_cast<uint64_t>(valid_len)
+                };
+                int t_sf = runtime->add_task(sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
+                total_tasks++;
+
+                // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N')
+                uint64_t pv_args[6] = {
+                    reinterpret_cast<uint64_t>(dev_pij),
+                    reinterpret_cast<uint64_t>(vj_ptr),
+                    reinterpret_cast<uint64_t>(dev_oi_new),
+                    static_cast<uint64_t>(q_tile_size),
+                    static_cast<uint64_t>(block_size),
+                    static_cast<uint64_t>(head_dim)
+                };
+                int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
+                total_tasks++;
+
+                runtime->add_successor(t_qk, t_sf);
+                runtime->add_successor(t_sf, t_pv);
+
+                // Online Update: serialized across blocks (each depends on previous)
+                int is_first = (bn == 0) ? 1 : 0;
+                int is_last  = (bn == bn_this_batch - 1) ? 1 : 0;
+
+                uint64_t up_args[11] = {
+                    reinterpret_cast<uint64_t>(dev_mij),
+                    reinterpret_cast<uint64_t>(dev_lij),
+                    reinterpret_cast<uint64_t>(dev_oi_new),
+                    reinterpret_cast<uint64_t>(dev_mi),
+                    reinterpret_cast<uint64_t>(dev_li),
+                    reinterpret_cast<uint64_t>(dev_oi),
+                    static_cast<uint64_t>(is_first),
+                    static_cast<uint64_t>(is_last),
+                    reinterpret_cast<uint64_t>(out_ptr),
+                    static_cast<uint64_t>(q_tile_size),
+                    static_cast<uint64_t>(head_dim)
+                };
+                int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
+                total_tasks++;
+
+                runtime->add_successor(t_pv, t_up);
+                if (t_up_prev >= 0) {
+                    runtime->add_successor(t_up_prev, t_up);
+                }
+                t_up_prev = t_up;
+            }
+        }
+    }
+
+    delete[] dev_sij_arr;
+    delete[] dev_pij_arr;
+    delete[] dev_mij_arr;
+    delete[] dev_lij_arr;
+    delete[] dev_oi_new_arr;
+    delete[] dev_mi_arr;
+    delete[] dev_li_arr;
+    delete[] dev_oi_arr;
+
+    std::cout << "Created " << total_tasks << " tasks\n";
+    runtime->print_runtime();
+
+    return 0;
+}
+
+}
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/golden.py b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/golden.py
new file mode 100644
index 00000000..e6e1d9b8
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/golden.py
@@ -0,0 +1,55 @@
+"""Paged Attention Golden - tensormap_and_ringbuffer test (production scale, bfloat16)."""
+
+from paged_attention_golden import (
+    generate_inputs as _generate_inputs,
+    compute_golden,
+    run_golden_test,
+)
+
+__outputs__ = ["out"]
+
+RTOL = 1e-3
+ATOL = 1e-3
+
+ALL_CASES = {
+    "Case1": {
+        "batch": 64,
+        "num_heads": 16,
+        "kv_head_num": 1,
+        "head_dim": 128,
+        "block_size": 128,
+        "context_len": 8192,
+        "max_model_len": 32768,
+        "dtype": "bfloat16",
+    },
+    "Case2": {
+        "batch": 64,
+        "num_heads": 64,
+        "kv_head_num": 1,
+        "head_dim": 128,
+        "block_size": 64,
+        "context_len": 8192,
+        "max_model_len": 32768,
+        "dtype": "bfloat16",
+    },
+    "Case3": {
+        "batch": 64,
+        "num_heads": 64,
+        "kv_head_num": 1,
+        "head_dim": 256,
+        "block_size": 64,
+        "context_len": 8192,
+        "max_model_len": 32768,
+        "dtype": "bfloat16",
+    },
+}
+
+DEFAULT_CASE = "Case1"
+
+
+def generate_inputs(params: dict) -> list:
+    return _generate_inputs(params, return_all_sizes=False)
+
+
+if __name__ == "__main__":
+    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
new file mode 100644
index 00000000..0974de37
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp
@@ -0,0 +1,18 @@
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+constexpr int M = 16;
+constexpr int K = 16;
+constexpr int N = 16;
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {}
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
new file mode 100644
index 00000000..dc9499cf
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -0,0 +1,97 @@
+// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
+//
+// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT).
+// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
+// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void pv_matmul_impl(__gm__ Tensor* pij, __gm__ Tensor* vj, __gm__ Tensor* oi) {
+    __gm__ bfloat16_t* pij_addr = reinterpret_cast<__gm__ bfloat16_t*>(pij->buffer.addr);
+    __gm__ bfloat16_t* vj_addr = reinterpret_cast<__gm__ bfloat16_t*>(vj->buffer.addr);
+    __gm__ float* oi_addr = reinterpret_cast<__gm__ float*>(oi->buffer.addr);
+
+    // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, N, 1>>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
+
+    GlobalA pijGlobal(pij_addr + pij->start_offset);
+    GlobalB vjGlobal(vj_addr + vj->start_offset);
+    GlobalOut oiGlobal(oi_addr + oi->start_offset);
+
+    // L1 Mat tiles: standard ND pattern for both A and B
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+
+    // L0 tiles
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // Load pij and vj to L1
+    TLOAD(aMatTile, pijGlobal);
+    TLOAD(bMatTile, vjGlobal);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    // Move to L0A/L0B
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Single matmul: (M,K) x (K,N) -> (M,N)
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(oiGlobal, cTile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* vj = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    uint64_t q_tile_size = static_cast<uint64_t>(pij->shapes[0]);
+    // args[4] = block_size, args[5] = head_dim
+
+    if (q_tile_size == 16) {
+        pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
+    } else {
+        pv_matmul_impl<64, 64, 128>(pij, vj, oi_new);
+    }
+}
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
new file mode 100644
index 00000000..b9f17ecb
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -0,0 +1,98 @@
+// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
+//
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
+//
+// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
+// This is equivalent to (K, N) in column-major (DN) layout.
+// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int K, int N>
+static __aicore__ void qk_matmul_impl(__gm__ Tensor* qi, __gm__ Tensor* kj, __gm__ Tensor* sij) {
+    __gm__ bfloat16_t* qi_addr = reinterpret_cast<__gm__ bfloat16_t*>(qi->buffer.addr);
+    __gm__ bfloat16_t* kj_addr = reinterpret_cast<__gm__ bfloat16_t*>(kj->buffer.addr);
+    __gm__ float* sij_addr = reinterpret_cast<__gm__ float*>(sij->buffer.addr);
+
+    // qi (M, K) bf16 in ND (row-major) layout
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
+    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
+
+    GlobalA qiGlobal(qi_addr + qi->start_offset);
+    GlobalB kjGlobal(kj_addr + kj->start_offset);
+    GlobalOut sijGlobal(sij_addr + sij->start_offset);
+
+    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+
+    // L0 tiles
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
+    using AccTile = TileAcc<float, M, N, M, N>;
+
+    TileMatA aMatTile;
+    TileMatB bMatTile;
+    TASSIGN(aMatTile, 0x0);
+    TASSIGN(bMatTile, 0x20000);
+
+    LeftTile aTile;
+    RightTile bTile;
+    AccTile cTile;
+    TASSIGN(aTile, 0x0);
+    TASSIGN(bTile, 0x0);
+    TASSIGN(cTile, 0x0);
+
+    // // Load A and B to L1
+    TLOAD(aMatTile, qiGlobal);
+    TLOAD(bMatTile, kjGlobal);
+
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+
+    // Move from L1 to L0A/L0B
+    TMOV(aTile, aMatTile);
+    TMOV(bTile, bMatTile);
+
+    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
+
+    // Matmul
+    TMATMUL(cTile, aTile, bTile);
+
+    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
+
+    TSTORE(sijGlobal, cTile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* qi = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* kj = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
+    // args[4] = head_dim (128), args[5] = block_size
+
+    if (q_tile_size == 16) {
+        qk_matmul_impl<16, 128, 128>(qi, kj, sij);
+    } else {
+        qk_matmul_impl<64, 128, 64>(qi, kj, sij);
+    }
+}
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
new file mode 100644
index 00000000..0974de37
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp
@@ -0,0 +1,18 @@
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+constexpr int M = 16;
+constexpr int K = 16;
+constexpr int N = 16;
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {}
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
new file mode 100644
index 00000000..3c4d227f
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -0,0 +1,232 @@
+// Online Softmax Update + Normalize Kernel (AIV)
+//
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
+//
+// Scalar layout strategy:
+//   M scalar floats stored contiguously in GM can be loaded as either:
+//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD)
+//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV)
+//   Conversion between layouts uses GM round-trip: ND TSTORE → DN TLOAD.
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void online_update_impl(__gm__ Tensor* mij,
+    __gm__ Tensor* lij,
+    __gm__ Tensor* oi_new,
+    __gm__ Tensor* mi,
+    __gm__ Tensor* li,
+    __gm__ Tensor* oi,
+    uint64_t is_first,
+    uint64_t is_last,
+    __gm__ Tensor* dst) {
+    __gm__ float* mij_ptr = reinterpret_cast<__gm__ float*>(mij->buffer.addr);
+    __gm__ float* lij_ptr = reinterpret_cast<__gm__ float*>(lij->buffer.addr);
+    __gm__ float* oi_new_ptr = reinterpret_cast<__gm__ float*>(oi_new->buffer.addr);
+    __gm__ float* mi_ptr = reinterpret_cast<__gm__ float*>(mi->buffer.addr);
+    __gm__ float* li_ptr = reinterpret_cast<__gm__ float*>(li->buffer.addr);
+    __gm__ float* oi_ptr = reinterpret_cast<__gm__ float*>(oi->buffer.addr);
+    __gm__ float* dst_ptr = reinterpret_cast<__gm__ float*>(dst->buffer.addr);
+
+    // Scalar tile dimensions for RowMajor layout:
+    // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block)
+    // kScalarRows = M / 8 (M=16 → 2 rows, M=64 → 8 rows)
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
+    // Aligned rows for ColMajor DN tiles (32-byte alignment)
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    // --- GlobalTensor types ---
+
+    // Data (M, N) RowMajor
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+
+    // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor
+    using GlobalScalarND =
+        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, pto::Stride<1, 1, 1, kScalarCols, 1>>;
+
+    // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // --- GlobalTensor instances ---
+
+    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
+    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
+    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
+
+    // ND globals for scalar element-wise operations
+    GlobalScalarND mijGlobalND(mij_ptr + mij->start_offset);
+    GlobalScalarND lijGlobalND(lij_ptr + lij->start_offset);
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
+    // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE)
+    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
+    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
+
+    // --- Tile types ---
+
+    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarND =
+        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // --- UB memory layout ---
+
+    constexpr int kDataBytes = M * N * sizeof(float);
+    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
+    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
+
+    // Data tiles
+    TileDataMxN oiNewTile;
+    TileDataMxN oiTile;
+
+    // Scalar ND tiles for element-wise arithmetic
+    TileScalarND mijND, lijND, miND, liND;
+    TileScalarND miNewND, alphaND, betaND, tmpND;
+
+    // Scalar DN tiles for TROWEXPAND operations
+    TileScalarDN alphaDN, betaDN, liDN;
+
+    TASSIGN(oiNewTile, 0);
+    TASSIGN(oiTile, kDataBytes);
+    TASSIGN(mijND, 2 * kDataBytes);
+    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
+    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
+    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
+    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
+    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
+    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
+    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
+
+    if (is_first) {
+        // --- First block: copy inputs to accumulators ---
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(mijND, mijGlobalND);
+        TLOAD(lijND, lijGlobalND);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Passthrough to MTE3 (no V compute needed)
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, mijND);    // mi = mij
+        TSTORE(liGlobalND, lijND);    // li = lij
+        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
+
+        if (is_last) {
+            // Single block: normalize dst = oi_new / lij
+            // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+            TLOAD(liDN, liGlobalDN);
+            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiNewTile);
+        }
+    } else {
+        // --- Subsequent blocks: accumulate ---
+
+        // Phase 1: Load all inputs
+        TLOAD(oiNewTile, oiNewGlobal);
+        TLOAD(oiTile, oiGlobal);
+        TLOAD(mijND, mijGlobalND);
+        TLOAD(lijND, lijGlobalND);
+        TLOAD(miND, miGlobalND);
+        TLOAD(liND, liGlobalND);
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+        // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols)
+        // to resolve RAW hazards on shared UB tiles.
+        TMAX(miNewND, miND, mijND);  // mi_new = max(mi, mij)
+        TSUB(alphaND, miND, miNewND);  // alpha = mi - mi_new
+        TEXP(alphaND, alphaND);  // alpha = exp(mi - mi_new)
+        TSUB(betaND, mijND, miNewND);  // beta = mij - mi_new
+        TEXP(betaND, betaND);  // beta = exp(mij - mi_new)
+        TMUL(liND, alphaND, liND);  // li = alpha * li
+        TMUL(tmpND, betaND, lijND);  // tmp = beta * lij
+        TADD(liND, liND, tmpND);  // li = alpha * li + beta * lij (= li_new)
+
+        // Phase 3: Store scalar results to GM (ND format)
+        // mi_new → mi accumulator, li_new → li accumulator
+        // alpha → mij buffer (reuse), beta → lij buffer (reuse)
+        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+        TSTORE(miGlobalND, miNewND);   // persist mi_new
+        TSTORE(liGlobalND, liND);      // persist li_new
+        TSTORE(mijGlobalND, alphaND);  // temp: alpha to mij buffer
+        TSTORE(lijGlobalND, betaND);   // temp: beta to lij buffer
+
+        // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN
+        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
+        TLOAD(alphaDN, mijGlobalDN);  // alpha from mij buffer as DN
+        TLOAD(betaDN, lijGlobalDN);   // beta from lij buffer as DN
+        if (is_last) {
+            TLOAD(liDN, liGlobalDN);  // li_new from li buffer as DN
+        }
+        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
+
+        // Phase 5: Scale data tiles using row-broadcast multiply
+        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
+        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
+        TADD(oiTile, oiTile, oiNewTile);  // oi = alpha*oi + beta*oi_new
+
+        if (is_last) {
+            // Phase 6: Normalize and output
+            TROWEXPANDDIV(oiTile, oiTile, liDN);  // dst = oi / li_new
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(dstGlobal, oiTile);
+        } else {
+            // Phase 6: Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            TSTORE(oiGlobal, oiTile);
+        }
+    }
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[1]);
+    __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* mi = reinterpret_cast<__gm__ Tensor*>(args[3]);
+    __gm__ Tensor* li = reinterpret_cast<__gm__ Tensor*>(args[4]);
+    __gm__ Tensor* oi = reinterpret_cast<__gm__ Tensor*>(args[5]);
+    __gm__ Tensor* dst = reinterpret_cast<__gm__ Tensor*>(args[6]);
+    uint64_t is_first = static_cast<uint64_t>(args[7]);
+    uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
+    // args[10] = head_dim (128)
+
+    if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
+}
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
new file mode 100644
index 00000000..eec1d4dd
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -0,0 +1,128 @@
+// Softmax Preparation Kernel (AIV) with partial block masking
+//
+// Operates on (M, N) tile where M=q_tile_size, N=block_size:
+//   Case1: sij is (16, 128)
+//   Case2: sij is (64, 64)
+//
+// For partial blocks (valid_len < N), positions [valid_len, N) in sij are
+// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0
+// so that invalid key positions contribute zero attention weight.
+//
+// Computes:
+//   sij_masked = TFILLPAD(sij, valid_len, pad=-inf)
+//   sij_scale = sij_masked * scale
+//   mij = row_max(sij_scale)        -> (M, 1)
+//   pij = exp(sij_scale - mij)      -> (M, N)
+//   lij = row_sum(pij)              -> (M, 1)
+
+#include <cstdint>
+#include <pto/pto-inst.hpp>
+
+#include "tensor.h"
+
+using namespace pto;
+
+#ifndef __gm__
+#define __gm__
+#endif
+
+#ifndef __aicore__
+#define __aicore__ [aicore]
+#endif
+
+template <int M, int N>
+static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
+    float scale_value,
+    __gm__ Tensor* pij,
+    __gm__ Tensor* mij,
+    __gm__ Tensor* lij) {
+    uint64_t valid_len = static_cast<uint64_t>(sij->shapes[1]);
+    __gm__ float* sij_addr = reinterpret_cast<__gm__ float*>(sij->buffer.addr);
+    __gm__ bfloat16_t* pij_addr = reinterpret_cast<__gm__ bfloat16_t*>(pij->buffer.addr);
+    __gm__ float* mij_addr = reinterpret_cast<__gm__ float*>(mij->buffer.addr);
+    __gm__ float* lij_addr = reinterpret_cast<__gm__ float*>(lij->buffer.addr);
+
+    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
+
+    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    GlobalDataMxN sijGlobal(sij_addr + sij->start_offset);
+    GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset);
+    GlobalScalarDN mijGlobal(mij_addr + mij->start_offset);
+    GlobalScalarDN lijGlobal(lij_addr + lij->start_offset);
+
+    // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary
+    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
+    // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf
+    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
+
+    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    TileVecMxN sijTile;
+    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
+    TileSijPad sijPadTile;
+    TileVecMxN pijTile;
+    TileVecMxN tmpTile;
+    TileScalarDN maxTile;
+    TileScalarDN sumTile;
+    TileVecMxN_bf16 pijBf16Tile;
+
+    // All sij tiles share UB address 0x0 (in-place masking)
+    TASSIGN(sijTile, 0x0);
+    TASSIGN(sijDynTile, 0x0);
+    TASSIGN(sijPadTile, 0x0);
+    TASSIGN(pijTile, M * N * sizeof(float));
+    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
+    TASSIGN(maxTile, 3 * M * N * sizeof(float));
+    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
+    TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+
+    // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
+    // printf("sij addr incore %x\n", sij->buffer.addr);
+    TLOAD(sijTile, sijGlobal);
+    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
+
+    // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary,
+    // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N.
+    TFILLPAD_INPLACE(sijPadTile, sijDynTile);
+
+    TMULS(sijTile, sijTile, scale_value);
+    TROWMAX(maxTile, sijTile, tmpTile);
+    TROWEXPANDSUB(pijTile, sijTile, maxTile);
+    TEXP(pijTile, pijTile);
+    // Truncate pij to bf16 first, then compute lij from truncated values (matches golden)
+    TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+    TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
+    TROWSUM(sumTile, pijTile, tmpTile);
+
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(mijGlobal, maxTile);
+    TSTORE(lijGlobal, sumTile);
+    TSTORE(pijGlobal, pijBf16Tile);
+}
+
+extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
+    __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[0]);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[1]);
+    float scale_value = scale_conv.f;
+    __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[2]);
+    __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[3]);
+    __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[4]);
+    uint64_t q_tile_size = static_cast<uint64_t>(sij->shapes[0]);
+
+    if (q_tile_size == 16) {
+        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij);
+    } else {
+        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij);
+    }
+}
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
new file mode 100644
index 00000000..dbd5064c
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
@@ -0,0 +1,45 @@
+"""
+Paged Attention Kernel and Orchestration Configuration
+
+Defines the kernels and orchestration function for paged attention
+with AIC/AIV subgraph splitting:
+
+AIC Kernels (Matrix Multiplication):
+  - aic_qk_matmul: Q @ K^T computation
+  - aic_pv_matmul: P @ V computation
+
+AIV Kernels (Vector Operations):
+  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
+  - aiv_online_update: online softmax accumulation + fused normalization
+
+Note: aiv_normalize has been merged into aiv_online_update for efficiency.
+"""
+
+from pathlib import Path
+
+_KERNELS_ROOT = Path(__file__).parent
+
+# Orchestration config
+ORCHESTRATION = {
+    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
+    "function_name": "build_paged_attention_graph",
+}
+
+# Kernel configs (aiv_normalize removed - merged into aiv_online_update)
+KERNELS = [
+    # AIC kernels (matrix multiplication using Cube unit)
+    {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),       "core_type": "aic"},
+    {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),       "core_type": "aic"},
+    {"func_id": 4, "name": "AIC_HUB", "source": str(_KERNELS_ROOT / "aic" / "aic_hub.cpp"),       "core_type": "aic"},
+    # AIV kernels (vector operations)
+    {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"},
+    {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),   "core_type": "aiv"},
+    {"func_id": 5, "name": "AIV_HUB", "source": str(_KERNELS_ROOT / "aiv" / "aiv_hub.cpp"),       "core_type": "aiv"},
+]
+
+# Runtime configuration
+RUNTIME_CONFIG = {
+    "runtime": "tensormap_and_ringbuffer",
+    "aicpu_thread_num": 4,
+    "block_dim": 24,
+}
diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
new file mode 100644
index 00000000..a3417a8c
--- /dev/null
+++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -0,0 +1,294 @@
+/**
+ * Paged Attention Orchestration Function - 16x16 Version
+ *
+ * Simplified for 16x16 framework-generated matmul kernels.
+ * Each block processes a single 16x16 matmul operation.
+ *
+ * Memory Layout:
+ *   Query: (batch, 16, 16) - one 16x16 tile per batch
+ *   Key:   (total_blocks, 16, 16) - stored as K^T for direct matmul
+ *   Value: (total_blocks, 16, 16) - direct format
+ */
+
+#include <cstdint>
+#include <cstring>
+
+#include "pto_orchestration_api.h"
+
+#define FUNC_QK_MATMUL 0
+#define FUNC_SOFTMAX_PREPARE 1
+#define FUNC_PV_MATMUL 2
+#define FUNC_ONLINE_UPDATE 3
+#define FUNC_AIC_HUB 4
+#define FUNC_AIV_HUB 5
+
+constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
+
+inline double cycles_to_us(uint64_t cycles) {
+    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
+};
+
+inline uint64_t get_sys_cnt_aicpu() {
+    uint64_t ticks;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+    return ticks;
+}
+
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc) do { _t1 = get_sys_cnt_aicpu(); acc += (_t1 - _t0); _t0 = _t1; } while(0)
+
+// Helper to encode float as uint64_t for scalar params
+static uint64_t float_to_u64(float f) {
+    union {
+        float f32;
+        uint64_t u64;
+    } conv;
+    conv.u64 = 0;  // Clear upper bits
+    conv.f32 = f;
+    return conv.u64;
+}
+
+extern "C" {
+/**
+ * Orchestration config — the executor reads these values to set up
+ * shared memory and runtime before calling aicpu_orchestration_entry.
+ */
+__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(
+    uint64_t* args, int arg_count) {
+    (void)args;
+    (void)arg_count;
+    return PTO2OrchestrationConfig{
+        .expected_arg_count = 10,
+    };
+}
+
+__attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, int orch_thread_num, int orch_thread_index) {
+    (void)orch_thread_num;
+    (void)orch_thread_index;
+    uint64_t prof_param_extract = 0;
+    uint64_t prof_ext_tensor = 0;
+    uint64_t prof_scope = 0;
+    uint64_t prof_make_tensor = 0;
+    uint64_t prof_tensor_view = 0;
+    uint64_t prof_param_setup = 0;
+    uint64_t prof_submit_task = 0;
+    int prof_submit_count = 0;
+    int prof_make_count = 0;
+    int prof_view_count = 0;
+
+    CYCLE_COUNT_START();
+
+    // Extract device pointers
+    // Extract pointers (first 7)
+    void* host_query = reinterpret_cast<void*>(args[0]);        // [batch, num_heads, head_dim]
+    void* host_key_cache = reinterpret_cast<void*>(args[1]);    // [batch, block_num, block_size, head_dim]
+    void* host_value_cache = reinterpret_cast<void*>(args[2]);  // [batch, block_num, block_size, head_dim]
+    int* host_block_table = reinterpret_cast<int*>(args[3]);    // [batch, block_num]
+    int* host_context_lens = reinterpret_cast<int*>(args[4]);   // [batch]
+    void* host_out = reinterpret_cast<void*>(args[5]);          // [batch, num_heads, head_dim]
+    int64_t* host_config = reinterpret_cast<int64_t*>(args[6]);
+
+    // Extract sizes (next 3)
+    size_t query_size = static_cast<size_t>(args[7]);
+    size_t key_cache_size = static_cast<size_t>(args[8]);
+    size_t value_cache_size = static_cast<size_t>(args[9]);
+
+    // Extract config parameters
+    uint64_t batch = static_cast<uint64_t>(static_cast<int>(host_config[0]));
+    uint64_t num_heads = static_cast<uint64_t>(static_cast<int>(host_config[1]));
+    int kv_head_num = static_cast<int>(host_config[2]);
+    uint64_t head_dim = static_cast<uint64_t>(static_cast<int>(host_config[3]));
+    uint64_t block_size = static_cast<uint64_t>(static_cast<int>(host_config[4]));
+    uint64_t block_num = static_cast<uint64_t>(static_cast<int>(host_config[5]));
+    union {
+        uint32_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint32_t>(host_config[6]);
+    float scale_value = scale_conv.f;
+    uint64_t q_head_num = num_heads;
+    uint64_t q_tile = std::min(num_heads, 128UL);
+    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
+    DataType data_type = DataType::BFLOAT16;  // 用例是float32的，这个考虑要如何扩展成其他类型
+    CYCLE_COUNT_LAP(prof_param_extract);
+
+    LOG_ALWAYS(rt, ">>>>>> batch = %lu", (unsigned long)batch);
+
+    // query_size = batch * num_heads * head_dim * data_type
+    // key_cache_size = batch * block_num * block_size * head_dim * data_type
+    // value_cache_size = batch * block_num * block_size * head_dim * data_type
+    // out = batch * num_heads * head_dim * data_type
+    uint64_t query_shapes[2] = {batch * num_heads, head_dim};
+    uint64_t key_cache_shapes[2] = {batch * block_num * block_size, head_dim};
+    uint64_t value_cache_shapes[2] = {batch * block_num * block_size, head_dim};
+    uint64_t out_shapes[2] = {batch * num_heads, head_dim};
+    Tensor query = make_tensor_external(host_query, query_shapes, 2, data_type);
+    Tensor key_cache = make_tensor_external(host_key_cache, key_cache_shapes, 2, data_type);
+    Tensor value_cache = make_tensor_external(host_value_cache, value_cache_shapes, 2, data_type);
+    // Tensor block_table = make_tensor_external(host_block_table, block_table_size);
+    // Tensor context_lens = make_tensor_external(host_context_lens, context_lens_size);
+    Tensor out = make_tensor_external(host_out, out_shapes, 2, DataType::FLOAT32);
+    CYCLE_COUNT_LAP(prof_ext_tensor);
+    // LOG_DEBUG(rt, "query=%s", query.dump().c_str());
+    // LOG_DEBUG(rt, "key_cache=%s", key_cache.dump().c_str());
+    // LOG_DEBUG(rt, "value_cache=%s", value_cache.dump().c_str());
+    // LOG_DEBUG(rt, "out=%s", out.dump().c_str());
+
+    int total_tasks = 0;
+
+    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
+        uint64_t cur_seq = host_context_lens[b_idx];
+        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
+        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            PTO2_SCOPE(rt) {
+                CYCLE_COUNT_LAP(prof_scope);
+                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
+
+                uint64_t oi_shapes[2] = {q_tile, head_dim};
+                uint64_t li_shapes[1] = {q_tile};
+                uint64_t mi_shapes[1] = {q_tile};
+                Tensor oi = make_tensor(oi_shapes, 2, DataType::FLOAT32);
+                Tensor li_update = make_tensor(li_shapes, 1, DataType::FLOAT32);
+                Tensor mi_update = make_tensor(mi_shapes, 1, DataType::FLOAT32);
+                prof_make_count += 3;
+                CYCLE_COUNT_LAP(prof_make_tensor);
+                uint64_t qi_shapes[2] = {q_tile, head_dim};
+                uint64_t qi_offsets[2] = {cur_offset, 0};
+                Tensor qi = query.view(qi_shapes, qi_offsets);
+                uint64_t out_view_shapes[2] = {q_tile, head_dim};
+                uint64_t out_view_offsets[2] = {cur_offset, 0};
+                Tensor out_view = out.view(out_view_shapes, out_view_offsets);
+                prof_view_count += 2;
+                CYCLE_COUNT_LAP(prof_tensor_view);
+
+                PTOParam params_inplace[] = {
+                    make_output_param(oi),
+                    make_output_param(li_update),
+                    make_output_param(mi_update),
+                };
+                CYCLE_COUNT_LAP(prof_param_setup);
+                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3);
+                prof_submit_count++;
+                CYCLE_COUNT_LAP(prof_submit_task);
+
+                for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
+                    uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn];
+                    uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size);
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    uint64_t kv_shapes[2] = {block_size, head_dim};
+                    uint64_t kv_offsets[2] = {cur_block_idx * block_size, 0};
+                    Tensor kj = key_cache.view(kv_shapes, kv_offsets);
+                    Tensor vj = value_cache.view(kv_shapes, kv_offsets);
+                    prof_view_count += 2;
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
+                    uint64_t sij_shapes[2] = {q_tile, block_size};
+                    Tensor sij = make_tensor(sij_shapes, 2, DataType::FLOAT32);
+                    Tensor pij_f16 = make_tensor(sij_shapes, 2, data_type);
+                    prof_make_count += 2;
+                    CYCLE_COUNT_LAP(prof_make_tensor);
+
+                    PTOParam params_qk[] = {
+                        make_input_param(qi),
+                        make_input_param(kj),
+                        make_output_param(sij),
+                    };
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3);
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint64_t sij_valid_shapes[2] = {q_tile, valid_len};
+                    uint64_t sij_valid_offsets[2] = {0, 0};
+                    Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
+                    prof_view_count += 1;
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
+                    Tensor li = make_tensor(li_shapes, 1, DataType::FLOAT32);
+                    Tensor mi = make_tensor(mi_shapes, 1, DataType::FLOAT32);
+                    prof_make_count += 2;
+                    CYCLE_COUNT_LAP(prof_make_tensor);
+
+                    PTOParam params_sf[] = {
+                        make_input_param(sij_valid),
+                        make_scalar_param(float_to_u64(scale_value)),
+                        make_output_param(pij_f16),
+                        make_output_param(mi),
+                        make_output_param(li),
+                    };
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5);
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint64_t oi_tmp_shapes[2] = {q_tile, head_dim};
+                    Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32);
+                    prof_make_count += 1;
+                    CYCLE_COUNT_LAP(prof_make_tensor);
+
+                    PTOParam params_pv[] = {
+                        make_input_param(pij_f16),
+                        make_input_param(vj),
+                        make_output_param(oi_tmp),
+                    };
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3);
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
+
+                    uint64_t is_first = (bn == 0) ? 1 : 0;
+                    uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
+                    PTOParam params_up[] = {
+                        make_input_param(mi),
+                        make_input_param(li),
+                        make_input_param(oi_tmp),
+                        make_inout_param(mi_update),
+                        make_inout_param(li_update),
+                        make_inout_param(oi),
+                        make_output_param(out_view),
+                        make_scalar_param(is_first),
+                        make_scalar_param(is_last),
+                    };
+                    CYCLE_COUNT_LAP(prof_param_setup);
+                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9);
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
+                }
+            }
+            CYCLE_COUNT_LAP(prof_scope);
+        }
+    }
+
+    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor +
+                     prof_tensor_view + prof_param_setup + prof_submit_task + prof_scope;
+    LOG_ALWAYS(rt, "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===",
+        prof_submit_count, prof_make_count, prof_view_count, cycles_to_us(total));
+    if (total > 0) {
+        LOG_ALWAYS(rt, "  param_extract    : %7.3fus (%5.1f%%)",
+            cycles_to_us(prof_param_extract), prof_param_extract * 100.0 / total);
+        LOG_ALWAYS(rt, "  ext_tensor(x4)   : %7.3fus (%5.1f%%)",
+            cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total);
+        LOG_ALWAYS(rt, "  make_tensor(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus",
+            prof_make_count, cycles_to_us(prof_make_tensor), prof_make_tensor * 100.0 / total,
+            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0);
+        LOG_ALWAYS(rt, "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus",
+            prof_view_count, cycles_to_us(prof_tensor_view), prof_tensor_view * 100.0 / total,
+            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0);
+        LOG_ALWAYS(rt,
+            "  param_setup      : %7.3fus (%5.1f%%)",
+            cycles_to_us(prof_param_setup),
+            prof_param_setup * 100.0 / total);
+        LOG_ALWAYS(rt, "  scope            : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
+        LOG_ALWAYS(rt, "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus",
+            prof_submit_count, cycles_to_us(prof_submit_task), prof_submit_task * 100.0 / total,
+            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0);
+    }
+
+#undef CYCLE_COUNT_START
+#undef CYCLE_COUNT_LAP
+}
+
+}  // extern "C"
\ No newline at end of file
diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh
index 748bb400..5387beb7 100755
--- a/tools/benchmark_rounds.sh
+++ b/tools/benchmark_rounds.sh
@@ -1,9 +1,9 @@
 #!/usr/bin/env bash
-# Benchmark wrapper: run examples on a2a3 hardware,
+# Benchmark wrapper: run examples on hardware,
 # then parse device-log timing lines to report per-round latency.
 #
 # Usage:
-#   ./tools/benchmark_rounds.sh [-d <device>] [-n <rounds>]
+#   ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>]
 #
 # Runs all examples listed in EXAMPLES array and prints timing for each.
 
@@ -12,10 +12,9 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
 RUN_EXAMPLE="$PROJECT_ROOT/examples/scripts/run_example.py"
-EXAMPLES_DIR="$PROJECT_ROOT/tests/device_tests/tensormap_and_ringbuffer"
 
 # ---------------------------------------------------------------------------
-# Examples to benchmark (paths relative to examples/tensormap_and_ringbuffer/)
+# Examples to benchmark (paths relative to tests/device_tests/<arch>/tensormap_and_ringbuffer/)
 # Each entry is just the directory name; kernels/ and golden.py are implied.
 # ---------------------------------------------------------------------------
 EXAMPLES=(
@@ -31,10 +30,15 @@ EXAMPLES=(
 # ---------------------------------------------------------------------------
 DEVICE_ID=0
 ROUNDS=10
+PLATFORM=a2a3
 EXTRA_ARGS=()
 
 while [[ $# -gt 0 ]]; do
     case "$1" in
+        -p|--platform)
+            PLATFORM="$2"
+            shift 2
+            ;;
         -d|--device)
             DEVICE_ID="$2"
             shift 2
@@ -48,9 +52,10 @@ while [[ $# -gt 0 ]]; do
 benchmark_rounds.sh — run all examples and report per-round timing from device logs
 
 Usage:
-  ./tools/benchmark_rounds.sh [-d <device>] [-n <rounds>]
+  ./tools/benchmark_rounds.sh [-p <platform>] [-d <device>] [-n <rounds>]
 
 Options:
+  -p, --platform Platform to run on (default: a2a3)
   -d, --device   Device ID (default: 0)
   -n, --rounds   Override number of rounds for each example (default: 10)
   -h, --help     Show this help
@@ -69,6 +74,11 @@ USAGE
     esac
 done
 
+# ---------------------------------------------------------------------------
+# Derive arch from platform and set examples directory
+# ---------------------------------------------------------------------------
+EXAMPLES_DIR="$PROJECT_ROOT/tests/device_tests/${PLATFORM}/tensormap_and_ringbuffer"
+
 # ---------------------------------------------------------------------------
 # Resolve device log directory (mirrors run_example.py / device_log_resolver.py)
 # ---------------------------------------------------------------------------
@@ -192,7 +202,7 @@ for example in "${EXAMPLES[@]}"; do
     # Run example
     if ! python3 "$RUN_EXAMPLE" \
             -k "$KERNELS_DIR" -g "$GOLDEN" \
-            -p a2a3 -d "$DEVICE_ID" \
+            -p "$PLATFORM" -d "$DEVICE_ID" \
             -n "$ROUNDS" \
             "${EXTRA_ARGS[@]}" > /dev/null 2>&1; then
         echo "  FAILED: run_example.py returned non-zero"