From 634255a1ba49a458545bdef62bfb14b6ef2712cc Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:30:54 +0800 Subject: [PATCH 1/3] Refactor: unify paged attention benchmark cases and dtype handling (#256) - Standardize ALL_CASES to 3 identical cases in paged_attention, batch_paged_attention, and paged_attention_unroll for fair comparison - Case1: QHEADS=16, HEADDIM=128, BLOCKSIZE=128, batch=256 - Case2: QHEADS=64, HEADDIM=128, BLOCKSIZE=64, batch=64 - Case3: QHEADS=64, HEADDIM=256, BLOCKSIZE=64, batch=64 - All cases: KVHEADS=1, context_len=8192, query_seqlen=1 - Remove CaseVarSeq from batch_paged_attention (not needed for benchmark) - Add dtype field to paged_attention_unroll cases and parameterize generate_inputs/paged_attention to read dtype from params --- .../batch_paged_attention/golden.py | 15 ++++---- .../paged_attention/golden.py | 14 ++++++-- .../paged_attention_unroll/golden.py | 34 +++++++++++++------ 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py b/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py index b07f0d53..8cce3b8a 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py +++ b/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py @@ -13,12 +13,12 @@ ALL_CASES = { "Case1": { - "batch": 64, + "batch": 256, "num_heads": 16, "kv_head_num": 1, "head_dim": 128, "block_size": 128, - "context_len": 8193, + "context_len": 8192, "max_model_len": 32768, "dtype": "bfloat16", }, @@ -32,14 +32,13 @@ "max_model_len": 32768, "dtype": "bfloat16", }, - "CaseVarSeq": { + "Case3": { "batch": 64, - "num_heads": 16, + "num_heads": 64, "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 8193, - "context_lens_list": [8193, 4096, 1024, 256, 8000, 512, 2048, 7777], + "head_dim": 256, + "block_size": 64, + "context_len": 8192, "max_model_len": 32768, "dtype": "bfloat16", }, diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py index 787ad2c7..898c4fad 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py +++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py @@ -13,12 +13,12 @@ ALL_CASES = { "Case1": { - "batch": 64, + "batch": 256, "num_heads": 16, "kv_head_num": 1, "head_dim": 128, "block_size": 128, - "context_len": 8193, + "context_len": 8192, "max_model_len": 32768, "dtype": "bfloat16", }, @@ -32,6 +32,16 @@ "max_model_len": 32768, "dtype": "bfloat16", }, + "Case3": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, } DEFAULT_CASE = "Case1" diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py b/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py index 6dc39319..a5d9089e 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py +++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py @@ -24,13 +24,14 @@ # All test cases - production scale ALL_CASES = { "Case1": { - "batch": 64, + "batch": 256, "num_heads": 16, "kv_head_num": 1, "head_dim": 128, "block_size": 128, - "context_len": 8193, + "context_len": 8192, "max_model_len": 32768, + "dtype": "bfloat16", }, "Case2": { "batch": 64, @@ -40,6 +41,17 @@ "block_size": 64, "context_len": 8192, "max_model_len": 32768, + "dtype": "bfloat16", + }, + "Case3": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", }, } @@ -55,6 +67,7 @@ def generate_inputs(params: dict) -> list: block_size = params["block_size"] context_len = params["context_len"] max_model_len = params["max_model_len"] + dtype = getattr(torch, params.get("dtype", "bfloat16")) max_num_blocks_per_req = max_model_len // block_size cur_valid_blocks = (context_len + block_size - 1) // block_size @@ -77,15 +90,15 @@ def generate_inputs(params: dict) -> list: dtype=torch.int64, ) - query_bf16 = torch.empty(batch, 1, num_heads * head_dim).uniform_(-0.5, 0.5).to(torch.bfloat16) - query_bf16 = query_bf16.reshape(batch, num_heads, head_dim) + query_raw = torch.empty(batch, 1, num_heads * head_dim).uniform_(-0.5, 0.5).to(dtype) + query_raw = query_raw.reshape(batch, num_heads, head_dim) - key_bf16 = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-0.5, 0.5).to(torch.bfloat16) - value_bf16 = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-1, 1).to(torch.bfloat16) + key_raw = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-0.5, 0.5).to(dtype) + value_raw = torch.empty(total_blocks, block_size, kv_head_num, head_dim).uniform_(-1, 1).to(dtype) - query = query_bf16.flatten() - key_cache = key_bf16.flatten() - value_cache = value_bf16.flatten() + query = query_raw.flatten() + key_cache = key_raw.flatten() + value_cache = value_raw.flatten() block_table_flat = block_table.flatten() out = torch.zeros(batch * num_heads * head_dim, dtype=torch.float32) @@ -133,6 +146,7 @@ def paged_attention( out: (batch * num_heads, head_dim) float32 """ assert num_kv_heads == 1 + input_dtype = query.dtype batch, num_heads_dim, head_dim = query.shape _, block_size, _, _ = key_cache.shape @@ -189,7 +203,7 @@ def paged_attention( pij = torch.exp(sij - mij) pij = pij.masked_fill(~valid_mask, 0.0) pij = pij.masked_fill(~batch_mask, 0.0) - pij = pij.to(torch.bfloat16).to(torch.float32) + pij = pij.to(input_dtype).to(torch.float32) lij = pij.sum(dim=-1, keepdim=True) # (batch, q_tile_size, 1) # PV matmul: (batch, q_tile_size, head_dim) From e2e38b9f0b52cd557f825af287211577b7572ec1 Mon Sep 17 00:00:00 2001 From: jvjhfhg Date: Wed, 11 Mar 2026 18:29:57 +0800 Subject: [PATCH 2/3] Refactor: cluster-based mixed-task dispatch for AICPU executor (#249) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add: MixedKernels type and resource shape definitions - Add pto_submit_types.h with MixedKernels struct, PTO2ResourceShape enum, PTO2SubtaskSlot enum, and active_mask/shape conversion helpers - Remove PTO2WorkerType enum from pto_runtime2_types.h (superseded by resource shapes) * Refactor: submit API from (kernel_id, worker_type) to MixedKernels - Change submit_task signature to take MixedKernels& instead of (kernel_id, worker_type), enabling multi-kernel mixed-task submission - Add pto2_rt_submit_aic_task / pto2_rt_submit_aiv_task convenience wrappers for single-kernel tasks - Implement pto2_submit_mixed_task with active_mask computation, AIV normalization (aiv1-only → aiv0 slot), and shape-based queue routing - Add mixed_task_id and subslot fields to PTO2DispatchPayload - Migrate all orchestration call sites to new API * Refactor: two-stage completion and shape-based ready queues in scheduler - Change ready queues from worker-type indexed to shape-based indexed (PTO2_NUM_RESOURCE_SHAPES queues instead of PTO2_NUM_WORKER_TYPES) - Add on_subtask_complete() for per-core subtask done-bit tracking - Rename on_task_complete to on_mixed_task_complete (fires only when all subtasks of a mixed task finish) - Route release_fanin_and_check_ready enqueue through shape-based queue using pto2_active_mask_to_shape() - Remove stale extern declarations left from self-consumed check move * Refactor: cluster-based dispatch and core assignment in executor - Add Cluster struct (1 AIC + 2 AIV) and extend CoreStateTracker with clusters[], core_idle[], and find_cluster_for_shape() - Add shape_resource_count() constexpr lookup and get_dispatch_order() with even/odd thread differentiation for queue probe order - Extract pop_ready_task() and dispatch_subtask_to_core() helpers - Replace 5 duplicated dispatch blocks with unified table-driven loop - Adapt local-first dispatch to cluster model (find_cluster_for_shape instead of per-type idle pool, overflow to shape-based global queue) - Rewrite assign/reassign_cores_to_threads for cluster-aligned assignment - Wire completion path through on_subtask_complete/on_mixed_task_complete - Fix completed_tasks_ to increment only on mixed-task completion, not per-subtask, preventing early scheduler termination * Add: mixed_example covering all 5 resource shapes - AIC_AIV_X2 (matmul + add + mul), AIC_ONLY (matmul), AIV_X1 (add), AIV_X2 (add + mul), AIC_AIV_X1 (matmul + add) per iteration - 5 kernels: matmul, add, mul, add_standalone, mul_standalone - 9 output tensors with golden verification (4 iterations × 5 shapes) * Docs: submit by cluster docs * Fix review comment --- .../orchestration/paged_attention_orch.cpp | 10 +- .../kernels/orchestration/bgemm_orch.cpp | 4 +- .../docs/INCORE_ORCHESTRATION_GUIDE.md | 21 +- .../mixed_example/golden.py | 130 ++++ .../kernels/aic/kernel_matmul.cpp | 126 ++++ .../mixed_example/kernels/aiv/kernel_add.cpp | 89 +++ .../kernels/aiv/kernel_add_standalone.cpp | 74 ++ .../mixed_example/kernels/aiv/kernel_mul.cpp | 90 +++ .../kernels/aiv/kernel_mul_standalone.cpp | 74 ++ .../mixed_example/kernels/kernel_config.py | 58 ++ .../kernels/orchestration/mixed_orch.cpp | 221 ++++++ .../orchestration/paged_attention_orch.cpp | 10 +- .../orchestration/example_orchestration.cpp | 10 +- .../aicore/aicore_executor.cpp | 6 +- .../aicpu/aicpu_executor.cpp | 665 +++++++++++------- .../docs/RUNTIME_LOGIC.md | 65 +- .../docs/SUBMIT_BY_CLUSTER.md | 226 ++++++ .../docs/device_log_profiling.md | 4 +- .../orchestration/pto_orchestration_api.h | 39 +- .../runtime/pto2_dispatch_payload.h | 4 +- .../runtime/pto_orchestrator.cpp | 39 +- .../runtime/pto_orchestrator.h | 6 +- .../runtime/pto_ring_buffer.h | 2 +- .../runtime/pto_runtime2.cpp | 7 +- .../runtime/pto_runtime2.h | 4 +- .../runtime/pto_runtime2_types.h | 21 +- .../runtime/pto_scheduler.cpp | 12 +- .../runtime/pto_scheduler.h | 99 ++- .../runtime/pto_submit_types.h | 97 +++ .../orchestration/alternating_orch.cpp | 4 +- .../orchestration/paged_attention_orch.cpp | 10 +- .../kernels/orchestration/bgemm_orch.cpp | 4 +- .../orchestration/paged_attention_orch.cpp | 10 +- .../orchestration/paged_attention_orch.cpp | 10 +- 34 files changed, 1862 insertions(+), 389 deletions(-) create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/golden.py create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py create mode 100644 examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp index 7e7b3b68..56ac566c 100644 --- a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -135,7 +135,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_output_param(li_batch), make_output_param(mi_batch), }; - pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_hub, 3); + pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_hub, 3); for (uint64_t bn = 0; bn < max_bn; bn++) { uint64_t sij_shapes[2] = {chunk_bc * q_tile, block_size}; @@ -160,7 +160,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(num_heads), make_scalar_param(batch_start), }; - pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 10); + pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 10); PTOParam params_sf[] = { make_input_param(sij_b), @@ -173,7 +173,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(bn), make_scalar_param(batch_start), }; - pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 9); + pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 9); PTOParam params_pv[] = { make_input_param(pij_b), @@ -185,7 +185,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(block_num), make_scalar_param(batch_start), }; - pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 8); + pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 8); uint64_t is_first = (bn == 0) ? 1 : 0; uint64_t is_last = (bn == max_bn - 1) ? 1 : 0; @@ -204,7 +204,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(num_heads), make_scalar_param(batch_start), }; - pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 13); + pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 13); } } } diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp index e3936359..6febf360 100644 --- a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp +++ b/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp @@ -120,7 +120,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_input_param(B_view), make_output_param(P), }; - pto2_rt_submit_task(rt, FUNC_GEMM_TILE, PTO2_WORKER_CUBE, + pto2_rt_submit_aic_task(rt, FUNC_GEMM_TILE, params_gemm, 3); // gemm // C[m,n] += P @@ -128,7 +128,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_inout_param(C_view), make_input_param(P), }; - pto2_rt_submit_task(rt, FUNC_TILE_ADD, PTO2_WORKER_VECTOR, + pto2_rt_submit_aiv_task(rt, FUNC_TILE_ADD, params_add, 2); // add } } diff --git a/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md index fbb18761..86700292 100644 --- a/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md +++ b/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md @@ -31,17 +31,28 @@ Validate `arg_count` in `aicpu_orchestration_config` and interpret pointers as d 2. Wrap orchestration in scopes with `PTO2_SCOPE(rt)` to control tensor lifetimes. 3. Use `make_tensor_external` for input/output buffers and `make_tensor` for intermediates. 4. Build `PTOParam` arrays with `make_input_param`, `make_output_param`, `make_inout_param`, and `make_scalar_param`. -5. Submit tasks with `pto2_rt_submit_task(rt, func_id, worker_type, params, num_params)`. +5. Submit tasks with one of: + - `pto2_rt_submit_aic_task(rt, kernel_id, params, num_params)` — AIC (CUBE) task + - `pto2_rt_submit_aiv_task(rt, kernel_id, params, num_params)` — AIV (VECTOR) task + - `pto2_rt_submit_task(rt, mixed_kernels, params, num_params)` — mixed task with a `MixedKernels` struct Dependencies are inferred by TensorMap from input/inout/output tensors, so you do not add explicit edges. -## Worker Types And Kernel IDs -- Worker types come from `pto_orchestration_api.h` (`PTO2_WORKER_CUBE`, `PTO2_WORKER_VECTOR`, etc.). +## Submit API And Kernel IDs +- Submit helpers are defined in `pto_orchestration_api.h`. +- `pto2_rt_submit_aic_task` and `pto2_rt_submit_aiv_task` are convenience wrappers around `pto2_rt_submit_task` with a `MixedKernels` struct. +- For mixed AIC+AIV tasks, construct a `MixedKernels` struct directly: + ```cpp + MixedKernels mk; + mk.aic_kernel_id = FUNC_QK; + mk.aiv0_kernel_id = FUNC_SF; + pto2_rt_submit_task(rt, mk, params, num_params); + ``` - Kernel `func_id` values are defined in `kernels/kernel_config.py` under `KERNELS`. ## Completion Semantics Do not call `pto2_rt_orchestration_done` yourself in device mode. The executor wraps the entry call in an outer scope and signals completion after `aicpu_orchestration_entry` returns. ## Examples -- `examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp` -- `examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp` +- `examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp` (AIV-only tasks) +- `examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp` (mixed AIC + AIV tasks) diff --git a/examples/tensormap_and_ringbuffer/mixed_example/golden.py b/examples/tensormap_and_ringbuffer/mixed_example/golden.py new file mode 100644 index 00000000..a6412a15 --- /dev/null +++ b/examples/tensormap_and_ringbuffer/mixed_example/golden.py @@ -0,0 +1,130 @@ +""" +Golden test specification for mixed AIC+AIV example. + +Covers all 5 resource shapes per iteration: + 1. AIC_AIV_X2: C = A@B, F = D+E, I = G*H + 2. AIC_ONLY: J = A@B + 3. AIV_X1: K = D+E + 4. AIV_X2: L = D+E, M = G*H + 5. AIC_AIV_X1: N = A@B, O = D+E + +All use 128x128 float32 tiles, repeated over num_iters slices. + +Args layout (30 args): + [ptr_A..ptr_O, size_A..size_O] +""" + +import ctypes +import torch + +__outputs__ = ["C", "F", "I", "J", "K", "L", "M", "N", "O"] +RTOL = 1e-3 +ATOL = 1e-3 + +ALL_CASES = { + "case1": {"num_iters": 4}, + "case2": {"num_iters": 1}, +} + +DEFAULT_CASE = "case1" + +MATMUL_SIZE = 128 +TILE_ELEMS = 128 * 128 + + +def generate_inputs(params: dict) -> list: + num_iters = params["num_iters"] + + torch.manual_seed(42) + + # Matmul inputs (shared by AIC tasks) + A = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01 + B = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01 + + # Add inputs (shared by AIV add tasks) + D = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 + E = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 + + # Mul inputs (shared by AIV mul tasks) + G = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 + H = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01 + + # Output buffers (num_iters slices each) + C = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIC_AIV_X2 matmul + F = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIC_AIV_X2 add + I = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIC_AIV_X2 mul + J = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIC_ONLY matmul + K = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIV_X1 add + L = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIV_X2 add + M = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIV_X2 mul + N = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIC_AIV_X1 matmul + O = torch.zeros(num_iters, TILE_ELEMS, dtype=torch.float32) # AIC_AIV_X1 add + + A_flat = A.flatten() + B_flat = B.flatten() + + return [ + ("A", A_flat), + ("B", B_flat), + ("C", C.flatten()), + ("D", D), + ("E", E), + ("F", F.flatten()), + ("G", G), + ("H", H), + ("I", I.flatten()), + ("J", J.flatten()), + ("K", K.flatten()), + ("L", L.flatten()), + ("M", M.flatten()), + ("N", N.flatten()), + ("O", O.flatten()), + ("size_A", ctypes.c_int64(A_flat.nbytes)), + ("size_B", ctypes.c_int64(B_flat.nbytes)), + ("size_C", ctypes.c_int64(C.flatten().nbytes)), + ("size_D", ctypes.c_int64(D.nbytes)), + ("size_E", ctypes.c_int64(E.nbytes)), + ("size_F", ctypes.c_int64(F.flatten().nbytes)), + ("size_G", ctypes.c_int64(G.nbytes)), + ("size_H", ctypes.c_int64(H.nbytes)), + ("size_I", ctypes.c_int64(I.flatten().nbytes)), + ("size_J", ctypes.c_int64(J.flatten().nbytes)), + ("size_K", ctypes.c_int64(K.flatten().nbytes)), + ("size_L", ctypes.c_int64(L.flatten().nbytes)), + ("size_M", ctypes.c_int64(M.flatten().nbytes)), + ("size_N", ctypes.c_int64(N.flatten().nbytes)), + ("size_O", ctypes.c_int64(O.flatten().nbytes)), + ] + + +def compute_golden(tensors: dict, params: dict) -> None: + num_iters = params["num_iters"] + + A = torch.as_tensor(tensors["A"]).reshape(MATMUL_SIZE, MATMUL_SIZE) + B = torch.as_tensor(tensors["B"]).reshape(MATMUL_SIZE, MATMUL_SIZE) + D = torch.as_tensor(tensors["D"]) + E = torch.as_tensor(tensors["E"]) + G = torch.as_tensor(tensors["G"]) + H = torch.as_tensor(tensors["H"]) + + golden_matmul = torch.matmul(A, B).flatten() + golden_add = D + E + golden_mul = G * H + + for name in ["C", "J", "N"]: + out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS) + for i in range(num_iters): + out[i] = golden_matmul + tensors[name][:] = out.flatten() + + for name in ["F", "K", "L", "O"]: + out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS) + for i in range(num_iters): + out[i] = golden_add + tensors[name][:] = out.flatten() + + for name in ["I", "M"]: + out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS) + for i in range(num_iters): + out[i] = golden_mul + tensors[name][:] = out.flatten() diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp new file mode 100644 index 00000000..186abb95 --- /dev/null +++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp @@ -0,0 +1,126 @@ +/** + * Matrix Multiplication Kernel (Cube Core) + * + * Computes: C = A @ B (TILE x TILE x TILE matmul) + * Uses TMATMUL instruction + * + * Args (Tensor*): + * args[0] = A (INPUT) - TILE x TILE + * args[1] = B (INPUT) - TILE x TILE + * args[2] = C (OUTPUT) - TILE x TILE + */ + +#include +#include +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +AICORE constexpr inline T CeilAlign(T num_1, T num_2) { + if (num_2 == 0) { + return 0; + } + return (num_1 + num_2 - 1) / num_2 * num_2; +} + +static __aicore__ inline int get_num_tiles(__gm__ Tensor* tensor, uint64_t tile_elems) { + uint64_t total_elems = tensor->shapes[0]; + return static_cast(total_elems / tile_elems); +} + +template +static __aicore__ void matmul_impl( + __gm__ float* input_a, + __gm__ float* input_b, + __gm__ float* output) { + + constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float); + constexpr int M = CeilAlign(TILE, 16); + constexpr int K = CeilAlign(TILE, blockAlign); + constexpr int N = CeilAlign(TILE, blockAlign); + + using GlobalDataA = GlobalTensor, + Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + using GlobalDataB = GlobalTensor, + Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + using GlobalDataC = GlobalTensor, + Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + + GlobalDataA src0Global(input_a); + GlobalDataB src1Global(input_b); + GlobalDataC dstGlobal(output); + + using TileMatA = Tile; + using TileMatB = Tile; + + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + TLOAD(aMatTile, src0Global); + TLOAD(bMatTile, src1Global); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(dstGlobal, cTile); + + set_flag(PIPE_FIX, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_MTE2, EVENT_ID0); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* input_a = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* input_b = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* output = reinterpret_cast<__gm__ Tensor*>(args[2]); + + constexpr uint64_t TILE_ELEMS = 128 * 128; + int num_tiles = get_num_tiles(input_a, TILE_ELEMS); + + __gm__ float* base_a = reinterpret_cast<__gm__ float*>(input_a->buffer.addr) + input_a->start_offset; + __gm__ float* base_b = reinterpret_cast<__gm__ float*>(input_b->buffer.addr) + input_b->start_offset; + __gm__ float* base_c = reinterpret_cast<__gm__ float*>(output->buffer.addr) + output->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float* a_ptr = base_a + (tile_idx * TILE_ELEMS); + __gm__ float* b_ptr = base_b + (tile_idx * TILE_ELEMS); + __gm__ float* c_ptr = base_c + (tile_idx * TILE_ELEMS); + + matmul_impl<128>(a_ptr, b_ptr, c_ptr); + } +} diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp new file mode 100644 index 00000000..be9b0ebc --- /dev/null +++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp @@ -0,0 +1,89 @@ +/** + * Element-wise Tensor Addition Kernel (for mixed task) + * + * Implements: out[i] = src0[i] + src1[i] + * Tile size: 128 x 128 + * + * In the mixed task, this kernel shares the param list with the matmul kernel. + * Matmul uses args[0..2], this kernel uses args[3..5]. + * + * Args (Tensor*): + * args[3] = src0 (INPUT) - 128 x 128 + * args[4] = src1 (INPUT) - 128 x 128 + * args[5] = out (OUTPUT) - 128 x 128 + */ + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +static __aicore__ inline int get_num_tiles(__gm__ Tensor* tensor, uint64_t tile_elems) { + uint64_t total_elems = tensor->shapes[0]; + return static_cast(total_elems / tile_elems); +} + +template +static __aicore__ void add_impl( + __gm__ float* src0, + __gm__ float* src1, + __gm__ float* out) { + + using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>; + using DynStridDim5 = Stride<1, 1, 1, COLS, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(ROWS, COLS); + TileData src1Tile(ROWS, COLS); + TileData dstTile(ROWS, COLS); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* src0_tensor = reinterpret_cast<__gm__ Tensor*>(args[3]); + __gm__ Tensor* src1_tensor = reinterpret_cast<__gm__ Tensor*>(args[4]); + __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[5]); + + constexpr uint64_t TILE_ELEMS = 128 * 128; + int num_tiles = get_num_tiles(src0_tensor, TILE_ELEMS); + + __gm__ float* base_src0 = reinterpret_cast<__gm__ float*>(src0_tensor->buffer.addr) + src0_tensor->start_offset; + __gm__ float* base_src1 = reinterpret_cast<__gm__ float*>(src1_tensor->buffer.addr) + src1_tensor->start_offset; + __gm__ float* base_out = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float* src0_ptr = base_src0 + (tile_idx * TILE_ELEMS); + __gm__ float* src1_ptr = base_src1 + (tile_idx * TILE_ELEMS); + __gm__ float* out_ptr = base_out + (tile_idx * TILE_ELEMS); + + add_impl<128, 128>(src0_ptr, src1_ptr, out_ptr); + } +} diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp new file mode 100644 index 00000000..4475907e --- /dev/null +++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp @@ -0,0 +1,74 @@ +/** + * Standalone Element-wise Addition Kernel + * + * Implements: out[i] = src0[i] + src1[i] + * Tile size: 128 x 128 + * + * Reads args[0..2] — for standalone AIV_X1 tasks or AIV0 slot in AIV_X2. + * + * Args (Tensor*): + * args[0] = src0 (INPUT) - 128 x 128 + * args[1] = src1 (INPUT) - 128 x 128 + * args[2] = out (OUTPUT) - 128 x 128 + */ + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void add_impl( + __gm__ float* src0, + __gm__ float* src1, + __gm__ float* out) { + + using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>; + using DynStridDim5 = Stride<1, 1, 1, COLS, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(ROWS, COLS); + TileData src1Tile(ROWS, COLS); + TileData dstTile(ROWS, COLS); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* src0_tensor = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* src1_tensor = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[2]); + + __gm__ float* src0 = reinterpret_cast<__gm__ float*>(src0_tensor->buffer.addr) + src0_tensor->start_offset; + __gm__ float* src1 = reinterpret_cast<__gm__ float*>(src1_tensor->buffer.addr) + src1_tensor->start_offset; + __gm__ float* out = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset; + + add_impl<128, 128>(src0, src1, out); +} diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp new file mode 100644 index 00000000..d5117419 --- /dev/null +++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp @@ -0,0 +1,90 @@ +/** + * Element-wise Tensor Multiplication Kernel (for mixed task, AIV1 slot) + * + * Implements: out[i] = src0[i] * src1[i] + * Tile size: 128 x 128 + * + * In the mixed task, this kernel occupies the AIV1 slot and shares the param + * list with the matmul kernel (args[0..2]) and add kernel (args[3..5]). + * This kernel uses args[6..8]. + * + * Args (Tensor*): + * args[6] = src0 (INPUT) - 128 x 128 + * args[7] = src1 (INPUT) - 128 x 128 + * args[8] = out (OUTPUT) - 128 x 128 + */ + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +static __aicore__ inline int get_num_tiles(__gm__ Tensor* tensor, uint64_t tile_elems) { + uint64_t total_elems = tensor->shapes[0]; + return static_cast(total_elems / tile_elems); +} + +template +static __aicore__ void mul_impl( + __gm__ float* src0, + __gm__ float* src1, + __gm__ float* out) { + + using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>; + using DynStridDim5 = Stride<1, 1, 1, COLS, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(ROWS, COLS); + TileData src1Tile(ROWS, COLS); + TileData dstTile(ROWS, COLS); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TMUL(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* src0_tensor = reinterpret_cast<__gm__ Tensor*>(args[6]); + __gm__ Tensor* src1_tensor = reinterpret_cast<__gm__ Tensor*>(args[7]); + __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[8]); + + constexpr uint64_t TILE_ELEMS = 128 * 128; + int num_tiles = get_num_tiles(src0_tensor, TILE_ELEMS); + + __gm__ float* base_src0 = reinterpret_cast<__gm__ float*>(src0_tensor->buffer.addr) + src0_tensor->start_offset; + __gm__ float* base_src1 = reinterpret_cast<__gm__ float*>(src1_tensor->buffer.addr) + src1_tensor->start_offset; + __gm__ float* base_out = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float* src0_ptr = base_src0 + (tile_idx * TILE_ELEMS); + __gm__ float* src1_ptr = base_src1 + (tile_idx * TILE_ELEMS); + __gm__ float* out_ptr = base_out + (tile_idx * TILE_ELEMS); + + mul_impl<128, 128>(src0_ptr, src1_ptr, out_ptr); + } +} diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp new file mode 100644 index 00000000..3b44b721 --- /dev/null +++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp @@ -0,0 +1,74 @@ +/** + * Standalone Element-wise Multiplication Kernel (AIV1 slot) + * + * Implements: out[i] = src0[i] * src1[i] + * Tile size: 128 x 128 + * + * Reads args[3..5] — for AIV1 slot in AIV_X2 tasks where AIV0 uses args[0..2]. + * + * Args (Tensor*): + * args[3] = src0 (INPUT) - 128 x 128 + * args[4] = src1 (INPUT) - 128 x 128 + * args[5] = out (OUTPUT) - 128 x 128 + */ + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void mul_impl( + __gm__ float* src0, + __gm__ float* src1, + __gm__ float* out) { + + using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>; + using DynStridDim5 = Stride<1, 1, 1, COLS, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(ROWS, COLS); + TileData src1Tile(ROWS, COLS); + TileData dstTile(ROWS, COLS); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TMUL(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* src0_tensor = reinterpret_cast<__gm__ Tensor*>(args[3]); + __gm__ Tensor* src1_tensor = reinterpret_cast<__gm__ Tensor*>(args[4]); + __gm__ Tensor* out_tensor = reinterpret_cast<__gm__ Tensor*>(args[5]); + + __gm__ float* src0 = reinterpret_cast<__gm__ float*>(src0_tensor->buffer.addr) + src0_tensor->start_offset; + __gm__ float* src1 = reinterpret_cast<__gm__ float*>(src1_tensor->buffer.addr) + src1_tensor->start_offset; + __gm__ float* out = reinterpret_cast<__gm__ float*>(out_tensor->buffer.addr) + out_tensor->start_offset; + + mul_impl<128, 128>(src0, src1, out); +} diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py b/examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py new file mode 100644 index 00000000..4637f3ce --- /dev/null +++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py @@ -0,0 +1,58 @@ +""" +Kernel configuration for mixed AIC+AIV example (tensormap_and_ringbuffer Runtime). + +Covers all 5 resource shapes: + - AIC_ONLY: standalone matmul + - AIV_X1: standalone add + - AIV_X2: add (AIV0) + mul (AIV1) + - AIC_AIV_X1: matmul (AIC) + add (AIV0) + - AIC_AIV_X2: matmul (AIC) + add (AIV0) + mul (AIV1) +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent + +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "mixed_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +KERNELS = [ + { + "func_id": 0, + "name": "MATMUL", + "source": str(_KERNELS_ROOT / "aic" / "kernel_matmul.cpp"), + "core_type": "aic", + }, + { + "func_id": 1, + "name": "ADD", + "source": str(_KERNELS_ROOT / "aiv" / "kernel_add.cpp"), + "core_type": "aiv", + }, + { + "func_id": 2, + "name": "MUL", + "source": str(_KERNELS_ROOT / "aiv" / "kernel_mul.cpp"), + "core_type": "aiv", + }, + { + "func_id": 3, + "name": "ADD_STANDALONE", + "source": str(_KERNELS_ROOT / "aiv" / "kernel_add_standalone.cpp"), + "core_type": "aiv", + }, + { + "func_id": 4, + "name": "MUL_STANDALONE", + "source": str(_KERNELS_ROOT / "aiv" / "kernel_mul_standalone.cpp"), + "core_type": "aiv", + }, +] + +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "block_dim": 3, +} diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp new file mode 100644 index 00000000..a97753ec --- /dev/null +++ b/examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp @@ -0,0 +1,221 @@ +/** + * Mixed AIC+AIV Orchestration Function (tensormap_and_ringbuffer Runtime) + * + * Covers all 5 resource shapes per iteration: + * 1. AIC_AIV_X2: AIC matmul(A,B->C) + AIV0 add(D,E->F) + AIV1 mul(G,H->I) + * 2. AIC_ONLY: matmul(A,B->J) + * 3. AIV_X1: add(D,E->K) + * 4. AIV_X2: AIV0 add(D,E->L) + AIV1 mul(G,H->M) + * 5. AIC_AIV_X1: AIC matmul(A,B->N) + AIV0 add(D,E->O) + * + * Args layout (30 args): + * [ptr_A, ptr_B, ptr_C, ptr_D, ptr_E, ptr_F, + * ptr_G, ptr_H, ptr_I, ptr_J, ptr_K, ptr_L, + * ptr_M, ptr_N, ptr_O, + * size_A, size_B, size_C, size_D, size_E, size_F, + * size_G, size_H, size_I, size_J, size_K, size_L, + * size_M, size_N, size_O] + */ + +#include +#include + +#include "pto_orchestration_api.h" + +// Mixed-task kernels (args offset matches param position in mixed param list) +#define FUNC_MATMUL 0 // AIC: reads args[0..2] +#define FUNC_ADD 1 // AIV0 in mixed: reads args[3..5] +#define FUNC_MUL 2 // AIV1 in mixed: reads args[6..8] +// Standalone kernels (read args[0..2] or args[3..5]) +#define FUNC_ADD_STANDALONE 3 // AIV: reads args[0..2] +#define FUNC_MUL_STANDALONE 4 // AIV1 in AIV_X2: reads args[3..5] + +#define ARG_PTR_A 0 +#define ARG_PTR_B 1 +#define ARG_PTR_C 2 +#define ARG_PTR_D 3 +#define ARG_PTR_E 4 +#define ARG_PTR_F 5 +#define ARG_PTR_G 6 +#define ARG_PTR_H 7 +#define ARG_PTR_I 8 +#define ARG_PTR_J 9 +#define ARG_PTR_K 10 +#define ARG_PTR_L 11 +#define ARG_PTR_M 12 +#define ARG_PTR_N 13 +#define ARG_PTR_O 14 +#define ARG_SIZE_A 15 +#define ARG_SIZE_B 16 +#define ARG_SIZE_C 17 +#define ARG_SIZE_D 18 +#define ARG_SIZE_E 19 +#define ARG_SIZE_F 20 +#define ARG_SIZE_G 21 +#define ARG_SIZE_H 22 +#define ARG_SIZE_I 23 +#define ARG_SIZE_J 24 +#define ARG_SIZE_K 25 +#define ARG_SIZE_L 26 +#define ARG_SIZE_M 27 +#define ARG_SIZE_N 28 +#define ARG_SIZE_O 29 + +static constexpr uint64_t TILE_ELEMS = 128 * 128; + +extern "C" { + +__attribute__((visibility("default"))) +PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count) { + (void)args; + (void)arg_count; + return PTO2OrchestrationConfig{ + .expected_arg_count = 30, + }; +} + +__attribute__((visibility("default"))) +void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, int orch_thread_num, int orch_thread_index) { + (void)arg_count; + (void)orch_thread_num; + (void)orch_thread_index; + + void* dev_A = (void*)(uintptr_t)args[ARG_PTR_A]; + void* dev_B = (void*)(uintptr_t)args[ARG_PTR_B]; + void* dev_C = (void*)(uintptr_t)args[ARG_PTR_C]; + void* dev_D = (void*)(uintptr_t)args[ARG_PTR_D]; + void* dev_E = (void*)(uintptr_t)args[ARG_PTR_E]; + void* dev_F = (void*)(uintptr_t)args[ARG_PTR_F]; + void* dev_G = (void*)(uintptr_t)args[ARG_PTR_G]; + void* dev_H = (void*)(uintptr_t)args[ARG_PTR_H]; + void* dev_I = (void*)(uintptr_t)args[ARG_PTR_I]; + void* dev_J = (void*)(uintptr_t)args[ARG_PTR_J]; + void* dev_K = (void*)(uintptr_t)args[ARG_PTR_K]; + void* dev_L = (void*)(uintptr_t)args[ARG_PTR_L]; + void* dev_M = (void*)(uintptr_t)args[ARG_PTR_M]; + void* dev_N = (void*)(uintptr_t)args[ARG_PTR_N]; + void* dev_O = (void*)(uintptr_t)args[ARG_PTR_O]; + size_t size_C = (size_t)args[ARG_SIZE_C]; + + int num_iters = (int)(size_C / (TILE_ELEMS * sizeof(float))); + + LOG_INFO(rt, "[mixed_orch] num_iters=%d", num_iters); + + // Input tensors (shared across all tasks) + uint64_t ab_shapes[1] = {TILE_ELEMS}; + Tensor ext_A = make_tensor_external(dev_A, ab_shapes, 1, DataType::FLOAT32); + Tensor ext_B = make_tensor_external(dev_B, ab_shapes, 1, DataType::FLOAT32); + + uint64_t de_shapes[1] = {TILE_ELEMS}; + Tensor ext_D = make_tensor_external(dev_D, de_shapes, 1, DataType::FLOAT32); + Tensor ext_E = make_tensor_external(dev_E, de_shapes, 1, DataType::FLOAT32); + + uint64_t gh_shapes[1] = {TILE_ELEMS}; + Tensor ext_G = make_tensor_external(dev_G, gh_shapes, 1, DataType::FLOAT32); + Tensor ext_H = make_tensor_external(dev_H, gh_shapes, 1, DataType::FLOAT32); + + // Output tensors (full buffers, one slice per iteration) + uint64_t out_shapes[1] = {(uint64_t)num_iters * TILE_ELEMS}; + Tensor ext_C = make_tensor_external(dev_C, out_shapes, 1, DataType::FLOAT32); + Tensor ext_F = make_tensor_external(dev_F, out_shapes, 1, DataType::FLOAT32); + Tensor ext_I = make_tensor_external(dev_I, out_shapes, 1, DataType::FLOAT32); + Tensor ext_J = make_tensor_external(dev_J, out_shapes, 1, DataType::FLOAT32); + Tensor ext_K = make_tensor_external(dev_K, out_shapes, 1, DataType::FLOAT32); + Tensor ext_L = make_tensor_external(dev_L, out_shapes, 1, DataType::FLOAT32); + Tensor ext_M = make_tensor_external(dev_M, out_shapes, 1, DataType::FLOAT32); + Tensor ext_N = make_tensor_external(dev_N, out_shapes, 1, DataType::FLOAT32); + Tensor ext_O = make_tensor_external(dev_O, out_shapes, 1, DataType::FLOAT32); + + for (int i = 0; i < num_iters; i++) { + PTO2_SCOPE(rt) { + uint64_t view_shapes[1] = {TILE_ELEMS}; + uint64_t view_offsets[1] = {(uint64_t)i * TILE_ELEMS}; + + Tensor C_view = ext_C.view(view_shapes, view_offsets); + Tensor F_view = ext_F.view(view_shapes, view_offsets); + Tensor I_view = ext_I.view(view_shapes, view_offsets); + Tensor J_view = ext_J.view(view_shapes, view_offsets); + Tensor K_view = ext_K.view(view_shapes, view_offsets); + Tensor L_view = ext_L.view(view_shapes, view_offsets); + Tensor M_view = ext_M.view(view_shapes, view_offsets); + Tensor N_view = ext_N.view(view_shapes, view_offsets); + Tensor O_view = ext_O.view(view_shapes, view_offsets); + + // 1. AIC_AIV_X2: matmul + add + mul + { + MixedKernels mk; + mk.aic_kernel_id = FUNC_MATMUL; + mk.aiv0_kernel_id = FUNC_ADD; + mk.aiv1_kernel_id = FUNC_MUL; + PTOParam params[9] = { + make_input_param(ext_A), + make_input_param(ext_B), + make_output_param(C_view), + make_input_param(ext_D), + make_input_param(ext_E), + make_output_param(F_view), + make_input_param(ext_G), + make_input_param(ext_H), + make_output_param(I_view), + }; + pto2_rt_submit_task(rt, mk, params, 9); + } + + // 2. AIC_ONLY: standalone matmul + { + PTOParam params[3] = { + make_input_param(ext_A), + make_input_param(ext_B), + make_output_param(J_view), + }; + pto2_rt_submit_aic_task(rt, FUNC_MATMUL, params, 3); + } + + // 3. AIV_X1: standalone add + { + PTOParam params[3] = { + make_input_param(ext_D), + make_input_param(ext_E), + make_output_param(K_view), + }; + pto2_rt_submit_aiv_task(rt, FUNC_ADD_STANDALONE, params, 3); + } + + // 4. AIV_X2: add (AIV0) + mul (AIV1) + { + MixedKernels mk; + mk.aiv0_kernel_id = FUNC_ADD_STANDALONE; + mk.aiv1_kernel_id = FUNC_MUL_STANDALONE; + PTOParam params[6] = { + make_input_param(ext_D), + make_input_param(ext_E), + make_output_param(L_view), + make_input_param(ext_G), + make_input_param(ext_H), + make_output_param(M_view), + }; + pto2_rt_submit_task(rt, mk, params, 6); + } + + // 5. AIC_AIV_X1: matmul (AIC) + add (AIV0) + { + MixedKernels mk; + mk.aic_kernel_id = FUNC_MATMUL; + mk.aiv0_kernel_id = FUNC_ADD; + PTOParam params[6] = { + make_input_param(ext_A), + make_input_param(ext_B), + make_output_param(N_view), + make_input_param(ext_D), + make_input_param(ext_E), + make_output_param(O_view), + }; + pto2_rt_submit_task(rt, mk, params, 6); + } + } + } + + LOG_INFO(rt, "[mixed_orch] Submitted %d iterations x 5 shapes = %d tasks", num_iters, num_iters * 5); +} + +} // extern "C" diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 9184031e..9bc691a9 100644 --- a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -141,7 +141,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_output_param(li_update), make_output_param(mi_update), }; - pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); // create_inplace + pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace, 3); // create_inplace for (uint64_t bn = 0; bn < bn_this_batch; bn++) { uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn]; @@ -160,7 +160,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_input_param(kj), make_output_param(sij), }; - pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3); // c1 + pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 3); // c1 uint64_t sij_valid_shapes[2] = {q_tile, valid_len}; uint64_t sij_valid_offsets[2] = {0, 0}; @@ -174,7 +174,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_output_param(mi), make_output_param(li), }; - pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5); // v1 + pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 5); // v1 uint64_t oi_tmp_shapes[2] = {q_tile, head_dim}; Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32); @@ -184,7 +184,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_input_param(vj), make_output_param(oi_tmp), }; - pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3); // c2 + pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 3); // c2 uint64_t is_first = (bn == 0) ? 1 : 0; uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; @@ -200,7 +200,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(is_first), make_scalar_param(is_last), }; - pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); // v2 + pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 9); // v2 } } } diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp b/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp index 4e6df402..c55eccf5 100644 --- a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp +++ b/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp @@ -107,7 +107,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_input_param(ext_b), make_output_param(c), }; - pto2_rt_submit_task(rt, 0, PTO2_WORKER_VECTOR, params_t0, 3); // kernel_add + pto2_rt_submit_aiv_task(rt, 0, params_t0, 3); // kernel_add // Inner scope: owns t1, t2, t3, t4; intermediates d, e, g release on scope end. // c flows in from outer scope (outer-scope tensors are visible to inner scopes). @@ -123,7 +123,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_output_param(d), make_scalar_param((uint64_t)3), }; - pto2_rt_submit_task(rt, 1, PTO2_WORKER_VECTOR, params_t1, 3); // kernel_add_scalar + pto2_rt_submit_aiv_task(rt, 1, params_t1, 3); // kernel_add_scalar // t2: e = c + 2 (kernel_id=1, kernel_add_scalar) PTOParam params_t2[] = { @@ -132,7 +132,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_output_param(e), make_scalar_param((uint64_t)3), }; - pto2_rt_submit_task(rt, 1, PTO2_WORKER_VECTOR, params_t2, 3); // kernel_add_scalar + pto2_rt_submit_aiv_task(rt, 1, params_t2, 3); // kernel_add_scalar // t3: g = d * e (kernel_id=2, kernel_mul) PTOParam params_t3[] = { @@ -141,7 +141,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_output_param(g), make_scalar_param((uint64_t)3), }; - pto2_rt_submit_task(rt, 2, PTO2_WORKER_VECTOR, params_t3, 3); // kernel_mul + pto2_rt_submit_aiv_task(rt, 2, params_t3, 3); // kernel_mul // t4: f = g + c (kernel_id=0, kernel_add) PTOParam params_t4[] = { @@ -149,7 +149,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_input_param(c), make_output_param(ext_f), }; - pto2_rt_submit_task(rt, 0, PTO2_WORKER_VECTOR, params_t4, 3); // kernel_add + pto2_rt_submit_aiv_task(rt, 0, params_t4, 3); // kernel_add } // inner scope ends: releases d, e, g } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index c687086b..b9057a99 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -106,7 +106,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in __gm__ PTO2DispatchPayload* payload = my_payload; - write_reg(RegId::COND, MAKE_ACK_VALUE(payload->task_id)); + write_reg(RegId::COND, MAKE_ACK_VALUE(payload->mixed_task_id)); // Performance profiling: record start time uint64_t start_time = 0; @@ -121,13 +121,13 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in if (profiling_enabled) { uint64_t end_time = get_sys_cnt_aicore(); __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr; - perf_aicore_record_task(perf_buf, payload->task_id, payload->kernel_id, + perf_aicore_record_task(perf_buf, payload->mixed_task_id, payload->kernel_id, start_time, end_time, kernel_ready_time, core_type); } last_task_id = task_id; - write_reg(RegId::COND, MAKE_FIN_VALUE(payload->task_id)); + write_reg(RegId::COND, MAKE_FIN_VALUE(payload->mixed_task_id)); } } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 89f13b47..69ddda44 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -98,16 +98,57 @@ struct CoreTypeTracker { idle[idle_count++] = running[idx]; running[idx] = running[--running_count]; } + + int32_t find_idle_index(int32_t core_id) { + for (int32_t i = 0; i < idle_count; i++) { + if (idle[i] == core_id) return i; + } + return -1; + } +}; + +struct Cluster { + int32_t aic_core_id; + int32_t aiv_core_ids[2]; }; struct CoreStateTracker { CoreTypeTracker by_type[2]; // indexed by static_cast(CoreType) + Cluster clusters[MAX_AIC_PER_THREAD]; + int32_t cluster_count; + bool core_idle[MAX_CORES_PER_THREAD]; CoreTypeTracker& aic() { return by_type[0]; } CoreTypeTracker& aiv() { return by_type[1]; } template CoreTypeTracker& get() { return by_type[static_cast(CT)]; } + + int32_t find_cluster_for_shape(PTO2ResourceShape shape) { + for (int32_t i = 0; i < cluster_count; i++) { + Cluster& c = clusters[i]; + switch (shape) { + case PTO2ResourceShape::AIC_ONLY: + if (core_idle[c.aic_core_id]) return i; + break; + case PTO2ResourceShape::AIV_X1: + if (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]]) return i; + break; + case PTO2ResourceShape::AIV_X2: + if (core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i; + break; + case PTO2ResourceShape::AIC_AIV_X1: + if (core_idle[c.aic_core_id] && + (core_idle[c.aiv_core_ids[0]] || core_idle[c.aiv_core_ids[1]])) return i; + break; + case PTO2ResourceShape::AIC_AIV_X2: + if (core_idle[c.aic_core_id] && + core_idle[c.aiv_core_ids[0]] && core_idle[c.aiv_core_ids[1]]) return i; + break; + } + } + return -1; + } }; struct AicpuExecutor { @@ -188,16 +229,19 @@ struct AicpuExecutor { void diagnose_stuck_state( Runtime* runtime, int32_t thread_idx, const int32_t* cur_thread_cores, int32_t core_num, Handshake* hank); - // Build PTO2DispatchPayload from PTO2TaskDescriptor. - template + // Build PTO2DispatchPayload from PTO2TaskDescriptor for a specific subtask slot. void build_pto2_payload(PTO2DispatchPayload* out, Runtime* runtime, PTO2TaskDescriptor* task, - PTO2TaskPayload* task_payload) { - out->task_id = task->task_id; - out->kernel_id = task->kernel_id; - out->core_type = CT; - out->function_bin_addr = runtime->get_function_bin_addr(task->kernel_id); + PTO2TaskPayload* task_payload, + PTO2SubtaskSlot subslot, + CoreType core_type) { + int32_t slot_idx = static_cast(subslot); + out->mixed_task_id = task->mixed_task_id; + out->subslot = subslot; + out->kernel_id = task->kernel_id[slot_idx]; + out->core_type = core_type; + out->function_bin_addr = runtime->get_function_bin_addr(task->kernel_id[slot_idx]); int32_t n = 0; for (int32_t i = 0; i < task_payload->param_count; i++) { @@ -216,6 +260,7 @@ struct AicpuExecutor { template void check_running_cores_for_completion(int32_t thread_idx, CoreTypeTracker& ct, + bool* core_idle, Handshake* hank, int32_t* executing_task_ids, int32_t& completed_this_turn, @@ -261,43 +306,49 @@ struct AicpuExecutor { if (done) { executing_task_ids[core_id] = AICPU_TASK_INVALID; -#if PTO2_SCHED_PROFILING PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id]; - PTO2CompletionStats cstats = rt->scheduler.on_task_complete(task_id, thread_idx, local_bufs); - notify_edges_total += cstats.fanout_edges; - if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges; - notify_tasks_enqueued += cstats.tasks_enqueued; - phase_complete_count++; + int32_t mixed_task_id = payload->mixed_task_id; + PTO2SubtaskSlot subslot = payload->subslot; + + // Two-stage completion: mark subtask done, then handle mixed-task completion + bool mixed_complete = rt->scheduler.on_subtask_complete(mixed_task_id, subslot); + if (mixed_complete) { +#if PTO2_SCHED_PROFILING + PTO2CompletionStats cstats = rt->scheduler.on_mixed_task_complete(mixed_task_id, thread_idx, local_bufs); + notify_edges_total += cstats.fanout_edges; + if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges; + notify_tasks_enqueued += cstats.tasks_enqueued; + phase_complete_count++; #elif PTO2_PROFILING - PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id]; - PTO2CompletionStats cstats = rt->scheduler.on_task_complete(task_id, local_bufs); - notify_edges_total += cstats.fanout_edges; - if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges; - notify_tasks_enqueued += cstats.tasks_enqueued; - phase_complete_count++; + PTO2CompletionStats cstats = rt->scheduler.on_mixed_task_complete(mixed_task_id, local_bufs); + notify_edges_total += cstats.fanout_edges; + if (cstats.fanout_edges > notify_max_degree) notify_max_degree = cstats.fanout_edges; + notify_tasks_enqueued += cstats.tasks_enqueued; + phase_complete_count++; #else - rt->scheduler.on_task_complete(task_id, local_bufs); + rt->scheduler.on_mixed_task_complete(mixed_task_id, local_bufs); #endif - if (deferred_release_count < 64) { - deferred_release_ids[deferred_release_count++] = task_id; - } else { - DEV_ALWAYS("Thread %d: release", thread_idx); - while (deferred_release_count > 0) { + if (deferred_release_count < 64) { + deferred_release_ids[deferred_release_count++] = mixed_task_id; + } else { + DEV_ALWAYS("Thread %d: release", thread_idx); + while (deferred_release_count > 0) { #if PTO2_SCHED_PROFILING - int32_t fe = - rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count], thread_idx); + int32_t fe = + rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count], thread_idx); #else - int32_t fe = rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count]); + int32_t fe = rt->scheduler.on_task_release(deferred_release_ids[--deferred_release_count]); #endif - (void)fe; + (void)fe; #if PTO2_PROFILING - fanin_edges_total += fe; - if (fe > fanin_max_degree) fanin_max_degree = fe; + fanin_edges_total += fe; + if (fe > fanin_max_degree) fanin_max_degree = fe; #endif + } } } ct.move_running_to_idle(i); - + core_idle[core_id] = true; #if PTO2_PROFILING if (profiling_enabled) { #if PTO2_SCHED_PROFILING @@ -310,7 +361,7 @@ struct AicpuExecutor { uint32_t count = perf_buf->count; if (count > 0) { PerfRecord* record = &perf_buf->records[count - 1]; - if (record->task_id == static_cast(payload->task_id)) { + if (record->task_id == static_cast(payload->mixed_task_id)) { perf_aicpu_record_dispatch_and_finish_time( record, dispatch_timestamps_[core_id], finish_ts); } @@ -321,114 +372,132 @@ struct AicpuExecutor { } #endif - DEV_DEBUG("Thread %d: %s core %d completed PTO2 task %d", + DEV_DEBUG("Thread %d: %s core %d completed PTO2 task %d (mixed_complete=%d)", thread_idx, CT == CoreType::AIC ? "AIC" : "AIV", core_id, - task_id); + task_id, + mixed_complete ? 1 : 0); cur_thread_completed++; - completed_this_turn++; + if (mixed_complete) { + completed_this_turn++; + } made_progress = true; } } } - template - void dispatch_ready_tasks_to_idle_cores(Runtime* runtime, - int32_t thread_idx, - CoreTypeTracker& ct, - int32_t* executing_task_ids, - bool& made_progress, - PTO2TaskDescriptor* task_descriptors, - PTO2TaskPayload* task_payloads, - int32_t window_mask, - PTO2LocalReadyBuffer* local_bufs -#if PTO2_PROFILING - , - bool profiling_enabled, - uint64_t& pop_hit, - uint64_t& pop_miss, - uint32_t& phase_dispatch_count, - uint64_t& local_dispatch_count, - uint64_t& local_overflow_count -#endif -#if PTO2_SCHED_PROFILING - , - uint64_t& sched_dispatch_pop_cycle, - uint64_t& sched_dispatch_setup_cycle -#endif - ) { - constexpr int ct_idx = static_cast(CT); + static const char* shape_name(PTO2ResourceShape shape) { + switch (shape) { + case PTO2ResourceShape::AIC_ONLY: return "AIC_ONLY"; + case PTO2ResourceShape::AIV_X1: return "AIV_X1"; + case PTO2ResourceShape::AIV_X2: return "AIV_X2"; + case PTO2ResourceShape::AIC_AIV_X1: return "AIC_AIV_X1"; + case PTO2ResourceShape::AIC_AIV_X2: return "AIC_AIV_X2"; + } + return "UNKNOWN"; + } - for (int32_t i = ct.idle_count - 1; i >= 0; i--) { - int32_t core_id = ct.idle[i]; + struct ResourceCount { + int32_t aic; + int32_t aiv; + }; + + static constexpr ResourceCount shape_resource_count(PTO2ResourceShape shape) { + constexpr ResourceCount kTable[PTO2_NUM_RESOURCE_SHAPES] = { + {1, 0}, // AIC_ONLY = 0 + {0, 1}, // AIV_X1 = 1 + {0, 2}, // AIV_X2 = 2 + {1, 1}, // AIC_AIV_X1 = 3 + {1, 2}, // AIC_AIV_X2 = 4 + }; + return kTable[static_cast(shape)]; + } + + /** + * Returns the dispatch probe order for a given scheduler thread. + * Widest shapes first to avoid consuming cluster resources with narrow tasks. + * Even/odd threads use different fallback orders (AIC-first vs AIV-first) + * to reduce contention on the same ready queue across adjacent threads. + */ + static const PTO2ResourceShape* get_dispatch_order(int32_t thread_idx) { + // Even threads: AIC-first fallback after widest + static constexpr PTO2ResourceShape kEvenOrder[PTO2_NUM_RESOURCE_SHAPES] = { + PTO2ResourceShape::AIC_AIV_X2, + PTO2ResourceShape::AIC_AIV_X1, + PTO2ResourceShape::AIC_ONLY, + PTO2ResourceShape::AIV_X2, + PTO2ResourceShape::AIV_X1, + }; + // Odd threads: AIV-first fallback after widest + static constexpr PTO2ResourceShape kOddOrder[PTO2_NUM_RESOURCE_SHAPES] = { + PTO2ResourceShape::AIC_AIV_X2, + PTO2ResourceShape::AIV_X2, + PTO2ResourceShape::AIC_AIV_X1, + PTO2ResourceShape::AIV_X1, + PTO2ResourceShape::AIC_ONLY, + }; + return (thread_idx % 2 == 0) ? kEvenOrder : kOddOrder; + } + int32_t pop_ready_task(PTO2ResourceShape shape, int32_t thread_idx #if PTO2_PROFILING - int local_count_before = local_bufs[ct_idx].count; + , uint64_t& pop_hit, uint64_t& pop_miss #endif #if PTO2_SCHED_PROFILING - extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[]; - uint64_t t_pop_start = get_sys_cnt_aicpu(); - int32_t task_id = rt->scheduler.get_ready_task( - local_bufs, - g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]); - sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); -#else - int32_t task_id = rt->scheduler.get_ready_task(local_bufs); -#endif - if (task_id >= 0) { -#if PTO2_PROFILING - pop_hit++; - phase_dispatch_count++; - if (local_bufs[ct_idx].count < local_count_before) { - local_dispatch_count++; - } + , uint64_t& sched_dispatch_pop_cycle #endif + ) { + (void)thread_idx; #if PTO2_SCHED_PROFILING - uint64_t t_setup_start = get_sys_cnt_aicpu(); + extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[]; + uint64_t t_pop_start = get_sys_cnt_aicpu(); + int32_t task_id = rt->scheduler.get_ready_task(shape, + g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]); + sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); +#else + int32_t task_id = rt->scheduler.get_ready_task(shape); #endif - PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask]; - PTO2TaskPayload* task_pl = &task_payloads[task_id & window_mask]; - PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id]; - build_pto2_payload(payload, runtime, task, task_pl); + if (task_id >= 0) { #if PTO2_PROFILING - if (profiling_enabled) { - dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); - if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) { - perf_aicpu_switch_buffer(runtime, core_id, thread_idx); - core_dispatch_counts_[core_id] = 0; - } - core_dispatch_counts_[core_id]++; - } -#endif - write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast(task_id + 1)); - ct.move_idle_to_running(i); - executing_task_ids[core_id] = task_id; - made_progress = true; -#if PTO2_SCHED_PROFILING - sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); + pop_hit++; #endif - DEV_DEBUG("Thread %d: Dispatching PTO2 task %d to %s core %d", - thread_idx, - task_id, - CT == CoreType::AIC ? "AIC" : "AIV", - core_id); - } else { + } else { #if PTO2_PROFILING - pop_miss++; + pop_miss++; #endif - break; - } } + return task_id; + } - // Drain remaining local tasks to global queue (idle cores exhausted) - while (local_bufs[ct_idx].count > 0) { - int32_t task_id = local_bufs[ct_idx].pop(); - rt->scheduler.ready_queues[ct_idx].push(task_id); + void dispatch_subtask_to_core( + Runtime* runtime, CoreStateTracker& tracker, int32_t* executing_task_ids, + int32_t core_id, CoreType core_type, + int32_t task_id, PTO2TaskDescriptor* task, PTO2TaskPayload* task_pl, + PTO2SubtaskSlot subslot #if PTO2_PROFILING - local_overflow_count++; + , bool profiling_enabled, int32_t thread_idx #endif + ) { + PTO2DispatchPayload* payload = &s_pto2_payload_per_core[core_id]; + build_pto2_payload(payload, runtime, task, task_pl, subslot, core_type); +#if PTO2_PROFILING + if (profiling_enabled) { + dispatch_timestamps_[core_id] = get_sys_cnt_aicpu(); + if (core_dispatch_counts_[core_id] >= PLATFORM_PROF_BUFFER_SIZE) { + perf_aicpu_switch_buffer(runtime, core_id, thread_idx); + core_dispatch_counts_[core_id] = 0; + } + core_dispatch_counts_[core_id]++; } +#endif + write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast(task_id + 1)); + + CoreTypeTracker& ct = tracker.by_type[static_cast(core_type)]; + int32_t idle_idx = ct.find_idle_index(core_id); + ct.move_idle_to_running(idle_idx); + tracker.core_idle[core_id] = false; + executing_task_ids[core_id] = task_id; } }; @@ -525,14 +594,13 @@ int32_t AicpuExecutor::handshake_all_cores(Runtime* runtime) { * (Aligned with host_build_graph mechanism) */ void AicpuExecutor::assign_cores_to_threads() { - // Determine how many cores each thread gets initially: - // - Mixed mode: distribute among scheduler threads only - // - All-orchestrator mode: distribute among all threads (they all transition to schedulers) + // Cluster-aligned assignment: each cluster = 1 AIC + 2 AIV (adjacent pair) int32_t divisor = (sched_thread_num_ > 0) ? sched_thread_num_ : thread_num_; - int32_t aic_per_thread = aic_count_ / divisor; - int32_t aiv_per_thread = aiv_count_ / divisor; + int32_t cluster_count = aic_count_; + int32_t clusters_per_thread = cluster_count / divisor; - DEV_INFO("Assigning cores: %d AIC per thread, %d AIV per thread", aic_per_thread, aiv_per_thread); + DEV_INFO("Assigning cores: %d clusters, %d per thread (%d AIC, %d AIV)", + cluster_count, clusters_per_thread, aic_count_, aiv_count_); for (int32_t i = 0; i < thread_num_; i++) { for (int32_t j = 0; j < MAX_CORES_PER_THREAD; j++) { @@ -542,6 +610,8 @@ void AicpuExecutor::assign_cores_to_threads() { trackers_[i].aiv().running_count = 0; trackers_[i].aic().idle_count = 0; trackers_[i].aiv().idle_count = 0; + trackers_[i].cluster_count = 0; + memset(trackers_[i].core_idle, 0, sizeof(trackers_[i].core_idle)); } for (int32_t t = 0; t < thread_num_; t++) { @@ -553,31 +623,36 @@ void AicpuExecutor::assign_cores_to_threads() { } int32_t core_idx = 0; + CoreStateTracker& tracker = trackers_[t]; - // Assign AIC cores - int32_t aic_start = t * aic_per_thread; - for (int32_t i = 0; i < aic_per_thread; i++) { - int32_t worker_id = aic_cores_[aic_start + i].worker_id; - core_assignments_[t][core_idx++] = worker_id; - trackers_[t].aic().idle[trackers_[t].aic().idle_count++] = worker_id; - DEV_INFO("Thread %d: assigned AIC worker_id=%d", t, worker_id); - } + for (int32_t c = 0; c < clusters_per_thread; c++) { + int32_t ci = t * clusters_per_thread + c; + int32_t aic_wid = aic_cores_[ci].worker_id; + int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id; + int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id; + + tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}}; - // Assign AIV cores - int32_t aiv_start = t * aiv_per_thread; - for (int32_t i = 0; i < aiv_per_thread; i++) { - int32_t worker_id = aiv_cores_[aiv_start + i].worker_id; - core_assignments_[t][core_idx++] = worker_id; - trackers_[t].aiv().idle[trackers_[t].aiv().idle_count++] = worker_id; - DEV_INFO("Thread %d: assigned AIV worker_id=%d", t, worker_id); + core_assignments_[t][core_idx++] = aic_wid; + tracker.aic().idle[tracker.aic().idle_count++] = aic_wid; + tracker.core_idle[aic_wid] = true; + + core_assignments_[t][core_idx++] = aiv0_wid; + core_assignments_[t][core_idx++] = aiv1_wid; + tracker.aiv().idle[tracker.aiv().idle_count++] = aiv0_wid; + tracker.aiv().idle[tracker.aiv().idle_count++] = aiv1_wid; + tracker.core_idle[aiv0_wid] = true; + tracker.core_idle[aiv1_wid] = true; + + DEV_INFO("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", + t, ci, aic_wid, aiv0_wid, aiv1_wid); } core_count_per_thread_[t] = core_idx; - - DEV_INFO("Thread %d: total %d cores", t, core_idx); + DEV_INFO("Thread %d: total %d cores (%d clusters)", t, core_idx, clusters_per_thread); } - thread_cores_num_ = aic_per_thread + aiv_per_thread; + thread_cores_num_ = clusters_per_thread * 3; } /** @@ -586,100 +661,90 @@ void AicpuExecutor::assign_cores_to_threads() { * Writes into new_core_assignments_ / new_core_count_per_thread_. */ void AicpuExecutor::reassign_cores_for_all_threads() { - // Calculate how many AIC/AIV each thread should have + DEV_INFO("Reassigning cores (cluster-aligned) for all %d threads: %d AIC, %d AIV", + thread_num_, aic_count_, aiv_count_); - DEV_INFO("Reassigning cores for all %d threads: %d AIC, %d AIV", thread_num_, aic_count_, aiv_count_); + // Collect running/idle state from all threads before reassignment + int32_t running_cores[128]; + int32_t running_task_ids[128]; + int32_t running_count = 0; - int32_t aic_running_cores[128]; - int32_t aic_running_task_ids[128]; - int32_t aic_idle_cores[128]; - int32_t aic_running_cores_num = 0; - int32_t aic_idle_cores_num = 0; - - int32_t aiv_running_cores[128]; - int32_t aiv_running_task_ids[128]; - int32_t aiv_idle_cores[128]; - int32_t aiv_running_cores_num = 0; - int32_t aiv_idle_cores_num = 0; + bool was_idle[MAX_CORES_PER_THREAD]; + memset(was_idle, 0, sizeof(was_idle)); for (int32_t i = 0; i < thread_num_; i++) { - core_count_per_thread_[i] = 0; for (int32_t j = 0; j < trackers_[i].aic().running_count; j++) { int32_t core_id = trackers_[i].aic().running[j]; - aic_running_cores[aic_running_cores_num] = core_id; - aic_running_task_ids[aic_running_cores_num] = executing_task_ids_[i][core_id]; - aic_running_cores_num++; + running_cores[running_count] = core_id; + running_task_ids[running_count] = executing_task_ids_[i][core_id]; + running_count++; } for (int32_t j = 0; j < trackers_[i].aic().idle_count; j++) { - aic_idle_cores[aic_idle_cores_num++] = trackers_[i].aic().idle[j]; + was_idle[trackers_[i].aic().idle[j]] = true; } for (int32_t j = 0; j < trackers_[i].aiv().running_count; j++) { int32_t core_id = trackers_[i].aiv().running[j]; - aiv_running_cores[aiv_running_cores_num] = core_id; - aiv_running_task_ids[aiv_running_cores_num] = executing_task_ids_[i][core_id]; - aiv_running_cores_num++; + running_cores[running_count] = core_id; + running_task_ids[running_count] = executing_task_ids_[i][core_id]; + running_count++; } for (int32_t j = 0; j < trackers_[i].aiv().idle_count; j++) { - aiv_idle_cores[aiv_idle_cores_num++] = trackers_[i].aiv().idle[j]; + was_idle[trackers_[i].aiv().idle[j]] = true; } + } + + // Reset all trackers + for (int32_t i = 0; i < thread_num_; i++) { + core_count_per_thread_[i] = 0; trackers_[i].aic().running_count = 0; trackers_[i].aic().idle_count = 0; trackers_[i].aiv().running_count = 0; trackers_[i].aiv().idle_count = 0; + trackers_[i].cluster_count = 0; + memset(trackers_[i].core_idle, 0, sizeof(trackers_[i].core_idle)); for (int32_t j = 0; j < MAX_CORES_PER_THREAD; j++) { executing_task_ids_[i][j] = AICPU_TASK_INVALID; } } - for (int32_t i = 0; i < aic_count_; i++) { - int32_t thread_idx = i % thread_num_; - int32_t core_id = aic_cores_[i].worker_id; - core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = core_id; - bool found = false; - for (int32_t j = 0; j < aic_running_cores_num; j++) { - if (core_id == aic_running_cores[j]) { - trackers_[thread_idx].aic().running[trackers_[thread_idx].aic().running_count++] = core_id; - executing_task_ids_[thread_idx][core_id] = aic_running_task_ids[j]; - found = true; - break; - } - } - if (!found) { - for (int32_t j = 0; j < aic_idle_cores_num; j++) { - if (core_id == aic_idle_cores[j]) { - trackers_[thread_idx].aic().idle[trackers_[thread_idx].aic().idle_count++] = core_id; - break; - } - } - } - } - for (int32_t i = 0; i < aiv_count_; i++) { - int32_t thread_idx = i % thread_num_; - int32_t core_id = aiv_cores_[i].worker_id; - core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = core_id; - bool found = false; - for (int32_t j = 0; j < aiv_running_cores_num; j++) { - if (core_id == aiv_running_cores[j]) { - trackers_[thread_idx].aiv().running[trackers_[thread_idx].aiv().running_count++] = core_id; - executing_task_ids_[thread_idx][core_id] = aiv_running_task_ids[j]; - found = true; - break; + + // Restore a single core's running/idle state into its new thread's tracker + auto reassign_core = [&](int32_t worker_id, CoreTypeTracker& type_tracker, + CoreStateTracker& tracker, int32_t thread_idx) { + core_assignments_[thread_idx][core_count_per_thread_[thread_idx]++] = worker_id; + for (int32_t j = 0; j < running_count; j++) { + if (running_cores[j] == worker_id) { + type_tracker.running[type_tracker.running_count++] = worker_id; + executing_task_ids_[thread_idx][worker_id] = running_task_ids[j]; + return; } } - if (!found) { - for (int32_t j = 0; j < aiv_idle_cores_num; j++) { - if (core_id == aiv_idle_cores[j]) { - trackers_[thread_idx].aiv().idle[trackers_[thread_idx].aiv().idle_count++] = core_id; - break; - } - } + if (was_idle[worker_id]) { + type_tracker.idle[type_tracker.idle_count++] = worker_id; + tracker.core_idle[worker_id] = true; } + }; + + // Assign whole clusters round-robin across all threads + for (int32_t ci = 0; ci < aic_count_; ci++) { + int32_t t = ci % thread_num_; + CoreStateTracker& tracker = trackers_[t]; + + int32_t aic_wid = aic_cores_[ci].worker_id; + int32_t aiv0_wid = aiv_cores_[2 * ci].worker_id; + int32_t aiv1_wid = aiv_cores_[2 * ci + 1].worker_id; + + tracker.clusters[tracker.cluster_count++] = {aic_wid, {aiv0_wid, aiv1_wid}}; + + reassign_core(aic_wid, tracker.aic(), tracker, t); + reassign_core(aiv0_wid, tracker.aiv(), tracker, t); + reassign_core(aiv1_wid, tracker.aiv(), tracker, t); } // Log final distribution for verification DEV_INFO("Core reassignment complete:"); for (int32_t t = 0; t < thread_num_; t++) { - DEV_INFO(" Thread %d: %d cores (AIC: running=%d idle=%d, AIV: running=%d idle=%d)", - t, core_count_per_thread_[t], + DEV_INFO(" Thread %d: %d cores, %d clusters (AIC: running=%d idle=%d, AIV: running=%d idle=%d)", + t, core_count_per_thread_[t], trackers_[t].cluster_count, trackers_[t].aic().running_count, trackers_[t].aic().idle_count, trackers_[t].aiv().running_count, trackers_[t].aiv().idle_count); } @@ -942,7 +1007,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa if (tracker.aic().running_count > 0) { try_completed = true; check_running_cores_for_completion( - thread_idx, tracker.aic(), hank, executing_task_ids, + thread_idx, tracker.aic(), tracker.core_idle, hank, executing_task_ids, completed_this_turn, cur_thread_completed, made_progress, deferred_release_ids, deferred_release_count, local_bufs @@ -961,7 +1026,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa if (tracker.aiv().running_count > 0) { try_completed = true; check_running_cores_for_completion( - thread_idx, tracker.aiv(), hank, executing_task_ids, + thread_idx, tracker.aiv(), tracker.core_idle, hank, executing_task_ids, completed_this_turn, cur_thread_completed, made_progress, deferred_release_ids, deferred_release_count, local_bufs @@ -1003,45 +1068,153 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa } #endif - // Dispatch: local queue first (zero MPMC operations), then global readyQ + // Phase 2: Local dispatch — drain local_bufs, match to idle clusters (zero MPMC operations) + // Phase 3: Global queue — push overflow to readyQ + fill remaining idle cores from readyQ bool try_pushed = false; - // Process AIC cores: local AIC buffer + global CUBE queue - // Enter when local buffer has tasks (even if no idle cores, to drain to global queue) - // or when idle cores can be filled from global queue - if (local_bufs[0].count > 0 || - (tracker.aic().idle_count > 0 && rt->scheduler.ready_queues[PTO2_WORKER_CUBE].size() > 0)) { - try_pushed = true; - dispatch_ready_tasks_to_idle_cores( - runtime, thread_idx, tracker.aic(), executing_task_ids, made_progress, - task_descriptors, task_payloads, window_mask, - local_bufs + // Local dispatch: drain both per-CoreType local_bufs, match to idle clusters by shape + int32_t overflow_ids[LOCAL_READY_CAP_PER_TYPE * PTO2_LOCAL_DISPATCH_TYPE_NUM]; + int overflow_count = 0; + for (int bi = 0; bi < PTO2_LOCAL_DISPATCH_TYPE_NUM; bi++) { + while (local_bufs[bi].count > 0) { + int32_t task_id = local_bufs[bi].pop(); + PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask]; + PTO2ResourceShape shape = pto2_active_mask_to_shape(task->active_mask); + int32_t ci = tracker.find_cluster_for_shape(shape); + + if (ci >= 0) { + try_pushed = true; + PTO2TaskPayload* task_pl = &task_payloads[task_id & window_mask]; + Cluster& c = tracker.clusters[ci]; +#if PTO2_SCHED_PROFILING + uint64_t t_setup_start = get_sys_cnt_aicpu(); +#endif + ResourceCount rc = shape_resource_count(shape); + + if (rc.aic) { + dispatch_subtask_to_core(runtime, tracker, executing_task_ids, + c.aic_core_id, CoreType::AIC, + task_id, task, task_pl, PTO2SubtaskSlot::AIC +#if PTO2_PROFILING + , profiling_enabled, thread_idx +#endif + ); + } + if (rc.aiv >= 1) { + int32_t aiv0 = tracker.core_idle[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1]; + dispatch_subtask_to_core(runtime, tracker, executing_task_ids, + aiv0, CoreType::AIV, + task_id, task, task_pl, PTO2SubtaskSlot::AIV0 +#if PTO2_PROFILING + , profiling_enabled, thread_idx +#endif + ); + } + if (rc.aiv >= 2) { + dispatch_subtask_to_core(runtime, tracker, executing_task_ids, + c.aiv_core_ids[1], CoreType::AIV, + task_id, task, task_pl, PTO2SubtaskSlot::AIV1 +#if PTO2_PROFILING + , profiling_enabled, thread_idx +#endif + ); + } #if PTO2_PROFILING - , profiling_enabled, pop_hit, pop_miss, phase_dispatch_count - , local_dispatch_count, local_overflow_count + pop_hit++; + phase_dispatch_count++; + local_dispatch_count++; #endif #if PTO2_SCHED_PROFILING - , sched_dispatch_pop_cycle, sched_dispatch_setup_cycle + sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); #endif - ); + made_progress = true; + DEV_DEBUG("Thread %d: Dispatching %s task %d to cluster %d (local)", + thread_idx, shape_name(shape), task_id, ci); + } else { + overflow_ids[overflow_count++] = task_id; +#if PTO2_PROFILING + local_overflow_count++; +#endif + } + } } - // Process AIV cores: local AIV buffer + global VECTOR queue - if (local_bufs[1].count > 0 || - (tracker.aiv().idle_count > 0 && rt->scheduler.ready_queues[PTO2_WORKER_VECTOR].size() > 0)) { - try_pushed = true; - dispatch_ready_tasks_to_idle_cores( - runtime, thread_idx, tracker.aiv(), executing_task_ids, made_progress, - task_descriptors, task_payloads, window_mask, - local_bufs + // Push overflow to global readyQ (shape-based) + for (int i = 0; i < overflow_count; i++) { + PTO2TaskDescriptor* task = &task_descriptors[overflow_ids[i] & window_mask]; + PTO2ResourceShape shape = pto2_active_mask_to_shape(task->active_mask); + rt->scheduler.ready_queues[static_cast(shape)].push(overflow_ids[i]); + } + + // Phase 3: Global dispatch — fill remaining idle cores from global readyQ (cluster-based) + const PTO2ResourceShape* dispatch_order = get_dispatch_order(thread_idx); + + for (int32_t si = 0; si < PTO2_NUM_RESOURCE_SHAPES; si++) { + PTO2ResourceShape shape = dispatch_order[si]; + if (rt->scheduler.ready_queues[static_cast(shape)].size() == 0) continue; + + while (true) { + int32_t ci = tracker.find_cluster_for_shape(shape); + if (ci < 0) break; + + int32_t task_id = pop_ready_task(shape, thread_idx #if PTO2_PROFILING - , profiling_enabled, pop_hit, pop_miss, phase_dispatch_count - , local_dispatch_count, local_overflow_count + , pop_hit, pop_miss #endif #if PTO2_SCHED_PROFILING - , sched_dispatch_pop_cycle, sched_dispatch_setup_cycle + , sched_dispatch_pop_cycle #endif - ); + ); + if (task_id < 0) break; + + try_pushed = true; +#if PTO2_PROFILING + phase_dispatch_count++; +#endif +#if PTO2_SCHED_PROFILING + uint64_t t_setup_start = get_sys_cnt_aicpu(); +#endif + Cluster& c = tracker.clusters[ci]; + PTO2TaskDescriptor* task = &task_descriptors[task_id & window_mask]; + PTO2TaskPayload* task_pl = &task_payloads[task_id & window_mask]; + ResourceCount rc = shape_resource_count(shape); + + if (rc.aic) { + dispatch_subtask_to_core(runtime, tracker, executing_task_ids, + c.aic_core_id, CoreType::AIC, task_id, task, task_pl, + PTO2SubtaskSlot::AIC +#if PTO2_PROFILING + , profiling_enabled, thread_idx +#endif + ); + } + if (rc.aiv >= 1) { + int32_t aiv_id = tracker.core_idle[c.aiv_core_ids[0]] + ? c.aiv_core_ids[0] : c.aiv_core_ids[1]; + dispatch_subtask_to_core(runtime, tracker, executing_task_ids, + aiv_id, CoreType::AIV, task_id, task, task_pl, + PTO2SubtaskSlot::AIV0 +#if PTO2_PROFILING + , profiling_enabled, thread_idx +#endif + ); + } + if (rc.aiv >= 2) { + dispatch_subtask_to_core(runtime, tracker, executing_task_ids, + c.aiv_core_ids[1], CoreType::AIV, task_id, task, task_pl, + PTO2SubtaskSlot::AIV1 +#if PTO2_PROFILING + , profiling_enabled, thread_idx +#endif + ); + } + made_progress = true; +#if PTO2_SCHED_PROFILING + sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); +#endif + DEV_DEBUG("Thread %d: Dispatching %s task %d to cluster %d", + thread_idx, shape_name(shape), task_id, ci); + } } #if PTO2_PROFILING @@ -1089,7 +1262,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa PTO2TaskState st = sched->task_state[slot].load(std::memory_order_relaxed); int32_t rc = sched->fanin_refcount[slot].load(std::memory_order_relaxed); int32_t fi = task_descriptors[slot].fanin_count; - int32_t kid = task_descriptors[slot].kernel_id; + int32_t kid = task_descriptors[slot].kernel_id[0]; if (st >= PTO2_TASK_COMPLETED) continue; // Already done if (st == PTO2_TASK_READY || st == PTO2_TASK_RUNNING) { cnt_inflight++; continue; } // PENDING @@ -1124,7 +1297,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa int32_t hw_kernel = -1; if (hh->task != 0) { const PTO2DispatchPayload* pl = reinterpret_cast((uintptr_t)hh->task); - hw_task_id = pl->task_id; + hw_task_id = pl->mixed_task_id; hw_kernel = pl->kernel_id; } DEV_ALWAYS(" AIC core[%d] cid=%d sw_task=%d hw_task=%d hw_kernel=%d", @@ -1138,7 +1311,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa int32_t hw_kernel = -1; if (hh->task != 0) { const PTO2DispatchPayload* pl = reinterpret_cast((uintptr_t)hh->task); - hw_task_id = pl->task_id; + hw_task_id = pl->mixed_task_id; hw_kernel = pl->kernel_id; } uint64_t cond_reg = read_reg(core_id_to_reg_addr_[cid], RegId::COND); @@ -1147,6 +1320,14 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa EXTRACT_TASK_STATE(cond_reg), EXTRACT_TASK_ID(cond_reg), executing_task_ids[cid], hw_task_id, hw_kernel); } + // Dump cluster state + for (int32_t cli = 0; cli < tracker.cluster_count && cli < STALL_DUMP_CORE_MAX; cli++) { + Cluster& cl = tracker.clusters[cli]; + DEV_ALWAYS(" cluster[%d] aic=%d(%s) aiv0=%d(%s) aiv1=%d(%s)", + cli, cl.aic_core_id, tracker.core_idle[cl.aic_core_id] ? "idle" : "busy", + cl.aiv_core_ids[0], tracker.core_idle[cl.aiv_core_ids[0]] ? "idle" : "busy", + cl.aiv_core_ids[1], tracker.core_idle[cl.aiv_core_ids[1]] ? "idle" : "busy"); + } } if (idle_iterations > MAX_IDLE_ITERATIONS) { DEV_ERROR("Thread %d: PTO2 timeout after %d idle iterations", thread_idx, idle_iterations); @@ -1761,13 +1942,17 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int32_t thread_idx, DEV_ALWAYS("Progress: %d/%d tasks (%.1f%%)", completed, total, total > 0 ? completed * 100.0 / total : 0.0); - uint64_t aic_ready = 0, aiv_ready = 0; + uint64_t aic_ready = 0, aiv_ready = 0, aiv_x2_ready = 0, mixed_x1_ready = 0, mixed_x2_ready = 0; if (rt) { PTO2SchedulerState* sched = &rt->scheduler; - aic_ready = sched->ready_queues[PTO2_WORKER_CUBE].size(); - aiv_ready = sched->ready_queues[PTO2_WORKER_VECTOR].size(); + aic_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIC_ONLY)].size(); + aiv_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIV_X1)].size(); + aiv_x2_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIV_X2)].size(); + mixed_x1_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIC_AIV_X1)].size(); + mixed_x2_ready = sched->ready_queues[static_cast(PTO2ResourceShape::AIC_AIV_X2)].size(); } - DEV_ALWAYS("Ready Queues: AIC=%lu, AIV=%lu", aic_ready, aiv_ready); + DEV_ALWAYS("Ready Queues: AIC=%lu, AIV=%lu, AIV_X2=%lu, AIC_AIV_X1=%lu, AIC_AIV_X2=%lu", + aic_ready, aiv_ready, aiv_x2_ready, mixed_x1_ready, mixed_x2_ready); int32_t busy_cores = 0; int32_t idle_cores = 0; @@ -1791,7 +1976,7 @@ void AicpuExecutor::diagnose_stuck_state(Runtime* runtime, int32_t thread_idx, DEV_ALWAYS(" Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s), executing_task_id=%d, kernel_id=%d", core_id, core_type_str, reg_val, reg_task_id, reg_state == TASK_FIN_STATE ? "FIN" : "ACK", - payload->task_id, payload->kernel_id); + payload->mixed_task_id, payload->kernel_id); } else { DEV_ALWAYS(" Core %d [%s, BUSY]: COND=0x%lx (reg_task_id=%d, reg_state=%s) but task_id not tracked", core_id, core_type_str, reg_val, reg_task_id, diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md index 7c302db7..a90ca050 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md @@ -318,23 +318,31 @@ When `pto2_submit_task` processes parameters: ## 6. Task Descriptor and States -### 6.1 PTO2TaskDescriptor +### 6.1 PTO2TaskDescriptor (Hot Path) | Field | Description | |-------|-------------| -| `task_id` | Monotonically increasing ID | -| `kernel_id` | Function ID (maps to compiled kernel binary) | -| `worker_type` | CUBE (AIC), VECTOR (AIV), AI_CPU, or ACCELERATOR | -| `fanin_head` | Head of fanin dependency list (pointer into DepListPool) | +| `mixed_task_id` | Canonical mixed-task ID (monotonically increasing) | +| `kernel_id[3]` | Per-slot kernel IDs: `[AIC, AIV0, AIV1]`; `INVALID_KERNEL_ID` = inactive | +| `active_mask` | Bitmask of active subtask slots: `bit0=AIC`, `bit1=AIV0`, `bit2=AIV1` | +| `subtask_done_mask` | Atomic bitmask; each subtask sets its done bit on completion | | `fanin_count` | Number of producer dependencies | | `fanout_lock` | Per-task spinlock for concurrent fanout modification | | `fanout_head` | Head of fanout consumer list (pointer, protected by `fanout_lock`) | | `fanout_count` | 1 (scope ref) + number of consumers | | `packed_buffer_base` | Start of packed buffer in GM Heap | | `packed_buffer_end` | End of packed buffer (for heap reclamation) | -| `is_active` | Task slot is in use | -| `params[16]` | Tensor and scalar parameters (`PTOParam` array) | + +### 6.1b PTO2TaskPayload (Cold Path) + +| Field | Description | +|-------|-------------| +| `tensors[16]` | Tensor descriptors for parameters | +| `scalar_value[16]` | Scalar parameter values | +| `is_tensor[16]` | Whether each parameter is tensor or scalar | | `param_count` | Number of valid parameters | +| `fanin_tasks[]` | Producer task IDs (used by `on_task_release`) | +| `fanin_actual_count` | Actual fanin count | ### 6.2 Task State Machine @@ -406,8 +414,8 @@ Scopes control the lifetime of intermediate buffers. Each scope: ```cpp PTO2_SCOPE(rt) { // Tasks submitted here belong to this scope - pto2_rt_submit_task(rt, FUNC_QK, PTO2_WORKER_CUBE, params, n); - pto2_rt_submit_task(rt, FUNC_SF, PTO2_WORKER_VECTOR, params, n); + pto2_rt_submit_aic_task(rt, FUNC_QK, params, n); + pto2_rt_submit_aiv_task(rt, FUNC_SF, params, n); } // scope_end: scope reference released from all tasks above ``` @@ -435,11 +443,11 @@ Each scheduler thread runs a tight loop with two main phases: **Phase 1 — Completion Handling**: - Poll register `COND` on each managed core -- When `TASK_FIN_STATE` detected: record completion timestamps, mark `task_state[slot] = COMPLETED`, acquire fanout lock, traverse fanout list (incrementing consumers' `fanin_refcount`), mark `task_state[slot] = CONSUMED`, advance `last_task_alive` watermark +- When `TASK_FIN_STATE` detected: record completion timestamps, call `on_subtask_complete(mixed_task_id, subslot)` to set the done bit; when `subtask_done_mask == active_mask`, trigger `on_mixed_task_complete(mixed_task_id)` which marks `task_state[slot] = COMPLETED`, acquires fanout lock, traverses fanout list (incrementing consumers' `fanin_refcount`), marks `task_state[slot] = CONSUMED`, and advances `last_task_alive` watermark **Phase 2 — Dispatch**: -- For each idle core: pop a task from the ready queue (lock-free MPMC Vyukov queue, one per worker type) -- Build `PTO2DispatchPayload` from `TaskDescriptor` +- For each idle core: pop a task from the matching shape-based ready queue (lock-free MPMC Vyukov queue, one per resource shape) +- Build `PTO2DispatchPayload` from `TaskDescriptor` with `mixed_task_id`, `subslot`, `kernel_id`, and `core_type` - Write task pointer to `Handshake.task`, signal AICore via register `DATA_MAIN_BASE` After these phases, the scheduler updates profiling headers and checks for termination (all tasks completed and orchestrator done). @@ -448,9 +456,9 @@ After these phases, the scheduler updates profiling headers and checks for termi Ready queues use a lock-free bounded MPMC (Vyukov) design: -- One `PTO2ReadyQueue` per worker type (4 types: CUBE, VECTOR, AI_CPU, ACCELERATOR) -- **Push**: any thread (orchestrator via `init_task`, or scheduler on completion) pushes newly-ready tasks -- **Pop**: scheduler threads pop from the queue matching the idle core's worker type +- One `PTO2ReadyQueue` per resource shape (5 shapes: `AIC_ONLY`, `AIV_X1`, `AIV_X2`, `AIC_AIV_X1`, `AIC_AIV_X2`) +- **Push**: any thread (orchestrator via `init_task`, or scheduler on completion) pushes newly-ready tasks to the queue matching `pto2_active_mask_to_shape(task->active_mask)` +- **Pop**: scheduler threads pop from the queue matching the idle core's resource shape - Per-slot sequence counters prevent ABA problems - `enqueue_pos` and `dequeue_pos` are on separate cache lines to avoid false sharing @@ -505,8 +513,10 @@ Built by the scheduler from `PTO2TaskDescriptor`: | Field | Description | |-------|-------------| -| `task_id` | Task identifier | -| `kernel_id` | Function ID | +| `mixed_task_id` | Mixed-task identifier (for completion aggregation) | +| `subslot` | Which subtask slot this dispatch represents (`AIC`, `AIV0`, or `AIV1`) | +| `kernel_id` | Function ID for this subtask slot | +| `core_type` | AIC or AIV | | `function_bin_addr` | GM address of compiled kernel binary | | `num_args` | Number of arguments | | `args[]` | Tensor addresses and scalar values | @@ -557,7 +567,9 @@ The orchestration API is defined in `pto_orchestration_api.h`. Orchestration cod | Function/Macro | Purpose | |----------------|---------| -| `pto2_rt_submit_task(rt, kernel_id, worker_type, params, n)` | Submit a task with parameters | +| `pto2_rt_submit_task(rt, mixed_kernels, params, n)` | Submit a mixed task with `MixedKernels` struct | +| `pto2_rt_submit_aic_task(rt, kernel_id, params, n)` | Convenience: submit AIC-only task | +| `pto2_rt_submit_aiv_task(rt, kernel_id, params, n)` | Convenience: submit AIV-only task | | `PTO2_SCOPE(rt) { ... }` | RAII scope for buffer lifetime | | `pto2_rt_orchestration_done(rt)` | Signal orchestration complete | | `pto2_rt_init_tensor_pool(rt)` | Initialize tensor pool for `make_tensor()` | @@ -573,14 +585,17 @@ The orchestration API is defined in `pto_orchestration_api.h`. Orchestration cod | `make_inout_param(tensor)` | INOUT parameter — read then written | | `make_scalar_param(value)` | 64-bit scalar parameter | -### 11.3 Worker Types +### 11.3 Resource Shapes + +Tasks are queued by resource shape, which is derived from the `active_mask` in the `MixedKernels` struct: -| Type | Target | -|------|--------| -| `PTO2_WORKER_CUBE` | AIC cores (matrix multiplication) | -| `PTO2_WORKER_VECTOR` | AIV cores (vector operations) | -| `PTO2_WORKER_AI_CPU` | AICPU (scalar ops, control flow) | -| `PTO2_WORKER_ACCELERATOR` | Fixed-function accelerators (DMA, etc.) | +| Shape | Active Mask | Description | +|-------|-------------|-------------| +| `AIC_ONLY` | AIC only | AIC cores (matrix multiplication) | +| `AIV_X1` | AIV0 or AIV1 only | Single AIV core (vector operations) | +| `AIV_X2` | AIV0 + AIV1 | Two AIV cores | +| `AIC_AIV_X1` | AIC + one AIV | AIC + single AIV core | +| `AIC_AIV_X2` | AIC + AIV0 + AIV1 | Full cluster (AIC + two AIV cores) | ### 11.4 Orchestration Export Interface diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md new file mode 100644 index 00000000..72619284 --- /dev/null +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SUBMIT_BY_CLUSTER.md @@ -0,0 +1,226 @@ +# Submit by Cluster - Requirements and Main-Branch-Aligned Design + +## 1. Goal + +Define a single, main-branch-aligned specification for PTO2 cluster submission that combines: + +1. Product requirements (what must be true). +2. Runtime design (how it is implemented on current main baseline). + +The target model is: one submitted graph node is one `MixedTask`, and dispatch/completion is mixed-task-granular. + +## 2. Background and Motivation + +Future Ascend hardware is expected to provide stronger locality within an AICore cluster (`1 AIC + 2 AIV`). +The runtime therefore needs a "submit together, run together" model for related AIC/AIV kernels. + +Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-dispatch of multiple kernels to one cluster. + +## 3. Scope + +### In Scope + +1. New orchestration-facing submit API for cluster-aware mixed submission. +2. Runtime/backend scheduler and executor changes to treat a mixed submit as one atomic scheduling unit. +3. Dependency gating, readiness, dispatch, completion, and reclamation at mixed-task granularity. +4. AIV slot equivalence (`AIV0` and `AIV1` are equivalent execution targets). + +### Out of Scope + +1. User-facing cluster pinning (`allocate_cluster/free_cluster`-style APIs). +2. New worker types beyond AIC/AIV. +3. Cross-cluster user placement policies. +4. Hardware topology changes beyond `1 AIC + 2 AIV` per cluster. + +## 4. Main-Branch Baseline Constraints + +Design must preserve the current main runtime architecture: + +1. Multi-orchestrator runtime wiring (`orchestrators[]`, `orch_count`, thread-local `pto2_current_orch_idx`). +2. Executor threading split (orchestrator threads vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`). +3. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold). + +## 5. Terminology + +1. `cluster`: one physical unit with `1 AIC + 2 AIV`. +2. `MixedKernels`: 3 submit slots (`AIC`, `AIV0`, `AIV1`) with `INVALID_KERNEL_ID` for inactive slots. +3. `MixedTask`: one runtime graph node created by one submit call. +4. `active_mask`: bitmask of active subtask slots. +5. `resource shape`: normalized lane demand class of a mixed task. + +## 6. API Contract + +```cpp +inline constexpr int32_t INVALID_KERNEL_ID = -1; + +struct MixedKernels { + int32_t aic_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv0_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv1_kernel_id{INVALID_KERNEL_ID}; +}; + +static inline void pto2_rt_submit_task(PTO2Runtime* rt, + const MixedKernels& mixed_kernels, + PTOParam* params, + int32_t num_params); + +static inline void pto2_rt_submit_aic_task(PTO2Runtime* rt, + int32_t kernel_id, + PTOParam* params, + int32_t num_params); + +static inline void pto2_rt_submit_aiv_task(PTO2Runtime* rt, + int32_t kernel_id, + PTOParam* params, + int32_t num_params); +``` + +Rules: + +1. One submit call creates one `MixedTask`. +2. All active slots share the same `params` and `num_params`. +3. At least one slot must be active. +4. `aiv0_kernel_id` and `aiv1_kernel_id` are semantically equivalent. +5. Wrappers are orchestration sugar only (inline in orchestration API); no dedicated runtime ops entries. +6. Submit-contract types are defined once in a shared header-only submit-types surface consumed by orchestration and runtime headers. +7. Invalid submits follow existing PTO2 behavior (`always_assert`), not a new recoverable return-code API. + +## 7. Data Model (Requirements + Design) + +`PTO2TaskDescriptor` (hot path) carries mixed-task identity/state: + +1. `mixed_task_id` +2. `active_mask` +3. `subtask_done_mask` +4. `kernel_id[3]` for `(AIC, AIV0, AIV1)` +5. dependency heads/counters and packed-buffer metadata + +`PTO2TaskPayload` (cold path) carries: + +1. shared params/tensors/scalars copied once per mixed submit +2. fanin mixed-task IDs +3. other cold-path submit metadata + +Producer identity in TensorMap is mixed-task ID end-to-end. + +## 8. Scheduling Model + +### 8.1 Resource Shapes + +Runtime uses shape-based ready queues (not worker-type queues): + +1. `AIC_ONLY` +2. `AIV_X1` +3. `AIV_X2` +4. `AIC_AIV_X1` +5. `AIC_AIV_X2` + +Queueing key is normalized resource shape (not raw slot label). + +### 8.2 Atomic Cluster Dispatch + +1. Dispatch decision unit is one mixed task. +2. For multi-slot mixed tasks, partial launch is forbidden. +3. A mixed task is dispatchable only when one local owned cluster can satisfy all required lanes. +4. Compatible mixed tasks may co-reside over time if they use disjoint free lanes. + +### 8.3 Dependency and Completion + +1. Fanin release/readiness remains dependency-correct and graph-level. +2. Two-stage completion: + - `on_subtask_complete(mixed_task_id, subslot)` + - `on_mixed_task_complete(mixed_task_id)` only when `subtask_done_mask == active_mask` +3. Downstream release is triggered once per mixed task completion, not once per subslot. + +## 9. Executor Ownership and Numbering + +### 9.1 Canonical Flattened Numbering (Unchanged) + +Given `block_dim` clusters: + +1. AIC IDs: `[0, block_dim)` +2. AIV IDs: `[block_dim, 3 * block_dim)` +3. Cluster `i`: `{i, block_dim + i, 2 * block_dim + i}` + +This project-defined flattened numbering is kept unchanged. + +### 9.2 Cluster Ownership + +1. One cluster must be owned by one scheduler domain/thread at a time. +2. No split-cluster ownership in either: + - initial `assign_cores_to_threads()` + - post-orchestrator `reassign_cores_for_all_threads()` +3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment. + +## 10. Functional Requirements + +### 10.1 Valid Mixed Shapes + +1. AIC only +2. AIV only (1 or 2 AIV lanes) +3. AIC + 1 AIV +4. AIC + 2 AIV + +### 10.2 Runtime Behavior per Submit + +1. Validate submit arguments. +2. Allocate mixed-task ID and initialize descriptor/payload once. +3. Build fanin/fanout at mixed-task granularity. +4. Enqueue by shape when ready. +5. Dispatch all active lanes atomically when resources allow. +6. Aggregate completion and release downstream once. + +## 11. Non-Functional Requirements + +1. Correctness: no dependency violation, no partial mixed-task dispatch. +2. Determinism: dependency-correct ordering preserved; AIV lane choice may vary but remains semantically equivalent. +3. Fairness: resource-aware polling heuristic is allowed; strict starvation-free guarantee across all shapes is not required. +4. Performance: no obvious regression for non-cluster workflows. +5. Observability: lifecycle visibility for submit/ready/dispatch/block/complete. + +## 12. Acceptance Criteria + +Feature is accepted when: + +1. Orchestration compiles and submits via `MixedKernels` API/wrappers. +2. Scheduler dispatches each mixed task as one cluster scheduling decision. +3. Dependencies gate mixed-task readiness correctly. +4. AIV execution remains cluster-local and semantically equivalent across lanes. +5. Existing non-cluster workflows continue to pass without behavior regression. +6. Cluster ownership is never split across scheduler domains before/after transition. + +## 13. Verification Matrix + +Recommended validation coverage: + +1. Mapping correctness for cluster-to-core ID relation. +2. Atomic dispatch for multi-slot shapes. +3. Dependency gating and completion aggregation (`done_mask == active_mask`). +4. Lane-occupancy co-residency behavior for compatible shapes. +5. Multi-orchestrator and core-transition ownership stability. +6. Invalid submit handling (`always_assert` path). +7. Regression coverage for existing examples/tests. + +Milestone command (device): + +```bash +python examples/scripts/run_example.py \ + -k tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels \ + -g tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py \ + -p a2a3 -d 9 +``` + +Final validation: + +```bash +./ci.sh +``` + +## 14. Resolved Decisions + +1. Legacy orchestration-facing single-task submit is replaced by mixed submit contract. +2. Invalid mixed submits fail with existing submit-time assert behavior. +3. Per-cluster concurrent capacity is lane-occupancy-driven, not a fixed constant. +4. Submit-contract types live in one shared header-only surface. +5. Resource-aware dispatch heuristics are allowed without a strict starvation-free guarantee. + diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md index 3b23d7f7..c619f36a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md @@ -110,9 +110,9 @@ The scheduler loop runs four phases each iteration. Each phase's time is accumul | Phase | What it does | Inline stats | |-------|-------------|-------------| -| **complete** | Polls handshake on each managed core; when a core completes, traverses fanout list (notify consumers) and fanin list (release producers) via `on_task_complete` | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release | +| **complete** | Polls handshake on each managed core; when a core completes, calls `on_subtask_complete(mixed_task_id, subslot)` to set the done bit; when `subtask_done_mask == active_mask`, triggers `on_mixed_task_complete` which traverses fanout list (notify consumers) and fanin list (release producers) | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release | | **scan** | Updates the perf profiling header with latest scheduler state | — | -| **dispatch** | For each idle core, pops a task from the ready queue via `pto2_scheduler_get_ready_task`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) | +| **dispatch** | For each idle core, pops a task from the shape-based ready queue via `get_ready_task(shape)`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) | | **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — | **Interpreting phase percentages:** diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h index ee54cbd2..4cc39212 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h @@ -24,16 +24,7 @@ // Type headers needed by orchestration #include "pto_types.h" // PTOParam, make_input_param, make_output_param, etc. #include "tensor.h" // Tensor, make_tensor, make_tensor_external - -// Worker type constants (duplicated from pto_runtime2_types.h to avoid -// pulling in the full types header with its internal structures) -typedef enum { - PTO2_WORKER_CUBE = 0, - PTO2_WORKER_VECTOR = 1, - PTO2_WORKER_AI_CPU = 2, - PTO2_WORKER_ACCELERATOR = 3, - PTO2_NUM_WORKER_TYPES = 4 -} PTO2WorkerType; +#include "pto_submit_types.h" // MixedKernels, INVALID_KERNEL_ID, subtask slots // ============================================================================= // Ops Table and Opaque Runtime @@ -51,8 +42,7 @@ typedef struct PTO2Runtime PTO2Runtime; * Populated by the runtime; called by orchestration through inline wrappers. */ typedef struct PTO2RuntimeOps { - void (*submit_task)(PTO2Runtime* rt, int32_t kernel_id, - PTO2WorkerType worker_type, + void (*submit_task)(PTO2Runtime* rt, const MixedKernels& mixed_kernels, PTOParam* params, int32_t num_params); void (*scope_begin)(PTO2Runtime* rt); void (*scope_end)(PTO2Runtime* rt); @@ -81,10 +71,29 @@ struct PTO2Runtime { // Inline Convenience Wrappers (call through ops table) // ============================================================================= -static inline void pto2_rt_submit_task(PTO2Runtime* rt, int32_t kernel_id, - PTO2WorkerType worker_type, +static inline void pto2_rt_submit_task(PTO2Runtime* rt, const MixedKernels& mixed_kernels, PTOParam* params, int32_t num_params) { - rt->ops->submit_task(rt, kernel_id, worker_type, params, num_params); + rt->ops->submit_task(rt, mixed_kernels, params, num_params); +} + +/** + * Convenience wrapper: submit an AIC-only task. + */ +static inline void pto2_rt_submit_aic_task(PTO2Runtime* rt, int32_t kernel_id, + PTOParam* params, int32_t num_params) { + MixedKernels mk; + mk.aic_kernel_id = kernel_id; + rt->ops->submit_task(rt, mk, params, num_params); +} + +/** + * Convenience wrapper: submit an AIV-only task (uses AIV0 slot). + */ +static inline void pto2_rt_submit_aiv_task(PTO2Runtime* rt, int32_t kernel_id, + PTOParam* params, int32_t num_params) { + MixedKernels mk; + mk.aiv0_kernel_id = kernel_id; + rt->ops->submit_task(rt, mk, params, num_params); } static inline void pto2_rt_scope_begin(PTO2Runtime* rt) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h index 385d7bf0..3ad84225 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h @@ -12,6 +12,7 @@ #include #include "common/core_type.h" +#include "pto_submit_types.h" /** Max arguments per task; must match RUNTIME_MAX_ARGS and PTO2_MAX_OUTPUTS */ #ifndef PTO2_DISPATCH_MAX_ARGS @@ -23,7 +24,8 @@ * AICPU packs this from PTO2TaskDescriptor; AICore unpacks to run kernel. */ struct PTO2DispatchPayload { - int32_t task_id; /**< Task ID (for completion_queue) */ + int32_t mixed_task_id; /**< Mixed-task ID (for completion aggregation) */ + PTO2SubtaskSlot subslot; /**< Which subtask slot this dispatch represents */ int32_t kernel_id; /**< InCore function id (debug/trace) */ CoreType core_type; /**< AIC or AIV */ uint64_t function_bin_addr; /**< Kernel entry in GM: (UnifiedKernelFunc)function_bin_addr */ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 1f6d9d67..09f909eb 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -218,10 +218,27 @@ void pto2_scope_end(PTO2OrchestratorState* orch) { // ============================================================================= // Task Submission // ============================================================================= -void pto2_submit_task( - PTO2OrchestratorState* orch, int32_t kernel_id, PTO2WorkerType worker_type, PTOParam* params, int32_t num_params) { +void pto2_submit_mixed_task( + PTO2OrchestratorState* orch, const MixedKernels& mixed_kernels, PTOParam* params, int32_t num_params) { CYCLE_COUNT_START(); + // === Validate submit inputs === + uint8_t active_mask = pto2_mixed_kernels_to_active_mask(mixed_kernels); + always_assert(active_mask != 0 && "MixedKernels must have at least one active slot"); + always_assert((params != nullptr || num_params == 0) && "params must not be null when num_params > 0"); + + // Normalize single-AIV tasks: if only aiv1 is set, move it to the aiv0 slot. + // This guarantees the dispatch path can always use PTO2SubtaskSlot::AIV0 for + // AIV_X1 and AIC_AIV_X1 shapes without inspecting active_mask. + MixedKernels normalized = mixed_kernels; + bool has_aiv0 = (active_mask & PTO2_SUBTASK_MASK_AIV0) != 0; + bool has_aiv1 = (active_mask & PTO2_SUBTASK_MASK_AIV1) != 0; + if (has_aiv1 && !has_aiv0) { + normalized.aiv0_kernel_id = normalized.aiv1_kernel_id; + normalized.aiv1_kernel_id = INVALID_KERNEL_ID; + active_mask = pto2_mixed_kernels_to_active_mask(normalized); + } + // === STEP 0: Sync TensorMap validity and optional cleanup === orch->tensor_map.sync_tensormap(); @@ -238,10 +255,13 @@ void pto2_submit_task( PTO2TaskDescriptor& task = task_ring.get_task_by_slot(slot); PTO2TaskPayload* payload = &orch->sm_handle->task_payloads[slot]; - // Initialize task descriptor - task.task_id = task_id; - task.kernel_id = kernel_id; - task.worker_type = worker_type; + // Initialize mixed-task descriptor + task.mixed_task_id = task_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIC)] = normalized.aic_kernel_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV0)] = normalized.aiv0_kernel_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV1)] = normalized.aiv1_kernel_id; + task.active_mask = active_mask; + task.subtask_done_mask.store(0, std::memory_order_relaxed); task.fanin_count = 0; task.fanout_head = nullptr; task.fanout_lock.store(0, std::memory_order_relaxed); @@ -364,7 +384,7 @@ void pto2_submit_task( for (int i = 0; i < num_params; i++) { PTOParamType ptype = params[i].type; if (ptype == PTOParamType::OUTPUT || ptype == PTOParamType::INOUT) { - // Register in TensorMap: this tensor is produced by task_id + // Register in TensorMap: this tensor is produced by task_id (mixed_task_id) orch->tensor_map.insert(payload->tensors[i], task_id, ptype == PTOParamType::OUTPUT); } } @@ -377,7 +397,7 @@ void pto2_submit_task( PTO2SchedulerState* sched = orch->scheduler; // Initialize scheduler state BEFORE adding to producer fanout lists, - // so concurrent on_task_complete can safely access task_state/fanout_refcount. + // so concurrent on_mixed_task_complete can safely access task_state/fanout_refcount. sched->task_state[slot].store(PTO2_TASK_PENDING, std::memory_order_relaxed); sched->fanout_refcount[slot].store(0, std::memory_order_relaxed); @@ -425,7 +445,8 @@ void pto2_submit_task( int32_t new_rc = sched->fanin_refcount[slot].fetch_add(initial_refcount, std::memory_order_acq_rel) + initial_refcount; if (new_rc >= fanin_count + 1) { - sched->ready_queues[task.worker_type].push(task_id); + PTO2ResourceShape shape = pto2_active_mask_to_shape(active_mask); + sched->ready_queues[static_cast(shape)].push(task_id); } #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING // Per producer: fetch_add(fanout_count) + load(task_state) + store(unlock) = 3 atomics diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index c4fa970c..4fdd1473 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -20,6 +20,7 @@ #include "pto_ring_buffer.h" #include "pto_runtime2_types.h" +#include "pto_submit_types.h" #include "pto_scheduler.h" #include "pto_shared_memory.h" #include "pto_tensormap.h" @@ -178,9 +179,8 @@ void pto2_scope_end(PTO2OrchestratorState* orch); * @param params Array of task parameters * @param num_params Number of parameters */ -void pto2_submit_task(PTO2OrchestratorState* orch, - int32_t kernel_id, - PTO2WorkerType worker_type, +void pto2_submit_mixed_task(PTO2OrchestratorState* orch, + const MixedKernels& mixed_kernels, PTOParam* params, int32_t num_params); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h index ed760359..48e52c2f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h @@ -358,7 +358,7 @@ struct PTO2TaskRing { if (active_count < window_size - 1) { int32_t slot = task_id & (window_size - 1); PTO2TaskDescriptor* task = &descriptors[slot]; - task->task_id = task_id; + task->mixed_task_id = task_id; return task_id; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp index 76f6ee4a..8ebb0033 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp @@ -26,11 +26,10 @@ void pto2_set_orch_thread_idx(int idx) { // Orchestration Ops Table (function-pointer dispatch for orchestration .so) // ============================================================================= -static void submit_task_impl(PTO2Runtime* rt, int32_t kernel_id, - PTO2WorkerType worker_type, +static void submit_task_impl(PTO2Runtime* rt, const MixedKernels& mixed_kernels, PTOParam* params, int32_t num_params) { - pto2_submit_task(&rt->orchestrators[pto2_current_orch_idx], kernel_id, worker_type, - params, num_params); + pto2_submit_mixed_task(&rt->orchestrators[pto2_current_orch_idx], mixed_kernels, + params, num_params); } void pto2_rt_scope_begin(PTO2Runtime* rt) { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h index cc3dc170..e09521ce 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h @@ -26,6 +26,7 @@ #define PTO_RUNTIME2_H #include "pto_runtime2_types.h" +#include "pto_submit_types.h" #include "pto_shared_memory.h" #include "pto_ring_buffer.h" #include "pto_tensormap.h" @@ -58,8 +59,7 @@ enum PTO2RuntimeMode { typedef struct PTO2Runtime PTO2Runtime; // forward declare for ops signatures struct PTO2RuntimeOps { - void (*submit_task)(PTO2Runtime* rt, int32_t kernel_id, - PTO2WorkerType worker_type, + void (*submit_task)(PTO2Runtime* rt, const MixedKernels& mixed_kernels, PTOParam* params, int32_t num_params); void (*scope_begin)(PTO2Runtime* rt); void (*scope_end)(PTO2Runtime* rt); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 33556475..60c19ecd 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -20,6 +20,7 @@ #include #include "pto_types.h" +#include "pto_submit_types.h" // ============================================================================= // Profiling Configuration @@ -79,7 +80,7 @@ #define PTO2_SCOPE_TASKS_INIT_CAP 65536 // Initial capacity for scope task buffer // Ready queue -#define PTO2_READY_QUEUE_SIZE 65536 // Per-worker-type queue size (16x larger to avoid queue full) +#define PTO2_READY_QUEUE_SIZE 65536 // Per-shape queue size (16x larger to avoid queue full) // Memory alignment #define PTO2_ALIGN_SIZE 64 // Cache line alignment @@ -282,13 +283,21 @@ struct PTO2DepListEntry { * - Other fields set by Orchestrator, read by Scheduler */ struct PTO2TaskDescriptor { - // Task identification - int32_t task_id; // Unique task identifier (absolute, not wrapped) - int32_t kernel_id; // InCore function to execute - int32_t worker_type; // Target: CUBE, VECTOR, AI_CPU, ACCELERATOR + // Mixed-task identification + int32_t mixed_task_id; // Canonical mixed-task ID + + // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive) + int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT]; + + // Active subtask mask: bit0=AIC, bit1=AIV0, bit2=AIV1 + uint8_t active_mask; + + // Completion aggregation: each subtask sets its done bit atomically + std::atomic subtask_done_mask; + // Dependency lists (linked list heads - offsets into DepListPool) // Fanin: producers this task depends on (set once at submission) - int32_t fanin_count; // Number of producer dependencies + int32_t fanin_count; // Number of producer dependencies // Fanout: consumers that depend on this task (grows as consumers submit) // PROTECTED BY fanout_lock diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp index 16c4ea7f..7cf8ab7f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.cpp @@ -158,8 +158,8 @@ bool pto2_scheduler_init(PTO2SchedulerState* sched, sched->fanout_refcount[i].store(0, std::memory_order_relaxed); } - // Initialize ready queues - for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) { + // Initialize ready queues (one per resource shape) + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { if (!pto2_ready_queue_init(&sched->ready_queues[i], PTO2_READY_QUEUE_SIZE)) { // Cleanup on failure for (int j = 0; j < i; j++) { @@ -194,7 +194,7 @@ void pto2_scheduler_destroy(PTO2SchedulerState* sched) { sched->fanout_refcount = nullptr; } - for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) { + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { pto2_ready_queue_destroy(&sched->ready_queues[i]); } } @@ -217,10 +217,10 @@ void pto2_scheduler_print_stats(PTO2SchedulerState* sched) { void pto2_scheduler_print_queues(PTO2SchedulerState* sched) { LOG_INFO("=== Ready Queues ==="); - const char* worker_names[] = {"CUBE", "VECTOR", "AI_CPU", "ACCELERATOR"}; + const char* shape_names[] = {"AIC_ONLY", "AIV_X1", "AIV_X2", "AIC_AIV_X1", "AIC_AIV_X2"}; - for (int i = 0; i < PTO2_NUM_WORKER_TYPES; i++) { - LOG_INFO(" %s: count=%" PRIu64, worker_names[i], + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + LOG_INFO(" %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size()); } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h index f94e4466..5d9dd77d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h @@ -2,10 +2,11 @@ * PTO Runtime2 - Scheduler Interface * * The Scheduler is responsible for: - * 1. Maintaining per-worker-type ready queues + * 1. Maintaining per-resource-shape ready queues * 2. Tracking task state (PENDING -> READY -> RUNNING -> COMPLETED -> CONSUMED) * 3. Managing fanin/fanout refcounts for dependency resolution * 4. Advancing last_task_alive for heap reclamation + * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete) * * The Scheduler runs on Device AI_CPU and processes: * - Task state transitions based on fanin_refcount @@ -260,12 +261,13 @@ void pto2_ready_queue_destroy(PTO2ReadyQueue* queue); // ============================================================================= /** - * Statistics returned by on_task_complete + * Statistics returned by mixed-task completion processing */ struct PTO2CompletionStats { int32_t fanout_edges; // Number of fanout edges traversed (notify consumers) int32_t tasks_enqueued; // Number of consumers that became READY int32_t fanin_edges; // Number of fanin edges traversed (release producers) + bool mixed_task_completed; // True only when this callback completed a mixed task }; /** @@ -298,8 +300,8 @@ struct PTO2SchedulerState { std::atomic* fanin_refcount; // Dynamic: counts completed producers std::atomic* fanout_refcount; // Dynamic: counts released references - // Ready queues (one per worker type) - PTO2ReadyQueue ready_queues[PTO2_NUM_WORKER_TYPES]; + // Ready queues (one per resource shape) + PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES]; // Statistics #if PTO2_PROFILING @@ -441,12 +443,15 @@ struct PTO2SchedulerState { if (new_refcount == task->fanin_count) { // Local-first: try per-CoreType thread-local buffer before global queue + // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1] + PTO2ResourceShape shape = pto2_active_mask_to_shape(task->active_mask); bool pushed_local = false; - if (local_bufs && task->worker_type >= 0 && task->worker_type < PTO2_LOCAL_DISPATCH_TYPE_NUM) { - pushed_local = local_bufs[task->worker_type].try_push(task_id); + if (local_bufs) { + int32_t buf_idx = (task->active_mask & 0x01) ? 0 : 1; + pushed_local = local_bufs[buf_idx].try_push(task_id); } if (!pushed_local) { - ready_queues[task->worker_type].push(task_id); + ready_queues[static_cast(shape)].push(task_id); } return true; } @@ -468,12 +473,14 @@ struct PTO2SchedulerState { expected, PTO2_TASK_READY, std::memory_order_acq_rel, std::memory_order_acquire)) { atomic_count += 1; // CAS(task_state PENDING→READY) // Local-first: try per-CoreType thread-local buffer before global queue + PTO2ResourceShape shape = pto2_active_mask_to_shape(task->active_mask); bool pushed_local = false; - if (local_bufs && task->worker_type >= 0 && task->worker_type < PTO2_LOCAL_DISPATCH_TYPE_NUM) { - pushed_local = local_bufs[task->worker_type].try_push(task_id); + if (local_bufs) { + int32_t buf_idx = (task->active_mask & 0x01) ? 0 : 1; + pushed_local = local_bufs[buf_idx].try_push(task_id); } if (!pushed_local) { - ready_queues[task->worker_type].push(task_id, atomic_count, push_wait); + ready_queues[static_cast(shape)].push(task_id, atomic_count, push_wait); } return true; } @@ -489,7 +496,7 @@ struct PTO2SchedulerState { // Reset fanout_refcount for new task lifecycle. // Do NOT reset fanin_refcount — it may have been incremented by - // concurrent on_task_complete between Step 5 and Step 6. + // concurrent on_mixed_task_complete between Step 5 and Step 6. fanout_refcount[slot].store(0, std::memory_order_relaxed); #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING @@ -502,9 +509,8 @@ struct PTO2SchedulerState { #endif } - template - int32_t get_ready_task() { - return ready_queues[static_cast(CT)].pop(); + int32_t get_ready_task(PTO2ResourceShape shape) { + return ready_queues[static_cast(shape)].pop(); } template @@ -517,9 +523,8 @@ struct PTO2SchedulerState { } #if PTO2_SCHED_PROFILING - template - int32_t get_ready_task(uint64_t& atomic_count, uint64_t& wait_cycle) { - return ready_queues[static_cast(CT)].pop(atomic_count, wait_cycle); + int32_t get_ready_task(PTO2ResourceShape shape, uint64_t& atomic_count, uint64_t& wait_cycle) { + return ready_queues[static_cast(shape)].pop(atomic_count, wait_cycle); } template @@ -533,6 +538,17 @@ struct PTO2SchedulerState { } #endif + /** + * Requeue a ready task that could not be dispatched (no suitable cluster). + * Pushes the task back into its shape-based queue. + */ + void requeue_ready_task(int32_t task_id) { + int32_t slot = pto2_task_slot(task_id); + PTO2TaskDescriptor& task = pto2_sm_get_task_by_slot(sm_handle, slot); + PTO2ResourceShape shape = pto2_active_mask_to_shape(task.active_mask); + ready_queues[static_cast(shape)].push(task_id); + } + void on_scope_end(const int32_t* task_ids, int32_t count) { #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING extern uint64_t g_orch_scope_end_atomic_count; @@ -546,19 +562,43 @@ struct PTO2SchedulerState { #endif } + /** + * Two-stage completion: first stage. + * Called when a single subtask (AIC, AIV0, or AIV1) finishes. + * Sets the corresponding done bit in subtask_done_mask. + * + * @return true if this subtask was the last one, completing the mixed task. + */ + bool on_subtask_complete(int32_t mixed_task_id, PTO2SubtaskSlot subslot) { + int32_t slot = pto2_task_slot(mixed_task_id); + PTO2TaskDescriptor& task = pto2_sm_get_task_by_slot(sm_handle, slot); + + uint8_t done_bit = (1u << static_cast(subslot)); + uint8_t prev_mask = task.subtask_done_mask.fetch_or(done_bit, std::memory_order_acq_rel); + uint8_t new_mask = prev_mask | done_bit; + + return new_mask == task.active_mask; + } + + /** + * Two-stage completion: second stage. + * Called exactly once when all subtasks of a mixed task are done + * (i.e., on_subtask_complete returned true). + * Handles fanout notification, fanin release, and self-consumption check. + */ #if PTO2_SCHED_PROFILING - PTO2CompletionStats on_task_complete(int32_t task_id, int thread_idx, - PTO2LocalReadyBuffer* local_bufs = nullptr) { - PTO2CompletionStats stats = {0, 0, 0}; + PTO2CompletionStats on_mixed_task_complete(int32_t mixed_task_id, int thread_idx, + PTO2LocalReadyBuffer* local_bufs = nullptr) { + PTO2CompletionStats stats = {0, 0, 0, true}; #elif PTO2_PROFILING - PTO2CompletionStats on_task_complete(int32_t task_id, - PTO2LocalReadyBuffer* local_bufs = nullptr) { - PTO2CompletionStats stats = {0, 0, 0}; + PTO2CompletionStats on_mixed_task_complete(int32_t mixed_task_id, + PTO2LocalReadyBuffer* local_bufs = nullptr) { + PTO2CompletionStats stats = {0, 0, 0, true}; #else - void on_task_complete(int32_t task_id, - PTO2LocalReadyBuffer* local_bufs = nullptr) { + void on_mixed_task_complete(int32_t mixed_task_id, + PTO2LocalReadyBuffer* local_bufs = nullptr) { #endif - int32_t slot = pto2_task_slot(task_id); + int32_t slot = pto2_task_slot(mixed_task_id); PTO2TaskDescriptor& task = pto2_sm_get_task_by_slot(sm_handle, slot); #if PTO2_PROFILING @@ -567,11 +607,8 @@ struct PTO2SchedulerState { #if PTO2_SCHED_PROFILING extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[]; - extern uint64_t g_sched_self_consumed_cycle[]; extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[]; extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[]; - extern uint64_t g_sched_self_atomic_count[]; - extern uint64_t g_sched_complete_count[]; uint64_t lock_atomics = 0, lock_wait = 0; PTO2_SCHED_CYCLE_START(); #endif @@ -664,7 +701,7 @@ struct PTO2SchedulerState { // Self consumed check #if PTO2_SCHED_PROFILING uint64_t self_atomics = 0; - check_and_handle_consumed(slot, task, self_atomics); + check_and_handle_consumed(slot, pto2_sm_get_task_by_slot(sm_handle, slot), self_atomics); g_sched_self_atomic_count[thread_idx] += self_atomics; PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]); g_sched_complete_count[thread_idx]++; @@ -698,7 +735,7 @@ const char* pto2_task_state_name(PTO2TaskState state); #if PTO2_SCHED_PROFILING struct PTO2SchedProfilingData { - // Sub-phase cycle breakdown within on_task_complete + // Sub-phase cycle breakdown within on_mixed_task_complete uint64_t lock_cycle; // pto2_fanout_lock + state store + unlock uint64_t fanout_cycle; // fanout traversal uint64_t fanin_cycle; // fanin traversal diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h new file mode 100644 index 00000000..177781a3 --- /dev/null +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h @@ -0,0 +1,97 @@ +/** + * PTO Submit Types - Shared submit-contract definitions + * + * Header-only definitions shared by orchestration-facing and runtime-facing + * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h). + */ + +#ifndef PTO_SUBMIT_TYPES_H +#define PTO_SUBMIT_TYPES_H + +#include + +inline constexpr int32_t INVALID_KERNEL_ID = -1; + +/** + * Subtask slot count: AIC, AIV0, AIV1 + */ +inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3; + +/** + * Subtask slot indices + */ +enum class PTO2SubtaskSlot : uint8_t { + AIC = 0, + AIV0 = 1, + AIV1 = 2, +}; + +/** + * Subtask mask bits (for active_mask / subtask_done_mask) + */ +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4 + +/** + * Test whether a subtask slot is active in a given mask + */ +static inline bool pto2_subtask_active(uint8_t mask, PTO2SubtaskSlot slot) { + return (mask & (1u << static_cast(slot))) != 0; +} + +/** + * Mixed-task submit contract. + * + * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive). + * At least one slot must be valid. + */ +struct MixedKernels { + int32_t aic_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv0_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv1_kernel_id{INVALID_KERNEL_ID}; +}; + +/** + * Resource shape — classifies a MixedKernels into one of 5 queue buckets. + */ +enum class PTO2ResourceShape : uint8_t { + AIC_ONLY = 0, // AIC only + AIV_X1 = 1, // One AIV slot + AIV_X2 = 2, // Both AIV slots + AIC_AIV_X1 = 3, // AIC + one AIV + AIC_AIV_X2 = 4, // AIC + both AIV +}; + +inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 5; + +/** + * Derive resource shape from active_mask. + * Caller must ensure active_mask is valid (at least one bit set). + */ +static inline PTO2ResourceShape pto2_active_mask_to_shape(uint8_t active_mask) { + bool has_aic = (active_mask & PTO2_SUBTASK_MASK_AIC) != 0; + int aiv_count = ((active_mask & PTO2_SUBTASK_MASK_AIV0) != 0) + + ((active_mask & PTO2_SUBTASK_MASK_AIV1) != 0); + + if (has_aic) { + if (aiv_count == 0) return PTO2ResourceShape::AIC_ONLY; + if (aiv_count == 1) return PTO2ResourceShape::AIC_AIV_X1; + return PTO2ResourceShape::AIC_AIV_X2; + } + if (aiv_count == 1) return PTO2ResourceShape::AIV_X1; + return PTO2ResourceShape::AIV_X2; +} + +/** + * Compute active_mask from MixedKernels. + */ +static inline uint8_t pto2_mixed_kernels_to_active_mask(const MixedKernels& mk) { + uint8_t mask = 0; + if (mk.aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC; + if (mk.aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0; + if (mk.aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1; + return mask; +} + +#endif // PTO_SUBMIT_TYPES_H diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp index 2674005e..a522d153 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp +++ b/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp @@ -121,7 +121,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) { make_input_param(B_view), make_output_param(C_view), }; - pto2_rt_submit_task(rt, FUNC_MATMUL, PTO2_WORKER_CUBE, params_matmul, 3); + pto2_rt_submit_aic_task(rt, FUNC_MATMUL, params_matmul, 3); total_matmul++; } @@ -142,7 +142,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) { make_input_param(Y_view), make_output_param(Z_view), }; - pto2_rt_submit_task(rt, FUNC_ADD, PTO2_WORKER_VECTOR, params_add, 3); + pto2_rt_submit_aiv_task(rt, FUNC_ADD, params_add, 3); total_add++; } } diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp index 3c25e9f1..f841e272 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -135,7 +135,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_output_param(li_batch), make_output_param(mi_batch), }; - pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_hub, 3); + pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_hub, 3); for (uint64_t bn = 0; bn < max_bn; bn++) { PTO2_SCOPE(rt) { @@ -161,7 +161,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(num_heads), make_scalar_param(batch_start), }; - pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 10); + pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 10); PTOParam params_sf[] = { make_input_param(sij_b), @@ -174,7 +174,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(bn), make_scalar_param(batch_start), }; - pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 9); + pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 9); PTOParam params_pv[] = { make_input_param(pij_b), @@ -186,7 +186,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(block_num), make_scalar_param(batch_start), }; - pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 8); + pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 8); uint64_t is_first = (bn == 0) ? 1 : 0; uint64_t is_last = (bn == max_bn - 1) ? 1 : 0; @@ -205,7 +205,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i make_scalar_param(num_heads), make_scalar_param(batch_start), }; - pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 13); + pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 13); } } } diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp index 563795a5..fb65329c 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp +++ b/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp @@ -124,7 +124,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) { make_output_param(P), make_input_param(ext_config), }; - pto2_rt_submit_task(rt, FUNC_GEMM_TILE, PTO2_WORKER_CUBE, + pto2_rt_submit_aic_task(rt, FUNC_GEMM_TILE, params_gemm, 4); total_gemm++; @@ -133,7 +133,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count) { make_input_param(P), make_input_param(ext_config), }; - pto2_rt_submit_task(rt, FUNC_TILE_ADD, PTO2_WORKER_VECTOR, + pto2_rt_submit_aiv_task(rt, FUNC_TILE_ADD, params_add, 3); total_add++; } diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index a3417a8c..3f061be8 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -167,7 +167,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_output_param(mi_update), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); + pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace, 3); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); @@ -195,7 +195,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_output_param(sij), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3); + pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 3); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); @@ -218,7 +218,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_output_param(li), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5); + pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 5); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); @@ -233,7 +233,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_output_param(oi_tmp), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3); + pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 3); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); @@ -253,7 +253,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_scalar_param(is_last), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); + pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 9); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); } diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp index cf028cb6..8e67888e 100644 --- a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp @@ -159,7 +159,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_output_param(mi_update), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); + pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace, 3); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); @@ -198,7 +198,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_scalar_param(block_indices[7]), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 12); + pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 12); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); @@ -220,7 +220,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_scalar_param(valid_len_last), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 7); + pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 7); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); @@ -245,7 +245,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_scalar_param(block_indices[7]), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 12); + pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 12); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); @@ -266,7 +266,7 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim make_scalar_param(is_last), }; CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); + pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 9); prof_submit_count++; CYCLE_COUNT_LAP(prof_submit_task); } From 83537f90c388625d727a3a4993a496758fd7ef6d Mon Sep 17 00:00:00 2001 From: ChaoZheng109 Date: Wed, 11 Mar 2026 20:07:02 +0800 Subject: [PATCH 3/3] Refactor: reorganize examples and tests by architecture prefix - Move examples from runtime-first layout (host_build_graph/, aicpu_build_graph/, tensormap_and_ringbuffer/) to arch-first layout (a2a3//, a5//) - Move device tests to matching tests/device_tests// layout - Update ci.sh to extract arch from path and track per-task platforms, replacing global HW_PLATFORM/SIM_PLATFORM variables - Add print_log_on_fail param to run_task() and fix attempt number display (off-by-one) in summary output - Update benchmark_rounds.sh with -p/--platform flag to derive arch from platform name - Update CLAUDE.md example path to new layout --- CLAUDE.md | 4 +- ci.sh | 63 ++-- .../aicpu_build_graph/bgemm/README.md | 0 .../aicpu_build_graph/bgemm/golden.py | 0 .../bgemm/kernels/aic/kernel_gemm_tile.cpp | 0 .../bgemm/kernels/aiv/kernel_tile_add.cpp | 0 .../bgemm/kernels/kernel_config.py | 0 .../kernels/orchestration/bgemm_orch.cpp | 0 .../docs/INCORE_ORCHESTRATION_GUIDE.md | 0 .../vector_example/README.md | 0 .../vector_example/golden.py | 0 .../vector_example/kernels/aiv/kernel_add.cpp | 0 .../kernels/aiv/kernel_add_scalar.cpp | 0 .../vector_example/kernels/aiv/kernel_mul.cpp | 0 .../vector_example/kernels/kernel_config.py | 0 .../kernels/orchestration/orchestration.cpp | 0 .../host_build_graph/bgemm/README.md | 0 .../host_build_graph/bgemm/golden.py | 0 .../bgemm/kernels/aic/kernel_gemm_tile.cpp | 0 .../bgemm/kernels/aiv/kernel_tile_add.cpp | 0 .../bgemm/kernels/kernel_config.py | 0 .../kernels/orchestration/bgemm_orch.cpp | 0 .../docs/INCORE_ORCHESTRATION_GUIDE.md | 0 .../host_build_graph/matmul/golden.py | 0 .../matmul/kernels/aic/kernel_matmul.cpp | 0 .../matmul/kernels/aiv/kernel_add_exp.cpp | 0 .../matmul/kernels/aiv/kernel_log_sqrt.cpp | 0 .../matmul/kernels/kernel_config.py | 0 .../kernels/orchestration/matmul_orch.cpp | 0 .../paged_attention/golden.py | 0 .../kernels/aic/aic_pv_matmul.cpp | 0 .../kernels/aic/aic_qk_matmul.cpp | 0 .../kernels/aiv/aiv_online_update.cpp | 0 .../kernels/aiv/aiv_softmax_prepare.cpp | 0 .../paged_attention/kernels/kernel_config.py | 0 .../orchestration/paged_attention_orch.cpp | 0 .../host_build_graph/vector_example/README.md | 0 .../host_build_graph/vector_example/golden.py | 0 .../vector_example/kernels/aiv/kernel_add.cpp | 0 .../kernels/aiv/kernel_add_scalar.cpp | 0 .../vector_example/kernels/aiv/kernel_mul.cpp | 0 .../vector_example/kernels/kernel_config.py | 0 .../kernels/orchestration/example_orch.cpp | 0 .../batch_paged_attention/golden.py | 0 .../kernels/aic/aic_hub.cpp | 0 .../kernels/aic/aic_pv_matmul.cpp | 0 .../kernels/aic/aic_qk_matmul.cpp | 0 .../kernels/aiv/aiv_hub.cpp | 0 .../kernels/aiv/aiv_online_update.cpp | 0 .../kernels/aiv/aiv_softmax_prepare.cpp | 0 .../kernels/kernel_config.py | 0 .../orchestration/paged_attention_orch.cpp | 0 .../tensormap_and_ringbuffer/bgemm/golden.py | 0 .../bgemm/kernels/aic/kernel_gemm_tile.cpp | 0 .../bgemm/kernels/aiv/kernel_tile_add.cpp | 0 .../bgemm/kernels/kernel_config.py | 0 .../kernels/orchestration/bgemm_orch.cpp | 0 .../docs/INCORE_ORCHESTRATION_GUIDE.md | 0 .../mixed_example/golden.py | 0 .../kernels/aic/kernel_matmul.cpp | 0 .../mixed_example/kernels/aiv/kernel_add.cpp | 0 .../kernels/aiv/kernel_add_standalone.cpp | 0 .../mixed_example/kernels/aiv/kernel_mul.cpp | 0 .../kernels/aiv/kernel_mul_standalone.cpp | 0 .../mixed_example/kernels/kernel_config.py | 0 .../kernels/orchestration/mixed_orch.cpp | 0 .../multi-round-paged-attention/golden.py | 0 .../kernels/kernel_config.py | 0 .../paged_attention/TFILLPAD_INPLACE_BUG.md | 0 .../paged_attention/golden.py | 0 .../paged_attention/kernels/aic/aic_hub.cpp | 0 .../kernels/aic/aic_pv_matmul.cpp | 0 .../kernels/aic/aic_qk_matmul.cpp | 0 .../paged_attention/kernels/aiv/aiv_hub.cpp | 0 .../kernels/aiv/aiv_online_update.cpp | 0 .../kernels/aiv/aiv_softmax_prepare.cpp | 0 .../paged_attention/kernels/kernel_config.py | 0 .../orchestration/paged_attention_orch.cpp | 0 .../vector_example/golden.py | 0 .../vector_example/kernels/aiv/kernel_add.cpp | 0 .../kernels/aiv/kernel_add_scalar.cpp | 0 .../vector_example/kernels/aiv/kernel_mul.cpp | 0 .../vector_example/kernels/kernel_config.py | 0 .../orchestration/example_orchestration.cpp | 0 .../paged_attention/golden.py | 45 +++ .../kernels/aic/aic_pv_matmul.cpp | 90 ++++++ .../kernels/aic/aic_qk_matmul.cpp | 91 ++++++ .../kernels/aiv/aiv_online_update.cpp | 220 +++++++++++++ .../kernels/aiv/aiv_softmax_prepare.cpp | 94 ++++++ .../paged_attention/kernels/kernel_config.py | 42 +++ .../orchestration/paged_attention_orch.cpp | 256 +++++++++++++++ .../paged_attention/TFILLPAD_INPLACE_BUG.md | 205 ++++++++++++ .../paged_attention/golden.py | 67 ++++ .../paged_attention/kernels/aic/aic_hub.cpp | 0 .../kernels/aic/aic_pv_matmul.cpp | 89 ++++++ .../kernels/aic/aic_qk_matmul.cpp | 90 ++++++ .../paged_attention/kernels/aiv/aiv_hub.cpp | 0 .../kernels/aiv/aiv_online_update.cpp | 224 +++++++++++++ .../kernels/aiv/aiv_softmax_prepare.cpp | 147 +++++++++ .../paged_attention/kernels/kernel_config.py | 46 +++ .../orchestration/paged_attention_orch.cpp | 214 +++++++++++++ .../paged_attention/README.md | 0 .../paged_attention/golden.py | 0 .../kernels/aic/aic_pv_matmul.cpp | 0 .../kernels/aic/aic_qk_matmul.cpp | 0 .../kernels/aiv/aiv_online_update.cpp | 0 .../kernels/aiv/aiv_softmax_prepare.cpp | 0 .../paged_attention/kernels/kernel_config.py | 0 .../orchestration/paged_attention_orch.cpp | 0 .../paged_attention/README.md | 0 .../paged_attention/golden.py | 0 .../kernels/aic/aic_pv_matmul.cpp | 0 .../kernels/aic/aic_qk_matmul.cpp | 0 .../kernels/aiv/aiv_online_update.cpp | 0 .../kernels/aiv/aiv_softmax_prepare.cpp | 0 .../paged_attention/kernels/kernel_config.py | 0 .../orchestration/paged_attention_orch.cpp | 0 .../alternating_matmul_add/golden.py | 0 .../kernels/aic/kernel_matmul.cpp | 0 .../kernels/aiv/kernel_add.cpp | 0 .../kernels/kernel_config.py | 0 .../orchestration/alternating_orch.cpp | 0 .../batch_paged_attention/golden.py | 0 .../kernels/aic/aic_hub.cpp | 0 .../kernels/aic/aic_pv_matmul.cpp | 0 .../kernels/aic/aic_qk_matmul.cpp | 0 .../kernels/aiv/aiv_hub.cpp | 0 .../kernels/aiv/aiv_online_update.cpp | 0 .../kernels/aiv/aiv_softmax_prepare.cpp | 0 .../kernels/kernel_config.py | 0 .../orchestration/paged_attention_orch.cpp | 0 .../benchmark_bgemm/golden.py | 0 .../kernels/aic/kernel_gemm_tile.cpp | 0 .../kernels/aiv/kernel_tile_add.cpp | 0 .../benchmark_bgemm/kernels/kernel_config.py | 0 .../kernels/orchestration/bgemm_orch.cpp | 0 .../paged_attention/golden.py | 0 .../paged_attention}/kernels/aic/aic_hub.cpp | 0 .../kernels/aic/aic_pv_matmul.cpp | 0 .../kernels/aic/aic_qk_matmul.cpp | 0 .../paged_attention}/kernels/aiv/aiv_hub.cpp | 0 .../kernels/aiv/aiv_online_update.cpp | 0 .../kernels/aiv/aiv_softmax_prepare.cpp | 0 .../paged_attention/kernels/kernel_config.py | 0 .../orchestration/paged_attention_orch.cpp | 0 .../paged_attention_unroll/golden.py | 0 .../kernels/aic/aic_hub.cpp | 18 ++ .../kernels/aic/aic_pv_matmul.cpp | 0 .../kernels/aic/aic_qk_matmul.cpp | 0 .../kernels/aiv/aiv_hub.cpp | 18 ++ .../kernels/aiv/aiv_online_update.cpp | 0 .../kernels/aiv/aiv_softmax_prepare.cpp | 0 .../kernels/kernel_config.py | 0 .../orchestration/paged_attention_orch.cpp | 0 .../paged_attention/README.md | 192 ++++++++++++ .../paged_attention/golden.py | 45 +++ .../kernels/aic/aic_pv_matmul.cpp | 97 ++++++ .../kernels/aic/aic_qk_matmul.cpp | 98 ++++++ .../kernels/aiv/aiv_online_update.cpp | 227 ++++++++++++++ .../kernels/aiv/aiv_softmax_prepare.cpp | 123 ++++++++ .../paged_attention/kernels/kernel_config.py | 43 +++ .../orchestration/paged_attention_orch.cpp | 261 ++++++++++++++++ .../paged_attention/golden.py | 55 ++++ .../paged_attention/kernels/aic/aic_hub.cpp | 18 ++ .../kernels/aic/aic_pv_matmul.cpp | 97 ++++++ .../kernels/aic/aic_qk_matmul.cpp | 98 ++++++ .../paged_attention/kernels/aiv/aiv_hub.cpp | 18 ++ .../kernels/aiv/aiv_online_update.cpp | 232 ++++++++++++++ .../kernels/aiv/aiv_softmax_prepare.cpp | 128 ++++++++ .../paged_attention/kernels/kernel_config.py | 45 +++ .../orchestration/paged_attention_orch.cpp | 294 ++++++++++++++++++ tools/benchmark_rounds.sh | 22 +- 172 files changed, 4088 insertions(+), 28 deletions(-) rename examples/{ => a2a3}/aicpu_build_graph/bgemm/README.md (100%) rename examples/{ => a2a3}/aicpu_build_graph/bgemm/golden.py (100%) rename examples/{ => a2a3}/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp (100%) rename examples/{ => a2a3}/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp (100%) rename examples/{ => a2a3}/aicpu_build_graph/bgemm/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp (100%) rename examples/{ => a2a3}/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md (100%) rename examples/{ => a2a3}/aicpu_build_graph/vector_example/README.md (100%) rename examples/{ => a2a3}/aicpu_build_graph/vector_example/golden.py (100%) rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp (100%) rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp (100%) rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp (100%) rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp (100%) rename examples/{ => a2a3}/host_build_graph/bgemm/README.md (100%) rename examples/{ => a2a3}/host_build_graph/bgemm/golden.py (100%) rename examples/{ => a2a3}/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp (100%) rename examples/{ => a2a3}/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp (100%) rename examples/{ => a2a3}/host_build_graph/bgemm/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp (100%) rename examples/{ => a2a3}/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md (100%) rename examples/{ => a2a3}/host_build_graph/matmul/golden.py (100%) rename examples/{ => a2a3}/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp (100%) rename examples/{ => a2a3}/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp (100%) rename examples/{ => a2a3}/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp (100%) rename examples/{ => a2a3}/host_build_graph/matmul/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp (100%) rename examples/{ => a2a3}/host_build_graph/paged_attention/golden.py (100%) rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%) rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%) rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp (100%) rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%) rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%) rename examples/{ => a2a3}/host_build_graph/vector_example/README.md (100%) rename examples/{ => a2a3}/host_build_graph/vector_example/golden.py (100%) rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp (100%) rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp (100%) rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp (100%) rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/golden.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/golden.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/golden.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/golden.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/golden.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py (100%) rename examples/{ => a2a3}/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp (100%) create mode 100644 examples/a5/host_build_graph/paged_attention/golden.py create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py create mode 100644 examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py rename {tests/device_tests => examples/a5}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp (100%) create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp rename {tests/device_tests => examples/a5}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp (100%) create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/README.md (100%) rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/golden.py (100%) rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp (100%) rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%) rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/kernel_config.py (100%) rename tests/device_tests/{ => a2a3}/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%) rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/README.md (100%) rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/golden.py (100%) rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp (100%) rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%) rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/kernel_config.py (100%) rename tests/device_tests/{ => a2a3}/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/golden.py (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/golden.py (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/golden.py (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/golden.py (100%) rename tests/device_tests/{tensormap_and_ringbuffer/paged_attention_unroll => a2a3/tensormap_and_ringbuffer/paged_attention}/kernels/aic/aic_hub.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp (100%) rename tests/device_tests/{tensormap_and_ringbuffer/paged_attention_unroll => a2a3/tensormap_and_ringbuffer/paged_attention}/kernels/aiv/aiv_hub.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/golden.py (100%) create mode 100644 tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp (100%) create mode 100644 tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py (100%) rename tests/device_tests/{ => a2a3}/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp (100%) create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/README.md create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/golden.py create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/kernel_config.py create mode 100644 tests/device_tests/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/golden.py create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py create mode 100644 tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/CLAUDE.md b/CLAUDE.md index 330d8991..ea8e3e2e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,8 +35,8 @@ PTO Runtime compiles three independent programs (Host `.so`, AICPU `.so`, AICore ### Run a single example ```bash python examples/scripts/run_example.py \ - -k examples/host_build_graph/vector_example/kernels \ - -g examples/host_build_graph/vector_example/golden.py \ + -k examples/a2a3/host_build_graph/vector_example/kernels \ + -g examples/a2a3/host_build_graph/vector_example/golden.py \ -p a2a3sim ``` diff --git a/ci.sh b/ci.sh index e4d9db03..17541d88 100755 --- a/ci.sh +++ b/ci.sh @@ -200,8 +200,10 @@ DEVICE_TESTS_DIR="tests/device_tests" declare -a HW_TASK_NAMES=() declare -a HW_TASK_DIRS=() +declare -a HW_TASK_PLATS=() declare -a SIM_TASK_NAMES=() declare -a SIM_TASK_DIRS=() +declare -a SIM_TASK_PLATS=() # Discover examples while IFS= read -r -d '' example_dir; do @@ -211,15 +213,21 @@ while IFS= read -r -d '' example_dir; do [[ -f "$kernel_config" && -f "$golden" ]] || continue example_name="${example_dir#$EXAMPLES_DIR/}" - example_runtime="${example_name%%/*}" # Extract runtime from path + example_arch="${example_name%%/*}" # Extract arch (a2a3/a5) from path + example_rest="${example_name#*/}" + example_runtime="${example_rest%%/*}" # Extract runtime from path # Filter by runtime if specified - if [[ -n "$RUNTIME" && "$example_name" != "$RUNTIME"/* ]]; then + if [[ -n "$RUNTIME" && "$example_runtime" != "$RUNTIME" ]]; then continue fi - # Filter by platform's supported runtimes + # Filter by platform's arch and supported runtimes if [[ -n "$PLATFORM" ]]; then + platform_base="${PLATFORM%sim}" + if [[ "$example_arch" != "$platform_base" ]]; then + continue # Skip examples not matching platform arch + fi platform_runtimes="$(get_platform_runtimes "$PLATFORM")" if [[ ! " $platform_runtimes " =~ " $example_runtime " ]]; then continue # Skip unsupported runtime for this platform @@ -230,18 +238,23 @@ while IFS= read -r -d '' example_dir; do if [[ "$PLATFORM" =~ sim$ ]]; then SIM_TASK_NAMES+=("example:${example_name}") SIM_TASK_DIRS+=("${example_dir}") + SIM_TASK_PLATS+=("${PLATFORM}") else HW_TASK_NAMES+=("example:${example_name}") HW_TASK_DIRS+=("${example_dir}") + HW_TASK_PLATS+=("${PLATFORM}") fi elif [[ "$OS" == "Darwin" ]]; then SIM_TASK_NAMES+=("example:${example_name}") SIM_TASK_DIRS+=("${example_dir}") + SIM_TASK_PLATS+=("${example_arch}sim") else HW_TASK_NAMES+=("example:${example_name}") HW_TASK_DIRS+=("${example_dir}") + HW_TASK_PLATS+=("${example_arch}") SIM_TASK_NAMES+=("example:${example_name}") SIM_TASK_DIRS+=("${example_dir}") + SIM_TASK_PLATS+=("${example_arch}sim") fi done < <(find "$EXAMPLES_DIR" -mindepth 1 -type d -print0 | sort -z) @@ -257,15 +270,21 @@ if [[ -d "$DEVICE_TESTS_DIR" ]]; then golden="${test_dir}/golden.py" [[ -f "$kernel_config" && -f "$golden" ]] || continue test_name="${test_dir#$DEVICE_TESTS_DIR/}" - test_runtime="${test_name%%/*}" # Extract runtime from path + test_arch="${test_name%%/*}" # Extract arch (a2a3/a5) from path + test_rest="${test_name#*/}" + test_runtime="${test_rest%%/*}" # Extract runtime from path # Filter by runtime if specified - if [[ -n "$RUNTIME" && "$test_name" != "$RUNTIME"/* ]]; then + if [[ -n "$RUNTIME" && "$test_runtime" != "$RUNTIME" ]]; then continue fi - # Filter by platform's supported runtimes + # Filter by platform's arch and supported runtimes if [[ -n "$PLATFORM" ]]; then + platform_base="${PLATFORM%sim}" + if [[ "$test_arch" != "$platform_base" ]]; then + continue # Skip tests not matching platform arch + fi platform_runtimes="$(get_platform_runtimes "$PLATFORM")" if [[ ! " $platform_runtimes " =~ " $test_runtime " ]]; then continue # Skip unsupported runtime for this platform @@ -274,6 +293,7 @@ if [[ -d "$DEVICE_TESTS_DIR" ]]; then HW_TASK_NAMES+=("device_test:${test_name}") HW_TASK_DIRS+=("${test_dir}") + HW_TASK_PLATS+=("${PLATFORM:-${test_arch}}") done < <(find "$DEVICE_TESTS_DIR" -mindepth 1 -type d -print0 | sort -z) else echo "Skipping device tests (hardware platforms only)" @@ -282,10 +302,6 @@ fi echo "Discovered ${#HW_TASK_NAMES[@]} hardware tasks, ${#SIM_TASK_NAMES[@]} simulation tasks" -# Determine platforms for execution -HW_PLATFORM="${PLATFORM:-a2a3}" -SIM_PLATFORM="${PLATFORM:-a2a3sim}" - MAX_RETRIES=3 # ---- Unified task runner ---- @@ -293,7 +309,7 @@ MAX_RETRIES=3 # Log naming: ${safe_name}_${platform}_attempt${attempt}.log # Result format: name|platform|PASS/FAIL|device:X|attempt:N|Xs run_task() { - local name="$1" dir="$2" platform="$3" attempt="$4" device_id="$5" + local name="$1" dir="$2" platform="$3" attempt="$4" device_id="$5" print_log_on_fail="${6:-true}" local safe_name="${name//[:\/]/_}" local task_log="${LOG_DIR}/${safe_name}_${platform}_attempt${attempt}.log" local start_time=$SECONDS @@ -319,9 +335,11 @@ run_task() { else status="FAIL" echo "[${platform}${device_id:+:dev${device_id}}] FAIL: $name (${elapsed}s)" - echo "--- LOG: $name (attempt $attempt) ---" - cat "$task_log" - echo "--- END ---" + if [[ "$print_log_on_fail" == "true" ]]; then + echo "--- LOG: $name (attempt $attempt) ---" + cat "$task_log" + echo "--- END ---" + fi fi echo "${name}|${platform}|${status}|device:${device_id:-sim}|attempt:${attempt}|${elapsed}s" \ >> "$RESULTS_FILE" @@ -348,7 +366,7 @@ run_sim_tasks() { local -a pids=() for idx in "${indices[@]}"; do ( - if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "$SIM_PLATFORM" "$attempt"; then + if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "${SIM_TASK_PLATS[$idx]}" "$attempt"; then echo "${idx}|PASS" >> "$sim_marker" else echo "${idx}|FAIL" >> "$sim_marker" @@ -359,7 +377,7 @@ run_sim_tasks() { for pid in "${pids[@]}"; do wait "$pid" 2>/dev/null || true; done else for idx in "${indices[@]}"; do - if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "$SIM_PLATFORM" "$attempt"; then + if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "${SIM_TASK_PLATS[$idx]}" "$attempt"; then echo "${idx}|PASS" >> "$sim_marker" else echo "${idx}|FAIL" >> "$sim_marker" @@ -406,7 +424,7 @@ run_hw_tasks() { IFS=':' read -r idx attempt <<< "$entry" - if run_task "${HW_TASK_NAMES[$idx]}" "${HW_TASK_DIRS[$idx]}" "$HW_PLATFORM" "$attempt" "$device_id"; then + if run_task "${HW_TASK_NAMES[$idx]}" "${HW_TASK_DIRS[$idx]}" "${HW_TASK_PLATS[$idx]}" "$attempt" "$device_id" "false"; then echo "${idx}|PASS" >> "$hw_marker" else next=$((attempt + 1)) @@ -414,9 +432,14 @@ run_hw_tasks() { flock "$lock" bash -c "echo '${idx}:${next}' >> \"$queue\"" else echo "${idx}|FAIL" >> "$hw_marker" + local safe_name="${HW_TASK_NAMES[$idx]//[:\/]/_}" + local last_log="${LOG_DIR}/${safe_name}_${HW_TASK_PLATS[$idx]}_attempt${attempt}.log" + echo "--- LOG: ${HW_TASK_NAMES[$idx]} (attempt $attempt) ---" + cat "$last_log" + echo "--- END ---" + echo "[${HW_TASK_PLATS[$idx]}:dev${device_id}] Device quarantined after exhausting retries" + break fi - echo "[${HW_PLATFORM}:dev${device_id}] Device quarantined after failure" - break fi done ) & @@ -606,7 +629,7 @@ for i in "${!TASK_ORDER[@]}"; do platform="${FINAL_PLATFORM[$i]}" device="${FINAL_DEVICE[$i]}" - attempt="${FINAL_ATTEMPT[$i]}" + attempt=$(( FINAL_ATTEMPT[$i] + 1 )) timing="${FINAL_TIMING[$i]}" if [[ "$result" == "FAIL" ]]; then diff --git a/examples/aicpu_build_graph/bgemm/README.md b/examples/a2a3/aicpu_build_graph/bgemm/README.md similarity index 100% rename from examples/aicpu_build_graph/bgemm/README.md rename to examples/a2a3/aicpu_build_graph/bgemm/README.md diff --git a/examples/aicpu_build_graph/bgemm/golden.py b/examples/a2a3/aicpu_build_graph/bgemm/golden.py similarity index 100% rename from examples/aicpu_build_graph/bgemm/golden.py rename to examples/a2a3/aicpu_build_graph/bgemm/golden.py diff --git a/examples/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp similarity index 100% rename from examples/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp rename to examples/a2a3/aicpu_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp diff --git a/examples/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp similarity index 100% rename from examples/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp rename to examples/a2a3/aicpu_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp diff --git a/examples/aicpu_build_graph/bgemm/kernels/kernel_config.py b/examples/a2a3/aicpu_build_graph/bgemm/kernels/kernel_config.py similarity index 100% rename from examples/aicpu_build_graph/bgemm/kernels/kernel_config.py rename to examples/a2a3/aicpu_build_graph/bgemm/kernels/kernel_config.py diff --git a/examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp similarity index 100% rename from examples/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp rename to examples/a2a3/aicpu_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp diff --git a/examples/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md similarity index 100% rename from examples/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md rename to examples/a2a3/aicpu_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md diff --git a/examples/aicpu_build_graph/vector_example/README.md b/examples/a2a3/aicpu_build_graph/vector_example/README.md similarity index 100% rename from examples/aicpu_build_graph/vector_example/README.md rename to examples/a2a3/aicpu_build_graph/vector_example/README.md diff --git a/examples/aicpu_build_graph/vector_example/golden.py b/examples/a2a3/aicpu_build_graph/vector_example/golden.py similarity index 100% rename from examples/aicpu_build_graph/vector_example/golden.py rename to examples/a2a3/aicpu_build_graph/vector_example/golden.py diff --git a/examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp similarity index 100% rename from examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add.cpp diff --git a/examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp b/examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp similarity index 100% rename from examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp diff --git a/examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp similarity index 100% rename from examples/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/aiv/kernel_mul.cpp diff --git a/examples/aicpu_build_graph/vector_example/kernels/kernel_config.py b/examples/a2a3/aicpu_build_graph/vector_example/kernels/kernel_config.py similarity index 100% rename from examples/aicpu_build_graph/vector_example/kernels/kernel_config.py rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/kernel_config.py diff --git a/examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp b/examples/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp similarity index 100% rename from examples/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp rename to examples/a2a3/aicpu_build_graph/vector_example/kernels/orchestration/orchestration.cpp diff --git a/examples/host_build_graph/bgemm/README.md b/examples/a2a3/host_build_graph/bgemm/README.md similarity index 100% rename from examples/host_build_graph/bgemm/README.md rename to examples/a2a3/host_build_graph/bgemm/README.md diff --git a/examples/host_build_graph/bgemm/golden.py b/examples/a2a3/host_build_graph/bgemm/golden.py similarity index 100% rename from examples/host_build_graph/bgemm/golden.py rename to examples/a2a3/host_build_graph/bgemm/golden.py diff --git a/examples/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp similarity index 100% rename from examples/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp rename to examples/a2a3/host_build_graph/bgemm/kernels/aic/kernel_gemm_tile.cpp diff --git a/examples/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp similarity index 100% rename from examples/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp rename to examples/a2a3/host_build_graph/bgemm/kernels/aiv/kernel_tile_add.cpp diff --git a/examples/host_build_graph/bgemm/kernels/kernel_config.py b/examples/a2a3/host_build_graph/bgemm/kernels/kernel_config.py similarity index 100% rename from examples/host_build_graph/bgemm/kernels/kernel_config.py rename to examples/a2a3/host_build_graph/bgemm/kernels/kernel_config.py diff --git a/examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp similarity index 100% rename from examples/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp rename to examples/a2a3/host_build_graph/bgemm/kernels/orchestration/bgemm_orch.cpp diff --git a/examples/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md similarity index 100% rename from examples/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md rename to examples/a2a3/host_build_graph/docs/INCORE_ORCHESTRATION_GUIDE.md diff --git a/examples/host_build_graph/matmul/golden.py b/examples/a2a3/host_build_graph/matmul/golden.py similarity index 100% rename from examples/host_build_graph/matmul/golden.py rename to examples/a2a3/host_build_graph/matmul/golden.py diff --git a/examples/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp b/examples/a2a3/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp similarity index 100% rename from examples/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp rename to examples/a2a3/host_build_graph/matmul/kernels/aic/kernel_matmul.cpp diff --git a/examples/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp b/examples/a2a3/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp similarity index 100% rename from examples/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp rename to examples/a2a3/host_build_graph/matmul/kernels/aiv/kernel_add_exp.cpp diff --git a/examples/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp b/examples/a2a3/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp similarity index 100% rename from examples/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp rename to examples/a2a3/host_build_graph/matmul/kernels/aiv/kernel_log_sqrt.cpp diff --git a/examples/host_build_graph/matmul/kernels/kernel_config.py b/examples/a2a3/host_build_graph/matmul/kernels/kernel_config.py similarity index 100% rename from examples/host_build_graph/matmul/kernels/kernel_config.py rename to examples/a2a3/host_build_graph/matmul/kernels/kernel_config.py diff --git a/examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp b/examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp similarity index 100% rename from examples/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp rename to examples/a2a3/host_build_graph/matmul/kernels/orchestration/matmul_orch.cpp diff --git a/examples/host_build_graph/paged_attention/golden.py b/examples/a2a3/host_build_graph/paged_attention/golden.py similarity index 100% rename from examples/host_build_graph/paged_attention/golden.py rename to examples/a2a3/host_build_graph/paged_attention/golden.py diff --git a/examples/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from examples/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp rename to examples/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp diff --git a/examples/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from examples/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp rename to examples/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp diff --git a/examples/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from examples/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp rename to examples/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp diff --git a/examples/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from examples/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp rename to examples/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/examples/host_build_graph/paged_attention/kernels/kernel_config.py b/examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py similarity index 100% rename from examples/host_build_graph/paged_attention/kernels/kernel_config.py rename to examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py diff --git a/examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from examples/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp rename to examples/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/examples/host_build_graph/vector_example/README.md b/examples/a2a3/host_build_graph/vector_example/README.md similarity index 100% rename from examples/host_build_graph/vector_example/README.md rename to examples/a2a3/host_build_graph/vector_example/README.md diff --git a/examples/host_build_graph/vector_example/golden.py b/examples/a2a3/host_build_graph/vector_example/golden.py similarity index 100% rename from examples/host_build_graph/vector_example/golden.py rename to examples/a2a3/host_build_graph/vector_example/golden.py diff --git a/examples/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp similarity index 100% rename from examples/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp rename to examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add.cpp diff --git a/examples/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp similarity index 100% rename from examples/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp rename to examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_add_scalar.cpp diff --git a/examples/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp similarity index 100% rename from examples/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp rename to examples/a2a3/host_build_graph/vector_example/kernels/aiv/kernel_mul.cpp diff --git a/examples/host_build_graph/vector_example/kernels/kernel_config.py b/examples/a2a3/host_build_graph/vector_example/kernels/kernel_config.py similarity index 100% rename from examples/host_build_graph/vector_example/kernels/kernel_config.py rename to examples/a2a3/host_build_graph/vector_example/kernels/kernel_config.py diff --git a/examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp b/examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp similarity index 100% rename from examples/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp rename to examples/a2a3/host_build_graph/vector_example/kernels/orchestration/example_orch.cpp diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/golden.py rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py diff --git a/examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp rename to examples/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/examples/tensormap_and_ringbuffer/bgemm/golden.py b/examples/a2a3/tensormap_and_ringbuffer/bgemm/golden.py similarity index 100% rename from examples/tensormap_and_ringbuffer/bgemm/golden.py rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/golden.py diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py similarity index 100% rename from examples/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py diff --git a/examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp rename to examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp diff --git a/examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md b/examples/a2a3/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md similarity index 100% rename from examples/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md rename to examples/a2a3/tensormap_and_ringbuffer/docs/INCORE_ORCHESTRATION_GUIDE.md diff --git a/examples/tensormap_and_ringbuffer/mixed_example/golden.py b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/golden.py similarity index 100% rename from examples/tensormap_and_ringbuffer/mixed_example/golden.py rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/golden.py diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py similarity index 100% rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py diff --git a/examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp rename to examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp diff --git a/examples/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py b/examples/a2a3/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py similarity index 100% rename from examples/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py rename to examples/a2a3/tensormap_and_ringbuffer/multi-round-paged-attention/golden.py diff --git a/examples/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py similarity index 100% rename from examples/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py rename to examples/a2a3/tensormap_and_ringbuffer/multi-round-paged-attention/kernels/kernel_config.py diff --git a/examples/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md diff --git a/examples/tensormap_and_ringbuffer/paged_attention/golden.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/golden.py similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/golden.py rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/golden.py diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py diff --git a/examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp rename to examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/examples/tensormap_and_ringbuffer/vector_example/golden.py b/examples/a2a3/tensormap_and_ringbuffer/vector_example/golden.py similarity index 100% rename from examples/tensormap_and_ringbuffer/vector_example/golden.py rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/golden.py diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add.cpp diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_add_scalar.cpp diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/aiv/kernel_mul.cpp diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py similarity index 100% rename from examples/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/kernel_config.py diff --git a/examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp b/examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp similarity index 100% rename from examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp rename to examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp diff --git a/examples/a5/host_build_graph/paged_attention/golden.py b/examples/a5/host_build_graph/paged_attention/golden.py new file mode 100644 index 00000000..17fafcdb --- /dev/null +++ b/examples/a5/host_build_graph/paged_attention/golden.py @@ -0,0 +1,45 @@ +"""Paged Attention Golden - host_build_graph example (small scale, float16).""" + +from paged_attention_golden import ( + generate_inputs as _generate_inputs, + compute_golden, + run_golden_test, +) + +__outputs__ = ["out"] + +RTOL = 1e-2 +ATOL = 1e-2 + +ALL_CASES = { + "Case1": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 16, + "max_model_len": 256, + "dtype": "float16", + }, + "Case2": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 64, + "max_model_len": 256, + "dtype": "float16", + }, +} + +DEFAULT_CASE = "Case1" + + +def generate_inputs(params: dict) -> list: + return _generate_inputs(params, return_all_sizes=True) + + +if __name__ == "__main__": + run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp new file mode 100644 index 00000000..a59b1243 --- /dev/null +++ b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp @@ -0,0 +1,90 @@ +// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) +// +// Fixed tile size: (16, 16) @ (16, 16) -> (16, 16) +// +// pij is float16 (converted from fp32 in softmax_prepare via TCVT). +// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. +// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +static __aicore__ void pv_matmul_impl(__gm__ uint8_t* pij_raw, __gm__ uint8_t* vj_raw, __gm__ uint8_t* oi_raw) +{ + constexpr int M = 16, K = 16, N = 16; + + __gm__ half* pij = reinterpret_cast<__gm__ half*>(pij_raw); + __gm__ half* vj = reinterpret_cast<__gm__ half*>(vj_raw); + __gm__ float* oi = reinterpret_cast<__gm__ float*>(oi_raw); + + // pij (M, K) fp16, vj (K, N) fp16 in ND (row-major), oi_new (M, N) fp32 + using GlobalA = GlobalTensor, pto::Stride>; + using GlobalB = GlobalTensor, pto::Stride>; + using GlobalOut = GlobalTensor, pto::Stride>; + + GlobalA pijGlobal(pij); + GlobalB vjGlobal(vj); + GlobalOut oiGlobal(oi); + + // L1 Mat tiles: standard ND pattern for both A and B + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load pij and vj to L1 + TLOAD(aMatTile, pijGlobal); + TLOAD(bMatTile, vjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Single matmul: (M,K) x (K,N) -> (M,N) + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(oiGlobal, cTile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) +{ + __gm__ uint8_t* pij = reinterpret_cast<__gm__ uint8_t*>(args[0]); + __gm__ uint8_t* vj = reinterpret_cast<__gm__ uint8_t*>(args[1]); + __gm__ uint8_t* oi_new = reinterpret_cast<__gm__ uint8_t*>(args[2]); + + pv_matmul_impl(pij, vj, oi_new); +} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp new file mode 100644 index 00000000..a173def0 --- /dev/null +++ b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp @@ -0,0 +1,91 @@ +// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) +// +// Fixed tile size: (16, 16) @ (16, 16).T -> (16, 16) +// +// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. +// This is equivalent to (K, N) in column-major (DN) layout. +// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +static __aicore__ void qk_matmul_impl(__gm__ uint8_t* qi_raw, __gm__ uint8_t* kj_raw, __gm__ uint8_t* sij_raw) +{ + constexpr int M = 16, K = 16, N = 16; + + __gm__ half* qi = reinterpret_cast<__gm__ half*>(qi_raw); + __gm__ half* kj = reinterpret_cast<__gm__ half*>(kj_raw); + __gm__ float* sij = reinterpret_cast<__gm__ float*>(sij_raw); + + // qi (M, K) fp16 in ND (row-major) layout + using GlobalA = GlobalTensor, pto::Stride>; + // kj stored as (N, K) row-major = (K, N) column-major -> DN layout + using GlobalB = GlobalTensor, pto::Stride, Layout::DN>; + using GlobalOut = GlobalTensor, pto::Stride>; + + GlobalA qiGlobal(qi); + GlobalB kjGlobal(kj); + GlobalOut sijGlobal(sij); + + // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load qi and kj to L1 + TLOAD(aMatTile, qiGlobal); + TLOAD(bMatTile, kjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Single matmul: (M,K) x (K,N) -> (M,N) + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(sijGlobal, cTile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) +{ + __gm__ uint8_t* qi = reinterpret_cast<__gm__ uint8_t*>(args[0]); + __gm__ uint8_t* kj = reinterpret_cast<__gm__ uint8_t*>(args[1]); + __gm__ uint8_t* sij = reinterpret_cast<__gm__ uint8_t*>(args[2]); + + qk_matmul_impl(qi, kj, sij); +} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp new file mode 100644 index 00000000..6f2ecd65 --- /dev/null +++ b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp @@ -0,0 +1,220 @@ +// Online Softmax Update + Normalize Kernel (AIV) +// +// Fixed tile size: oi/oi_new are (16, 16), mij/lij/mi/li are 16-element vectors +// +// Scalar layout strategy: +// M scalar floats stored contiguously in GM can be loaded as either: +// - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD) +// - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV) +// Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD. + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +static __aicore__ void online_update_impl(__gm__ uint8_t* mij_raw, __gm__ uint8_t* lij_raw, + __gm__ uint8_t* oi_new_raw, __gm__ uint8_t* mi_raw, + __gm__ uint8_t* li_raw, __gm__ uint8_t* oi_raw, + int is_first, int is_last, __gm__ uint8_t* dst_raw) +{ + constexpr int M = 16, N = 16; + + __gm__ float* mij_ptr = reinterpret_cast<__gm__ float*>(mij_raw); + __gm__ float* lij_ptr = reinterpret_cast<__gm__ float*>(lij_raw); + __gm__ float* oi_new_ptr = reinterpret_cast<__gm__ float*>(oi_new_raw); + __gm__ float* mi_ptr = reinterpret_cast<__gm__ float*>(mi_raw); + __gm__ float* li_ptr = reinterpret_cast<__gm__ float*>(li_raw); + __gm__ float* oi_ptr = reinterpret_cast<__gm__ float*>(oi_raw); + __gm__ float* dst_ptr = reinterpret_cast<__gm__ float*>(dst_raw); + + // Scalar tile dimensions for RowMajor layout: + // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block) + // kScalarRows = M / 8 (M=16 -> 2 rows) + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + // Aligned rows for ColMajor DN tiles (32-byte alignment) + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + // --- GlobalTensor types --- + + // Data (M, N) RowMajor + using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + + // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor + using GlobalScalarND = GlobalTensor, + pto::Stride<1, 1, 1, kScalarCols, 1>>; + + // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor + using GlobalScalarDN = GlobalTensor, + pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + // --- GlobalTensor instances --- + + GlobalDataMxN oiNewGlobal(oi_new_ptr); + GlobalDataMxN oiGlobal(oi_ptr); + GlobalDataMxN dstGlobal(dst_ptr); + + // ND globals for scalar element-wise operations + GlobalScalarND mijGlobalND(mij_ptr); + GlobalScalarND lijGlobalND(lij_ptr); + GlobalScalarND miGlobalND(mi_ptr); + GlobalScalarND liGlobalND(li_ptr); + + // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE) + GlobalScalarDN mijGlobalDN(mij_ptr); + GlobalScalarDN lijGlobalDN(lij_ptr); + GlobalScalarDN liGlobalDN(li_ptr); + + // --- Tile types --- + + using TileDataMxN = Tile; + using TileScalarND = Tile; + using TileScalarDN = Tile; + + // --- UB memory layout --- + + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float); + constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); + + // Data tiles + TileDataMxN oiNewTile; + TileDataMxN oiTile; + + // Scalar ND tiles for element-wise arithmetic + TileScalarND mijND, lijND, miND, liND; + TileScalarND miNewND, alphaND, betaND, tmpND; + + // Scalar DN tiles for TROWEXPAND operations + TileScalarDN alphaDN, betaDN, liDN; + + TASSIGN(oiNewTile, 0); + TASSIGN(oiTile, kDataBytes); + TASSIGN(mijND, 2 * kDataBytes); + TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes); + TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes); + TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes); + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes); + TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes); + TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes); + TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes); + TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes); + TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes); + TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes); + + if (is_first) { + // --- First block: copy inputs to accumulators --- + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Passthrough to MTE3 (no V compute needed) + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, mijND); // mi = mij + TSTORE(liGlobalND, lijND); // li = lij + TSTORE(oiGlobal, oiNewTile); // oi = oi_new + + if (is_last) { + // Single block: normalize dst = oi_new / lij + // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(liDN, liGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TROWEXPANDDIV(oiNewTile, oiNewTile, liDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiNewTile); + } + } else { + // --- Subsequent blocks: accumulate --- + + // Phase 1: Load all inputs + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(oiTile, oiGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + TLOAD(miND, miGlobalND); + TLOAD(liND, liGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols) + // to resolve RAW hazards on shared UB tiles. + TMAX(miNewND, miND, mijND); // mi_new = max(mi, mij) + TSUB(alphaND, miND, miNewND); // alpha = mi - mi_new + TEXP(alphaND, alphaND); // alpha = exp(mi - mi_new) + TSUB(betaND, mijND, miNewND); // beta = mij - mi_new + TEXP(betaND, betaND); // beta = exp(mij - mi_new) + TMUL(liND, alphaND, liND); // li = alpha * li + TMUL(tmpND, betaND, lijND); // tmp = beta * lij + TADD(liND, liND, tmpND); // li = alpha * li + beta * lij (= li_new) + + // Phase 3: Store scalar results to GM (ND format) + // mi_new -> mi accumulator, li_new -> li accumulator + // alpha -> mij buffer (reuse), beta -> lij buffer (reuse) + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liND); // persist li_new + TSTORE(mijGlobalND, alphaND); // temp: alpha to mij buffer + TSTORE(lijGlobalND, betaND); // temp: beta to lij buffer + + // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(alphaDN, mijGlobalDN); // alpha from mij buffer as DN + TLOAD(betaDN, lijGlobalDN); // beta from lij buffer as DN + if (is_last) { + TLOAD(liDN, liGlobalDN); // li_new from li buffer as DN + } + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + + // Phase 5: Scale data tiles using row-broadcast multiply + TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha + TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta + TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new + + if (is_last) { + // Phase 6: Normalize and output + TROWEXPANDDIV(oiTile, oiTile, liDN); // dst = oi / li_new + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiTile); + } else { + // Phase 6: Store updated accumulators + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(oiGlobal, oiTile); + } + } +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ uint8_t* mij = reinterpret_cast<__gm__ uint8_t*>(args[0]); + __gm__ uint8_t* lij = reinterpret_cast<__gm__ uint8_t*>(args[1]); + __gm__ uint8_t* oi_new = reinterpret_cast<__gm__ uint8_t*>(args[2]); + __gm__ uint8_t* mi = reinterpret_cast<__gm__ uint8_t*>(args[3]); + __gm__ uint8_t* li = reinterpret_cast<__gm__ uint8_t*>(args[4]); + __gm__ uint8_t* oi = reinterpret_cast<__gm__ uint8_t*>(args[5]); + int is_first = static_cast(args[6]); + int is_last = static_cast(args[7]); + __gm__ uint8_t* dst = reinterpret_cast<__gm__ uint8_t*>(args[8]); + + online_update_impl(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); +} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp new file mode 100644 index 00000000..7b168049 --- /dev/null +++ b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -0,0 +1,94 @@ +// Softmax Preparation Kernel (AIV) +// +// Fixed tile size: sij is (16, 16) +// +// Computes: +// sij_scale = sij * scale +// mij = row_max(sij_scale) -> (M, 1) +// pij = exp(sij_scale - mij) -> (M, N) +// lij = row_sum(pij) -> (M, 1) + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +static __aicore__ void softmax_prepare_impl(__gm__ uint8_t* sij_raw, float scale_value, + __gm__ uint8_t* pij_raw, __gm__ uint8_t* mij_raw, + __gm__ uint8_t* lij_raw) +{ + constexpr int M = 16, N = 16; + + __gm__ float* sij = reinterpret_cast<__gm__ float*>(sij_raw); + __gm__ half* pij = reinterpret_cast<__gm__ half*>(pij_raw); + __gm__ float* mij = reinterpret_cast<__gm__ float*>(mij_raw); + __gm__ float* lij = reinterpret_cast<__gm__ float*>(lij_raw); + + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_f16 = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + GlobalDataMxN sijGlobal(sij); + GlobalDataMxN_f16 pijGlobal(pij); + GlobalScalarDN mijGlobal(mij); + GlobalScalarDN lijGlobal(lij); + + using TileVecMxN = Tile; + using TileVecMxN_f16 = Tile; + using TileScalarDN = Tile; + + TileVecMxN sijTile; + TileVecMxN pijTile; + TileVecMxN tmpTile; + TileScalarDN maxTile; + TileScalarDN sumTile; + TileVecMxN_f16 pijF16Tile; + + TASSIGN(sijTile, 0x0); + TASSIGN(pijTile, M * N * sizeof(float)); + TASSIGN(tmpTile, 2 * M * N * sizeof(float)); + TASSIGN(maxTile, 3 * M * N * sizeof(float)); + TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); + TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); + + TLOAD(sijTile, sijGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + TMULS(sijTile, sijTile, scale_value); + TROWMAX(maxTile, sijTile, tmpTile); + TROWEXPANDSUB(pijTile, sijTile, maxTile); + TEXP(pijTile, pijTile); + // Truncate pij to fp16 first, then compute lij from truncated values (matches golden) + TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND); + TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND); + TROWSUM(sumTile, pijTile, tmpTile); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(mijGlobal, maxTile); + TSTORE(lijGlobal, sumTile); + TSTORE(pijGlobal, pijF16Tile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ uint8_t* sij = reinterpret_cast<__gm__ uint8_t*>(args[0]); + union { uint64_t u; float f; } scale_conv; + scale_conv.u = static_cast(args[1]); + float scale_value = scale_conv.f; + __gm__ uint8_t* pij = reinterpret_cast<__gm__ uint8_t*>(args[2]); + __gm__ uint8_t* mij = reinterpret_cast<__gm__ uint8_t*>(args[3]); + __gm__ uint8_t* lij = reinterpret_cast<__gm__ uint8_t*>(args[4]); + + softmax_prepare_impl(sij, scale_value, pij, mij, lij); +} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py b/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py new file mode 100644 index 00000000..d826b9fc --- /dev/null +++ b/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py @@ -0,0 +1,42 @@ +""" +Paged Attention Kernel and Orchestration Configuration + +Defines the kernels and orchestration function for paged attention +with AIC/AIV subgraph splitting: + +AIC Kernels (Matrix Multiplication): + - aic_qk_matmul: Q @ K^T computation + - aic_pv_matmul: P @ V computation + +AIV Kernels (Vector Operations): + - aiv_softmax_prepare: scale, rowmax, exp, rowsum + - aiv_online_update: online softmax accumulation + fused normalization +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent + +# Orchestration config +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), + "function_name": "build_paged_attention_graph", +} + +# Kernel configs +KERNELS = [ + # AIC kernels (matrix multiplication using Cube unit) + {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), "core_type": "aic"}, + {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), "core_type": "aic"}, + # AIV kernels (vector operations) + {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"}, + {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), "core_type": "aiv"}, +] + +# Runtime configuration +RUNTIME_CONFIG = { + "runtime": "host_build_graph", + "aicpu_thread_num": 3, + "orch_thread_num": 0, + "block_dim": 3, +} diff --git a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 00000000..5b29b587 --- /dev/null +++ b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,256 @@ +/** + * Paged Attention Orchestration - Small Scale (16x16) + * + * Supports small-scale paged attention with: + * Query: (batch, q_head_num, head_dim) fp16 + * Key: (total_blocks, block_size, kv_head_num, head_dim) fp16 (NOT transposed) + * Value: (total_blocks, block_size, kv_head_num, head_dim) fp16 + * Output: (batch * q_head_num, head_dim) float32 + * + * Head tiling: q_tile_size = min(num_heads, 128) + * GQA: kv_head_num can differ from q_head_num + */ + +#include "runtime.h" +#include +#include +#include + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 + +extern "C" { + +int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) { + if (arg_count < 14) { + std::cerr << "Expected at least 14 args, got " << arg_count << '\n'; + return -1; + } + + void* host_query = reinterpret_cast(args[0]); + void* host_key_cache = reinterpret_cast(args[1]); + void* host_value_cache = reinterpret_cast(args[2]); + int* host_block_table = reinterpret_cast(args[3]); + int* host_context_lens = reinterpret_cast(args[4]); + void* host_out = reinterpret_cast(args[5]); + int64_t* host_config = reinterpret_cast(args[6]); + + size_t query_size = static_cast(args[7]); + size_t key_cache_size = static_cast(args[8]); + size_t value_cache_size = static_cast(args[9]); + size_t block_table_size = static_cast(args[10]); + size_t context_lens_size = static_cast(args[11]); + size_t out_size = static_cast(args[12]); + size_t config_size = static_cast(args[13]); + + int batch = static_cast(host_config[0]); + int num_heads = static_cast(host_config[1]); + int kv_head_num = static_cast(host_config[2]); + int head_dim = static_cast(host_config[3]); + int block_size = static_cast(host_config[4]); + int max_num_blocks = static_cast(host_config[5]); + uint64_t scale_value_bits = static_cast(host_config[6]); + + int q_tile_size = std::min(num_heads, 128); + int num_head_tiles = (num_heads + q_tile_size - 1) / q_tile_size; + + std::cout << "\n=== build_paged_attention_graph ===" << '\n'; + std::cout << "batch=" << batch << ", num_heads=" << num_heads + << ", kv_head_num=" << kv_head_num << ", head_dim=" << head_dim << '\n'; + std::cout << "block_size=" << block_size << ", max_num_blocks=" << max_num_blocks << '\n'; + std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n'; + + // Allocate device memory for inputs/outputs + void* dev_query = runtime->host_api.device_malloc(query_size); + void* dev_key_cache = runtime->host_api.device_malloc(key_cache_size); + void* dev_value_cache = runtime->host_api.device_malloc(value_cache_size); + void* dev_out = runtime->host_api.device_malloc(out_size); + + if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) { + std::cerr << "Error: Failed to allocate device memory\n"; + return -1; + } + + runtime->host_api.copy_to_device(dev_query, host_query, query_size); + runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size); + runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size); + runtime->record_tensor_pair(host_out, dev_out, out_size); + + // Buffer sizes depend on q_tile_size and block_size + size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); + size_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); + size_t mij_size = static_cast(q_tile_size) * sizeof(float); + size_t lij_size = mij_size; + size_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); + + // Per-batch-per-block intermediate buffers + int total_buffers = batch * max_num_blocks; + void** dev_sij_arr = new void*[total_buffers]; + void** dev_pij_arr = new void*[total_buffers]; + void** dev_mij_arr = new void*[total_buffers]; + void** dev_lij_arr = new void*[total_buffers]; + void** dev_oi_new_arr = new void*[total_buffers]; + + for (int i = 0; i < total_buffers; i++) { + dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size); + dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size); + dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size); + dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size); + dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size); + } + + // Per-(batch, head_tile) accumulators + int total_accums = batch * num_head_tiles; + size_t mi_size = static_cast(q_tile_size) * sizeof(float); + size_t li_size = mi_size; + size_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); + + void** dev_mi_arr = new void*[total_accums]; + void** dev_li_arr = new void*[total_accums]; + void** dev_oi_arr = new void*[total_accums]; + + for (int i = 0; i < total_accums; i++) { + dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size); + dev_li_arr[i] = runtime->host_api.device_malloc(li_size); + dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size); + } + + std::cout << "Allocated " << total_buffers << " per-block buffers\n"; + std::cout << "Allocated " << total_accums << " per-(batch,head_tile) accumulators\n"; + + int total_tasks = 0; + + for (int b_idx = 0; b_idx < batch; b_idx++) { + int cur_seq = host_context_lens[b_idx]; + int bn_this_batch = (cur_seq + block_size - 1) / block_size; + + for (int ht = 0; ht < num_head_tiles; ht++) { + int cur_offset = ht * q_tile_size; + + // Query: (batch, q_head_num, head_dim) fp16 + // qi points to heads [cur_offset .. cur_offset+q_tile_size) for batch b_idx + uint8_t* qi_ptr = reinterpret_cast(dev_query) + + static_cast(b_idx * num_heads + cur_offset) * head_dim * sizeof(uint16_t); + + // Output: (batch * q_head_num, head_dim) float32 + uint8_t* out_ptr = reinterpret_cast(dev_out) + + static_cast(b_idx * num_heads + cur_offset) * head_dim * sizeof(float); + + // GQA: which kv_head this head tile maps to + int kv_head_idx = cur_offset / (num_heads / kv_head_num); + + // Per-(batch, head_tile) accumulators + int accum_idx = b_idx * num_head_tiles + ht; + void* dev_mi = dev_mi_arr[accum_idx]; + void* dev_li = dev_li_arr[accum_idx]; + void* dev_oi = dev_oi_arr[accum_idx]; + + int t_up_prev = -1; + + for (int bn = 0; bn < bn_this_batch; bn++) { + int cur_block_idx = host_block_table[b_idx * max_num_blocks + bn]; + + // Key: (total_blocks, block_size, kv_head_num, head_dim) fp16 + uint8_t* kj_ptr = reinterpret_cast(dev_key_cache) + + (static_cast(cur_block_idx) * block_size * kv_head_num + kv_head_idx) + * head_dim * sizeof(uint16_t); + + // Value: (total_blocks, block_size, kv_head_num, head_dim) fp16 + uint8_t* vj_ptr = reinterpret_cast(dev_value_cache) + + (static_cast(cur_block_idx) * block_size * kv_head_num + kv_head_idx) + * head_dim * sizeof(uint16_t); + + int buf_idx = b_idx * max_num_blocks + bn; + void* dev_sij = dev_sij_arr[buf_idx]; + void* dev_pij = dev_pij_arr[buf_idx]; + void* dev_mij = dev_mij_arr[buf_idx]; + void* dev_lij = dev_lij_arr[buf_idx]; + void* dev_oi_new = dev_oi_new_arr[buf_idx]; + + // QK: qi(M, K) @ kj.T(K, N) -> sij(M, N) + uint64_t qk_args[6] = { + reinterpret_cast(qi_ptr), + reinterpret_cast(kj_ptr), + reinterpret_cast(dev_sij), + static_cast(q_tile_size), + static_cast(head_dim), + static_cast(block_size) + }; + int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); + total_tasks++; + + // SF: scale, rowmax, exp, rowsum -> pij, mij, lij + uint64_t sf_args[7] = { + reinterpret_cast(dev_sij), + scale_value_bits, + reinterpret_cast(dev_pij), + reinterpret_cast(dev_mij), + reinterpret_cast(dev_lij), + static_cast(q_tile_size), + static_cast(block_size) + }; + int t_sf = runtime->add_task(sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV); + total_tasks++; + + // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N') + uint64_t pv_args[6] = { + reinterpret_cast(dev_pij), + reinterpret_cast(vj_ptr), + reinterpret_cast(dev_oi_new), + static_cast(q_tile_size), + static_cast(block_size), + static_cast(head_dim) + }; + int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); + total_tasks++; + + runtime->add_successor(t_qk, t_sf); + runtime->add_successor(t_sf, t_pv); + + // Online Update: serialized across blocks (each depends on previous) + int is_first = (bn == 0) ? 1 : 0; + int is_last = (bn == bn_this_batch - 1) ? 1 : 0; + + uint64_t up_args[11] = { + reinterpret_cast(dev_mij), + reinterpret_cast(dev_lij), + reinterpret_cast(dev_oi_new), + reinterpret_cast(dev_mi), + reinterpret_cast(dev_li), + reinterpret_cast(dev_oi), + static_cast(is_first), + static_cast(is_last), + reinterpret_cast(out_ptr), + static_cast(q_tile_size), + static_cast(head_dim) + }; + int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); + total_tasks++; + + runtime->add_successor(t_pv, t_up); + if (t_up_prev >= 0) { + runtime->add_successor(t_up_prev, t_up); + } + t_up_prev = t_up; + } + } + } + + delete[] dev_sij_arr; + delete[] dev_pij_arr; + delete[] dev_mij_arr; + delete[] dev_lij_arr; + delete[] dev_oi_new_arr; + delete[] dev_mi_arr; + delete[] dev_li_arr; + delete[] dev_oi_arr; + + std::cout << "Created " << total_tasks << " tasks\n"; + runtime->print_runtime(); + + return 0; +} + +} diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md b/examples/a5/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md new file mode 100644 index 00000000..5d83385a --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/TFILLPAD_INPLACE_BUG.md @@ -0,0 +1,205 @@ +# TFILLPAD_INPLACE Bug at Small Tile Width (N ≤ 16) + +## Summary + +`TFILLPAD_INPLACE` produces incorrect padding results on Ascend A2/A3 hardware when +the tile column count `N` is small (e.g. N=16 for float32). The bug manifests as +corrupted data in the padded region for certain `valid_len` values, causing downstream +softmax and attention computations to produce wrong results. + +## Affected Configuration + +- **Platform**: Ascend A2/A3 (tested on hardware, also reproduces on simulator) +- **Data type**: float32 (sizeof=4) +- **Tile shape**: (M, N) = (16, 16) — i.e. 2 × 32-byte blocks per row +- **PTO source**: `include/pto/npu/a2a3/TFillPad.hpp` + +The bug does NOT reproduce at larger N values (N=32, 64, 128) where the same +`valid_len` values work correctly. + +## Reproduction + +In the paged attention example (`examples/tensormap_and_ringbuffer/paged_attention/`), +the softmax preparation kernel uses `TFILLPAD_INPLACE` to mask invalid key positions +with `-inf` before computing softmax: + +```cpp +// Tile types +using TileSijDyn = Tile; +using TileSijPad = Tile; + +TileSijDyn sijDynTile(valid_len); // valid_len = number of valid columns +TileSijPad sijPadTile; +// Both assigned to same UB address (in-place) +TASSIGN(sijDynTile, 0x0); +TASSIGN(sijPadTile, 0x0); + +// After loading sij from GM: +TFILLPAD_INPLACE(sijPadTile, sijDynTile); +// Expected: columns [valid_len, 16) filled with -inf (0xff800000) +// Actual: corrupted for certain valid_len values +``` + +### Test Matrix (N=16, float32, on hardware) + +| valid_len | context_len | blocks | TFILLPAD_INPLACE only | SetValue only | TFILLPAD + SetValue | +|-----------|-------------|--------|-----------------------|---------------|---------------------| +| 1 | 17 | 2 | FAIL (27/256) | PASS | PASS | +| 7 | 23 | 2 | FAIL (29/256) | PASS | PASS | +| 8 | 24 | 2 | FAIL (28/256) | FAIL (182/256)| PASS | +| 9 | 25 | 2 | PASS | PASS | PASS | +| 12 | 28 | 2 | PASS | PASS | PASS | +| 15 | 31 | 2 | PASS | PASS | PASS | +| 16 (full) | 32 | 2 | PASS | PASS | PASS | +| 1 | 33 | 3 | FAIL (25/256) | FAIL (88/256) | PASS | + +### Cross-dimension validation (confirming N=16 is the trigger) + +| num_heads | head_dim | block_size (=N) | context_len | valid_len | Result | +|-----------|----------|-----------------|-------------|-----------|--------| +| 16 | 16 | **16** | 33 | 1 | FAIL | +| 16 | 16 | **32** | 33 | 1 | PASS | +| 16 | **32** | **16** | 33 | 1 | FAIL | + +block_size determines N in the softmax tile (M, N). When block_size=32 (N=32), +the same valid_len=1 passes. When block_size=16 (N=16), it fails regardless of +head_dim. + +## Root Cause Analysis + +The bug is in the `TFillPad` function in `include/pto/npu/a2a3/TFillPad.hpp`. +The function has two internal code paths for filling padding: + +### Path A: `Handle32BAlignedPad_Other` (lines 103-134) + +Fills the **partial 32-byte block** at the boundary using `vector_dup` with a +norm-mode bitmask. This path is reliable. + +### Path B: `PadRightSingleRow` + `PadRightRemainingRows` (lines 136-167) + +Fills **complete 32-byte blocks** to the right of the boundary. Uses `vector_dup` +for row 0, then `vcopy` with `srcRepeatStride=0` (broadcast) to replicate to +remaining rows. **This path has the bug.** + +### Which path runs depends on `valid_len` + +The key variable is `srcValidCol32B` — the valid_len rounded up to the next +32-byte-aligned element count: + +``` +elements_per_block = 32 / sizeof(float) = 8 +srcValidCol32B = ceil(valid_len / 8) * 8 +padOffset = srcValidCol32B +padCols = N - srcValidCol32B // columns for Path B +pad_32B = srcValidCol32B - valid_len // columns for Path A +``` + +For N=16 (2 blocks of 8 elements each): + +``` +valid_len ∈ [1, 8]: + srcValidCol32B = 8 + padOffset = 8, padCols = 8 → Path B runs (fills block 1) + pad_32B = 8 - valid_len → Path A runs if valid_len < 8 + +valid_len ∈ [9, 15]: + srcValidCol32B = 16 + padOffset = 16, padCols = 0 → Path B is a NO-OP + pad_32B = 16 - valid_len → Path A runs (fills within block 1) + +valid_len = 16: + No padding needed (full block) +``` + +**Pattern: valid_len ≤ 8 → Path B runs → BUG. valid_len ≥ 9 → only Path A → OK.** + +### Path B code trace (the buggy path) + +```cpp +// PadRightSingleRow: fill row 0's right padding +set_mask_count(); +set_vector_mask(0, padCols); // padCols = 8 +vector_dup(dstPtr + padOffset, dupPadValue, 1, 1, 1, 8, 0); +// ^-- dstPtr + 8 (element 8 of row 0) +pipe_barrier(PIPE_V); + +// PadRightRemainingRows: broadcast row 0's pattern to rows 1..M-1 +dstRepeatStride = N * sizeof(float) / 32; // = 16 * 4 / 32 = 2 +_dstPtr = dstPtr + padOffset + copyDstCols; // = dstPtr + 8 + 16 = dstPtr + 24 +fillRow = M - 1; // = 15 + +vcopy(_dstPtr, dstPtr + padOffset, 15, 1, 0, 2, 0); +// dst src rep dB sB dR sR +// row1:8 row0:8 15 1 0 2 0 +// +// dstRepeatStride=2 (64 bytes = 1 row), srcRepeatStride=0 (broadcast) +// mask: counter mode, 8 elements (inherited from PadRightSingleRow) +``` + +The `vcopy` with `srcRepeatStride=0` and `dstRepeatStride=2` at N=16 appears to +produce incorrect results on hardware. The exact hardware failure mode is unclear, +but it consistently corrupts the padding data. + +### Why valid_len=8 is special + +When `valid_len=8`: +- `pad_32B = 8 - 8 = 0` → Path A computes `mask = 0xff >> 8 << 8 = 0` +- `set_vector_mask(0, 0)` is called, then `vector_dup` with zero mask +- This is effectively a no-op, but may have undefined behavior on hardware +- Path B still runs and produces incorrect results +- Additionally, `SetValue`-only workaround also fails for valid_len=8, + suggesting the zero-mask `vector_dup` in Path A corrupts pipeline state + +## Workaround + +The working fix uses **both** `TFILLPAD_INPLACE` and scalar `SetValue` writes: + +```cpp +// Step 1: TFILLPAD_INPLACE sets up vector pipeline state correctly +// (mask modes, barriers, etc.) even though its data output is buggy +TFILLPAD_INPLACE(sijPadTile, sijDynTile); + +// Step 2: SetValue patches the actual data with correct -inf values +if (valid_len < static_cast(N)) { + constexpr float NEG_INF = -__builtin_huge_valf(); + for (int r = 0; r < M; r++) { + for (uint64_t c = valid_len; c < N; c++) { + sijTile.SetValue(static_cast(r * N + c), NEG_INF); + } + } +} +``` + +**Why both are needed:** + +| Approach | valid_len=1 | valid_len=7 | valid_len=8 | +|------------------------|-------------|-------------|-------------| +| TFILLPAD_INPLACE only | FAIL | FAIL | FAIL | +| SetValue only | PASS | PASS | FAIL | +| TFILLPAD + SetValue | PASS | PASS | PASS | + +- `TFILLPAD_INPLACE` alone: Path B produces wrong data +- `SetValue` alone: works for most cases, but valid_len=8 fails because + Path A's zero-mask `vector_dup` (which runs before SetValue in the + TFILLPAD-only case) apparently sets up necessary pipeline state that + subsequent vector operations depend on +- Both together: TFILLPAD handles pipeline state, SetValue fixes the data + +## Scope + +- **Affected**: Any `TFILLPAD_INPLACE` call with float32 tiles where + `N ≤ 16` and `valid_len ≤ N/2` (i.e. valid data fits within the first + 32-byte block of each row) +- **Not affected**: N ≥ 32 (tested with N=32, 64, 128 — all pass) +- **Not affected**: Full tiles (valid_len == N) +- **Likely affected**: float16/bfloat16 tiles with N ≤ 32 (untested, but + the same code path would be triggered since elements_per_block=16 for + 16-bit types, and the same vcopy broadcast pattern is used) + +## Files + +- Bug location: `include/pto/npu/a2a3/TFillPad.hpp`, functions + `PadRightSingleRow` (line 136) and `PadRightRemainingRows` (line 146) +- Workaround applied in: `examples/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp` +- Test configuration: `examples/tensormap_and_ringbuffer/paged_attention/golden.py` diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py new file mode 100644 index 00000000..6eeb936e --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py @@ -0,0 +1,67 @@ +"""Paged Attention Golden - tensormap_and_ringbuffer example (small scale, float16).""" + +from paged_attention_golden import ( + generate_inputs as _generate_inputs, + compute_golden, + run_golden_test, +) + +__outputs__ = ["out"] + +RTOL = 1e-2 +ATOL = 1e-2 + +ALL_CASES = { + "Case1": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "max_model_len": 256, + "dtype": "float16", + }, + "Case2": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "max_model_len": 256, + "dtype": "float16", + }, + "CaseVarSeq2": { + "batch": 2, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "context_lens_list": [33, 17], + "max_model_len": 256, + "dtype": "float16", + }, + "CaseVarSeq4": { + "batch": 4, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "context_lens_list": [33, 64, 128, 15], + "max_model_len": 256, + "dtype": "float16", + }, +} + +DEFAULT_CASE = "Case1" + + +def generate_inputs(params: dict) -> list: + return _generate_inputs(params, return_all_sizes=False) + + +if __name__ == "__main__": + run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp rename to examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp new file mode 100644 index 00000000..0f9b0ae5 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp @@ -0,0 +1,89 @@ +// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) +// +// Fixed tile size: (16, 16) @ (16, 16) -> (16, 16) +// +// pij is float16 (converted from fp32 in softmax_prepare via TCVT). +// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. +// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void pv_matmul_impl(__gm__ Tensor* pij, __gm__ Tensor* vj, __gm__ Tensor* oi) { + __gm__ half* pij_addr = reinterpret_cast<__gm__ half*>(pij->buffer.addr); + __gm__ half* vj_addr = reinterpret_cast<__gm__ half*>(vj->buffer.addr); + __gm__ float* oi_addr = reinterpret_cast<__gm__ float*>(oi->buffer.addr); + + // pij (M, K) fp16, vj (K, N) fp16 in ND (row-major), oi_new (M, N) fp32 + using GlobalA = GlobalTensor, pto::Stride>; + using GlobalB = GlobalTensor, pto::Stride>; + using GlobalOut = GlobalTensor, pto::Stride>; + + GlobalA pijGlobal(pij_addr + pij->start_offset); + GlobalB vjGlobal(vj_addr + vj->start_offset); + GlobalOut oiGlobal(oi_addr + oi->start_offset); + + // L1 Mat tiles: standard ND pattern for both A and B + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load pij and vj to L1 + TLOAD(aMatTile, pijGlobal); + TLOAD(bMatTile, vjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Single matmul: (M,K) x (K,N) -> (M,N) + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(oiGlobal, cTile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* vj = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]); + + pv_matmul_impl<16, 16, 16>(pij, vj, oi_new); +} diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp new file mode 100644 index 00000000..3b9ef46f --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp @@ -0,0 +1,90 @@ +// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) +// +// Fixed tile size: (16, 16) @ (16, 16).T -> (16, 16) +// +// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. +// This is equivalent to (K, N) in column-major (DN) layout. +// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void qk_matmul_impl(__gm__ Tensor* qi, __gm__ Tensor* kj, __gm__ Tensor* sij) { + __gm__ half* qi_addr = reinterpret_cast<__gm__ half*>(qi->buffer.addr); + __gm__ half* kj_addr = reinterpret_cast<__gm__ half*>(kj->buffer.addr); + __gm__ float* sij_addr = reinterpret_cast<__gm__ float*>(sij->buffer.addr); + + // qi (M, K) fp16 in ND (row-major) layout + using GlobalA = GlobalTensor, pto::Stride>; + // kj stored as (N, K) row-major = (K, N) column-major -> DN layout + using GlobalB = GlobalTensor, pto::Stride, Layout::DN>; + using GlobalOut = GlobalTensor, pto::Stride>; + + GlobalA qiGlobal(qi_addr + qi->start_offset); + GlobalB kjGlobal(kj_addr + kj->start_offset); + GlobalOut sijGlobal(sij_addr + sij->start_offset); + + // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load A and B to L1 + TLOAD(aMatTile, qiGlobal); + TLOAD(bMatTile, kjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move from L1 to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Matmul + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(sijGlobal, cTile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* qi = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* kj = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[2]); + + qk_matmul_impl<16, 16, 16>(qi, kj, sij); +} diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp rename to examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp new file mode 100644 index 00000000..7351f73f --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp @@ -0,0 +1,224 @@ +// Online Softmax Update + Normalize Kernel (AIV) +// +// Fixed tile size: oi/oi_new are (16, 16), mij/lij/mi/li are 16-element vectors +// +// Scalar layout strategy: +// M scalar floats stored contiguously in GM can be loaded as either: +// - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD) +// - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV) +// Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void online_update_impl(__gm__ Tensor* mij, + __gm__ Tensor* lij, + __gm__ Tensor* oi_new, + __gm__ Tensor* mi, + __gm__ Tensor* li, + __gm__ Tensor* oi, + uint64_t is_first, + uint64_t is_last, + __gm__ Tensor* dst) { + __gm__ float* mij_ptr = reinterpret_cast<__gm__ float*>(mij->buffer.addr); + __gm__ float* lij_ptr = reinterpret_cast<__gm__ float*>(lij->buffer.addr); + __gm__ float* oi_new_ptr = reinterpret_cast<__gm__ float*>(oi_new->buffer.addr); + __gm__ float* mi_ptr = reinterpret_cast<__gm__ float*>(mi->buffer.addr); + __gm__ float* li_ptr = reinterpret_cast<__gm__ float*>(li->buffer.addr); + __gm__ float* oi_ptr = reinterpret_cast<__gm__ float*>(oi->buffer.addr); + __gm__ float* dst_ptr = reinterpret_cast<__gm__ float*>(dst->buffer.addr); + + // Scalar tile dimensions for RowMajor layout: + // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block) + // kScalarRows = M / 8 (M=16 -> 2 rows) + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + // Aligned rows for ColMajor DN tiles (32-byte alignment) + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + // --- GlobalTensor types --- + + // Data (M, N) RowMajor + using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + + // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor + using GlobalScalarND = + GlobalTensor, pto::Stride<1, 1, 1, kScalarCols, 1>>; + + // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor + using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + // --- GlobalTensor instances --- + + GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); + GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); + GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); + + // ND globals for scalar element-wise operations + GlobalScalarND mijGlobalND(mij_ptr + mij->start_offset); + GlobalScalarND lijGlobalND(lij_ptr + lij->start_offset); + GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); + GlobalScalarND liGlobalND(li_ptr + li->start_offset); + + // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE) + GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); + GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); + GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); + + // --- Tile types --- + + using TileDataMxN = Tile; + using TileScalarND = + Tile; + using TileScalarDN = Tile; + + // --- UB memory layout --- + + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float); + constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); + + // Data tiles + TileDataMxN oiNewTile; + TileDataMxN oiTile; + + // Scalar ND tiles for element-wise arithmetic + TileScalarND mijND, lijND, miND, liND; + TileScalarND miNewND, alphaND, betaND, tmpND; + + // Scalar DN tiles for TROWEXPAND operations + TileScalarDN alphaDN, betaDN, liDN; + + TASSIGN(oiNewTile, 0); + TASSIGN(oiTile, kDataBytes); + TASSIGN(mijND, 2 * kDataBytes); + TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes); + TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes); + TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes); + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes); + TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes); + TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes); + TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes); + TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes); + TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes); + TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes); + + if (is_first) { + // --- First block: copy inputs to accumulators --- + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Passthrough to MTE3 (no V compute needed) + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, mijND); // mi = mij + TSTORE(liGlobalND, lijND); // li = lij + TSTORE(oiGlobal, oiNewTile); // oi = oi_new + + if (is_last) { + // Single block: normalize dst = oi_new / lij + // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(liDN, liGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TROWEXPANDDIV(oiNewTile, oiNewTile, liDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiNewTile); + } + } else { + // --- Subsequent blocks: accumulate --- + + // Phase 1: Load all inputs + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(oiTile, oiGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + TLOAD(miND, miGlobalND); + TLOAD(liND, liGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols) + // to resolve RAW hazards on shared UB tiles. + TMAX(miNewND, miND, mijND); // mi_new = max(mi, mij) + TSUB(alphaND, miND, miNewND); // alpha = mi - mi_new + TEXP(alphaND, alphaND); // alpha = exp(mi - mi_new) + TSUB(betaND, mijND, miNewND); // beta = mij - mi_new + TEXP(betaND, betaND); // beta = exp(mij - mi_new) + TMUL(liND, alphaND, liND); // li = alpha * li + TMUL(tmpND, betaND, lijND); // tmp = beta * lij + TADD(liND, liND, tmpND); // li = alpha * li + beta * lij (= li_new) + + // Phase 3: Store scalar results to GM (ND format) + // mi_new -> mi accumulator, li_new -> li accumulator + // alpha -> mij buffer (reuse), beta -> lij buffer (reuse) + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liND); // persist li_new + TSTORE(mijGlobalND, alphaND); // temp: alpha to mij buffer + TSTORE(lijGlobalND, betaND); // temp: beta to lij buffer + + // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(alphaDN, mijGlobalDN); // alpha from mij buffer as DN + TLOAD(betaDN, lijGlobalDN); // beta from lij buffer as DN + if (is_last) { + TLOAD(liDN, liGlobalDN); // li_new from li buffer as DN + } + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + + // Phase 5: Scale data tiles using row-broadcast multiply + TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha + TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta + TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new + + if (is_last) { + // Phase 6: Normalize and output + TROWEXPANDDIV(oiTile, oiTile, liDN); // dst = oi / li_new + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiTile); + } else { + // Phase 6: Store updated accumulators + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(oiGlobal, oiTile); + } + } +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]); + __gm__ Tensor* mi = reinterpret_cast<__gm__ Tensor*>(args[3]); + __gm__ Tensor* li = reinterpret_cast<__gm__ Tensor*>(args[4]); + __gm__ Tensor* oi = reinterpret_cast<__gm__ Tensor*>(args[5]); + __gm__ Tensor* dst = reinterpret_cast<__gm__ Tensor*>(args[6]); + uint64_t is_first = static_cast(args[7]); + uint64_t is_last = static_cast(args[8]); + + online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); +} diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp new file mode 100644 index 00000000..d0f97987 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -0,0 +1,147 @@ +// Softmax Preparation Kernel (AIV) with partial block masking +// +// Fixed tile size: sij is (16, 16) +// +// For partial blocks (valid_len < N), positions [valid_len, N) in sij are +// filled with -inf before softmax, ensuring exp(-inf)=0 so that invalid +// key positions contribute zero attention weight. +// +// Uses TFILLPAD_INPLACE for vector pipeline state setup, then patches with +// scalar SetValue writes to fix a hardware bug in TFILLPAD's vcopy broadcast +// path at small N (N=16). +// +// Computes: +// sij_masked = pad(sij, valid_len, -inf) +// sij_scale = sij_masked * scale +// mij = row_max(sij_scale) -> (M, 1) +// pij = exp(sij_scale - mij) -> (M, N) +// lij = row_sum(pij) -> (M, 1) + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij, + float scale_value, + __gm__ Tensor* pij, + __gm__ Tensor* mij, + __gm__ Tensor* lij) { + uint64_t valid_len = static_cast(sij->shapes[1]); + __gm__ float* sij_addr = reinterpret_cast<__gm__ float*>(sij->buffer.addr); + __gm__ half* pij_addr = reinterpret_cast<__gm__ half*>(pij->buffer.addr); + __gm__ float* mij_addr = reinterpret_cast<__gm__ float*>(mij->buffer.addr); + __gm__ float* lij_addr = reinterpret_cast<__gm__ float*>(lij->buffer.addr); + + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_f16 = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + GlobalDataMxN sijGlobal(sij_addr + sij->start_offset); + GlobalDataMxN_f16 pijGlobal(pij_addr + pij->start_offset); + GlobalScalarDN mijGlobal(mij_addr + mij->start_offset); + GlobalScalarDN lijGlobal(lij_addr + lij->start_offset); + + // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary + using TileSijDyn = Tile; + // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf + using TileSijPad = Tile; + + using TileVecMxN = Tile; + using TileVecMxN_f16 = Tile; + using TileScalarDN = Tile; + + TileVecMxN sijTile; + TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijPad sijPadTile; + TileVecMxN pijTile; + TileVecMxN tmpTile; + TileScalarDN maxTile; + TileScalarDN sumTile; + TileVecMxN_f16 pijF16Tile; + + TASSIGN(sijTile, 0x0); + TASSIGN(sijDynTile, 0x0); + TASSIGN(sijPadTile, 0x0); + TASSIGN(pijTile, M * N * sizeof(float)); + TASSIGN(tmpTile, 2 * M * N * sizeof(float)); + TASSIGN(maxTile, 3 * M * N * sizeof(float)); + TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); + TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); + + // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks + TLOAD(sijTile, sijGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Mask columns [valid_len, N) with -inf. + // Use TFILLPAD_INPLACE for the main fill, then patch with SetValue for + // cases where TFILLPAD's vcopy broadcast path fails at small N. + TFILLPAD_INPLACE(sijPadTile, sijDynTile); + // Patch: SetValue ensures correctness for valid_len <= N/2 where + // TFILLPAD's PadRightRemainingRows vcopy has a hardware issue. + if (valid_len < static_cast(N)) { + // Cross-pipeline sync: wait for PIPE_V vcopy in TFILLPAD to complete + // before PIPE_S scalar SetValue writes to the same UB addresses. + // Without this, PIPE_V vcopy and PIPE_S SetValue race on UB memory, + // causing sporadic FAIL when vcopy finishes after SetValue. + // Pattern from TFillPad.hpp Handle32BAlignedPad_Byte (PtoSetWaitFlag). + set_flag(PIPE_V, PIPE_S, EVENT_ID0); + wait_flag(PIPE_V, PIPE_S, EVENT_ID0); + constexpr float NEG_INF = -__builtin_huge_valf(); + for (int r = 0; r < M; r++) { + for (uint64_t c = valid_len; c < N; c++) { + sijTile.SetValue(static_cast(r * N + c), NEG_INF); + } + } + // Ensure PIPE_S scalar UB writes are visible to subsequent PIPE_V ops. + // dsb(DSB_UB) is a hardware-only intrinsic; in simulation there are no + // real pipelines so the barrier is unnecessary and DSB_UB is undefined. +#ifdef DSB_UB + dsb(DSB_UB); +#endif + } + + TMULS(sijTile, sijTile, scale_value); + TROWMAX(maxTile, sijTile, tmpTile); + TROWEXPANDSUB(pijTile, sijTile, maxTile); + TEXP(pijTile, pijTile); + // Truncate pij to fp16 first, then compute lij from truncated values (matches golden) + TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND); + TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND); + TROWSUM(sumTile, pijTile, tmpTile); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(mijGlobal, maxTile); + TSTORE(lijGlobal, sumTile); + TSTORE(pijGlobal, pijF16Tile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[0]); + union { + uint64_t u; + float f; + } scale_conv; + scale_conv.u = static_cast(args[1]); + float scale_value = scale_conv.f; + __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[2]); + __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[3]); + __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[4]); + + softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij); +} diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py new file mode 100644 index 00000000..d7627cd0 --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py @@ -0,0 +1,46 @@ +""" +Paged Attention Kernel and Orchestration Configuration + +Defines the kernels and orchestration function for paged attention +with AIC/AIV subgraph splitting: + +AIC Kernels (Matrix Multiplication): + - aic_qk_matmul: Q @ K^T computation + - aic_pv_matmul: P @ V computation + +AIV Kernels (Vector Operations): + - aiv_softmax_prepare: scale, rowmax, exp, rowsum + - aiv_online_update: online softmax accumulation + fused normalization + +Note: aiv_normalize has been merged into aiv_online_update for efficiency. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent + +# Orchestration config +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), + "function_name": "aicpu_orchestration_entry", +} + +# Kernel configs (aiv_normalize removed - merged into aiv_online_update) +KERNELS = [ + # AIC kernels (matrix multiplication using Cube unit) + {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), "core_type": "aic"}, + {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), "core_type": "aic"}, + {"func_id": 4, "name": "AIC_HUB", "source": str(_KERNELS_ROOT / "aic" / "aic_hub.cpp"), "core_type": "aic"}, + # AIV kernels (vector operations) + {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"}, + {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), "core_type": "aiv"}, + {"func_id": 5, "name": "AIV_HUB", "source": str(_KERNELS_ROOT / "aiv" / "aiv_hub.cpp"), "core_type": "aiv"}, +] + +# Runtime configuration +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "orch_thread_num": 2, + "block_dim": 24, +} diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 00000000..9184031e --- /dev/null +++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,214 @@ +/** + * Paged Attention Orchestration Function - 16x16 Version + * + * Simplified for 16x16 framework-generated matmul kernels. + * Each block processes a single 16x16 matmul operation. + * + * Memory Layout: + * Query: (batch, 16, 16) - one 16x16 tile per batch fp16 + * Key: (total_blocks, 16, 16) - stored as K^T for direct matmul fp16 + * Value: (total_blocks, 16, 16) - direct format fp16 + * + * This file compiles as a standalone .so with zero runtime link dependencies. + * All runtime calls go through the PTO2RuntimeOps function-pointer table. + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +#define FUNC_AIC_HUB 4 +#define FUNC_AIV_HUB 5 + +// Helper to encode float as uint64_t for scalar params +static uint64_t float_to_u64(float f) { + union { + float f32; + uint64_t u64; + } conv; + conv.u64 = 0; // Clear upper bits + conv.f32 = f; + return conv.u64; +} + +extern "C" { + +/** + * Orchestration config — the executor reads these values to set up + * shared memory and runtime before calling aicpu_orchestration_entry. + */ +__attribute__((visibility("default"))) +PTO2OrchestrationConfig aicpu_orchestration_config(uint64_t* args, int arg_count) { + (void)args; + (void)arg_count; + return PTO2OrchestrationConfig{ + .expected_arg_count = 10, + }; +} + +/** + * Orchestration entry — receives a PTO2Runtime* with ops table populated. + * The executor wraps this call in PTO2_SCOPE, so we are already inside + * the outer scope on entry. + */ +__attribute__((visibility("default"))) +void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, int orch_thread_num, int orch_thread_index) { + (void)arg_count; + + // Extract device pointers (first 7) + void* host_query = (void*)(uintptr_t)args[0]; // [batch, num_heads, head_dim] + void* host_key_cache = (void*)(uintptr_t)args[1]; // [batch, block_num, block_size, head_dim] + void* host_value_cache = (void*)(uintptr_t)args[2]; // [batch, block_num, block_size, head_dim] + int* host_block_table = (int*)(uintptr_t)args[3]; // [batch, block_num] + int* host_context_lens = (int*)(uintptr_t)args[4]; // [batch] + void* host_out = (void*)(uintptr_t)args[5]; // [batch, num_heads, head_dim] + int64_t* host_config = (int64_t*)(uintptr_t)args[6]; + + // Extract sizes (next 3 args after pointers) + size_t query_size = (size_t)args[7]; + size_t key_cache_size = (size_t)args[8]; + size_t value_cache_size = (size_t)args[9]; + + // Extract config parameters + uint64_t batch = (uint64_t)(int)host_config[0]; + uint64_t num_heads = (uint64_t)(int)host_config[1]; + int kv_head_num = (int)host_config[2]; + uint64_t head_dim = (uint64_t)(int)host_config[3]; + uint64_t block_size = (uint64_t)(int)host_config[4]; + uint64_t block_num = (uint64_t)(int)host_config[5]; + // Reinterpret scale_bits as float (golden.py packs float via struct.pack) + union { uint32_t u; float f; } scale_conv; + scale_conv.u = (uint32_t)host_config[6]; + float scale_value = scale_conv.f; + uint64_t q_head_num = num_heads; + uint64_t q_tile = 16; + uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; + DataType data_type = DataType::FLOAT16; + uint64_t elem_size = get_element_size(data_type); + + (void)kv_head_num; + + // Partition batch across orchestrators + uint64_t b_start = batch * orch_thread_index / orch_thread_num; + uint64_t b_end = batch * (orch_thread_index + 1) / orch_thread_num; + + LOG_INFO(rt, "orch_idx=%d/%d batch=%lu b_range=[%lu,%lu)", + orch_thread_index, orch_thread_num, + (unsigned long)batch, (unsigned long)b_start, (unsigned long)b_end); + + // Compute actual tensor shapes from buffer sizes (not from max block_num) + uint64_t query_shapes[2] = {batch * num_heads, head_dim}; + uint64_t kv_total_rows = key_cache_size / (head_dim * elem_size); + uint64_t key_cache_shapes[2] = {kv_total_rows, head_dim}; + uint64_t value_cache_shapes[2] = {kv_total_rows, head_dim}; + uint64_t out_shapes[2] = {batch * num_heads, head_dim}; + Tensor query = make_tensor_external(host_query, query_shapes, 2, data_type); + Tensor key_cache = make_tensor_external(host_key_cache, key_cache_shapes, 2, data_type); + Tensor value_cache = make_tensor_external(host_value_cache, value_cache_shapes, 2, data_type); + Tensor out = make_tensor_external(host_out, out_shapes, 2, DataType::FLOAT32); + LOG_DEBUG(rt, "query=%s", query.dump().c_str()); + LOG_DEBUG(rt, "key_cache=%s", key_cache.dump().c_str()); + LOG_DEBUG(rt, "value_cache=%s", value_cache.dump().c_str()); + LOG_DEBUG(rt, "out=%s", out.dump().c_str()); + + for (uint64_t b_idx = b_start; b_idx < b_end; b_idx++) { + uint64_t cur_seq = host_context_lens[b_idx]; + uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_SCOPE(rt) { + uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; + uint64_t oi_shapes[2] = {q_tile, head_dim}; + uint64_t li_shapes[1] = {q_tile}; + uint64_t mi_shapes[1] = {q_tile}; + Tensor oi = make_tensor(oi_shapes, 2, DataType::FLOAT32); + Tensor li_update = make_tensor(li_shapes, 1, DataType::FLOAT32); + Tensor mi_update = make_tensor(mi_shapes, 1, DataType::FLOAT32); + + uint64_t qi_shapes[2] = {q_tile, head_dim}; + uint64_t qi_offsets[2] = {cur_offset, 0}; + Tensor qi = query.view(qi_shapes, qi_offsets); + uint64_t out_view_shapes[2] = {q_tile, head_dim}; + uint64_t out_view_offsets[2] = {cur_offset, 0}; + Tensor out_view = out.view(out_view_shapes, out_view_offsets); + + PTOParam params_inplace[] = { + make_output_param(oi), + make_output_param(li_update), + make_output_param(mi_update), + }; + pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); // create_inplace + + for (uint64_t bn = 0; bn < bn_this_batch; bn++) { + uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn]; + uint64_t valid_len = block_size < (cur_seq - bn * block_size) ? block_size : (cur_seq - bn * block_size); + uint64_t kv_shapes[2] = {block_size, head_dim}; + uint64_t kv_offsets[2] = {cur_block_idx * block_size, 0}; + Tensor kj = key_cache.view(kv_shapes, kv_offsets); + Tensor vj = value_cache.view(kv_shapes, kv_offsets); + + uint64_t sij_shapes[2] = {q_tile, block_size}; + Tensor sij = make_tensor(sij_shapes, 2, DataType::FLOAT32); + Tensor pij_f16 = make_tensor(sij_shapes, 2, data_type); + + PTOParam params_qk[] = { + make_input_param(qi), + make_input_param(kj), + make_output_param(sij), + }; + pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3); // c1 + + uint64_t sij_valid_shapes[2] = {q_tile, valid_len}; + uint64_t sij_valid_offsets[2] = {0, 0}; + Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets); + Tensor li = make_tensor(li_shapes, 1, DataType::FLOAT32); + Tensor mi = make_tensor(mi_shapes, 1, DataType::FLOAT32); + PTOParam params_sf[] = { + make_input_param(sij_valid), + make_scalar_param(float_to_u64(scale_value)), + make_output_param(pij_f16), + make_output_param(mi), + make_output_param(li), + }; + pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5); // v1 + + uint64_t oi_tmp_shapes[2] = {q_tile, head_dim}; + Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32); + + PTOParam params_pv[] = { + make_input_param(pij_f16), + make_input_param(vj), + make_output_param(oi_tmp), + }; + pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3); // c2 + + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; + + PTOParam params_up[] = { + make_input_param(mi), + make_input_param(li), + make_input_param(oi_tmp), + make_inout_param(mi_update), + make_inout_param(li_update), + make_inout_param(oi), + make_output_param(out_view), + make_scalar_param(is_first), + make_scalar_param(is_last), + }; + pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); // v2 + } + } + } + } + + LOG_INFO(rt, "orch_idx=%d: tasks submitted for batch=[%lu,%lu), num_heads=%lu", + orch_thread_index, (unsigned long)b_start, (unsigned long)b_end, + (unsigned long)num_heads); +} + +} // extern "C" diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/README.md b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/README.md similarity index 100% rename from tests/device_tests/aicpu_build_graph/paged_attention/README.md rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/README.md diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/golden.py b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/golden.py similarity index 100% rename from tests/device_tests/aicpu_build_graph/paged_attention/golden.py rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/golden.py diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/kernel_config.py b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py similarity index 100% rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/kernel_config.py rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py diff --git a/tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from tests/device_tests/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp rename to tests/device_tests/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/tests/device_tests/host_build_graph/paged_attention/README.md b/tests/device_tests/a2a3/host_build_graph/paged_attention/README.md similarity index 100% rename from tests/device_tests/host_build_graph/paged_attention/README.md rename to tests/device_tests/a2a3/host_build_graph/paged_attention/README.md diff --git a/tests/device_tests/host_build_graph/paged_attention/golden.py b/tests/device_tests/a2a3/host_build_graph/paged_attention/golden.py similarity index 100% rename from tests/device_tests/host_build_graph/paged_attention/golden.py rename to tests/device_tests/a2a3/host_build_graph/paged_attention/golden.py diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from tests/device_tests/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from tests/device_tests/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from tests/device_tests/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/kernel_config.py b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py similarity index 100% rename from tests/device_tests/host_build_graph/paged_attention/kernels/kernel_config.py rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py diff --git a/tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from tests/device_tests/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp rename to tests/device_tests/a2a3/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/golden.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/golden.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/golden.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_hub.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_hub.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/kernel_config.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/golden.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/golden.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/golden.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/kernel_config.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/golden.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/golden.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/golden.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/golden.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/golden.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/golden.py diff --git a/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp new file mode 100644 index 00000000..0974de37 --- /dev/null +++ b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_hub.cpp @@ -0,0 +1,18 @@ +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +constexpr int M = 16; +constexpr int K = 16; +constexpr int N = 16; + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {} diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp diff --git a/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp new file mode 100644 index 00000000..0974de37 --- /dev/null +++ b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_hub.cpp @@ -0,0 +1,18 @@ +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +constexpr int M = 16; +constexpr int K = 16; +constexpr int N = 16; + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {} diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py diff --git a/tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from tests/device_tests/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp rename to tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/README.md b/tests/device_tests/a5/host_build_graph/paged_attention/README.md new file mode 100644 index 00000000..c0b6ebd0 --- /dev/null +++ b/tests/device_tests/a5/host_build_graph/paged_attention/README.md @@ -0,0 +1,192 @@ +# Paged Attention (Device Test) + +This example demonstrates Paged Attention implementation using CCE (Cube Core Engine) code generation, with AIC matmul kernels and AIV vector kernels using PTO Tile API. + +## Overview + +Paged Attention is an efficient attention mechanism that processes KV cache in fixed-size blocks, enabling memory-efficient inference for long sequences. This implementation uses: + +- **CCE-style codegen** for AIC kernels (Cube unit matmul) +- **PTO Tile API** for AIV kernels (Vector unit operations) +- **Online Softmax** algorithm for numerically stable incremental computation + +### Supported Platforms + +| Platform | Description | +|----------|-------------| +| a2a3 | Ascend hardware (requires device ID) | + +> This test uses bfloat16 data types and production-scale shapes that are not supported by the a2a3sim simulator. It only runs on real hardware. + +### Algorithm + +For each query token, the attention is computed incrementally across KV cache blocks: + +``` +For each block j: + sij = Qi @ Kj^T # QK MatMul (AIC) + mij, lij, pij = softmax_prepare(sij) # Softmax (AIV) + oi_new = pij @ Vj # PV MatMul (AIC) + oi = online_update(oi, oi_new, mij, lij) # Accumulate (AIV) +``` + +### Kernel Design (AIC/AIV Split) + +| Kernel | Core Type | Operation | Key Instructions | +|--------|-----------|-----------|------------------| +| aic_qk_matmul | AIC (Cube) | Q @ K^T | TLOAD/TMOV/TMATMUL/TSTORE | +| aiv_softmax_prepare | AIV (Vector) | scale, rowmax, exp, rowsum | TMULS/TROWMAX/TROWEXPANDSUB/TEXP/TROWSUM | +| aic_pv_matmul | AIC (Cube) | P @ V | TLOAD/TMOV/TMATMUL/TSTORE | +| aiv_online_update | AIV (Vector) | Online Softmax + normalize | TMAX/TSUB/TEXP/TROWEXPANDMUL/TROWEXPANDDIV | + +### Memory Hierarchy (AIC Matmul) + +``` +GM -> L1 (Mat tiles) -> L0A/L0B -> L0C (Accumulator) -> GM +``` + +### Task Graph Structure + +For each batch, the task dependency pattern is: + +``` +Block 0: QK -> SF -> PV --+ +Block 1: QK -> SF -> PV --+--> UP[0] -> UP[1] -> ... -> UP[n] +Block n: QK -> SF -> PV --+ +``` + +- **QK/SF/PV chains**: Run in parallel across blocks +- **UP (Online Update)**: Serialized within batch due to accumulator dependency + +## Quick Start + +```bash +# Run on hardware (specify device ID) +python examples/scripts/run_example.py \ + -k tests/device_tests/host_build_graph/paged_attention/kernels \ + -g tests/device_tests/host_build_graph/paged_attention/golden.py \ + -p a2a3 -d 0 + +# Run multi-block test case +PA_CASE=Case2 python examples/scripts/run_example.py \ + -k tests/device_tests/host_build_graph/paged_attention/kernels \ + -g tests/device_tests/host_build_graph/paged_attention/golden.py \ + -p a2a3 -d 0 +``` + +## Directory Structure + +``` +paged_attention/ +├── README.md # This file +├── golden.py # Input generation and expected output +└── kernels/ + ├── kernel_config.py # Kernel registration config + ├── aic/ # AIC kernels (CCE codegen style) + │ ├── aic_qk_matmul.cpp # Q @ K^T matmul + │ └── aic_pv_matmul.cpp # P @ V matmul + ├── aiv/ # AIV kernels (PTO Tile API) + │ ├── aiv_softmax_prepare.cpp # Softmax preparation + │ └── aiv_online_update.cpp # Online Softmax update + normalize + └── orchestration/ + └── paged_attention_orch.cpp # Task graph builder +``` + +## Test Cases + +| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Description | +|------|-------|-----------|-------------|----------|------------|-------------|-------------| +| Case1 | 1 | 16 | 1 | 128 | 128 | 256 | Small scale (default) | +| Case2 | 8 | 64 | 1 | 128 | 64 | 8192 | Production scale | + +All test cases use **bfloat16** Q/K/V inputs with GQA (kv_head_num=1). + +## Key Technical Details + +### AIC Kernels (CCE Codegen) + +```cpp +// L1 tiles: ColMajor + SLayout::RowMajor (required for matmul) +using TileMatA = Tile; +using TileMatB = Tile; + +// L0 tiles: Use standard TileLeft/TileRight/TileAcc aliases +using LeftTile = TileLeft; +using RightTile = TileRight; +using AccTile = TileAcc; + +// Pipeline: MTE2 -> MTE1 -> M -> FIX -> MTE3 +TLOAD(aMatTile, qiGlobal); // GM -> L1 +TMOV(aTile, aMatTile); // L1 -> L0A +TMATMUL(cTile, aTile, bTile); // L0A x L0B -> L0C +TSTORE(sijGlobal, cTile); // L0C -> GM +``` + +### AIV Kernels (PTO Tile API) + +**softmax_prepare**: Uses DN layout (ColMajor, 16x1) for row reduction results + +```cpp +using TileScalarDN = Tile; + +TMULS(sijTile, sijTile, scale_value); // Scale +TROWMAX(maxTile, sijTile, tmpTile); // Row max +TROWEXPANDSUB(pijTile, sijTile, maxTile); // Subtract max (broadcast) +TEXP(pijTile, pijTile); // Exp +TROWSUM(sumTile, pijTile, tmpTile); // Row sum +``` + +**online_update**: Uses ND/DN layout conversion for hardware compatibility + +```cpp +// ND (1x16, RowMajor) for scalar arithmetic - TSUB/TMUL/TADD require RowMajor +using TileScalarND = Tile; +// DN (16x1, ColMajor) for row broadcast - TROWEXPANDMUL/TROWEXPANDDIV require this +using TileScalarDN = Tile; + +// Arithmetic in ND layout +TMAX(miNewTileND, miTileND, mijTileND); +TSUB(alphaTileND, miTileND, miNewTileND); +TEXP(alphaTileND, alphaTileND); + +// Reshape ND -> DN for broadcast operations +TRESHAPE(alphaTileDN, alphaTileND); +TROWEXPANDMUL(oiTile, oiTile, alphaTileDN); +``` + +### Data Layout + +- **K stored as K^T**: (head_dim, block_size) for direct matmul compatibility +- **V stored normally**: (block_size, head_dim) + +## Expected Output + +``` +=== Compiling and Registering Kernels === +Compiling kernel: .../aic_qk_matmul.cpp (func_id=0) +Compiling kernel: .../aiv_softmax_prepare.cpp (func_id=1) +Compiling kernel: .../aic_pv_matmul.cpp (func_id=2) +Compiling kernel: .../aiv_online_update.cpp (func_id=3) +... +=== build_paged_attention_graph (16x16 framework version) === +batch=1, num_heads=16, kv_head_num=1, head_dim=16 +block_size=16, block_num=1 +... +Created 4 tasks +... +=== Comparing Results === +Comparing out: shape=(256,), dtype=float32 + out: PASS (256/256 elements matched) + +============================================================ +TEST PASSED +============================================================ +``` + +## Reference + +This implementation uses the Online Softmax algorithm for paged attention, with identical kernel structure to the PyPTO reference implementation. + +## See Also + +- [Test Framework Documentation](../../../../examples/scripts/README.md) diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/golden.py b/tests/device_tests/a5/host_build_graph/paged_attention/golden.py new file mode 100644 index 00000000..b5dd811d --- /dev/null +++ b/tests/device_tests/a5/host_build_graph/paged_attention/golden.py @@ -0,0 +1,45 @@ +"""Paged Attention Golden - host_build_graph test (production scale, bfloat16).""" + +from paged_attention_golden import ( + generate_inputs as _generate_inputs, + compute_golden, + run_golden_test, +) + +__outputs__ = ["out"] + +RTOL = 1e-3 +ATOL = 1e-3 + +ALL_CASES = { + "Case1": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8100, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + "Case2": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8150, + "max_model_len": 32768, + "dtype": "bfloat16", + }, +} + +DEFAULT_CASE = "Case1" + + +def generate_inputs(params: dict) -> list: + return _generate_inputs(params, return_all_sizes=True) + + +if __name__ == "__main__": + run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp new file mode 100644 index 00000000..55827067 --- /dev/null +++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp @@ -0,0 +1,97 @@ +// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128) -> (16, 128) +// Case2: (64, 64) @ ( 64, 128) -> (64, 128) +// +// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT). +// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. +// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void pv_matmul_impl(__gm__ uint8_t* pij_raw, __gm__ uint8_t* vj_raw, __gm__ uint8_t* oi_raw) +{ + __gm__ bfloat16_t* pij = reinterpret_cast<__gm__ bfloat16_t*>(pij_raw); + __gm__ bfloat16_t* vj = reinterpret_cast<__gm__ bfloat16_t*>(vj_raw); + __gm__ float* oi = reinterpret_cast<__gm__ float*>(oi_raw); + + // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32 + using GlobalA = GlobalTensor, pto::Stride>; + using GlobalB = GlobalTensor, pto::Stride>; + using GlobalOut = GlobalTensor, pto::Stride>; + + GlobalA pijGlobal(pij); + GlobalB vjGlobal(vj); + GlobalOut oiGlobal(oi); + + // L1 Mat tiles: standard ND pattern for both A and B + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load pij and vj to L1 + TLOAD(aMatTile, pijGlobal); + TLOAD(bMatTile, vjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Single matmul: (M,K) x (K,N) -> (M,N) + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(oiGlobal, cTile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) +{ + __gm__ uint8_t* pij = reinterpret_cast<__gm__ uint8_t*>(args[0]); + __gm__ uint8_t* vj = reinterpret_cast<__gm__ uint8_t*>(args[1]); + __gm__ uint8_t* oi_new = reinterpret_cast<__gm__ uint8_t*>(args[2]); + int q_tile_size = static_cast(args[3]); + // args[4] = block_size, args[5] = head_dim + + if (q_tile_size == 16) { + pv_matmul_impl<16, 128, 128>(pij, vj, oi_new); + } else { + pv_matmul_impl<64, 64, 128>(pij, vj, oi_new); + } +} diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp new file mode 100644 index 00000000..608879f9 --- /dev/null +++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp @@ -0,0 +1,98 @@ +// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128).T -> (16, 128) +// Case2: (64, 128) @ (128, 64).T -> (64, 64) +// +// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. +// This is equivalent to (K, N) in column-major (DN) layout. +// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void qk_matmul_impl(__gm__ uint8_t* qi_raw, __gm__ uint8_t* kj_raw, __gm__ uint8_t* sij_raw) +{ + __gm__ bfloat16_t* qi = reinterpret_cast<__gm__ bfloat16_t*>(qi_raw); + __gm__ bfloat16_t* kj = reinterpret_cast<__gm__ bfloat16_t*>(kj_raw); + __gm__ float* sij = reinterpret_cast<__gm__ float*>(sij_raw); + + // qi (M, K) bf16 in ND (row-major) layout + using GlobalA = GlobalTensor, pto::Stride>; + // kj stored as (N, K) row-major = (K, N) column-major -> DN layout + using GlobalB = GlobalTensor, pto::Stride, Layout::DN>; + using GlobalOut = GlobalTensor, pto::Stride>; + + GlobalA qiGlobal(qi); + GlobalB kjGlobal(kj); + GlobalOut sijGlobal(sij); + + // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load qi and kj to L1 + TLOAD(aMatTile, qiGlobal); + TLOAD(bMatTile, kjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Single matmul: (M,K) x (K,N) -> (M,N) + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(sijGlobal, cTile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) +{ + __gm__ uint8_t* qi = reinterpret_cast<__gm__ uint8_t*>(args[0]); + __gm__ uint8_t* kj = reinterpret_cast<__gm__ uint8_t*>(args[1]); + __gm__ uint8_t* sij = reinterpret_cast<__gm__ uint8_t*>(args[2]); + int q_tile_size = static_cast(args[3]); + // args[4] = head_dim (128), args[5] = block_size + + if (q_tile_size == 16) { + qk_matmul_impl<16, 128, 128>(qi, kj, sij); + } else { + qk_matmul_impl<64, 128, 64>(qi, kj, sij); + } +} diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp new file mode 100644 index 00000000..71f28d2d --- /dev/null +++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp @@ -0,0 +1,227 @@ +// Online Softmax Update + Normalize Kernel (AIV) +// +// Operates on full tiles where M=q_tile_size, N=head_dim (128): +// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors +// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors +// +// Scalar layout strategy: +// M scalar floats stored contiguously in GM can be loaded as either: +// - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD) +// - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV) +// Conversion between layouts uses GM round-trip: ND TSTORE → DN TLOAD. + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void online_update_impl(__gm__ uint8_t* mij_raw, __gm__ uint8_t* lij_raw, + __gm__ uint8_t* oi_new_raw, __gm__ uint8_t* mi_raw, + __gm__ uint8_t* li_raw, __gm__ uint8_t* oi_raw, + int is_first, int is_last, __gm__ uint8_t* dst_raw) +{ + __gm__ float* mij_ptr = reinterpret_cast<__gm__ float*>(mij_raw); + __gm__ float* lij_ptr = reinterpret_cast<__gm__ float*>(lij_raw); + __gm__ float* oi_new_ptr = reinterpret_cast<__gm__ float*>(oi_new_raw); + __gm__ float* mi_ptr = reinterpret_cast<__gm__ float*>(mi_raw); + __gm__ float* li_ptr = reinterpret_cast<__gm__ float*>(li_raw); + __gm__ float* oi_ptr = reinterpret_cast<__gm__ float*>(oi_raw); + __gm__ float* dst_ptr = reinterpret_cast<__gm__ float*>(dst_raw); + + // Scalar tile dimensions for RowMajor layout: + // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block) + // kScalarRows = M / 8 (M=16 → 2 rows, M=64 → 8 rows) + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + // Aligned rows for ColMajor DN tiles (32-byte alignment) + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + // --- GlobalTensor types --- + + // Data (M, N) RowMajor + using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + + // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor + using GlobalScalarND = GlobalTensor, + pto::Stride<1, 1, 1, kScalarCols, 1>>; + + // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor + using GlobalScalarDN = GlobalTensor, + pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + // --- GlobalTensor instances --- + + GlobalDataMxN oiNewGlobal(oi_new_ptr); + GlobalDataMxN oiGlobal(oi_ptr); + GlobalDataMxN dstGlobal(dst_ptr); + + // ND globals for scalar element-wise operations + GlobalScalarND mijGlobalND(mij_ptr); + GlobalScalarND lijGlobalND(lij_ptr); + GlobalScalarND miGlobalND(mi_ptr); + GlobalScalarND liGlobalND(li_ptr); + + // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE) + GlobalScalarDN mijGlobalDN(mij_ptr); + GlobalScalarDN lijGlobalDN(lij_ptr); + GlobalScalarDN liGlobalDN(li_ptr); + + // --- Tile types --- + + using TileDataMxN = Tile; + using TileScalarND = Tile; + using TileScalarDN = Tile; + + // --- UB memory layout --- + + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float); + constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); + + // Data tiles + TileDataMxN oiNewTile; + TileDataMxN oiTile; + + // Scalar ND tiles for element-wise arithmetic + TileScalarND mijND, lijND, miND, liND; + TileScalarND miNewND, alphaND, betaND, tmpND; + + // Scalar DN tiles for TROWEXPAND operations + TileScalarDN alphaDN, betaDN, liDN; + + TASSIGN(oiNewTile, 0); + TASSIGN(oiTile, kDataBytes); + TASSIGN(mijND, 2 * kDataBytes); + TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes); + TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes); + TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes); + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes); + TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes); + TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes); + TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes); + TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes); + TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes); + TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes); + + if (is_first) { + // --- First block: copy inputs to accumulators --- + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Passthrough to MTE3 (no V compute needed) + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, mijND); // mi = mij + TSTORE(liGlobalND, lijND); // li = lij + TSTORE(oiGlobal, oiNewTile); // oi = oi_new + + if (is_last) { + // Single block: normalize dst = oi_new / lij + // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(liDN, liGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TROWEXPANDDIV(oiNewTile, oiNewTile, liDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiNewTile); + } + } else { + // --- Subsequent blocks: accumulate --- + + // Phase 1: Load all inputs + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(oiTile, oiGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + TLOAD(miND, miGlobalND); + TLOAD(liND, liGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols) + // to resolve RAW hazards on shared UB tiles. + TMAX(miNewND, miND, mijND); // mi_new = max(mi, mij) + TSUB(alphaND, miND, miNewND); // alpha = mi - mi_new + TEXP(alphaND, alphaND); // alpha = exp(mi - mi_new) + TSUB(betaND, mijND, miNewND); // beta = mij - mi_new + TEXP(betaND, betaND); // beta = exp(mij - mi_new) + TMUL(liND, alphaND, liND); // li = alpha * li + TMUL(tmpND, betaND, lijND); // tmp = beta * lij + TADD(liND, liND, tmpND); // li = alpha * li + beta * lij (= li_new) + + // Phase 3: Store scalar results to GM (ND format) + // mi_new → mi accumulator, li_new → li accumulator + // alpha → mij buffer (reuse), beta → lij buffer (reuse) + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liND); // persist li_new + TSTORE(mijGlobalND, alphaND); // temp: alpha to mij buffer + TSTORE(lijGlobalND, betaND); // temp: beta to lij buffer + + // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(alphaDN, mijGlobalDN); // alpha from mij buffer as DN + TLOAD(betaDN, lijGlobalDN); // beta from lij buffer as DN + if (is_last) { + TLOAD(liDN, liGlobalDN); // li_new from li buffer as DN + } + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + + // Phase 5: Scale data tiles using row-broadcast multiply + TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha + TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta + TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new + + if (is_last) { + // Phase 6: Normalize and output + TROWEXPANDDIV(oiTile, oiTile, liDN); // dst = oi / li_new + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiTile); + } else { + // Phase 6: Store updated accumulators + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(oiGlobal, oiTile); + } + } +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ uint8_t* mij = reinterpret_cast<__gm__ uint8_t*>(args[0]); + __gm__ uint8_t* lij = reinterpret_cast<__gm__ uint8_t*>(args[1]); + __gm__ uint8_t* oi_new = reinterpret_cast<__gm__ uint8_t*>(args[2]); + __gm__ uint8_t* mi = reinterpret_cast<__gm__ uint8_t*>(args[3]); + __gm__ uint8_t* li = reinterpret_cast<__gm__ uint8_t*>(args[4]); + __gm__ uint8_t* oi = reinterpret_cast<__gm__ uint8_t*>(args[5]); + int is_first = static_cast(args[6]); + int is_last = static_cast(args[7]); + __gm__ uint8_t* dst = reinterpret_cast<__gm__ uint8_t*>(args[8]); + int q_tile_size = static_cast(args[9]); + // args[10] = head_dim (128) + + if (q_tile_size == 16) { + online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else { + online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } +} diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp new file mode 100644 index 00000000..dde7537c --- /dev/null +++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -0,0 +1,123 @@ +// Softmax Preparation Kernel (AIV) with partial block masking +// +// Operates on (M, N) tile where M=q_tile_size, N=block_size: +// Case1: sij is (16, 128) +// Case2: sij is (64, 64) +// +// For partial blocks (valid_len < N), positions [valid_len, N) in sij are +// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0 +// so that invalid key positions contribute zero attention weight. +// +// Computes: +// sij_masked = TFILLPAD(sij, valid_len, pad=-inf) +// sij_scale = sij_masked * scale +// mij = row_max(sij_scale) -> (M, 1) +// pij = exp(sij_scale - mij) -> (M, N) +// lij = row_sum(pij) -> (M, 1) + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void softmax_prepare_impl(__gm__ uint8_t* sij_raw, float scale_value, + __gm__ uint8_t* pij_raw, __gm__ uint8_t* mij_raw, + __gm__ uint8_t* lij_raw, int valid_len) +{ + __gm__ float* sij = reinterpret_cast<__gm__ float*>(sij_raw); + __gm__ bfloat16_t* pij = reinterpret_cast<__gm__ bfloat16_t*>(pij_raw); + __gm__ float* mij = reinterpret_cast<__gm__ float*>(mij_raw); + __gm__ float* lij = reinterpret_cast<__gm__ float*>(lij_raw); + + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_bf16 = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + GlobalDataMxN sijGlobal(sij); + GlobalDataMxN_bf16 pijGlobal(pij); + GlobalScalarDN mijGlobal(mij); + GlobalScalarDN lijGlobal(lij); + + // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary + using TileSijDyn = Tile; + // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf + using TileSijPad = Tile; + + using TileVecMxN = Tile; + using TileVecMxN_bf16 = Tile; + using TileScalarDN = Tile; + + TileVecMxN sijTile; + TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijPad sijPadTile; + TileVecMxN pijTile; + TileVecMxN tmpTile; + TileScalarDN maxTile; + TileScalarDN sumTile; + TileVecMxN_bf16 pijBf16Tile; + + // All sij tiles share UB address 0x0 (in-place masking) + TASSIGN(sijTile, 0x0); + TASSIGN(sijDynTile, 0x0); + TASSIGN(sijPadTile, 0x0); + TASSIGN(pijTile, M * N * sizeof(float)); + TASSIGN(tmpTile, 2 * M * N * sizeof(float)); + TASSIGN(maxTile, 3 * M * N * sizeof(float)); + TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); + TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); + + // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks + TLOAD(sijTile, sijGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary, + // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N. + TFILLPAD_INPLACE(sijPadTile, sijDynTile); + + TMULS(sijTile, sijTile, scale_value); + TROWMAX(maxTile, sijTile, tmpTile); + TROWEXPANDSUB(pijTile, sijTile, maxTile); + TEXP(pijTile, pijTile); + // Truncate pij to bf16 first, then compute lij from truncated values (matches golden) + TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); + TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); + TROWSUM(sumTile, pijTile, tmpTile); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(mijGlobal, maxTile); + TSTORE(lijGlobal, sumTile); + TSTORE(pijGlobal, pijBf16Tile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ uint8_t* sij = reinterpret_cast<__gm__ uint8_t*>(args[0]); + union { uint64_t u; float f; } scale_conv; + scale_conv.u = static_cast(args[1]); + float scale_value = scale_conv.f; + __gm__ uint8_t* pij = reinterpret_cast<__gm__ uint8_t*>(args[2]); + __gm__ uint8_t* mij = reinterpret_cast<__gm__ uint8_t*>(args[3]); + __gm__ uint8_t* lij = reinterpret_cast<__gm__ uint8_t*>(args[4]); + int q_tile_size = static_cast(args[5]); + // args[6] = block_size + int valid_len = static_cast(args[7]); + + if (q_tile_size == 16) { + softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij, valid_len); + } else { + softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij, valid_len); + } +} diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/kernel_config.py b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/kernel_config.py new file mode 100644 index 00000000..03f4a7c4 --- /dev/null +++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/kernel_config.py @@ -0,0 +1,43 @@ +""" +Paged Attention Kernel and Orchestration Configuration + +Defines the kernels and orchestration function for paged attention +with AIC/AIV subgraph splitting: + +AIC Kernels (Matrix Multiplication): + - aic_qk_matmul: Q @ K^T computation + - aic_pv_matmul: P @ V computation + +AIV Kernels (Vector Operations): + - aiv_softmax_prepare: scale, rowmax, exp, rowsum + - aiv_online_update: online softmax accumulation + fused normalization + +Note: aiv_normalize has been merged into aiv_online_update for efficiency. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent + +# Orchestration config +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), + "function_name": "build_paged_attention_graph", +} + +# Kernel configs (aiv_normalize removed - merged into aiv_online_update) +KERNELS = [ + # AIC kernels (matrix multiplication using Cube unit) + {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), "core_type": "aic"}, + {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), "core_type": "aic"}, + # AIV kernels (vector operations) + {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"}, + {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), "core_type": "aiv"}, +] + +# Runtime configuration +RUNTIME_CONFIG = { + "runtime": "host_build_graph", + "aicpu_thread_num": 3, + "block_dim": 24, +} diff --git a/tests/device_tests/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 00000000..2b2192dc --- /dev/null +++ b/tests/device_tests/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,261 @@ +/** + * Paged Attention Orchestration - Production Scale + * + * Supports production-scale paged attention with: + * Query: (batch, q_head_num, head_dim) bf16 + * Key: (total_blocks, block_size, kv_head_num, head_dim) bf16 (NOT transposed) + * Value: (total_blocks, block_size, kv_head_num, head_dim) bf16 + * Output: (batch * q_head_num, head_dim) float32 + * + * Head tiling: q_tile_size = min(num_heads, 128) + * GQA: kv_head_num can differ from q_head_num + */ + +#include "runtime.h" +#include +#include +#include + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 + +extern "C" { + +int build_paged_attention_graph(Runtime* runtime, uint64_t* args, int arg_count) { + if (arg_count < 14) { + std::cerr << "Expected at least 14 args, got " << arg_count << '\n'; + return -1; + } + + void* host_query = reinterpret_cast(args[0]); + void* host_key_cache = reinterpret_cast(args[1]); + void* host_value_cache = reinterpret_cast(args[2]); + int* host_block_table = reinterpret_cast(args[3]); + int* host_context_lens = reinterpret_cast(args[4]); + void* host_out = reinterpret_cast(args[5]); + int64_t* host_config = reinterpret_cast(args[6]); + + size_t query_size = static_cast(args[7]); + size_t key_cache_size = static_cast(args[8]); + size_t value_cache_size = static_cast(args[9]); + size_t block_table_size = static_cast(args[10]); + size_t context_lens_size = static_cast(args[11]); + size_t out_size = static_cast(args[12]); + size_t config_size = static_cast(args[13]); + + int batch = static_cast(host_config[0]); + int num_heads = static_cast(host_config[1]); + int kv_head_num = static_cast(host_config[2]); + int head_dim = static_cast(host_config[3]); + int block_size = static_cast(host_config[4]); + int max_num_blocks = static_cast(host_config[5]); + uint64_t scale_value_bits = static_cast(host_config[6]); + + int q_tile_size = std::min(num_heads, 128); + int num_head_tiles = (num_heads + q_tile_size - 1) / q_tile_size; + + std::cout << "\n=== build_paged_attention_graph ===" << '\n'; + std::cout << "batch=" << batch << ", num_heads=" << num_heads + << ", kv_head_num=" << kv_head_num << ", head_dim=" << head_dim << '\n'; + std::cout << "block_size=" << block_size << ", max_num_blocks=" << max_num_blocks << '\n'; + std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n'; + + // Allocate device memory for inputs/outputs + void* dev_query = runtime->host_api.device_malloc(query_size); + void* dev_key_cache = runtime->host_api.device_malloc(key_cache_size); + void* dev_value_cache = runtime->host_api.device_malloc(value_cache_size); + void* dev_out = runtime->host_api.device_malloc(out_size); + + if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) { + std::cerr << "Error: Failed to allocate device memory\n"; + return -1; + } + + runtime->host_api.copy_to_device(dev_query, host_query, query_size); + runtime->host_api.copy_to_device(dev_key_cache, host_key_cache, key_cache_size); + runtime->host_api.copy_to_device(dev_value_cache, host_value_cache, value_cache_size); + runtime->record_tensor_pair(host_out, dev_out, out_size); + + // Buffer sizes depend on q_tile_size and block_size + size_t sij_size = static_cast(q_tile_size) * block_size * sizeof(float); + size_t pij_size = static_cast(q_tile_size) * block_size * sizeof(uint16_t); + size_t mij_size = static_cast(q_tile_size) * sizeof(float); + size_t lij_size = mij_size; + size_t oi_new_size = static_cast(q_tile_size) * head_dim * sizeof(float); + + // Per-batch-per-block intermediate buffers + int total_buffers = batch * max_num_blocks; + void** dev_sij_arr = new void*[total_buffers]; + void** dev_pij_arr = new void*[total_buffers]; + void** dev_mij_arr = new void*[total_buffers]; + void** dev_lij_arr = new void*[total_buffers]; + void** dev_oi_new_arr = new void*[total_buffers]; + + for (int i = 0; i < total_buffers; i++) { + dev_sij_arr[i] = runtime->host_api.device_malloc(sij_size); + dev_pij_arr[i] = runtime->host_api.device_malloc(pij_size); + dev_mij_arr[i] = runtime->host_api.device_malloc(mij_size); + dev_lij_arr[i] = runtime->host_api.device_malloc(lij_size); + dev_oi_new_arr[i] = runtime->host_api.device_malloc(oi_new_size); + } + + // Per-(batch, head_tile) accumulators + int total_accums = batch * num_head_tiles; + size_t mi_size = static_cast(q_tile_size) * sizeof(float); + size_t li_size = mi_size; + size_t oi_size = static_cast(q_tile_size) * head_dim * sizeof(float); + + void** dev_mi_arr = new void*[total_accums]; + void** dev_li_arr = new void*[total_accums]; + void** dev_oi_arr = new void*[total_accums]; + + for (int i = 0; i < total_accums; i++) { + dev_mi_arr[i] = runtime->host_api.device_malloc(mi_size); + dev_li_arr[i] = runtime->host_api.device_malloc(li_size); + dev_oi_arr[i] = runtime->host_api.device_malloc(oi_size); + } + + std::cout << "Allocated " << total_buffers << " per-block buffers\n"; + std::cout << "Allocated " << total_accums << " per-(batch,head_tile) accumulators\n"; + + int total_tasks = 0; + + for (int b_idx = 0; b_idx < batch; b_idx++) { + int cur_seq = host_context_lens[b_idx]; + int bn_this_batch = (cur_seq + block_size - 1) / block_size; + + for (int ht = 0; ht < num_head_tiles; ht++) { + int cur_offset = ht * q_tile_size; + + // Query: (batch, q_head_num, head_dim) bf16 + // qi points to heads [cur_offset .. cur_offset+q_tile_size) for batch b_idx + uint8_t* qi_ptr = reinterpret_cast(dev_query) + + static_cast(b_idx * num_heads + cur_offset) * head_dim * sizeof(uint16_t); + + // Output: (batch * q_head_num, head_dim) float32 + uint8_t* out_ptr = reinterpret_cast(dev_out) + + static_cast(b_idx * num_heads + cur_offset) * head_dim * sizeof(float); + + // GQA: which kv_head this head tile maps to + int kv_head_idx = cur_offset / (num_heads / kv_head_num); + + // Per-(batch, head_tile) accumulators + int accum_idx = b_idx * num_head_tiles + ht; + void* dev_mi = dev_mi_arr[accum_idx]; + void* dev_li = dev_li_arr[accum_idx]; + void* dev_oi = dev_oi_arr[accum_idx]; + + int t_up_prev = -1; + + for (int bn = 0; bn < bn_this_batch; bn++) { + int cur_block_idx = host_block_table[b_idx * max_num_blocks + bn]; + int valid_len = std::min(block_size, cur_seq - bn * block_size); + + // Key: (total_blocks, block_size, kv_head_num, head_dim) bf16 + // Stride to block: cur_block_idx * (block_size * kv_head_num * head_dim) + // Then offset to kv_head: kv_head_idx * head_dim (within each token row) + // But since we want contiguous (block_size, head_dim), and kv_head_num=1 makes it simple: + uint8_t* kj_ptr = reinterpret_cast(dev_key_cache) + + (static_cast(cur_block_idx) * block_size * kv_head_num + kv_head_idx) + * head_dim * sizeof(uint16_t); + + // Value: (total_blocks, block_size, kv_head_num, head_dim) bf16 - same layout as key + uint8_t* vj_ptr = reinterpret_cast(dev_value_cache) + + (static_cast(cur_block_idx) * block_size * kv_head_num + kv_head_idx) + * head_dim * sizeof(uint16_t); + + int buf_idx = b_idx * max_num_blocks + bn; + void* dev_sij = dev_sij_arr[buf_idx]; + void* dev_pij = dev_pij_arr[buf_idx]; + void* dev_mij = dev_mij_arr[buf_idx]; + void* dev_lij = dev_lij_arr[buf_idx]; + void* dev_oi_new = dev_oi_new_arr[buf_idx]; + + // QK: qi(M, K) @ kj.T(K, N) -> sij(M, N) + uint64_t qk_args[6] = { + reinterpret_cast(qi_ptr), + reinterpret_cast(kj_ptr), + reinterpret_cast(dev_sij), + static_cast(q_tile_size), + static_cast(head_dim), + static_cast(block_size) + }; + int t_qk = runtime->add_task(qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC); + total_tasks++; + + // SF: scale, rowmax, exp, rowsum -> pij, mij, lij + uint64_t sf_args[8] = { + reinterpret_cast(dev_sij), + scale_value_bits, + reinterpret_cast(dev_pij), + reinterpret_cast(dev_mij), + reinterpret_cast(dev_lij), + static_cast(q_tile_size), + static_cast(block_size), + static_cast(valid_len) + }; + int t_sf = runtime->add_task(sf_args, 8, FUNC_SOFTMAX_PREPARE, CoreType::AIV); + total_tasks++; + + // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N') + uint64_t pv_args[6] = { + reinterpret_cast(dev_pij), + reinterpret_cast(vj_ptr), + reinterpret_cast(dev_oi_new), + static_cast(q_tile_size), + static_cast(block_size), + static_cast(head_dim) + }; + int t_pv = runtime->add_task(pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC); + total_tasks++; + + runtime->add_successor(t_qk, t_sf); + runtime->add_successor(t_sf, t_pv); + + // Online Update: serialized across blocks (each depends on previous) + int is_first = (bn == 0) ? 1 : 0; + int is_last = (bn == bn_this_batch - 1) ? 1 : 0; + + uint64_t up_args[11] = { + reinterpret_cast(dev_mij), + reinterpret_cast(dev_lij), + reinterpret_cast(dev_oi_new), + reinterpret_cast(dev_mi), + reinterpret_cast(dev_li), + reinterpret_cast(dev_oi), + static_cast(is_first), + static_cast(is_last), + reinterpret_cast(out_ptr), + static_cast(q_tile_size), + static_cast(head_dim) + }; + int t_up = runtime->add_task(up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV); + total_tasks++; + + runtime->add_successor(t_pv, t_up); + if (t_up_prev >= 0) { + runtime->add_successor(t_up_prev, t_up); + } + t_up_prev = t_up; + } + } + } + + delete[] dev_sij_arr; + delete[] dev_pij_arr; + delete[] dev_mij_arr; + delete[] dev_lij_arr; + delete[] dev_oi_new_arr; + delete[] dev_mi_arr; + delete[] dev_li_arr; + delete[] dev_oi_arr; + + std::cout << "Created " << total_tasks << " tasks\n"; + runtime->print_runtime(); + + return 0; +} + +} diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/golden.py b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/golden.py new file mode 100644 index 00000000..e6e1d9b8 --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/golden.py @@ -0,0 +1,55 @@ +"""Paged Attention Golden - tensormap_and_ringbuffer test (production scale, bfloat16).""" + +from paged_attention_golden import ( + generate_inputs as _generate_inputs, + compute_golden, + run_golden_test, +) + +__outputs__ = ["out"] + +RTOL = 1e-3 +ATOL = 1e-3 + +ALL_CASES = { + "Case1": { + "batch": 64, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + "Case2": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + "Case3": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, +} + +DEFAULT_CASE = "Case1" + + +def generate_inputs(params: dict) -> list: + return _generate_inputs(params, return_all_sizes=False) + + +if __name__ == "__main__": + run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp new file mode 100644 index 00000000..0974de37 --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_hub.cpp @@ -0,0 +1,18 @@ +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +constexpr int M = 16; +constexpr int K = 16; +constexpr int N = 16; + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {} diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp new file mode 100644 index 00000000..dc9499cf --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp @@ -0,0 +1,97 @@ +// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128) -> (16, 128) +// Case2: (64, 64) @ ( 64, 128) -> (64, 128) +// +// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT). +// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. +// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void pv_matmul_impl(__gm__ Tensor* pij, __gm__ Tensor* vj, __gm__ Tensor* oi) { + __gm__ bfloat16_t* pij_addr = reinterpret_cast<__gm__ bfloat16_t*>(pij->buffer.addr); + __gm__ bfloat16_t* vj_addr = reinterpret_cast<__gm__ bfloat16_t*>(vj->buffer.addr); + __gm__ float* oi_addr = reinterpret_cast<__gm__ float*>(oi->buffer.addr); + + // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32 + using GlobalA = GlobalTensor, pto::Stride>; + using GlobalB = GlobalTensor, pto::Stride>; + using GlobalOut = GlobalTensor, pto::Stride>; + + GlobalA pijGlobal(pij_addr + pij->start_offset); + GlobalB vjGlobal(vj_addr + vj->start_offset); + GlobalOut oiGlobal(oi_addr + oi->start_offset); + + // L1 Mat tiles: standard ND pattern for both A and B + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load pij and vj to L1 + TLOAD(aMatTile, pijGlobal); + TLOAD(bMatTile, vjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Single matmul: (M,K) x (K,N) -> (M,N) + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(oiGlobal, cTile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* vj = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]); + uint64_t q_tile_size = static_cast(pij->shapes[0]); + // args[4] = block_size, args[5] = head_dim + + if (q_tile_size == 16) { + pv_matmul_impl<16, 128, 128>(pij, vj, oi_new); + } else { + pv_matmul_impl<64, 64, 128>(pij, vj, oi_new); + } +} diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp new file mode 100644 index 00000000..b9f17ecb --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp @@ -0,0 +1,98 @@ +// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128).T -> (16, 128) +// Case2: (64, 128) @ (128, 64).T -> (64, 64) +// +// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. +// This is equivalent to (K, N) in column-major (DN) layout. +// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void qk_matmul_impl(__gm__ Tensor* qi, __gm__ Tensor* kj, __gm__ Tensor* sij) { + __gm__ bfloat16_t* qi_addr = reinterpret_cast<__gm__ bfloat16_t*>(qi->buffer.addr); + __gm__ bfloat16_t* kj_addr = reinterpret_cast<__gm__ bfloat16_t*>(kj->buffer.addr); + __gm__ float* sij_addr = reinterpret_cast<__gm__ float*>(sij->buffer.addr); + + // qi (M, K) bf16 in ND (row-major) layout + using GlobalA = GlobalTensor, pto::Stride>; + // kj stored as (N, K) row-major = (K, N) column-major -> DN layout + using GlobalB = GlobalTensor, pto::Stride, Layout::DN>; + using GlobalOut = GlobalTensor, pto::Stride>; + + GlobalA qiGlobal(qi_addr + qi->start_offset); + GlobalB kjGlobal(kj_addr + kj->start_offset); + GlobalOut sijGlobal(sij_addr + sij->start_offset); + + // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // // Load A and B to L1 + TLOAD(aMatTile, qiGlobal); + TLOAD(bMatTile, kjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // Move from L1 to L0A/L0B + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Matmul + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(sijGlobal, cTile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* qi = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* kj = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[2]); + uint64_t q_tile_size = static_cast(qi->shapes[0]); + // args[4] = head_dim (128), args[5] = block_size + + if (q_tile_size == 16) { + qk_matmul_impl<16, 128, 128>(qi, kj, sij); + } else { + qk_matmul_impl<64, 128, 64>(qi, kj, sij); + } +} diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp new file mode 100644 index 00000000..0974de37 --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_hub.cpp @@ -0,0 +1,18 @@ +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +constexpr int M = 16; +constexpr int K = 16; +constexpr int N = 16; + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {} diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp new file mode 100644 index 00000000..3c4d227f --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp @@ -0,0 +1,232 @@ +// Online Softmax Update + Normalize Kernel (AIV) +// +// Operates on full tiles where M=q_tile_size, N=head_dim (128): +// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors +// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors +// +// Scalar layout strategy: +// M scalar floats stored contiguously in GM can be loaded as either: +// - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD) +// - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV) +// Conversion between layouts uses GM round-trip: ND TSTORE → DN TLOAD. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void online_update_impl(__gm__ Tensor* mij, + __gm__ Tensor* lij, + __gm__ Tensor* oi_new, + __gm__ Tensor* mi, + __gm__ Tensor* li, + __gm__ Tensor* oi, + uint64_t is_first, + uint64_t is_last, + __gm__ Tensor* dst) { + __gm__ float* mij_ptr = reinterpret_cast<__gm__ float*>(mij->buffer.addr); + __gm__ float* lij_ptr = reinterpret_cast<__gm__ float*>(lij->buffer.addr); + __gm__ float* oi_new_ptr = reinterpret_cast<__gm__ float*>(oi_new->buffer.addr); + __gm__ float* mi_ptr = reinterpret_cast<__gm__ float*>(mi->buffer.addr); + __gm__ float* li_ptr = reinterpret_cast<__gm__ float*>(li->buffer.addr); + __gm__ float* oi_ptr = reinterpret_cast<__gm__ float*>(oi->buffer.addr); + __gm__ float* dst_ptr = reinterpret_cast<__gm__ float*>(dst->buffer.addr); + + // Scalar tile dimensions for RowMajor layout: + // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block) + // kScalarRows = M / 8 (M=16 → 2 rows, M=64 → 8 rows) + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + // Aligned rows for ColMajor DN tiles (32-byte alignment) + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + // --- GlobalTensor types --- + + // Data (M, N) RowMajor + using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + + // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor + using GlobalScalarND = + GlobalTensor, pto::Stride<1, 1, 1, kScalarCols, 1>>; + + // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor + using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + // --- GlobalTensor instances --- + + GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); + GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); + GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); + + // ND globals for scalar element-wise operations + GlobalScalarND mijGlobalND(mij_ptr + mij->start_offset); + GlobalScalarND lijGlobalND(lij_ptr + lij->start_offset); + GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); + GlobalScalarND liGlobalND(li_ptr + li->start_offset); + + // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE) + GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); + GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); + GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); + + // --- Tile types --- + + using TileDataMxN = Tile; + using TileScalarND = + Tile; + using TileScalarDN = Tile; + + // --- UB memory layout --- + + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float); + constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); + + // Data tiles + TileDataMxN oiNewTile; + TileDataMxN oiTile; + + // Scalar ND tiles for element-wise arithmetic + TileScalarND mijND, lijND, miND, liND; + TileScalarND miNewND, alphaND, betaND, tmpND; + + // Scalar DN tiles for TROWEXPAND operations + TileScalarDN alphaDN, betaDN, liDN; + + TASSIGN(oiNewTile, 0); + TASSIGN(oiTile, kDataBytes); + TASSIGN(mijND, 2 * kDataBytes); + TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes); + TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes); + TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes); + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes); + TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes); + TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes); + TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes); + TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes); + TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes); + TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes); + + if (is_first) { + // --- First block: copy inputs to accumulators --- + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Passthrough to MTE3 (no V compute needed) + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, mijND); // mi = mij + TSTORE(liGlobalND, lijND); // li = lij + TSTORE(oiGlobal, oiNewTile); // oi = oi_new + + if (is_last) { + // Single block: normalize dst = oi_new / lij + // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(liDN, liGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TROWEXPANDDIV(oiNewTile, oiNewTile, liDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiNewTile); + } + } else { + // --- Subsequent blocks: accumulate --- + + // Phase 1: Load all inputs + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(oiTile, oiGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + TLOAD(miND, miGlobalND); + TLOAD(liND, liGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols) + // to resolve RAW hazards on shared UB tiles. + TMAX(miNewND, miND, mijND); // mi_new = max(mi, mij) + TSUB(alphaND, miND, miNewND); // alpha = mi - mi_new + TEXP(alphaND, alphaND); // alpha = exp(mi - mi_new) + TSUB(betaND, mijND, miNewND); // beta = mij - mi_new + TEXP(betaND, betaND); // beta = exp(mij - mi_new) + TMUL(liND, alphaND, liND); // li = alpha * li + TMUL(tmpND, betaND, lijND); // tmp = beta * lij + TADD(liND, liND, tmpND); // li = alpha * li + beta * lij (= li_new) + + // Phase 3: Store scalar results to GM (ND format) + // mi_new → mi accumulator, li_new → li accumulator + // alpha → mij buffer (reuse), beta → lij buffer (reuse) + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liND); // persist li_new + TSTORE(mijGlobalND, alphaND); // temp: alpha to mij buffer + TSTORE(lijGlobalND, betaND); // temp: beta to lij buffer + + // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(alphaDN, mijGlobalDN); // alpha from mij buffer as DN + TLOAD(betaDN, lijGlobalDN); // beta from lij buffer as DN + if (is_last) { + TLOAD(liDN, liGlobalDN); // li_new from li buffer as DN + } + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + + // Phase 5: Scale data tiles using row-broadcast multiply + TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha + TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta + TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new + + if (is_last) { + // Phase 6: Normalize and output + TROWEXPANDDIV(oiTile, oiTile, liDN); // dst = oi / li_new + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiTile); + } else { + // Phase 6: Store updated accumulators + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(oiGlobal, oiTile); + } + } +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[0]); + __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[1]); + __gm__ Tensor* oi_new = reinterpret_cast<__gm__ Tensor*>(args[2]); + __gm__ Tensor* mi = reinterpret_cast<__gm__ Tensor*>(args[3]); + __gm__ Tensor* li = reinterpret_cast<__gm__ Tensor*>(args[4]); + __gm__ Tensor* oi = reinterpret_cast<__gm__ Tensor*>(args[5]); + __gm__ Tensor* dst = reinterpret_cast<__gm__ Tensor*>(args[6]); + uint64_t is_first = static_cast(args[7]); + uint64_t is_last = static_cast(args[8]); + uint64_t q_tile_size = static_cast(mij->shapes[0]); + // args[10] = head_dim (128) + + if (q_tile_size == 16) { + online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else { + online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } +} diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp new file mode 100644 index 00000000..eec1d4dd --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -0,0 +1,128 @@ +// Softmax Preparation Kernel (AIV) with partial block masking +// +// Operates on (M, N) tile where M=q_tile_size, N=block_size: +// Case1: sij is (16, 128) +// Case2: sij is (64, 64) +// +// For partial blocks (valid_len < N), positions [valid_len, N) in sij are +// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0 +// so that invalid key positions contribute zero attention weight. +// +// Computes: +// sij_masked = TFILLPAD(sij, valid_len, pad=-inf) +// sij_scale = sij_masked * scale +// mij = row_max(sij_scale) -> (M, 1) +// pij = exp(sij_scale - mij) -> (M, N) +// lij = row_sum(pij) -> (M, 1) + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij, + float scale_value, + __gm__ Tensor* pij, + __gm__ Tensor* mij, + __gm__ Tensor* lij) { + uint64_t valid_len = static_cast(sij->shapes[1]); + __gm__ float* sij_addr = reinterpret_cast<__gm__ float*>(sij->buffer.addr); + __gm__ bfloat16_t* pij_addr = reinterpret_cast<__gm__ bfloat16_t*>(pij->buffer.addr); + __gm__ float* mij_addr = reinterpret_cast<__gm__ float*>(mij->buffer.addr); + __gm__ float* lij_addr = reinterpret_cast<__gm__ float*>(lij->buffer.addr); + + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + using GlobalDataMxN = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_bf16 = GlobalTensor, pto::Stride<1, 1, 1, N, 1>>; + using GlobalScalarDN = GlobalTensor, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>; + + GlobalDataMxN sijGlobal(sij_addr + sij->start_offset); + GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset); + GlobalScalarDN mijGlobal(mij_addr + mij->start_offset); + GlobalScalarDN lijGlobal(lij_addr + lij->start_offset); + + // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary + using TileSijDyn = Tile; + // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf + using TileSijPad = Tile; + + using TileVecMxN = Tile; + using TileVecMxN_bf16 = Tile; + using TileScalarDN = Tile; + + TileVecMxN sijTile; + TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijPad sijPadTile; + TileVecMxN pijTile; + TileVecMxN tmpTile; + TileScalarDN maxTile; + TileScalarDN sumTile; + TileVecMxN_bf16 pijBf16Tile; + + // All sij tiles share UB address 0x0 (in-place masking) + TASSIGN(sijTile, 0x0); + TASSIGN(sijDynTile, 0x0); + TASSIGN(sijPadTile, 0x0); + TASSIGN(pijTile, M * N * sizeof(float)); + TASSIGN(tmpTile, 2 * M * N * sizeof(float)); + TASSIGN(maxTile, 3 * M * N * sizeof(float)); + TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); + TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); + + // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks + // printf("sij addr incore %x\n", sij->buffer.addr); + TLOAD(sijTile, sijGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary, + // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N. + TFILLPAD_INPLACE(sijPadTile, sijDynTile); + + TMULS(sijTile, sijTile, scale_value); + TROWMAX(maxTile, sijTile, tmpTile); + TROWEXPANDSUB(pijTile, sijTile, maxTile); + TEXP(pijTile, pijTile); + // Truncate pij to bf16 first, then compute lij from truncated values (matches golden) + TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); + TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); + TROWSUM(sumTile, pijTile, tmpTile); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(mijGlobal, maxTile); + TSTORE(lijGlobal, sumTile); + TSTORE(pijGlobal, pijBf16Tile); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) { + __gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[0]); + union { + uint64_t u; + float f; + } scale_conv; + scale_conv.u = static_cast(args[1]); + float scale_value = scale_conv.f; + __gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[2]); + __gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[3]); + __gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[4]); + uint64_t q_tile_size = static_cast(sij->shapes[0]); + + if (q_tile_size == 16) { + softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij); + } else { + softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij); + } +} diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py new file mode 100644 index 00000000..dbd5064c --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py @@ -0,0 +1,45 @@ +""" +Paged Attention Kernel and Orchestration Configuration + +Defines the kernels and orchestration function for paged attention +with AIC/AIV subgraph splitting: + +AIC Kernels (Matrix Multiplication): + - aic_qk_matmul: Q @ K^T computation + - aic_pv_matmul: P @ V computation + +AIV Kernels (Vector Operations): + - aiv_softmax_prepare: scale, rowmax, exp, rowsum + - aiv_online_update: online softmax accumulation + fused normalization + +Note: aiv_normalize has been merged into aiv_online_update for efficiency. +""" + +from pathlib import Path + +_KERNELS_ROOT = Path(__file__).parent + +# Orchestration config +ORCHESTRATION = { + "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), + "function_name": "build_paged_attention_graph", +} + +# Kernel configs (aiv_normalize removed - merged into aiv_online_update) +KERNELS = [ + # AIC kernels (matrix multiplication using Cube unit) + {"func_id": 0, "name": "QK", "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), "core_type": "aic"}, + {"func_id": 2, "name": "PV", "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), "core_type": "aic"}, + {"func_id": 4, "name": "AIC_HUB", "source": str(_KERNELS_ROOT / "aic" / "aic_hub.cpp"), "core_type": "aic"}, + # AIV kernels (vector operations) + {"func_id": 1, "name": "SF", "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), "core_type": "aiv"}, + {"func_id": 3, "name": "UP", "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), "core_type": "aiv"}, + {"func_id": 5, "name": "AIV_HUB", "source": str(_KERNELS_ROOT / "aiv" / "aiv_hub.cpp"), "core_type": "aiv"}, +] + +# Runtime configuration +RUNTIME_CONFIG = { + "runtime": "tensormap_and_ringbuffer", + "aicpu_thread_num": 4, + "block_dim": 24, +} diff --git a/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 00000000..a3417a8c --- /dev/null +++ b/tests/device_tests/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,294 @@ +/** + * Paged Attention Orchestration Function - 16x16 Version + * + * Simplified for 16x16 framework-generated matmul kernels. + * Each block processes a single 16x16 matmul operation. + * + * Memory Layout: + * Query: (batch, 16, 16) - one 16x16 tile per batch + * Key: (total_blocks, 16, 16) - stored as K^T for direct matmul + * Value: (total_blocks, 16, 16) - direct format + */ + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +#define FUNC_AIC_HUB 4 +#define FUNC_AIV_HUB 5 + +constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz + +inline double cycles_to_us(uint64_t cycles) { + return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; +}; + +inline uint64_t get_sys_cnt_aicpu() { + uint64_t ticks; + asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); + return ticks; +} + +#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 +#define CYCLE_COUNT_LAP(acc) do { _t1 = get_sys_cnt_aicpu(); acc += (_t1 - _t0); _t0 = _t1; } while(0) + +// Helper to encode float as uint64_t for scalar params +static uint64_t float_to_u64(float f) { + union { + float f32; + uint64_t u64; + } conv; + conv.u64 = 0; // Clear upper bits + conv.f32 = f; + return conv.u64; +} + +extern "C" { +/** + * Orchestration config — the executor reads these values to set up + * shared memory and runtime before calling aicpu_orchestration_entry. + */ +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config( + uint64_t* args, int arg_count) { + (void)args; + (void)arg_count; + return PTO2OrchestrationConfig{ + .expected_arg_count = 10, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, int orch_thread_num, int orch_thread_index) { + (void)orch_thread_num; + (void)orch_thread_index; + uint64_t prof_param_extract = 0; + uint64_t prof_ext_tensor = 0; + uint64_t prof_scope = 0; + uint64_t prof_make_tensor = 0; + uint64_t prof_tensor_view = 0; + uint64_t prof_param_setup = 0; + uint64_t prof_submit_task = 0; + int prof_submit_count = 0; + int prof_make_count = 0; + int prof_view_count = 0; + + CYCLE_COUNT_START(); + + // Extract device pointers + // Extract pointers (first 7) + void* host_query = reinterpret_cast(args[0]); // [batch, num_heads, head_dim] + void* host_key_cache = reinterpret_cast(args[1]); // [batch, block_num, block_size, head_dim] + void* host_value_cache = reinterpret_cast(args[2]); // [batch, block_num, block_size, head_dim] + int* host_block_table = reinterpret_cast(args[3]); // [batch, block_num] + int* host_context_lens = reinterpret_cast(args[4]); // [batch] + void* host_out = reinterpret_cast(args[5]); // [batch, num_heads, head_dim] + int64_t* host_config = reinterpret_cast(args[6]); + + // Extract sizes (next 3) + size_t query_size = static_cast(args[7]); + size_t key_cache_size = static_cast(args[8]); + size_t value_cache_size = static_cast(args[9]); + + // Extract config parameters + uint64_t batch = static_cast(static_cast(host_config[0])); + uint64_t num_heads = static_cast(static_cast(host_config[1])); + int kv_head_num = static_cast(host_config[2]); + uint64_t head_dim = static_cast(static_cast(host_config[3])); + uint64_t block_size = static_cast(static_cast(host_config[4])); + uint64_t block_num = static_cast(static_cast(host_config[5])); + union { + uint32_t u; + float f; + } scale_conv; + scale_conv.u = static_cast(host_config[6]); + float scale_value = scale_conv.f; + uint64_t q_head_num = num_heads; + uint64_t q_tile = std::min(num_heads, 128UL); + uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; + DataType data_type = DataType::BFLOAT16; // 用例是float32的,这个考虑要如何扩展成其他类型 + CYCLE_COUNT_LAP(prof_param_extract); + + LOG_ALWAYS(rt, ">>>>>> batch = %lu", (unsigned long)batch); + + // query_size = batch * num_heads * head_dim * data_type + // key_cache_size = batch * block_num * block_size * head_dim * data_type + // value_cache_size = batch * block_num * block_size * head_dim * data_type + // out = batch * num_heads * head_dim * data_type + uint64_t query_shapes[2] = {batch * num_heads, head_dim}; + uint64_t key_cache_shapes[2] = {batch * block_num * block_size, head_dim}; + uint64_t value_cache_shapes[2] = {batch * block_num * block_size, head_dim}; + uint64_t out_shapes[2] = {batch * num_heads, head_dim}; + Tensor query = make_tensor_external(host_query, query_shapes, 2, data_type); + Tensor key_cache = make_tensor_external(host_key_cache, key_cache_shapes, 2, data_type); + Tensor value_cache = make_tensor_external(host_value_cache, value_cache_shapes, 2, data_type); + // Tensor block_table = make_tensor_external(host_block_table, block_table_size); + // Tensor context_lens = make_tensor_external(host_context_lens, context_lens_size); + Tensor out = make_tensor_external(host_out, out_shapes, 2, DataType::FLOAT32); + CYCLE_COUNT_LAP(prof_ext_tensor); + // LOG_DEBUG(rt, "query=%s", query.dump().c_str()); + // LOG_DEBUG(rt, "key_cache=%s", key_cache.dump().c_str()); + // LOG_DEBUG(rt, "value_cache=%s", value_cache.dump().c_str()); + // LOG_DEBUG(rt, "out=%s", out.dump().c_str()); + + int total_tasks = 0; + + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { + uint64_t cur_seq = host_context_lens[b_idx]; + uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_SCOPE(rt) { + CYCLE_COUNT_LAP(prof_scope); + uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; + + uint64_t oi_shapes[2] = {q_tile, head_dim}; + uint64_t li_shapes[1] = {q_tile}; + uint64_t mi_shapes[1] = {q_tile}; + Tensor oi = make_tensor(oi_shapes, 2, DataType::FLOAT32); + Tensor li_update = make_tensor(li_shapes, 1, DataType::FLOAT32); + Tensor mi_update = make_tensor(mi_shapes, 1, DataType::FLOAT32); + prof_make_count += 3; + CYCLE_COUNT_LAP(prof_make_tensor); + uint64_t qi_shapes[2] = {q_tile, head_dim}; + uint64_t qi_offsets[2] = {cur_offset, 0}; + Tensor qi = query.view(qi_shapes, qi_offsets); + uint64_t out_view_shapes[2] = {q_tile, head_dim}; + uint64_t out_view_offsets[2] = {cur_offset, 0}; + Tensor out_view = out.view(out_view_shapes, out_view_offsets); + prof_view_count += 2; + CYCLE_COUNT_LAP(prof_tensor_view); + + PTOParam params_inplace[] = { + make_output_param(oi), + make_output_param(li_update), + make_output_param(mi_update), + }; + CYCLE_COUNT_LAP(prof_param_setup); + pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); + + for (uint64_t bn = 0; bn < bn_this_batch; bn++) { + uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn]; + uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size); + CYCLE_COUNT_LAP(prof_param_extract); + + uint64_t kv_shapes[2] = {block_size, head_dim}; + uint64_t kv_offsets[2] = {cur_block_idx * block_size, 0}; + Tensor kj = key_cache.view(kv_shapes, kv_offsets); + Tensor vj = value_cache.view(kv_shapes, kv_offsets); + prof_view_count += 2; + CYCLE_COUNT_LAP(prof_tensor_view); + + uint64_t sij_shapes[2] = {q_tile, block_size}; + Tensor sij = make_tensor(sij_shapes, 2, DataType::FLOAT32); + Tensor pij_f16 = make_tensor(sij_shapes, 2, data_type); + prof_make_count += 2; + CYCLE_COUNT_LAP(prof_make_tensor); + + PTOParam params_qk[] = { + make_input_param(qi), + make_input_param(kj), + make_output_param(sij), + }; + CYCLE_COUNT_LAP(prof_param_setup); + pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); + + uint64_t sij_valid_shapes[2] = {q_tile, valid_len}; + uint64_t sij_valid_offsets[2] = {0, 0}; + Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets); + prof_view_count += 1; + CYCLE_COUNT_LAP(prof_tensor_view); + + Tensor li = make_tensor(li_shapes, 1, DataType::FLOAT32); + Tensor mi = make_tensor(mi_shapes, 1, DataType::FLOAT32); + prof_make_count += 2; + CYCLE_COUNT_LAP(prof_make_tensor); + + PTOParam params_sf[] = { + make_input_param(sij_valid), + make_scalar_param(float_to_u64(scale_value)), + make_output_param(pij_f16), + make_output_param(mi), + make_output_param(li), + }; + CYCLE_COUNT_LAP(prof_param_setup); + pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); + + uint64_t oi_tmp_shapes[2] = {q_tile, head_dim}; + Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32); + prof_make_count += 1; + CYCLE_COUNT_LAP(prof_make_tensor); + + PTOParam params_pv[] = { + make_input_param(pij_f16), + make_input_param(vj), + make_output_param(oi_tmp), + }; + CYCLE_COUNT_LAP(prof_param_setup); + pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); + + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; + CYCLE_COUNT_LAP(prof_param_extract); + + PTOParam params_up[] = { + make_input_param(mi), + make_input_param(li), + make_input_param(oi_tmp), + make_inout_param(mi_update), + make_inout_param(li_update), + make_inout_param(oi), + make_output_param(out_view), + make_scalar_param(is_first), + make_scalar_param(is_last), + }; + CYCLE_COUNT_LAP(prof_param_setup); + pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); + } + } + CYCLE_COUNT_LAP(prof_scope); + } + } + + uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + + prof_tensor_view + prof_param_setup + prof_submit_task + prof_scope; + LOG_ALWAYS(rt, "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", + prof_submit_count, prof_make_count, prof_view_count, cycles_to_us(total)); + if (total > 0) { + LOG_ALWAYS(rt, " param_extract : %7.3fus (%5.1f%%)", + cycles_to_us(prof_param_extract), prof_param_extract * 100.0 / total); + LOG_ALWAYS(rt, " ext_tensor(x4) : %7.3fus (%5.1f%%)", + cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total); + LOG_ALWAYS(rt, " make_tensor(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", + prof_make_count, cycles_to_us(prof_make_tensor), prof_make_tensor * 100.0 / total, + prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0); + LOG_ALWAYS(rt, " tensor_view(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", + prof_view_count, cycles_to_us(prof_tensor_view), prof_tensor_view * 100.0 / total, + prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0); + LOG_ALWAYS(rt, + " param_setup : %7.3fus (%5.1f%%)", + cycles_to_us(prof_param_setup), + prof_param_setup * 100.0 / total); + LOG_ALWAYS(rt, " scope : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total); + LOG_ALWAYS(rt, " submit_task(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", + prof_submit_count, cycles_to_us(prof_submit_task), prof_submit_task * 100.0 / total, + prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0); + } + +#undef CYCLE_COUNT_START +#undef CYCLE_COUNT_LAP +} + +} // extern "C" \ No newline at end of file diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh index 748bb400..5387beb7 100755 --- a/tools/benchmark_rounds.sh +++ b/tools/benchmark_rounds.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash -# Benchmark wrapper: run examples on a2a3 hardware, +# Benchmark wrapper: run examples on hardware, # then parse device-log timing lines to report per-round latency. # # Usage: -# ./tools/benchmark_rounds.sh [-d ] [-n ] +# ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] # # Runs all examples listed in EXAMPLES array and prints timing for each. @@ -12,10 +12,9 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" RUN_EXAMPLE="$PROJECT_ROOT/examples/scripts/run_example.py" -EXAMPLES_DIR="$PROJECT_ROOT/tests/device_tests/tensormap_and_ringbuffer" # --------------------------------------------------------------------------- -# Examples to benchmark (paths relative to examples/tensormap_and_ringbuffer/) +# Examples to benchmark (paths relative to tests/device_tests//tensormap_and_ringbuffer/) # Each entry is just the directory name; kernels/ and golden.py are implied. # --------------------------------------------------------------------------- EXAMPLES=( @@ -31,10 +30,15 @@ EXAMPLES=( # --------------------------------------------------------------------------- DEVICE_ID=0 ROUNDS=10 +PLATFORM=a2a3 EXTRA_ARGS=() while [[ $# -gt 0 ]]; do case "$1" in + -p|--platform) + PLATFORM="$2" + shift 2 + ;; -d|--device) DEVICE_ID="$2" shift 2 @@ -48,9 +52,10 @@ while [[ $# -gt 0 ]]; do benchmark_rounds.sh — run all examples and report per-round timing from device logs Usage: - ./tools/benchmark_rounds.sh [-d ] [-n ] + ./tools/benchmark_rounds.sh [-p ] [-d ] [-n ] Options: + -p, --platform Platform to run on (default: a2a3) -d, --device Device ID (default: 0) -n, --rounds Override number of rounds for each example (default: 10) -h, --help Show this help @@ -69,6 +74,11 @@ USAGE esac done +# --------------------------------------------------------------------------- +# Derive arch from platform and set examples directory +# --------------------------------------------------------------------------- +EXAMPLES_DIR="$PROJECT_ROOT/tests/device_tests/${PLATFORM}/tensormap_and_ringbuffer" + # --------------------------------------------------------------------------- # Resolve device log directory (mirrors run_example.py / device_log_resolver.py) # --------------------------------------------------------------------------- @@ -192,7 +202,7 @@ for example in "${EXAMPLES[@]}"; do # Run example if ! python3 "$RUN_EXAMPLE" \ -k "$KERNELS_DIR" -g "$GOLDEN" \ - -p a2a3 -d "$DEVICE_ID" \ + -p "$PLATFORM" -d "$DEVICE_ID" \ -n "$ROUNDS" \ "${EXTRA_ARGS[@]}" > /dev/null 2>&1; then echo " FAILED: run_example.py returned non-zero"