From 1cd87820236069103869a8f6c6d9b04e396ba178 Mon Sep 17 00:00:00 2001 From: chenshengxin Date: Wed, 18 Mar 2026 11:21:34 +0800 Subject: [PATCH] Eliminate AIV_HUB init task in paged_attention_unroll orchestration Remove the separate FUNC_AIV_HUB task that zero-initialized the online- update accumulators (oi, mi_update, li_update) before the inner loop. Instead, on the first block iteration (is_first), register these buffers as outputs rather than inouts in FUNC_ONLINE_UPDATE, letting the kernel handle initialization inline. This removes one AIV task submission per query-head group, reducing orchestration overhead and improving end-to-end pipeline throughput. --- .../orchestration/paged_attention_orch.cpp | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp index c4060460..4d8f1df9 100644 --- a/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/device_tests/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp @@ -170,15 +170,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim prof_view_count += 2; CYCLE_COUNT_LAP(prof_tensor_view); - PTOParam params_inplace; - params_inplace.add_output(oi); - params_inplace.add_output(li_update); - params_inplace.add_output(mi_update); - CYCLE_COUNT_LAP(prof_param_setup); - pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace); - prof_submit_count++; - CYCLE_COUNT_LAP(prof_submit_task); - // Reusable PTOParam objects — reset() before each use avoids // repeated stack-frame construction in the inner loop. // params_qk must persist until params_pv.copy_scalars_from(). @@ -256,9 +247,15 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(PTO2Runtim params_up.add_input(mi); params_up.add_input(li); params_up.add_input(oi_new); - params_up.add_inout(mi_update); - params_up.add_inout(li_update); - params_up.add_inout(oi); + if (is_first) { + params_up.add_output(mi_update); + params_up.add_output(li_update); + params_up.add_output(oi); + } else { + params_up.add_inout(mi_update); + params_up.add_inout(li_update); + params_up.add_inout(oi); + } params_up.add_output(out_view); params_up.add_scalar(is_first); params_up.add_scalar(is_last);