hw-native-sys-bot · hw-native-sys-bot · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -35,8 +35,8 @@ PTO Runtime compiles three independent programs (Host `.so`, AICPU `.so`, AICore
 ### Run a single example
 ```bash
 python examples/scripts/run_example.py \
-    -k examples/host_build_graph/vector_example/kernels \
-    -g examples/host_build_graph/vector_example/golden.py \
+    -k examples/a2a3/host_build_graph/vector_example/kernels \
+    -g examples/a2a3/host_build_graph/vector_example/golden.py \
     -p a2a3sim
 ```
 

diff --git a/ci.sh b/ci.sh
@@ -200,8 +200,10 @@ DEVICE_TESTS_DIR="tests/device_tests"
 
 declare -a HW_TASK_NAMES=()
 declare -a HW_TASK_DIRS=()
+declare -a HW_TASK_PLATS=()
 declare -a SIM_TASK_NAMES=()
 declare -a SIM_TASK_DIRS=()
+declare -a SIM_TASK_PLATS=()
 
 # Discover examples
 while IFS= read -r -d '' example_dir; do
@@ -211,15 +213,21 @@ while IFS= read -r -d '' example_dir; do
     [[ -f "$kernel_config" && -f "$golden" ]] || continue
 
     example_name="${example_dir#$EXAMPLES_DIR/}"
-    example_runtime="${example_name%%/*}"  # Extract runtime from path
+    example_arch="${example_name%%/*}"  # Extract arch (a2a3/a5) from path
+    example_rest="${example_name#*/}"
+    example_runtime="${example_rest%%/*}"  # Extract runtime from path
 
     # Filter by runtime if specified
-    if [[ -n "$RUNTIME" && "$example_name" != "$RUNTIME"/* ]]; then
+    if [[ -n "$RUNTIME" && "$example_runtime" != "$RUNTIME" ]]; then
         continue
     fi
 
-    # Filter by platform's supported runtimes
+    # Filter by platform's arch and supported runtimes
     if [[ -n "$PLATFORM" ]]; then
+        platform_base="${PLATFORM%sim}"
+        if [[ "$example_arch" != "$platform_base" ]]; then
+            continue  # Skip examples not matching platform arch
+        fi
         platform_runtimes="$(get_platform_runtimes "$PLATFORM")"
         if [[ ! " $platform_runtimes " =~ " $example_runtime " ]]; then
             continue  # Skip unsupported runtime for this platform
@@ -230,18 +238,23 @@ while IFS= read -r -d '' example_dir; do
         if [[ "$PLATFORM" =~ sim$ ]]; then
             SIM_TASK_NAMES+=("example:${example_name}")
             SIM_TASK_DIRS+=("${example_dir}")
+            SIM_TASK_PLATS+=("${PLATFORM}")
         else
             HW_TASK_NAMES+=("example:${example_name}")
             HW_TASK_DIRS+=("${example_dir}")
+            HW_TASK_PLATS+=("${PLATFORM}")
         fi
     elif [[ "$OS" == "Darwin" ]]; then
         SIM_TASK_NAMES+=("example:${example_name}")
         SIM_TASK_DIRS+=("${example_dir}")
+        SIM_TASK_PLATS+=("${example_arch}sim")
     else
         HW_TASK_NAMES+=("example:${example_name}")
         HW_TASK_DIRS+=("${example_dir}")
+        HW_TASK_PLATS+=("${example_arch}")
         SIM_TASK_NAMES+=("example:${example_name}")
         SIM_TASK_DIRS+=("${example_dir}")
+        SIM_TASK_PLATS+=("${example_arch}sim")
     fi
 done < <(find "$EXAMPLES_DIR" -mindepth 1 -type d -print0 | sort -z)
 
@@ -257,15 +270,21 @@ if [[ -d "$DEVICE_TESTS_DIR" ]]; then
             golden="${test_dir}/golden.py"
             [[ -f "$kernel_config" && -f "$golden" ]] || continue
             test_name="${test_dir#$DEVICE_TESTS_DIR/}"
-            test_runtime="${test_name%%/*}"  # Extract runtime from path
+            test_arch="${test_name%%/*}"  # Extract arch (a2a3/a5) from path
+            test_rest="${test_name#*/}"
+            test_runtime="${test_rest%%/*}"  # Extract runtime from path
 
             # Filter by runtime if specified
-            if [[ -n "$RUNTIME" && "$test_name" != "$RUNTIME"/* ]]; then
+            if [[ -n "$RUNTIME" && "$test_runtime" != "$RUNTIME" ]]; then
                 continue
             fi
 
-            # Filter by platform's supported runtimes
+            # Filter by platform's arch and supported runtimes
             if [[ -n "$PLATFORM" ]]; then
+                platform_base="${PLATFORM%sim}"
+                if [[ "$test_arch" != "$platform_base" ]]; then
+                    continue  # Skip tests not matching platform arch
+                fi
                 platform_runtimes="$(get_platform_runtimes "$PLATFORM")"
                 if [[ ! " $platform_runtimes " =~ " $test_runtime " ]]; then
                     continue  # Skip unsupported runtime for this platform
@@ -274,6 +293,7 @@ if [[ -d "$DEVICE_TESTS_DIR" ]]; then
 
             HW_TASK_NAMES+=("device_test:${test_name}")
             HW_TASK_DIRS+=("${test_dir}")
+            HW_TASK_PLATS+=("${PLATFORM:-${test_arch}}")
         done < <(find "$DEVICE_TESTS_DIR" -mindepth 1 -type d -print0 | sort -z)
     else
         echo "Skipping device tests (hardware platforms only)"
@@ -282,18 +302,14 @@ fi
 
 echo "Discovered ${#HW_TASK_NAMES[@]} hardware tasks, ${#SIM_TASK_NAMES[@]} simulation tasks"
 
-# Determine platforms for execution
-HW_PLATFORM="${PLATFORM:-a2a3}"
-SIM_PLATFORM="${PLATFORM:-a2a3sim}"
-
 MAX_RETRIES=3
 
 # ---- Unified task runner ----
 # Runs a single task and records the result.
 # Log naming: ${safe_name}_${platform}_attempt${attempt}.log
 # Result format: name|platform|PASS/FAIL|device:X|attempt:N|Xs
 run_task() {
-    local name="$1" dir="$2" platform="$3" attempt="$4" device_id="$5"
+    local name="$1" dir="$2" platform="$3" attempt="$4" device_id="$5" print_log_on_fail="${6:-true}"
     local safe_name="${name//[:\/]/_}"
     local task_log="${LOG_DIR}/${safe_name}_${platform}_attempt${attempt}.log"
     local start_time=$SECONDS
@@ -319,9 +335,11 @@ run_task() {
     else
         status="FAIL"
         echo "[${platform}${device_id:+:dev${device_id}}] FAIL: $name (${elapsed}s)"
-        echo "--- LOG: $name (attempt $attempt) ---"
-        cat "$task_log"
-        echo "--- END ---"
+        if [[ "$print_log_on_fail" == "true" ]]; then
+            echo "--- LOG: $name (attempt $attempt) ---"
+            cat "$task_log"
+            echo "--- END ---"
+        fi
     fi
     echo "${name}|${platform}|${status}|device:${device_id:-sim}|attempt:${attempt}|${elapsed}s" \
         >> "$RESULTS_FILE"
@@ -348,7 +366,7 @@ run_sim_tasks() {
         local -a pids=()
         for idx in "${indices[@]}"; do
             (
-                if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "$SIM_PLATFORM" "$attempt"; then
+                if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "${SIM_TASK_PLATS[$idx]}" "$attempt"; then
                     echo "${idx}|PASS" >> "$sim_marker"
                 else
                     echo "${idx}|FAIL" >> "$sim_marker"
@@ -359,7 +377,7 @@ run_sim_tasks() {
         for pid in "${pids[@]}"; do wait "$pid" 2>/dev/null || true; done
     else
         for idx in "${indices[@]}"; do
-            if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "$SIM_PLATFORM" "$attempt"; then
+            if run_task "${SIM_TASK_NAMES[$idx]}" "${SIM_TASK_DIRS[$idx]}" "${SIM_TASK_PLATS[$idx]}" "$attempt"; then
                 echo "${idx}|PASS" >> "$sim_marker"
             else
                 echo "${idx}|FAIL" >> "$sim_marker"
@@ -406,17 +424,22 @@ run_hw_tasks() {
 
                 IFS=':' read -r idx attempt <<< "$entry"
 
-                if run_task "${HW_TASK_NAMES[$idx]}" "${HW_TASK_DIRS[$idx]}" "$HW_PLATFORM" "$attempt" "$device_id"; then
+                if run_task "${HW_TASK_NAMES[$idx]}" "${HW_TASK_DIRS[$idx]}" "${HW_TASK_PLATS[$idx]}" "$attempt" "$device_id" "false"; then
                     echo "${idx}|PASS" >> "$hw_marker"
                 else
                     next=$((attempt + 1))
                     if [[ $next -lt $MAX_RETRIES ]]; then
                         flock "$lock" bash -c "echo '${idx}:${next}' >> \"$queue\""
                     else
                         echo "${idx}|FAIL" >> "$hw_marker"
+                        local safe_name="${HW_TASK_NAMES[$idx]//[:\/]/_}"
+                        local last_log="${LOG_DIR}/${safe_name}_${HW_TASK_PLATS[$idx]}_attempt${attempt}.log"
+                        echo "--- LOG: ${HW_TASK_NAMES[$idx]} (attempt $attempt) ---"
+                        cat "$last_log"
+                        echo "--- END ---"
+                        echo "[${HW_TASK_PLATS[$idx]}:dev${device_id}] Device quarantined after exhausting retries"
+                        break
                     fi
-                    echo "[${HW_PLATFORM}:dev${device_id}] Device quarantined after failure"
-                    break
                 fi
             done
         ) &
@@ -606,7 +629,7 @@ for i in "${!TASK_ORDER[@]}"; do
 
     platform="${FINAL_PLATFORM[$i]}"
     device="${FINAL_DEVICE[$i]}"
-    attempt="${FINAL_ATTEMPT[$i]}"
+    attempt=$(( FINAL_ATTEMPT[$i] + 1 ))
     timing="${FINAL_TIMING[$i]}"
 
     if [[ "$result" == "FAIL" ]]; then

diff --git a/examples/aicpu_build_graph/bgemm/README.md → ...es/a2a3/aicpu_build_graph/bgemm/README.md b/examples/aicpu_build_graph/bgemm/README.md → ...es/a2a3/aicpu_build_graph/bgemm/README.md
diff --git a/examples/aicpu_build_graph/bgemm/golden.py → ...es/a2a3/aicpu_build_graph/bgemm/golden.py b/examples/aicpu_build_graph/bgemm/golden.py → ...es/a2a3/aicpu_build_graph/bgemm/golden.py
diff --git a/...ph/bgemm/kernels/aic/kernel_gemm_tile.cpp → ...ph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/...ph/bgemm/kernels/aic/kernel_gemm_tile.cpp → ...ph/bgemm/kernels/aic/kernel_gemm_tile.cpp
diff --git a/...aph/bgemm/kernels/aiv/kernel_tile_add.cpp → ...aph/bgemm/kernels/aiv/kernel_tile_add.cpp b/...aph/bgemm/kernels/aiv/kernel_tile_add.cpp → ...aph/bgemm/kernels/aiv/kernel_tile_add.cpp
diff --git a/...uild_graph/bgemm/kernels/kernel_config.py → ...uild_graph/bgemm/kernels/kernel_config.py b/...uild_graph/bgemm/kernels/kernel_config.py → ...uild_graph/bgemm/kernels/kernel_config.py
diff --git a/...gemm/kernels/orchestration/bgemm_orch.cpp → ...gemm/kernels/orchestration/bgemm_orch.cpp b/...gemm/kernels/orchestration/bgemm_orch.cpp → ...gemm/kernels/orchestration/bgemm_orch.cpp
diff --git a/..._graph/docs/INCORE_ORCHESTRATION_GUIDE.md → ..._graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/..._graph/docs/INCORE_ORCHESTRATION_GUIDE.md → ..._graph/docs/INCORE_ORCHESTRATION_GUIDE.md
diff --git a/...icpu_build_graph/vector_example/README.md → ...icpu_build_graph/vector_example/README.md b/...icpu_build_graph/vector_example/README.md → ...icpu_build_graph/vector_example/README.md
diff --git a/...icpu_build_graph/vector_example/golden.py → ...icpu_build_graph/vector_example/golden.py b/...icpu_build_graph/vector_example/golden.py → ...icpu_build_graph/vector_example/golden.py
diff --git a/...vector_example/kernels/aiv/kernel_add.cpp → ...vector_example/kernels/aiv/kernel_add.cpp b/...vector_example/kernels/aiv/kernel_add.cpp → ...vector_example/kernels/aiv/kernel_add.cpp
diff --git a/...example/kernels/aiv/kernel_add_scalar.cpp → ...example/kernels/aiv/kernel_add_scalar.cpp b/...example/kernels/aiv/kernel_add_scalar.cpp → ...example/kernels/aiv/kernel_add_scalar.cpp
diff --git a/...vector_example/kernels/aiv/kernel_mul.cpp → ...vector_example/kernels/aiv/kernel_mul.cpp b/...vector_example/kernels/aiv/kernel_mul.cpp → ...vector_example/kernels/aiv/kernel_mul.cpp
diff --git a/...h/vector_example/kernels/kernel_config.py → ...h/vector_example/kernels/kernel_config.py b/...h/vector_example/kernels/kernel_config.py → ...h/vector_example/kernels/kernel_config.py
diff --git a/...e/kernels/orchestration/orchestration.cpp → ...e/kernels/orchestration/orchestration.cpp b/...e/kernels/orchestration/orchestration.cpp → ...e/kernels/orchestration/orchestration.cpp
diff --git a/examples/host_build_graph/bgemm/README.md → ...les/a2a3/host_build_graph/bgemm/README.md b/examples/host_build_graph/bgemm/README.md → ...les/a2a3/host_build_graph/bgemm/README.md
diff --git a/examples/host_build_graph/bgemm/golden.py → ...les/a2a3/host_build_graph/bgemm/golden.py b/examples/host_build_graph/bgemm/golden.py → ...les/a2a3/host_build_graph/bgemm/golden.py
diff --git a/...ph/bgemm/kernels/aic/kernel_gemm_tile.cpp → ...ph/bgemm/kernels/aic/kernel_gemm_tile.cpp b/...ph/bgemm/kernels/aic/kernel_gemm_tile.cpp → ...ph/bgemm/kernels/aic/kernel_gemm_tile.cpp
diff --git a/...aph/bgemm/kernels/aiv/kernel_tile_add.cpp → ...aph/bgemm/kernels/aiv/kernel_tile_add.cpp b/...aph/bgemm/kernels/aiv/kernel_tile_add.cpp → ...aph/bgemm/kernels/aiv/kernel_tile_add.cpp
diff --git a/...uild_graph/bgemm/kernels/kernel_config.py → ...uild_graph/bgemm/kernels/kernel_config.py b/...uild_graph/bgemm/kernels/kernel_config.py → ...uild_graph/bgemm/kernels/kernel_config.py
diff --git a/...gemm/kernels/orchestration/bgemm_orch.cpp → ...gemm/kernels/orchestration/bgemm_orch.cpp b/...gemm/kernels/orchestration/bgemm_orch.cpp → ...gemm/kernels/orchestration/bgemm_orch.cpp
diff --git a/..._graph/docs/INCORE_ORCHESTRATION_GUIDE.md → ..._graph/docs/INCORE_ORCHESTRATION_GUIDE.md b/..._graph/docs/INCORE_ORCHESTRATION_GUIDE.md → ..._graph/docs/INCORE_ORCHESTRATION_GUIDE.md
diff --git a/examples/host_build_graph/matmul/golden.py → ...es/a2a3/host_build_graph/matmul/golden.py b/examples/host_build_graph/matmul/golden.py → ...es/a2a3/host_build_graph/matmul/golden.py
diff --git a/...raph/matmul/kernels/aic/kernel_matmul.cpp → ...raph/matmul/kernels/aic/kernel_matmul.cpp b/...raph/matmul/kernels/aic/kernel_matmul.cpp → ...raph/matmul/kernels/aic/kernel_matmul.cpp
diff --git a/...aph/matmul/kernels/aiv/kernel_add_exp.cpp → ...aph/matmul/kernels/aiv/kernel_add_exp.cpp b/...aph/matmul/kernels/aiv/kernel_add_exp.cpp → ...aph/matmul/kernels/aiv/kernel_add_exp.cpp
diff --git a/...ph/matmul/kernels/aiv/kernel_log_sqrt.cpp → ...ph/matmul/kernels/aiv/kernel_log_sqrt.cpp b/...ph/matmul/kernels/aiv/kernel_log_sqrt.cpp → ...ph/matmul/kernels/aiv/kernel_log_sqrt.cpp
diff --git a/...ild_graph/matmul/kernels/kernel_config.py → ...ild_graph/matmul/kernels/kernel_config.py b/...ild_graph/matmul/kernels/kernel_config.py → ...ild_graph/matmul/kernels/kernel_config.py
diff --git a/...mul/kernels/orchestration/matmul_orch.cpp → ...mul/kernels/orchestration/matmul_orch.cpp b/...mul/kernels/orchestration/matmul_orch.cpp → ...mul/kernels/orchestration/matmul_orch.cpp
diff --git a/...ost_build_graph/paged_attention/golden.py → ...ost_build_graph/paged_attention/golden.py b/...ost_build_graph/paged_attention/golden.py → ...ost_build_graph/paged_attention/golden.py
diff --git a/...d_attention/kernels/aic/aic_pv_matmul.cpp → ...d_attention/kernels/aic/aic_pv_matmul.cpp b/...d_attention/kernels/aic/aic_pv_matmul.cpp → ...d_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/...d_attention/kernels/aic/aic_qk_matmul.cpp → ...d_attention/kernels/aic/aic_qk_matmul.cpp b/...d_attention/kernels/aic/aic_qk_matmul.cpp → ...d_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/...tention/kernels/aiv/aiv_online_update.cpp → ...tention/kernels/aiv/aiv_online_update.cpp b/...tention/kernels/aiv/aiv_online_update.cpp → ...tention/kernels/aiv/aiv_online_update.cpp
diff --git a/...ntion/kernels/aiv/aiv_softmax_prepare.cpp → ...ntion/kernels/aiv/aiv_softmax_prepare.cpp b/...ntion/kernels/aiv/aiv_softmax_prepare.cpp → ...ntion/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/.../paged_attention/kernels/kernel_config.py → .../paged_attention/kernels/kernel_config.py b/.../paged_attention/kernels/kernel_config.py → .../paged_attention/kernels/kernel_config.py
diff --git a/...ls/orchestration/paged_attention_orch.cpp → ...ls/orchestration/paged_attention_orch.cpp b/...ls/orchestration/paged_attention_orch.cpp → ...ls/orchestration/paged_attention_orch.cpp
diff --git a/...host_build_graph/vector_example/README.md → ...host_build_graph/vector_example/README.md b/...host_build_graph/vector_example/README.md → ...host_build_graph/vector_example/README.md
diff --git a/...host_build_graph/vector_example/golden.py → ...host_build_graph/vector_example/golden.py b/...host_build_graph/vector_example/golden.py → ...host_build_graph/vector_example/golden.py
diff --git a/...vector_example/kernels/aiv/kernel_add.cpp → ...vector_example/kernels/aiv/kernel_add.cpp b/...vector_example/kernels/aiv/kernel_add.cpp → ...vector_example/kernels/aiv/kernel_add.cpp
diff --git a/...example/kernels/aiv/kernel_add_scalar.cpp → ...example/kernels/aiv/kernel_add_scalar.cpp b/...example/kernels/aiv/kernel_add_scalar.cpp → ...example/kernels/aiv/kernel_add_scalar.cpp
diff --git a/...vector_example/kernels/aiv/kernel_mul.cpp → ...vector_example/kernels/aiv/kernel_mul.cpp b/...vector_example/kernels/aiv/kernel_mul.cpp → ...vector_example/kernels/aiv/kernel_mul.cpp
diff --git a/...h/vector_example/kernels/kernel_config.py → ...h/vector_example/kernels/kernel_config.py b/...h/vector_example/kernels/kernel_config.py → ...h/vector_example/kernels/kernel_config.py
diff --git a/...le/kernels/orchestration/example_orch.cpp → ...le/kernels/orchestration/example_orch.cpp b/...le/kernels/orchestration/example_orch.cpp → ...le/kernels/orchestration/example_orch.cpp
diff --git a/...ingbuffer/batch_paged_attention/golden.py → ...ingbuffer/batch_paged_attention/golden.py b/...ingbuffer/batch_paged_attention/golden.py → ...ingbuffer/batch_paged_attention/golden.py
diff --git a/...h_paged_attention/kernels/aic/aic_hub.cpp → ...h_paged_attention/kernels/aic/aic_hub.cpp b/...h_paged_attention/kernels/aic/aic_hub.cpp → ...h_paged_attention/kernels/aic/aic_hub.cpp
diff --git a/...d_attention/kernels/aic/aic_pv_matmul.cpp → ...d_attention/kernels/aic/aic_pv_matmul.cpp b/...d_attention/kernels/aic/aic_pv_matmul.cpp → ...d_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/...d_attention/kernels/aic/aic_qk_matmul.cpp → ...d_attention/kernels/aic/aic_qk_matmul.cpp b/...d_attention/kernels/aic/aic_qk_matmul.cpp → ...d_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/...h_paged_attention/kernels/aiv/aiv_hub.cpp → ...h_paged_attention/kernels/aiv/aiv_hub.cpp b/...h_paged_attention/kernels/aiv/aiv_hub.cpp → ...h_paged_attention/kernels/aiv/aiv_hub.cpp
diff --git a/...tention/kernels/aiv/aiv_online_update.cpp → ...tention/kernels/aiv/aiv_online_update.cpp b/...tention/kernels/aiv/aiv_online_update.cpp → ...tention/kernels/aiv/aiv_online_update.cpp
diff --git a/...ntion/kernels/aiv/aiv_softmax_prepare.cpp → ...ntion/kernels/aiv/aiv_softmax_prepare.cpp b/...ntion/kernels/aiv/aiv_softmax_prepare.cpp → ...ntion/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/..._paged_attention/kernels/kernel_config.py → ..._paged_attention/kernels/kernel_config.py b/..._paged_attention/kernels/kernel_config.py → ..._paged_attention/kernels/kernel_config.py
diff --git a/...ls/orchestration/paged_attention_orch.cpp → ...ls/orchestration/paged_attention_orch.cpp b/...ls/orchestration/paged_attention_orch.cpp → ...ls/orchestration/paged_attention_orch.cpp
@@ -135,7 +135,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                     make_output_param(li_batch),
                     make_output_param(mi_batch),
                 };
-                pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_hub, 3);
+                pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_hub, 3);
 
                 for (uint64_t bn = 0; bn < max_bn; bn++) {
                     uint64_t sij_shapes[2] = {chunk_bc * q_tile, block_size};
@@ -160,7 +160,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(num_heads),
                         make_scalar_param(batch_start),
                     };
-                    pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 10);
+                    pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk, 10);
 
                     PTOParam params_sf[] = {
                         make_input_param(sij_b),
@@ -173,7 +173,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(bn),
                         make_scalar_param(batch_start),
                     };
-                    pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 9);
+                    pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf, 9);
 
                     PTOParam params_pv[] = {
                         make_input_param(pij_b),
@@ -185,7 +185,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(block_num),
                         make_scalar_param(batch_start),
                     };
-                    pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 8);
+                    pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv, 8);
 
                     uint64_t is_first = (bn == 0) ? 1 : 0;
                     uint64_t is_last = (bn == max_bn - 1) ? 1 : 0;
@@ -204,7 +204,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                         make_scalar_param(num_heads),
                         make_scalar_param(batch_start),
                     };
-                    pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 13);
+                    pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up, 13);
                 }
             }
         }

diff --git a/.../tensormap_and_ringbuffer/bgemm/golden.py → .../tensormap_and_ringbuffer/bgemm/golden.py b/.../tensormap_and_ringbuffer/bgemm/golden.py → .../tensormap_and_ringbuffer/bgemm/golden.py
diff --git a/...er/bgemm/kernels/aic/kernel_gemm_tile.cpp → ...er/bgemm/kernels/aic/kernel_gemm_tile.cpp b/...er/bgemm/kernels/aic/kernel_gemm_tile.cpp → ...er/bgemm/kernels/aic/kernel_gemm_tile.cpp
diff --git a/...fer/bgemm/kernels/aiv/kernel_tile_add.cpp → ...fer/bgemm/kernels/aiv/kernel_tile_add.cpp b/...fer/bgemm/kernels/aiv/kernel_tile_add.cpp → ...fer/bgemm/kernels/aiv/kernel_tile_add.cpp
diff --git a/...ringbuffer/bgemm/kernels/kernel_config.py → ...ringbuffer/bgemm/kernels/kernel_config.py b/...ringbuffer/bgemm/kernels/kernel_config.py → ...ringbuffer/bgemm/kernels/kernel_config.py
diff --git a/...gemm/kernels/orchestration/bgemm_orch.cpp → ...gemm/kernels/orchestration/bgemm_orch.cpp b/...gemm/kernels/orchestration/bgemm_orch.cpp → ...gemm/kernels/orchestration/bgemm_orch.cpp
@@ -120,15 +120,15 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                             make_input_param(B_view),
                             make_output_param(P),
                         };
-                        pto2_rt_submit_task(rt, FUNC_GEMM_TILE, PTO2_WORKER_CUBE,
+                        pto2_rt_submit_aic_task(rt, FUNC_GEMM_TILE,
                                            params_gemm, 3); // gemm
 
                         // C[m,n] += P
                         PTOParam params_add[] = {
                             make_inout_param(C_view),
                             make_input_param(P),
                         };
-                        pto2_rt_submit_task(rt, FUNC_TILE_ADD, PTO2_WORKER_VECTOR,
+                        pto2_rt_submit_aiv_task(rt, FUNC_TILE_ADD,
                                            params_add, 2); // add
                     }
                 }

diff --git a/...buffer/docs/INCORE_ORCHESTRATION_GUIDE.md → ...buffer/docs/INCORE_ORCHESTRATION_GUIDE.md b/...buffer/docs/INCORE_ORCHESTRATION_GUIDE.md → ...buffer/docs/INCORE_ORCHESTRATION_GUIDE.md
@@ -31,17 +31,28 @@ Validate `arg_count` in `aicpu_orchestration_config` and interpret pointers as d
 2. Wrap orchestration in scopes with `PTO2_SCOPE(rt)` to control tensor lifetimes.
 3. Use `make_tensor_external` for input/output buffers and `make_tensor` for intermediates.
 4. Build `PTOParam` arrays with `make_input_param`, `make_output_param`, `make_inout_param`, and `make_scalar_param`.
-5. Submit tasks with `pto2_rt_submit_task(rt, func_id, worker_type, params, num_params)`.
+5. Submit tasks with one of:
+   - `pto2_rt_submit_aic_task(rt, kernel_id, params, num_params)` — AIC (CUBE) task
+   - `pto2_rt_submit_aiv_task(rt, kernel_id, params, num_params)` — AIV (VECTOR) task
+   - `pto2_rt_submit_task(rt, mixed_kernels, params, num_params)` — mixed task with a `MixedKernels` struct
 
 Dependencies are inferred by TensorMap from input/inout/output tensors, so you do not add explicit edges.
 
-## Worker Types And Kernel IDs
-- Worker types come from `pto_orchestration_api.h` (`PTO2_WORKER_CUBE`, `PTO2_WORKER_VECTOR`, etc.).
+## Submit API And Kernel IDs
+- Submit helpers are defined in `pto_orchestration_api.h`.
+- `pto2_rt_submit_aic_task` and `pto2_rt_submit_aiv_task` are convenience wrappers around `pto2_rt_submit_task` with a `MixedKernels` struct.
+- For mixed AIC+AIV tasks, construct a `MixedKernels` struct directly:
+  ```cpp
+  MixedKernels mk;
+  mk.aic_kernel_id = FUNC_QK;
+  mk.aiv0_kernel_id = FUNC_SF;
+  pto2_rt_submit_task(rt, mk, params, num_params);
+  ```
 - Kernel `func_id` values are defined in `kernels/kernel_config.py` under `KERNELS`.
 
 ## Completion Semantics
 Do not call `pto2_rt_orchestration_done` yourself in device mode. The executor wraps the entry call in an outer scope and signals completion after `aicpu_orchestration_entry` returns.
 
 ## Examples
-- `examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp`
-- `examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp`
+- `examples/tensormap_and_ringbuffer/vector_example/kernels/orchestration/example_orchestration.cpp` (AIV-only tasks)
+- `examples/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp` (mixed AIC + AIV tasks)