Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ static __aicore__ void pv_matmul_impl(__gm__ Tensor* pij, __gm__ Tensor* vj, __g
wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);

TSTORE(oiGlobal, cTile);

set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
}

extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ static __aicore__ void qk_matmul_impl(__gm__ Tensor* qi, __gm__ Tensor* kj, __gm
wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);

TSTORE(sijGlobal, cTile);

set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
}

extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ static __aicore__ void online_update_impl(__gm__ Tensor* mij,
TSTORE(oiGlobal, oiTile);
}
}
set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
}

extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);

// manually fill invalid columns with -inf as a workaround.
TFILLPAD_INPLACE(sijPadTile, sijDynTile);

TMULS(sijTile, sijTile, scale_value);
Expand All @@ -99,19 +100,22 @@ static __aicore__ void softmax_prepare_impl(__gm__ Tensor* sij,
TSTORE(mijGlobal, maxTile);
TSTORE(lijGlobal, sumTile);
TSTORE(pijGlobal, pijF16Tile);

set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
}

extern "C" __aicore__ void kernel_entry(__gm__ int64_t* args) {
__gm__ Tensor* sij = reinterpret_cast<__gm__ Tensor*>(args[0]);
__gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[1]);
__gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[2]);
__gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[3]);
union {
uint64_t u;
float f;
} scale_conv;
scale_conv.u = static_cast<uint64_t>(args[1]);
scale_conv.u = static_cast<uint64_t>(args[4]);
float scale_value = scale_conv.f;
__gm__ Tensor* pij = reinterpret_cast<__gm__ Tensor*>(args[2]);
__gm__ Tensor* mij = reinterpret_cast<__gm__ Tensor*>(args[3]);
__gm__ Tensor* lij = reinterpret_cast<__gm__ Tensor*>(args[4]);

softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij);
}
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
(unsigned long)batch, (unsigned long)b_start, (unsigned long)b_end);

// Compute actual tensor shapes from buffer sizes (not from max block_num)
uint64_t query_shapes[2] = {batch * num_heads, head_dim};
uint32_t query_shapes[2] = {(uint32_t)(batch * num_heads), (uint32_t)head_dim};
uint64_t kv_total_rows = key_cache_size / (head_dim * elem_size);
uint64_t key_cache_shapes[2] = {kv_total_rows, head_dim};
uint64_t value_cache_shapes[2] = {kv_total_rows, head_dim};
uint64_t out_shapes[2] = {batch * num_heads, head_dim};
uint32_t key_cache_shapes[2] = {(uint32_t)kv_total_rows, (uint32_t)head_dim};
uint32_t value_cache_shapes[2] = {(uint32_t)kv_total_rows, (uint32_t)head_dim};
uint32_t out_shapes[2] = {(uint32_t)(batch * num_heads), (uint32_t)head_dim};
Tensor query = make_tensor_external(host_query, query_shapes, 2, data_type);
Tensor key_cache = make_tensor_external(host_key_cache, key_cache_shapes, 2, data_type);
Tensor value_cache = make_tensor_external(host_value_cache, value_cache_shapes, 2, data_type);
Expand All @@ -121,86 +121,81 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
PTO2_SCOPE(rt) {
uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
uint64_t oi_shapes[2] = {q_tile, head_dim};
uint64_t li_shapes[1] = {q_tile};
uint64_t mi_shapes[1] = {q_tile};
uint32_t cur_offset = (uint32_t)(b_idx * q_head_num + q_idx * q_tile);
uint32_t oi_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
uint32_t li_shapes[1] = {(uint32_t)q_tile};
uint32_t mi_shapes[1] = {(uint32_t)q_tile};
Tensor oi = make_tensor(oi_shapes, 2, DataType::FLOAT32);
Tensor li_update = make_tensor(li_shapes, 1, DataType::FLOAT32);
Tensor mi_update = make_tensor(mi_shapes, 1, DataType::FLOAT32);

uint64_t qi_shapes[2] = {q_tile, head_dim};
uint64_t qi_offsets[2] = {cur_offset, 0};
uint32_t qi_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
uint32_t qi_offsets[2] = {cur_offset, 0};
Tensor qi = query.view(qi_shapes, qi_offsets);
uint64_t out_view_shapes[2] = {q_tile, head_dim};
uint64_t out_view_offsets[2] = {cur_offset, 0};
uint32_t out_view_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
uint32_t out_view_offsets[2] = {cur_offset, 0};
Tensor out_view = out.view(out_view_shapes, out_view_offsets);

PTOParam params_inplace[] = {
make_output_param(oi),
make_output_param(li_update),
make_output_param(mi_update),
};
pto2_rt_submit_task(rt, FUNC_AIV_HUB, PTO2_WORKER_VECTOR, params_inplace, 3); // create_inplace
PTOParam params_inplace;
params_inplace.add_output(oi);
params_inplace.add_output(li_update);
params_inplace.add_output(mi_update);
pto2_rt_submit_aiv_task(rt, FUNC_AIV_HUB, params_inplace); // create_inplace

for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
uint64_t cur_block_idx = host_block_table[b_idx * block_num + bn];
uint64_t valid_len = block_size < (cur_seq - bn * block_size) ? block_size : (cur_seq - bn * block_size);
uint64_t kv_shapes[2] = {block_size, head_dim};
uint64_t kv_offsets[2] = {cur_block_idx * block_size, 0};
uint32_t kv_shapes[2] = {(uint32_t)block_size, (uint32_t)head_dim};
uint32_t kv_offsets[2] = {(uint32_t)(cur_block_idx * block_size), 0};
Tensor kj = key_cache.view(kv_shapes, kv_offsets);
Tensor vj = value_cache.view(kv_shapes, kv_offsets);

uint64_t sij_shapes[2] = {q_tile, block_size};
uint32_t sij_shapes[2] = {(uint32_t)q_tile, (uint32_t)block_size};
Tensor sij = make_tensor(sij_shapes, 2, DataType::FLOAT32);
Tensor pij_f16 = make_tensor(sij_shapes, 2, data_type);

PTOParam params_qk[] = {
make_input_param(qi),
make_input_param(kj),
make_output_param(sij),
};
pto2_rt_submit_task(rt, FUNC_QK_MATMUL, PTO2_WORKER_CUBE, params_qk, 3); // c1
PTOParam params_qk;
params_qk.add_input(qi);
params_qk.add_input(kj);
params_qk.add_output(sij);
pto2_rt_submit_aic_task(rt, FUNC_QK_MATMUL, params_qk); // c1

uint64_t sij_valid_shapes[2] = {q_tile, valid_len};
uint64_t sij_valid_offsets[2] = {0, 0};
uint32_t sij_valid_shapes[2] = {(uint32_t)q_tile, (uint32_t)valid_len};
uint32_t sij_valid_offsets[2] = {0, 0};
Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
Tensor li = make_tensor(li_shapes, 1, DataType::FLOAT32);
Tensor mi = make_tensor(mi_shapes, 1, DataType::FLOAT32);
PTOParam params_sf[] = {
make_input_param(sij_valid),
make_scalar_param(float_to_u64(scale_value)),
make_output_param(pij_f16),
make_output_param(mi),
make_output_param(li),
};
pto2_rt_submit_task(rt, FUNC_SOFTMAX_PREPARE, PTO2_WORKER_VECTOR, params_sf, 5); // v1

uint64_t oi_tmp_shapes[2] = {q_tile, head_dim};
PTOParam params_sf;
params_sf.add_input(sij_valid);
params_sf.add_output(pij_f16);
params_sf.add_output(mi);
params_sf.add_output(li);
params_sf.add_scalar(float_to_u64(scale_value));
pto2_rt_submit_aiv_task(rt, FUNC_SOFTMAX_PREPARE, params_sf); // v1

uint32_t oi_tmp_shapes[2] = {(uint32_t)q_tile, (uint32_t)head_dim};
Tensor oi_tmp = make_tensor(oi_tmp_shapes, 2, DataType::FLOAT32);

PTOParam params_pv[] = {
make_input_param(pij_f16),
make_input_param(vj),
make_output_param(oi_tmp),
};
pto2_rt_submit_task(rt, FUNC_PV_MATMUL, PTO2_WORKER_CUBE, params_pv, 3); // c2
PTOParam params_pv;
params_pv.add_input(pij_f16);
params_pv.add_input(vj);
params_pv.add_output(oi_tmp);
pto2_rt_submit_aic_task(rt, FUNC_PV_MATMUL, params_pv); // c2

uint64_t is_first = (bn == 0) ? 1 : 0;
uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;

PTOParam params_up[] = {
make_input_param(mi),
make_input_param(li),
make_input_param(oi_tmp),
make_inout_param(mi_update),
make_inout_param(li_update),
make_inout_param(oi),
make_output_param(out_view),
make_scalar_param(is_first),
make_scalar_param(is_last),
};
pto2_rt_submit_task(rt, FUNC_ONLINE_UPDATE, PTO2_WORKER_VECTOR, params_up, 9); // v2
PTOParam params_up;
params_up.add_input(mi);
params_up.add_input(li);
params_up.add_input(oi_tmp);
params_up.add_inout(mi_update);
params_up.add_inout(li_update);
params_up.add_inout(oi);
params_up.add_output(out_view);
params_up.add_scalar(is_first);
params_up.add_scalar(is_last);
pto2_rt_submit_aiv_task(rt, FUNC_ONLINE_UPDATE, params_up); // v2
}
}
}
Expand Down
13 changes: 5 additions & 8 deletions src/a5/platform/include/aicore/performance_collector_aicore.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,22 @@
* Writes performance metrics to the provided buffer. Buffer management
* and status tracking are handled by AICPU.
*
* AICore records task_id and timestamps only. AICPU fills func_id and
* core_type at completion time from TaskDescriptor.
*
* @param perf_buf Performance buffer pointer
* @param task_id Task ID
* @param func_id Function ID
* @param start_time Start timestamp
* @param end_time End timestamp
* @param kernel_ready_time Kernel ready timestamp
* @param core_type Core type (AIC/AIV)
*/
__aicore__ __attribute__((always_inline))
static inline void perf_aicore_record_task(
__gm__ PerfBuffer* perf_buf,
uint32_t task_id,
uint32_t func_id,
uint64_t start_time,
uint64_t end_time,
uint64_t kernel_ready_time,
CoreType core_type) {
uint64_t kernel_ready_time) {

// Read current buffer count
dcci(&perf_buf->count, SINGLE_CACHE_LINE);
Expand All @@ -55,13 +54,11 @@ static inline void perf_aicore_record_task(

__gm__ PerfRecord* record = &perf_buf->records[idx];

// Write record data
// Write record data (func_id and core_type filled by AICPU at completion)
record->start_time = start_time;
record->end_time = end_time;
record->kernel_ready_time = kernel_ready_time;
record->task_id = task_id;
record->func_id = func_id;
record->core_type = core_type;

perf_buf->count = idx + 1;

Expand Down
8 changes: 8 additions & 0 deletions src/a5/platform/include/aicpu/platform_aicpu_affinity.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#pragma once
#include <cstdint>

// Returns true if this thread should call aicpu_execute().
// Returns false if this thread should exit (dropped).
// logical_count: desired active threads (from runtime.sche_cpu_num)
// total_launched: actual threads launched (PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)
bool platform_aicpu_affinity_gate(int32_t logical_count, int32_t total_launched);
4 changes: 3 additions & 1 deletion src/a5/platform/include/common/perf_profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,11 @@ struct PerfRecord {
uint64_t finish_time; // AICPU timestamp: when AICPU observed task completion (task_status back to 0)

// Task identification
uint32_t task_id; // Task unique identifier
uint32_t task_id; // Register dispatch id (per-core monotonic counter, NOT mixed_task_id).
// May collide across cores; use (ring_id, task_id, core_id) as unique key.
uint32_t func_id; // Kernel function identifier
CoreType core_type; // Core type (AIC/AIV)
uint8_t ring_id; // Ring layer (0 for single-ring / legacy)

// Dependency relationship (fanout only)
int32_t fanout[RUNTIME_MAX_FANOUT]; // Successor task ID array
Expand Down
8 changes: 8 additions & 0 deletions src/a5/platform/include/common/platform_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ constexpr int PLATFORM_AIV_CORES_PER_BLOCKDIM = 2;
*/
constexpr int PLATFORM_MAX_AICPU_THREADS = 7;

/**
* Maximum AICPU launch threads (physical)
* Upper bound for the number of AICPU threads that can be launched by Host.
* Can be larger than PLATFORM_MAX_AICPU_THREADS to allow threads to be dropped
* from scheduling while still participating in affinity (e.g. 6 launch, 4 active).
*/
constexpr int PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH = 6;

// =============================================================================
// Derived Platform Limits
// =============================================================================
Expand Down
13 changes: 10 additions & 3 deletions src/a5/platform/onboard/aicpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

#include "common/unified_log.h"
#include "common/kernel_args.h"
#include "common/platform_config.h"
#include "aicpu/device_log.h"
#include "aicpu/platform_regs.h"

// Forward declaration (no need for full runtime.h)
class Runtime;
#include "aicpu/platform_aicpu_affinity.h"
#include "runtime.h"

// Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp)
extern "C" int aicpu_execute(Runtime *arg);
Expand Down Expand Up @@ -71,6 +71,13 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer
// Store platform regs before calling aicpu_execute
set_platform_regs(k_args->regs);

// Affinity gate: drop excess threads before entering runtime
if (!platform_aicpu_affinity_gate(runtime->sche_cpu_num,
PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) {
LOG_INFO("Thread dropped by cluster affinity");
return 0;
}

LOG_INFO("%s", "DynTileFwkBackendKernelServer: Calling aicpu_execute with Runtime");
int rc = aicpu_execute(runtime);
if (rc != 0) {
Expand Down
Loading
Loading