Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 41 additions & 13 deletions src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,18 +351,46 @@ struct AicpuExecutor {
Handshake* h = &hank[core_id];
uint64_t finish_ts = get_sys_cnt_aicpu();
PerfBuffer* perf_buf = (PerfBuffer*)h->perf_records_addr;
rmb();
uint32_t count = perf_buf->count;
if (count > 0) {
PerfRecord* record = &perf_buf->records[count - 1];
if (record->task_id == static_cast<uint32_t>(task_id)) {
// Fill metadata that AICore doesn't know
int32_t perf_slot_idx = static_cast<int32_t>(s_executing_subslot[core_id]);
record->func_id = slot_state.task->kernel_id[perf_slot_idx];
record->core_type = CT;
perf_aicpu_record_dispatch_and_finish_time(
record, dispatch_timestamps_[core_id], finish_ts);
PerfRecord* matched_record = nullptr;
uint32_t observed_count = 0;
constexpr int kPerfPatchRetries = 4;
constexpr int kPerfScanWindow = 4;
for (int retry = 0; retry < kPerfPatchRetries && matched_record == nullptr; retry++) {
rmb();
observed_count = perf_buf->count;
if (observed_count == 0) {
SPIN_WAIT_HINT();
continue;
}
if (observed_count > PLATFORM_PROF_BUFFER_SIZE) {
observed_count = PLATFORM_PROF_BUFFER_SIZE;
}
int32_t start_idx = static_cast<int32_t>(observed_count) - 1;
int32_t end_idx = start_idx - kPerfScanWindow + 1;
if (end_idx < 0) {
end_idx = 0;
}
Comment on lines +368 to +372

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The calculation of the scan window's end_idx can be simplified. The current logic, which involves an initial calculation followed by a conditional clamp to zero, can be expressed more concisely in a single line using a ternary operator. This would make the code easier to read and understand at a glance.

                        const int32_t start_idx = static_cast<int32_t>(observed_count) - 1;
                        const int32_t end_idx = (observed_count > (uint32_t)kPerfScanWindow) ? (static_cast<int32_t>(observed_count) - kPerfScanWindow) : 0;

for (int32_t ridx = start_idx; ridx >= end_idx; ridx--) {
PerfRecord* candidate = &perf_buf->records[ridx];
if (candidate->task_id == static_cast<uint32_t>(task_id)) {
matched_record = candidate;
break;
}
}
if (matched_record == nullptr) {
SPIN_WAIT_HINT();
}
}
if (matched_record != nullptr) {
// Fill metadata that AICore doesn't know
int32_t perf_slot_idx = static_cast<int32_t>(s_executing_subslot[core_id]);
matched_record->func_id = slot_state.task->kernel_id[perf_slot_idx];
matched_record->core_type = CT;
perf_aicpu_record_dispatch_and_finish_time(
matched_record, dispatch_timestamps_[core_id], finish_ts);
} else {
DEV_WARN("Thread %d: failed to patch perf metadata for task %d on core %d (count=%u)",
thread_idx, task_id, core_id, observed_count);
}
#if PTO2_SCHED_PROFILING
sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
Expand Down Expand Up @@ -1789,8 +1817,8 @@ int32_t AicpuExecutor::run(Runtime* runtime) {
// Compute new core assignments for all threads and initialize donated slots
DEV_INFO("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
#if PTO2_PROFILING
// Benchmark: record orchestrator end timestamp before waiting for schedulers
DEV_ALWAYS("BENCHMARK: thread=%d end=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
// Record orchestrator end timestamp before waiting for schedulers
DEV_ALWAYS("Thread=%d end=%llu", thread_idx, (unsigned long long)get_sys_cnt_aicpu());
#endif
transition_requested_.store(true, std::memory_order_release);

Expand Down
2 changes: 1 addition & 1 deletion tools/perf_to_mermaid.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def main():
parser.add_argument('--style', choices=['detailed', 'compact'], default='detailed',
help='节点信息密度:detailed(详细,包含核心和时间)或 compact(紧凑,仅函数名)')
parser.add_argument('--direction', choices=['TD', 'LR'], default='TD',
help='流程图方向:TD(从上到下)或 LR(从左到右,默认)')
help='流程图方向:TD(从上到下,默认)或 LR(从左到右)')
parser.add_argument('-v', '--verbose', action='store_true', help='详细输出')

args = parser.parse_args()
Expand Down
Loading