From ca4151718c707ff5a05c805588156d11310ba8b9 Mon Sep 17 00:00:00 2001 From: zhusy54 Date: Wed, 18 Mar 2026 18:33:17 +0800 Subject: [PATCH 1/2] perf(rt2): precompute dispatch desc and use flat byte offset register encoding Replace structured (slot_in_ring, ring_id, slot_idx) register encoding with a flat byte offset scheme. The new layout uses 28 bits for offset_field, enabling ~2GB of addressable dispatch descriptors and eliminating independent bit-width constraints on ring depth and task window size. Add overflow safety check in dispatch_subtask_to_core that triggers emergency_shutdown with PTO2_ERROR_ENCODING_OVERFLOW on sentinel collision. --- .../aicore/aicore_executor.cpp | 85 ++++++--- .../aicpu/aicpu_executor.cpp | 172 ++++++++++-------- .../tensormap_and_ringbuffer/build_config.py | 2 +- .../runtime/pto2_dispatch_payload.h | 124 +++++++++++-- .../runtime/pto_orchestrator.cpp | 10 +- .../runtime/pto_orchestrator.h | 4 + .../runtime/pto_runtime2_types.h | 57 ++++-- .../runtime/pto_shared_memory.h | 5 + .../runtime/runtime.h | 10 +- 9 files changed, 339 insertions(+), 130 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index 860c91c2..12c2e706 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -14,21 +14,28 @@ typedef void (*UnifiedKernelFunc)(__gm__ int64_t*); /** - * Execute task from PTO2DispatchPayload. + * Execute task from PTO2DispatchDesc. * - * Reads function_bin_addr and args from the dispatch payload. + * Reads function_bin_addrs[slot_idx] and args from the dispatch descriptor. + * The descriptor is pre-built by the Orchestrator at submit time, so this + * function performs no address computation—just a function pointer call. * - * @param payload Pointer to PTO2DispatchPayload in global memory + * @param desc Pointer to PTO2DispatchDesc in global memory + * @param slot_idx Subtask slot index (0=AIC, 1=AIV0, 2=AIV1) */ __aicore__ __attribute__((always_inline)) static void execute_task( - __gm__ PTO2DispatchPayload* payload + __gm__ PTO2DispatchDesc* desc, uint32_t slot_idx ) { - if (payload == nullptr || payload->function_bin_addr == 0) { + if (desc == nullptr) { + return; + } + uint64_t func_addr = desc->function_bin_addrs[slot_idx]; + if (func_addr == 0) { return; } - UnifiedKernelFunc kernel = (UnifiedKernelFunc)payload->function_bin_addr; - kernel(reinterpret_cast<__gm__ int64_t*>(payload->args)); + UnifiedKernelFunc kernel = (UnifiedKernelFunc)func_addr; + kernel(reinterpret_cast<__gm__ int64_t*>(desc->args)); FULL_MEMORY_BARRIER(); } @@ -38,10 +45,15 @@ __aicore__ __attribute__((always_inline)) static void execute_task( * Implements the AICPU-AICore register-based dispatch protocol: * 1. Wait for AICPU ready signal via handshake buffer * 2. Report physical core ID and core type, signal AICore ready - * 3. Poll DATA_MAIN_BASE register for task dispatch until exit signal + * 3. Read PTO2DispatchInitInfo from hank->task (one-shot, wait for non-zero) + * 4. Poll DATA_MAIN_BASE register for task dispatch until exit signal * - * Task dispatch reads PTO2DispatchPayload address from Handshake.task. - * Task ID is derived from the register value (task_id + 1 encoding). + * Register encoding (set by AICPU scheduler) — see pto2_dispatch_payload.h: + * bit [30] = toggle bit (alternates per core, ignored during decode) + * bits [29:2] = offset_field = (desc_byte_offset >> 3) + 1 (28 bits, 0 = idle) + * bits [1:0] = slot_idx (2 bits: 0=AIC, 1=AIV0, 2=AIV1) + * + * Dispatch desc address = dispatch_base + decoded byte offset * * @param runtime Pointer to Runtime in global memory * @param block_idx Block index (core ID) @@ -72,15 +84,32 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); - // Cache payload address (set once by AICPU during initialization, never changes) - __gm__ PTO2DispatchPayload* payload = - reinterpret_cast<__gm__ PTO2DispatchPayload*>(my_hank->task); + // Phase 3.5: Cache dispatch init info from AICPU. + // + // Why this wait is necessary: + // hank->task is set to 0 during handshake (Phase 1) because + // PTO2DispatchInitInfo requires shared memory addresses that don't + // exist yet — PTO2Runtime is created AFTER handshake completes. + // AICPU writes &init_info to hank->task once PTO2Runtime is ready. + // dcci is needed because hank->task lives in GM; without cache + // invalidation, AICore would keep reading a stale cached zero. + while (my_hank->task == 0) { + dcci(my_hank, SINGLE_CACHE_LINE); + } + __gm__ PTO2DispatchInitInfo* init_info = + reinterpret_cast<__gm__ PTO2DispatchInitInfo*>(my_hank->task); + // init_info points to a separate GM object — invalidate its cache line + // so we read the values AICPU wrote, not stale data. + dcci(init_info, SINGLE_CACHE_LINE); + + uint64_t dispatch_base = init_info->dispatch_base; + my_hank->task = 0; // Clear after reading (no longer needed) + dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); bool profiling_enabled = runtime->enable_profiling; uint64_t kernel_ready_time = get_sys_cnt_aicore(); // Phase 4: Main execution loop - poll register for tasks until exit signal - // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit uint32_t reg_val = AICPU_IDLE_TASK_ID; uint32_t last_reg_val = AICPU_IDLE_TASK_ID; @@ -92,37 +121,45 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime* runtime, in break; } - // Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task) - if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) { + // Skip idle (0 or AICPU_IDLE_TASK_ID) or duplicate dispatch + if (reg_val == 0 || reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) { SPIN_WAIT_HINT(); continue; } { - uint32_t task_id = reg_val; // Decode: register holds task_id directly + // Decode register value using named constants from pto2_dispatch_payload.h. + // Inline decode instead of calling pto2_reg_decode_*() because ccec does not + // allow [aicore] code to call [host]-annotated functions. + uint32_t offset_field = (reg_val >> PTO2_REG_OFFSET_SHIFT) & PTO2_REG_OFFSET_MASK; + uint64_t desc_byte_offset = static_cast(offset_field - 1) << PTO2_REG_ALIGN_SHIFT; + uint32_t slot_idx = reg_val & PTO2_REG_SLOTIDX_MASK; + + // Compute dispatch descriptor address from cached base + decoded offset + __gm__ PTO2DispatchDesc* desc = reinterpret_cast<__gm__ PTO2DispatchDesc*>( + dispatch_base + desc_byte_offset); - // Invalidate payload buffer (AICPU updates its content each dispatch) - dcci(payload, ENTIRE_DATA_CACHE); + // Invalidate data cache to ensure fresh read of dispatch descriptor + dcci(desc, ENTIRE_DATA_CACHE); - write_reg(RegId::COND, MAKE_ACK_VALUE(task_id)); + write_reg(RegId::COND, MAKE_ACK_VALUE(reg_val)); // Performance profiling: record start time uint64_t start_time = get_sys_cnt_aicore(); // Execute the task - execute_task(payload); + execute_task(desc, slot_idx); // Performance profiling: record task execution - // (func_id and core_type are filled by AICPU at completion time) if (profiling_enabled) { uint64_t end_time = get_sys_cnt_aicore(); __gm__ PerfBuffer* perf_buf = (__gm__ PerfBuffer*)my_hank->perf_records_addr; - perf_aicore_record_task(perf_buf, task_id, + perf_aicore_record_task(perf_buf, reg_val, start_time, end_time, kernel_ready_time); } last_reg_val = reg_val; - write_reg(RegId::COND, MAKE_FIN_VALUE(task_id)); + write_reg(RegId::COND, MAKE_FIN_VALUE(reg_val)); } } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index ad2b098d..302e7787 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -69,9 +70,6 @@ constexpr int32_t PROGRESS_LOG_INTERVAL = 250; // log every N completions a static PTO2Runtime *rt{nullptr}; -// Per-core dispatch payload storage (one per physical core) -static PTO2DispatchPayload s_pto2_payload_per_core[RUNTIME_MAX_WORKER]; - // Core information for discovery (with register address for fast dispatch) struct CoreInfo { int32_t worker_id; // Index in runtime.workers[] @@ -175,12 +173,6 @@ struct AicpuExecutor { uint64_t core_id_to_reg_addr_[MAX_CORES_PER_THREAD]; // Per-core monotonic dispatch counter for register protocol uniqueness. - // Multi-ring task_ids can collide in the lower 32 bits (e.g., ring 0 local 0 - // and ring 1 local 0 both truncate to 0), breaking the AICore's last_reg_val - // duplicate detection and causing false-positive COND completion. A per-core - // counter guarantees each dispatch writes a unique DATA_MAIN_BASE value. - uint32_t dispatch_seq_by_core_[RUNTIME_MAX_WORKER]{}; - // Per-core subtask slot tracking (which PTO2SubtaskSlot is running on each core) PTO2SubtaskSlot executing_subslot_by_core_[RUNTIME_MAX_WORKER]{}; @@ -190,10 +182,17 @@ struct AicpuExecutor { // Platform register base address array (set via get_platform_regs()) uint64_t regs_{0}; - // Track executing register task_id per core (AICPU_TASK_INVALID = idle). - // NOTE: this is NOT the mixed_task_id; it is the per-core dispatch id used by the - // register protocol (derived from dispatch_seq_by_core_ and masked by TASK_ID_MASK). + // Track executing register value per core (0 = idle). + // The register value encodes a flat byte offset + slot_idx + toggle bit. int32_t executing_reg_task_ids_[MAX_AICPU_THREADS][MAX_CORES_PER_THREAD]; + // Per-core toggle bit to guarantee consecutive dispatches differ in reg_val. + // Without this, ring buffer wrap-around could produce identical encodings. + uint32_t dispatch_toggle_by_core_[RUNTIME_MAX_WORKER]{}; + // Dispatch base address: address of ring 0's first PTO2DispatchDesc. + // Used to compute byte offsets for register encoding. + uint64_t dispatch_base_{0}; + // Shared memory header for error reporting during dispatch. + PTO2SharedMemoryHeader* sm_header_{nullptr}; CoreStateTracker trackers_[MAX_AICPU_THREADS]; // ===== Task queue state (managed by scheduler ready queues) ===== @@ -247,24 +246,6 @@ struct AicpuExecutor { void diagnose_stuck_state( Runtime* runtime, int32_t thread_idx, const int32_t* cur_thread_cores, int32_t core_num, Handshake* hank); - // Build slim PTO2DispatchPayload: only function_bin_addr + args. - // Metadata (mixed_task_id, subslot, kernel_id, core_type) stays in TaskDescriptor. - // Dispatch order: tensor args first, then scalar args. - void build_pto2_payload(PTO2DispatchPayload& out, - int32_t kernel_id, - PTO2TaskPayload& task_pl) { - out.function_bin_addr = get_function_bin_addr(kernel_id); - int32_t n = 0; - for (int32_t i = 0; i < task_pl.tensor_count; i++) { - task_pl.tensors[i].update_start_offset(); - out.args[n++] = reinterpret_cast(&task_pl.tensors[i]); - } - for (int32_t i = 0; i < task_pl.scalar_count; i++) { - out.args[n++] = task_pl.scalars[i]; - } - } - - // Template methods for Phase 1 and Phase 2 template void check_running_cores_for_completion(int32_t thread_idx, CoreTypeTracker& ct, @@ -492,7 +473,11 @@ struct AicpuExecutor { return slot_state; } - void dispatch_subtask_to_core( + /** + * Dispatch a subtask to an AICore and encode the register value. + * Returns true on success, false on fatal encoding overflow (emergency shutdown initiated). + */ + bool dispatch_subtask_to_core( Runtime* runtime, CoreStateTracker& tracker, int32_t* executing_reg_task_ids, int32_t core_id, CoreType core_type, PTO2TaskSlotState& slot_state, PTO2SubtaskSlot subslot @@ -500,10 +485,30 @@ struct AicpuExecutor { , bool profiling_enabled, int32_t thread_idx #endif ) { - PTO2DispatchPayload& payload = s_pto2_payload_per_core[core_id]; - PTO2TaskDescriptor& task = *slot_state.task; - int32_t slot_idx = static_cast(subslot); - build_pto2_payload(payload, task.kernel_id[slot_idx], *slot_state.payload); + // Compute flat byte offset of dispatch descriptor from global base. + uint64_t desc_addr = reinterpret_cast(&slot_state.payload->dispatch); + uint64_t desc_byte_offset = desc_addr - dispatch_base_; + uint32_t offset_field = static_cast((desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1); + + // Overflow check: offset_field must not exceed sentinel-safe upper bound. + // If it does, the encoded register value would collide with AICORE_EXIT_SIGNAL + // or AICORE_IDLE_TASK_ID, causing AICore to misinterpret the dispatch. + if (offset_field > PTO2_REG_MAX_OFFSET_FIELD) { + DEV_ERROR("PTO2 REG ENCODING OVERFLOW: offset_field=0x%x exceeds max 0x%x. " + "desc_byte_offset=0x%llx ring_id=%u slot_in_ring=%u core=%d subslot=%u. " + "Reduce task_window_size or shared memory size.", + offset_field, PTO2_REG_MAX_OFFSET_FIELD, + (unsigned long long)desc_byte_offset, + (unsigned)slot_state.ring_id, (unsigned)slot_state.slot_in_ring, + core_id, (unsigned)static_cast(subslot)); + if (sm_header_) { + sm_header_->sched_error_code.store(PTO2_ERROR_ENCODING_OVERFLOW, std::memory_order_release); + } + emergency_shutdown(runtime); + completed_.store(true, std::memory_order_release); + return false; + } + executing_subslot_by_core_[core_id] = subslot; executing_slot_state_by_core_[core_id] = &slot_state; #if PTO2_PROFILING @@ -516,28 +521,22 @@ struct AicpuExecutor { core_dispatch_counts_[core_id]++; } #endif - // Per-core monotonic counter for register protocol uniqueness. - // mixed_task_id encodes (ring_id << 32 | local_id); truncation to - // uint32 loses ring_id, so tasks from different rings with the same - // local_id would write identical DATA_MAIN_BASE values. The AICore - // uses last_reg_val to detect new dispatches and would skip the - // duplicate, while the stale COND register from the previous task - // (same local_id) would cause a false-positive completion. - dispatch_seq_by_core_[core_id]++; - uint32_t reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK; - // Skip reserved sentinel values - while (reg_task_id == AICORE_IDLE_TASK_ID || - (reg_task_id + 1) == AICORE_EXIT_SIGNAL) { - dispatch_seq_by_core_[core_id]++; - reg_task_id = dispatch_seq_by_core_[core_id] & TASK_ID_MASK; - } - write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast(reg_task_id)); + // Encode flat byte offset + slot_idx into register value. + // Toggle bit ensures consecutive dispatches to the same core always differ, + // preventing the AICore last_reg_val duplicate check from skipping a dispatch + // when the ring buffer wraps and the same slot is reused for the same core. + uint32_t slot_idx = static_cast(subslot); + dispatch_toggle_by_core_[core_id] ^= (1u << PTO2_REG_TOGGLE_BIT); + uint32_t reg_val = pto2_reg_encode( + desc_byte_offset, slot_idx, dispatch_toggle_by_core_[core_id]); + write_reg(core_id_to_reg_addr_[core_id], RegId::DATA_MAIN_BASE, static_cast(reg_val)); + executing_reg_task_ids[core_id] = static_cast(reg_val); CoreTypeTracker& ct = tracker.by_type[static_cast(core_type)]; int32_t idle_idx = ct.find_idle_index(core_id); ct.move_idle_to_running(idle_idx); tracker.core_idle[core_id] = false; - executing_reg_task_ids[core_id] = reg_task_id; + return true; } }; @@ -564,10 +563,10 @@ int32_t AicpuExecutor::handshake_all_cores(Runtime* runtime) { DEV_INFO("Handshaking with %d cores", cores_total_num_); - // Step 1: Write per-core payload addresses and send handshake signal - // task must be written BEFORE aicpu_ready so AICore sees it after waking up + // Step 1: Send handshake signal (task=0 initially; dispatch init info is set later + // after PTO2Runtime is created, see pto2_runtime_create_from_sm path) for (int32_t i = 0; i < cores_total_num_; i++) { - all_handshakes[i].task = reinterpret_cast(&s_pto2_payload_per_core[i]); + all_handshakes[i].task = 0; all_handshakes[i].aicpu_ready = 1; } @@ -871,10 +870,11 @@ int32_t AicpuExecutor::init(Runtime* runtime) { } // Clear per-core dispatch payloads and subslot tracking - memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core)); - memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_)); memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_)); memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_)); + memset(dispatch_toggle_by_core_, 0, sizeof(dispatch_toggle_by_core_)); + dispatch_base_ = 0; + sm_header_ = nullptr; DEV_INFO("Init: PTO2 mode, task count from shared memory"); @@ -1154,29 +1154,29 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa ResourceCount rc = shape_resource_count(shape); if (rc.aic) { - dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, + if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC #if PTO2_PROFILING , profiling_enabled, thread_idx #endif - ); + )) break; } if (rc.aiv >= 1) { int32_t aiv0 = tracker.core_idle[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1]; - dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, + if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, aiv0, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0 #if PTO2_PROFILING , profiling_enabled, thread_idx #endif - ); + )) break; } if (rc.aiv >= 2) { - dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, + if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1 #if PTO2_PROFILING , profiling_enabled, thread_idx #endif - ); + )) break; } #if PTO2_PROFILING phase_dispatch_count++; @@ -1236,30 +1236,30 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime* runtime, int32_t threa ResourceCount rc = shape_resource_count(shape); if (rc.aic) { - dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, + if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC #if PTO2_PROFILING , profiling_enabled, thread_idx #endif - ); + )) break; } if (rc.aiv >= 1) { int32_t aiv_id = tracker.core_idle[c.aiv_core_ids[0]] ? c.aiv_core_ids[0] : c.aiv_core_ids[1]; - dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, + if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, aiv_id, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0 #if PTO2_PROFILING , profiling_enabled, thread_idx #endif - ); + )) break; } if (rc.aiv >= 2) { - dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, + if (!dispatch_subtask_to_core(runtime, tracker, executing_reg_task_ids, c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1 #if PTO2_PROFILING , profiling_enabled, thread_idx #endif - ); + )) break; } made_progress = true; #if PTO2_SCHED_PROFILING @@ -1716,6 +1716,33 @@ int32_t AicpuExecutor::run(Runtime* runtime) { } #endif + // Pass kernel address mapping to orchestrators for dispatch descriptor construction + for (int i = 0; i < orch_thread_num_; i++) { + rt->orchestrators[i].func_id_to_addr = func_id_to_addr_; + rt->orchestrators[i].func_id_count = RUNTIME_MAX_FUNC_ID; + } + + // Build dispatch init info in shared memory (GM) for AICore. + // IMPORTANT: dispatch_init_info MUST live in GM because AICore cannot + // access AICPU-local memory on hardware. The shared memory header is + // allocated in GM, so sm_handle->header->dispatch_init_info is accessible. + // AICore caches dispatch_base at startup, then computes dispatch desc + // address as: dispatch_base + decoded_byte_offset + PTO2DispatchInitInfo& init_info = sm_handle->header->dispatch_init_info; + dispatch_base_ = reinterpret_cast(sm_handle->task_payloads[0]) + + offsetof(PTO2TaskPayload, dispatch); + init_info.dispatch_base = dispatch_base_; + sm_header_ = sm_handle->header; + + // Publish dispatch init info to all AICore handshakes. + // AICore waits for hank->task != 0 before reading. + { + Handshake* hank = static_cast(runtime->workers); + for (int32_t i = 0; i < cores_total_num_; i++) { + hank[i].task = reinterpret_cast(&init_info); + } + } + // With multi-ring, slot_states are per-ring inside the scheduler. // Fanout fill-in in complete_perf_records is disabled (slot_states_ptr = nullptr). runtime->set_pto2_slot_states_ptr(nullptr); @@ -2022,11 +2049,12 @@ void AicpuExecutor::deinit(Runtime* runtime) { core_dispatch_counts_[i] = 0; } - // Clear per-core dispatch payloads and subslot tracking - memset(s_pto2_payload_per_core, 0, sizeof(s_pto2_payload_per_core)); - memset(dispatch_seq_by_core_, 0, sizeof(dispatch_seq_by_core_)); + // Clear per-core subslot tracking memset(executing_subslot_by_core_, 0, sizeof(executing_subslot_by_core_)); memset(executing_slot_state_by_core_, 0, sizeof(executing_slot_state_by_core_)); + memset(dispatch_toggle_by_core_, 0, sizeof(dispatch_toggle_by_core_)); + dispatch_base_ = 0; + sm_header_ = nullptr; completed_tasks_.store(0, std::memory_order_release); total_tasks_ = 0; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py b/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py index 85a841ca..d5321cb3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/build_config.py @@ -4,7 +4,7 @@ # This is a device-orchestration runtime where: # - AICPU thread 3 runs the orchestrator (builds task graph on device) # - AICPU threads 0/1/2 run schedulers (dispatch tasks to AICore) -# - AICore executes tasks via PTO2DispatchPayload +# - AICore executes tasks via PTO2DispatchDesc (precomputed by Orchestrator) # # The "orchestration" directory contains source files compiled into both # runtime targets AND the orchestration .so (e.g., tensor methods needed diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h index 94f2da37..d8cd9f94 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h @@ -1,13 +1,17 @@ /** * @file pto2_dispatch_payload.h - * @brief Minimal dispatch payload for AICore kernel execution + * @brief Dispatch descriptor for AICore kernel execution * - * Shared between AICPU (builds in-place) and AICore (reads to run kernel). - * Handshake.task points to PTO2DispatchPayload embedded in PTO2TaskPayload. + * PTO2DispatchDesc is embedded in PTO2TaskPayload and built by the Orchestrator + * at submit time. It contains per-slot function addresses and a unified args[] + * array (tensor pointers + scalar values). The Scheduler encodes a flat byte + * offset (from a single dispatch base address) into the DATA_MAIN_BASE register; + * AICore decodes the offset and adds it to a cached base to get the descriptor + * address, achieving zero GM access on the scheduler hot path. * - * Only contains fields AICore needs to execute: function address + arguments. - * Metadata (task_id, kernel_id, core_type) lives in PTO2TaskDescriptor and - * is accessed by AICPU when needed (profiling, diagnostics). + * PTO2DispatchInitInfo is a one-shot initialization struct passed to AICore + * via Handshake.task during startup. It provides the single dispatch base + * address needed for address computation. */ #ifndef RT2_PTO2_DISPATCH_PAYLOAD_H_ @@ -15,19 +19,115 @@ #include +#include "pto_submit_types.h" + /** Max arguments per task; must match RUNTIME_MAX_ARGS and PTO2_MAX_OUTPUTS */ #ifndef PTO2_DISPATCH_MAX_ARGS #define PTO2_DISPATCH_MAX_ARGS 128 #endif +// ============================================================================= +// Register Encoding Constants (Flat Byte Offset Scheme) +// ============================================================================= + +/** + * DATA_MAIN_BASE register bit layout (AICPU → AICore): + * + * [31] unused (COND register uses bit 31 for ACK/FIN state) + * [30] toggle bit (alternates per core to guarantee uniqueness) + * [29:2] offset_field = (desc_byte_offset >> 3) + 1 (28 bits) + * +1 reserves 0 for idle; >>3 because PTO2DispatchDesc is 8-byte aligned + * [1:0] slot_idx (2 bits: 0=AIC, 1=AIV0, 2=AIV1) + * + * Max safe offset_field = 0x0FFFFFFB (avoid sentinel collision when toggle=1). + * Max encodable byte offset ≈ 2GB — well beyond any practical shared memory size. + * + * Sentinel values (must never be produced by encoding): + * AICORE_EXIT_SIGNAL = 0x7FFFFFF0 + * AICPU_IDLE_TASK_ID = 0x7FFFFFFD + * AICORE_IDLE_TASK_ID = 0x7FFFFFFF + */ +constexpr uint32_t PTO2_REG_SLOTIDX_MASK = 0x3; +constexpr uint32_t PTO2_REG_OFFSET_SHIFT = 2; +constexpr uint32_t PTO2_REG_OFFSET_BITS = 28; +constexpr uint32_t PTO2_REG_OFFSET_MASK = (1u << PTO2_REG_OFFSET_BITS) - 1; // 0x0FFFFFFF +constexpr uint32_t PTO2_REG_TOGGLE_BIT = 30; +constexpr uint32_t PTO2_REG_ALIGN_SHIFT = 3; // PTO2DispatchDesc is 8-byte aligned +constexpr uint32_t PTO2_REG_MAX_OFFSET_FIELD = 0x0FFFFFFBu; // sentinel-safe upper bound + /** - * Dispatch payload: minimal execution interface for AICore. - * Layout: function_bin_addr followed by args[]. - * AICore reads function_bin_addr, casts to UnifiedKernelFunc, calls with args. + * Encode dispatch info into a 32-bit register value. + * + * @param desc_byte_offset Byte offset of PTO2DispatchDesc from dispatch_base (must be 8-byte aligned) + * @param slot_idx Subtask slot (0=AIC, 1=AIV0, 2=AIV1) + * @param toggle Toggle bit value (bit 30, alternated per dispatch per core) + */ +static inline uint32_t pto2_reg_encode(uint64_t desc_byte_offset, + uint32_t slot_idx, uint32_t toggle) { + uint32_t offset_field = static_cast((desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1); + return (offset_field << PTO2_REG_OFFSET_SHIFT) + | slot_idx + | toggle; +} + +/** + * Decode desc byte offset from register value. + * Returns the byte offset to add to dispatch_base to get the PTO2DispatchDesc address. + */ +static inline uint64_t pto2_reg_decode_offset(uint32_t reg_val) { + uint32_t offset_field = (reg_val >> PTO2_REG_OFFSET_SHIFT) & PTO2_REG_OFFSET_MASK; + return static_cast(offset_field - 1) << PTO2_REG_ALIGN_SHIFT; +} + +/** Decode slot_idx (subtask slot: 0=AIC, 1=AIV0, 2=AIV1) from register value. */ +static inline uint32_t pto2_reg_decode_slotidx(uint32_t reg_val) { + return reg_val & PTO2_REG_SLOTIDX_MASK; +} + +// ============================================================================= +// Dispatch Descriptor +// ============================================================================= + +/** + * Dispatch descriptor: execution interface for AICore. + * + * Layout: per-slot function_bin_addrs[] followed by unified args[]. + * AICore reads function_bin_addrs[slot_idx], casts to UnifiedKernelFunc, + * and calls with args (tensor GM pointers followed by scalar values). + * + * Built once by the Orchestrator during submit_mixed_task(); the Scheduler + * never touches it — it only writes a register-encoded value. + */ +struct PTO2DispatchDesc { + /** Per-slot kernel entry addresses in GM (AIC, AIV0, AIV1); 0 = inactive */ + uint64_t function_bin_addrs[PTO2_SUBTASK_SLOT_COUNT]; + /** Kernel arguments: tensor GM pointers first, then scalar values */ + uint64_t args[PTO2_DISPATCH_MAX_ARGS]; +}; + +// ============================================================================= +// Dispatch Initialization Info +// ============================================================================= + +/** + * One-shot initialization info passed from AICPU to AICore. + * + * Lifecycle: + * 1. AICPU sets Handshake.task = 0 during handshake (AICore not yet ready) + * 2. AICPU creates PTO2Runtime, computes dispatch_base from shared memory + * 3. AICPU writes Handshake.task = &init_info for all cores + * 4. AICore waits for Handshake.task != 0, reads and caches dispatch_base + * 5. AICore clears Handshake.task = 0 (init_info no longer needed) + * + * During the main loop, AICore computes the dispatch desc address as: + * dispatch_base + pto2_reg_decode_offset(reg_val) + * + * This avoids any GM read on the scheduler hot path — the scheduler only + * writes a register-encoded byte offset value. */ -struct PTO2DispatchPayload { - uint64_t function_bin_addr; /**< Kernel entry in GM: (UnifiedKernelFunc)function_bin_addr */ - uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars) */ +struct PTO2DispatchInitInfo { + /** Base address of the first PTO2DispatchDesc (ring 0, slot 0) */ + uint64_t dispatch_base; }; #endif // RT2_PTO2_DISPATCH_PAYLOAD_H_ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index d62e0f9a..c7bedfd9 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -353,8 +353,9 @@ void pto2_submit_mixed_task( __builtin_prefetch(&payload->tensors[i], 1, 3); __builtin_prefetch(reinterpret_cast(&payload->tensors[i]) + 64, 1, 3); } - for (int32_t i = 0; i < params.scalar_count; i += 8) { - __builtin_prefetch(&payload->scalars[i], 1, 3); + // Prefetch dispatch.args[] area (scalar values written directly here) + for (int32_t j = 0; j < params.scalar_count; j += 8) { + __builtin_prefetch(&payload->dispatch.args[params.tensor_count + j], 1, 3); } __builtin_prefetch(payload, 1, 3); __builtin_prefetch(reinterpret_cast(payload) + 64, 1, 3); @@ -375,7 +376,8 @@ void pto2_submit_mixed_task( slot_state.task = &task; slot_state.active_mask = active_mask; slot_state.subtask_done_mask.store(0, std::memory_order_relaxed); - slot_state.ring_id = ring_id; + slot_state.ring_id = static_cast(ring_id); + slot_state.slot_in_ring = static_cast(slot); scope_tasks_push(orch, &slot_state); } else { scope_tasks_push(orch, nullptr); @@ -519,7 +521,7 @@ void pto2_submit_mixed_task( } } - payload->init(params); + payload->init(params, orch->func_id_to_addr, orch->func_id_count, task.kernel_id); CYCLE_COUNT_LAP_RECORD(g_orch_params_cycle, AicpuPhaseId::ORCH_PARAMS, local_id); #if PTO2_ORCH_PROFILING diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index a2d4898d..e778f7d1 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -69,6 +69,10 @@ struct PTO2OrchestratorState { void* gm_heap_base; // Base address of GM heap uint64_t gm_heap_size; // Total size of GM heap (all rings) + // === FUNCTION ADDRESS MAPPING (for dispatch descriptor construction) === + const uint64_t* func_id_to_addr; // Kernel ID → GM function address (points to Runtime::func_id_to_addr_[]) + int32_t func_id_count; // Number of entries in func_id_to_addr (for bounds check) + // === FATAL ERROR === // Fatal error flag (single-thread access by orchestrator, no atomic needed) // Cross-thread notification uses shared memory orch_error_code (atomic) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index 141be544..ce1c0b60 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -21,6 +21,7 @@ #include "pto_types.h" #include "pto_submit_types.h" +#include "pto2_dispatch_payload.h" // ============================================================================= // Profiling Configuration @@ -68,6 +69,7 @@ // Scheduler errors (100+): detected in scheduler threads #define PTO2_ERROR_SCHEDULER_TIMEOUT 100 +#define PTO2_ERROR_ENCODING_OVERFLOW 101 // ============================================================================= // Configuration Constants @@ -358,9 +360,14 @@ struct PTO2TaskDescriptor { * Task payload data (cold path - only accessed during orchestration and dispatch) * * Layout: metadata (counts, fanin pointers) packed in the first 3 cache lines, - * followed by bulk tensor and scalar data. This gives sequential write access - * during orchestration and groups scheduler-hot fields (fanin_actual_count + - * fanin_slot_states) together for on_task_release. + * followed by the dispatch descriptor and bulk tensor data. Scalar values are + * written directly into dispatch.args[] (after tensor pointers), eliminating the + * separate scalars[] array. + * + * The Scheduler never reads this struct — it only encodes a flat byte offset + * and slot_idx into the DATA_MAIN_BASE register. AICore computes the dispatch + * desc address from a cached base address and reads dispatch.function_bin_addrs[] + * + dispatch.args[] directly. */ struct PTO2TaskPayload { // === Cache line 0 (64B) — metadata === @@ -369,23 +376,48 @@ struct PTO2TaskPayload { int32_t fanin_actual_count{0}; // Actual fanin count (without the +1 redundance) int32_t _reserved{0}; // Reserved (dep_pool_mark moved to SlotState for local access) PTO2TaskSlotState* fanin_slot_states[PTO2_MAX_INPUTS]; // Producer slot states (used by on_task_release) - // === Cache lines 3-34 (2048B) — tensors (alignas(64) forces alignment) === + // === Dispatch descriptor (1048B) — built by Orchestrator, read by AICore === + PTO2DispatchDesc dispatch; + // === Tensors (2048B) — alignas(64) Tensor forces alignment === Tensor tensors[PTO2_MAX_TENSOR_PARAMS]; - // === Cache lines 35-50 (1024B) — scalars === - uint64_t scalars[PTO2_MAX_SCALAR_PARAMS]; - void init(const PTOParam& params) { + /** + * Initialize payload: copy tensors, build dispatch descriptor. + * + * @param params Task parameters (tensors + scalars) + * @param func_id_to_addr Kernel ID → GM function address mapping + * @param func_id_count Number of entries in func_id_to_addr (for bounds check) + * @param kernel_ids Per-slot kernel IDs (AIC, AIV0, AIV1); <0 = inactive + */ + void init(const PTOParam& params, + const uint64_t* func_id_to_addr, + int32_t func_id_count, + const int32_t kernel_ids[PTO2_SUBTASK_SLOT_COUNT]) { tensor_count = params.tensor_count; scalar_count = params.scalar_count; + + // 1. Copy tensors from PTOParam auto src_tensors = params.tensors; for (int32_t i = 0; i < params.tensor_count; i++) { tensors[i].copy(*src_tensors[i]); } - static_assert(sizeof(scalars) == sizeof(params.scalars)); - // Round up to cache line boundary. Both arrays are 1024B so no overrun. - // Eliminates branches; extra bytes within the same CL have zero additional cost. - memcpy(scalars, params.scalars, - PTO2_ALIGN_UP(params.scalar_count * sizeof(uint64_t), 64)); + + // 2. Fill per-slot function addresses (0 for inactive or out-of-range slots) + for (int32_t s = 0; s < PTO2_SUBTASK_SLOT_COUNT; s++) { + int32_t kid = kernel_ids[s]; + dispatch.function_bin_addrs[s] = + (kid >= 0 && kid < func_id_count) ? func_id_to_addr[kid] : 0; + } + + // 3. Build dispatch.args[]: tensor pointers first, then scalar values + int32_t n = 0; + for (int32_t i = 0; i < params.tensor_count; i++) { + tensors[i].update_start_offset(); + dispatch.args[n++] = reinterpret_cast(&tensors[i]); + } + for (int32_t i = 0; i < params.scalar_count; i++) { + dispatch.args[n++] = params.scalars[i]; + } } }; @@ -427,6 +459,7 @@ struct alignas(64) PTO2TaskSlotState { std::atomic subtask_done_mask; // Each subtask sets its done bit on completion uint8_t ring_id; // Ring layer this task belongs to (for per-ring reclamation) int32_t dep_pool_mark{0}; // Dep pool top after this task's submission (orchestrator-only, local memory) + uint32_t slot_in_ring; // Index into task_payloads[ring_id][] (for register encoding) }; static_assert(sizeof(PTO2TaskSlotState) == 64); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h index e3ada51f..3f44085a 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h @@ -96,6 +96,11 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader { std::atomic graph_output_ptr; // Address where final output was written (packed buffer) std::atomic graph_output_size; // Size in bytes + // === DISPATCH INIT INFO (GM-resident, read by AICore at startup) === + // Must live in shared memory (GM) because AICore cannot access AICPU-local memory. + // Written by AICPU after PTO2Runtime creation; AICore caches the values once. + PTO2DispatchInitInfo dispatch_init_info; + // === ERROR REPORTING === // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 62508b8b..0a6f83e3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -10,7 +10,7 @@ * - Device orchestration state (pto2_gm_sm_ptr_, orch_args_) * - Function address mapping (func_id_to_addr_) * - * Task dispatch uses PTO2DispatchPayload from PTO2 shared memory. + * Task dispatch uses PTO2DispatchDesc from PTO2 shared memory (precomputed by Orchestrator). */ #ifndef RUNTIME_H @@ -71,7 +71,7 @@ constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - * Field Access Patterns: * - aicpu_ready: Written by AICPU, read by AICore * - aicore_done: Written by AICore, read by AICPU - * - task: Written by AICPU, read by AICore (0 = no task, non-zero = PTO2DispatchPayload*) + * - task: Written by AICPU, read by AICore (0 = not ready, non-zero = PTO2DispatchInitInfo*) * - task_status: Written by both (AICPU=1 on dispatch, AICore=0 on completion) * - control: Written by AICPU, read by AICore (0 = continue, 1 = quit) * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV) @@ -79,7 +79,7 @@ constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - struct Handshake { volatile uint32_t aicpu_ready; // AICPU ready signal: 0=not ready, 1=ready volatile uint32_t aicore_done; // AICore ready signal: 0=not ready, core_id+1=ready - volatile uint64_t task; // Task pointer: 0=no task, non-zero=PTO2DispatchPayload* + volatile uint64_t task; // Init: PTO2DispatchInitInfo* (0 until ready); runtime: unused volatile int32_t task_status; // Task execution status: 0=idle, 1=busy volatile int32_t control; // Control signal: 0=execute, 1=quit volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV @@ -116,7 +116,7 @@ struct HostApi { /** * Task structure - Compatibility stub for platform layer * - * RT2 uses PTO2DispatchPayload instead of Task for task dispatch. + * RT2 uses PTO2DispatchDesc instead of Task for task dispatch. * This stub exists only for API compatibility with device_runner.cpp. * Since get_task_count() returns 0, this struct is never actually used. */ @@ -262,7 +262,7 @@ class Runtime { /** @deprecated Task count is now in PTO2 shared memory */ int get_task_count() const { return 0; } - /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */ + /** @deprecated RT2 uses PTO2DispatchDesc, not Task. Always returns nullptr. */ Task* get_task(int) { return nullptr; } /** @deprecated Use PTO2 dispatch mode */ From eb92a017c12477ea6f0d42572b3153de63fb2f51 Mon Sep 17 00:00:00 2001 From: zhusy54 Date: Wed, 18 Mar 2026 19:45:50 +0800 Subject: [PATCH 2/2] fix(rt2): use uint64_t for offset_field overflow check before truncation Perform the overflow safety check on the full-width uint64_t value before narrowing to uint32_t, preventing silent truncation from bypassing the sentinel-collision guard. Co-Authored-By: Claude Opus 4.6 --- .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp | 8 ++++---- .../runtime/pto2_dispatch_payload.h | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 302e7787..3e7a8cbf 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -488,16 +488,16 @@ struct AicpuExecutor { // Compute flat byte offset of dispatch descriptor from global base. uint64_t desc_addr = reinterpret_cast(&slot_state.payload->dispatch); uint64_t desc_byte_offset = desc_addr - dispatch_base_; - uint32_t offset_field = static_cast((desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1); + uint64_t offset_field_64 = (desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1; // Overflow check: offset_field must not exceed sentinel-safe upper bound. // If it does, the encoded register value would collide with AICORE_EXIT_SIGNAL // or AICORE_IDLE_TASK_ID, causing AICore to misinterpret the dispatch. - if (offset_field > PTO2_REG_MAX_OFFSET_FIELD) { - DEV_ERROR("PTO2 REG ENCODING OVERFLOW: offset_field=0x%x exceeds max 0x%x. " + if (offset_field_64 > PTO2_REG_MAX_OFFSET_FIELD) { + DEV_ERROR("PTO2 REG ENCODING OVERFLOW: offset_field=0x%llx exceeds max 0x%x. " "desc_byte_offset=0x%llx ring_id=%u slot_in_ring=%u core=%d subslot=%u. " "Reduce task_window_size or shared memory size.", - offset_field, PTO2_REG_MAX_OFFSET_FIELD, + (unsigned long long)offset_field_64, PTO2_REG_MAX_OFFSET_FIELD, (unsigned long long)desc_byte_offset, (unsigned)slot_state.ring_id, (unsigned)slot_state.slot_in_ring, core_id, (unsigned)static_cast(subslot)); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h index d8cd9f94..9fe51dfd 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h @@ -64,7 +64,8 @@ constexpr uint32_t PTO2_REG_MAX_OFFSET_FIELD = 0x0FFFFFFBu; // sentinel-safe */ static inline uint32_t pto2_reg_encode(uint64_t desc_byte_offset, uint32_t slot_idx, uint32_t toggle) { - uint32_t offset_field = static_cast((desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1); + uint64_t offset_field_64 = (desc_byte_offset >> PTO2_REG_ALIGN_SHIFT) + 1; + uint32_t offset_field = static_cast(offset_field_64); return (offset_field << PTO2_REG_OFFSET_SHIFT) | slot_idx | toggle;