From 75e507b87348c1c1b314423a2f0758f8b5515bcb Mon Sep 17 00:00:00 2001 From: Joe Dluzen Date: Mon, 9 Mar 2026 21:16:57 -0400 Subject: [PATCH 1/3] perf: improve batch generation in step 1 by 3-12% --- tools/ace-qwen3.cpp | 128 +++++++++++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 49 deletions(-) diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp index eb357e6..6d75f75 100644 --- a/tools/ace-qwen3.cpp +++ b/tools/ace-qwen3.cpp @@ -529,22 +529,23 @@ static std::vector run_phase2_batch(Qwen3LM * } // Batched decode loop, partial LM head: only project [TOKEN_IM_END..V) - Timer t_decode; - int V_eff = V - TOKEN_IM_END; // 65559 vs 217204 - std::vector logits_cond((size_t) V_eff * N); - std::vector logits_uncond((size_t) V_eff * N); - std::vector tokens(N); + // --- START OF OPTIMIZED PHASE 2 DECODE --- + Timer t_decode; + int V_eff = V - TOKEN_IM_END; - // CFG: single forward with 2*N (cond + uncond) - int N2 = use_cfg ? 2 * N : N; - std::vector tokens_2n(N2), sets_2n(N2); - std::vector logits_2n((size_t) V_eff * N2); - if (use_cfg) { - for (int i = 0; i < N; i++) { - sets_2n[i] = cond_sets[i]; - sets_2n[N + i] = uncond_sets[i]; - } - } + // Pre-allocate batched arrays for the maximum possible size (N or 2*N for CFG) + int max_N2 = use_cfg ? 2 * N : N; + std::vector batch_tokens(max_N2); + std::vector batch_sets(max_N2); + std::vector batch_logits((size_t) V_eff * max_N2); + + // This array maps the compact "active" index back to the original sequence index (0 to N-1) + std::vector active_to_orig(N); + + // Tiny array for CPU sampling (EOS token + Audio Codes) to prevent sorting 150,000 text logits + int audio_code_offset = AUDIO_CODE_BASE - TOKEN_IM_END; + int compact_V = AUDIO_CODE_COUNT + 1; + std::vector compact_logits(compact_V); int n_active = N; for (int i = 0; i < N; i++) { @@ -554,65 +555,94 @@ static std::vector run_phase2_batch(Qwen3LM * } for (int step = 0; step < max_tokens && n_active > 0; step++) { - // Collect tokens (done sequences feed their last token, result ignored) - for (int i = 0; i < N; i++) { - tokens[i] = seqs[i].last_token; - } + int current_active = 0; - if (use_cfg) { - // Single batched forward: cond[0..N-1] + uncond[N..2N-1] - for (int i = 0; i < N; i++) { - tokens_2n[i] = tokens[i]; - tokens_2n[N + i] = tokens[i]; + // 1. DYNAMIC COMPACTION: Loop through all N sequences, but only gather the active ones! + for (int i = 0; i < N; i++) { + if (!seqs[i].done) { + active_to_orig[current_active] = i; // Remember that this slot belongs to sequence 'i' + + if (use_cfg) { + // Place the Cond token/set in the first half + batch_tokens[current_active] = seqs[i].last_token; + batch_sets[current_active] = cond_sets[i]; + + // Place the Uncond token/set exactly n_active elements later + batch_tokens[n_active + current_active] = seqs[i].last_token; + batch_sets[n_active + current_active] = uncond_sets[i]; + } else { + batch_tokens[current_active] = seqs[i].last_token; + batch_sets[current_active] = cond_sets[i]; + } + current_active++; } - qw3lm_forward_batch(m, tokens_2n.data(), sets_2n.data(), N2, logits_2n.data(), TOKEN_IM_END, V_eff); - memcpy(logits_cond.data(), logits_2n.data(), (size_t) V_eff * N * sizeof(float)); - memcpy(logits_uncond.data(), logits_2n.data() + (size_t) V_eff * N, (size_t) V_eff * N * sizeof(float)); - } else { - qw3lm_forward_batch(m, tokens.data(), cond_sets.data(), N, logits_cond.data(), TOKEN_IM_END, V_eff); } - // Per-sequence: CFG combine + sample (logits are [V_eff] starting at TOKEN_IM_END) - for (int i = 0; i < N; i++) { - if (seqs[i].done) { - continue; - } + // 2. FORWARD PASS: GPU only computes attention for n_active sequences (Massive speedup as sequences finish!) + int actual_batch_size = use_cfg ? (2 * n_active) : n_active; + qw3lm_forward_batch(m, batch_tokens.data(), batch_sets.data(), actual_batch_size, batch_logits.data(), + TOKEN_IM_END, V_eff); + + // 3. TARGETED CFG & LOGIT EXTRACTION + for (int a = 0; a < n_active; a++) { + int orig_i = active_to_orig[a]; // Map back to original sequence object + + // Pointer to the conditional logits for THIS active sequence + float * lc = batch_logits.data() + (size_t) a * V_eff; - float * lc = logits_cond.data() + (size_t) i * V_eff; if (use_cfg) { - float * lu = logits_uncond.data() + (size_t) i * V_eff; - for (int v = 0; v < V_eff; v++) { - lc[v] = lu[v] + cfg_scale * (lc[v] - lu[v]); + // Pointer to the unconditional logits (offset by n_active) + float * lu = batch_logits.data() + (size_t) (n_active + a) * V_eff; + + // Targeted CFG Math: Only apply it to EOS + Audio Codes. Skip the 150,000 text tokens! + lc[0] = lu[0] + cfg_scale * (lc[0] - lu[0]); // EOS token + for (int c = 0; c < AUDIO_CODE_COUNT; c++) { + int idx = audio_code_offset + c; + lc[idx] = lu[idx] + cfg_scale * (lc[idx] - lu[idx]); } } - // Mask the 24-token gap: indices 1..AUDIO_CODE_BASE-TOKEN_IM_END-1 - // (index 0 = TOKEN_IM_END = EOS, index 24+ = audio codes) - for (int v = 1; v < AUDIO_CODE_BASE - TOKEN_IM_END; v++) { - lc[v] = -1e9f; + // Extract ONLY the valid target tokens into the tiny compact array + compact_logits[0] = lc[0]; + for (int c = 0; c < AUDIO_CODE_COUNT; c++) { + compact_logits[c + 1] = lc[audio_code_offset + c]; } - int tok = sample_top_k_p(lc, V_eff, temperature, top_p, top_k, seqs[i].rng) + TOKEN_IM_END; - seqs[i].last_token = tok; + + // CPU samples instantly because it only has to sort ~2049 items instead of 150,000+ + int compact_tok = + sample_top_k_p(compact_logits.data(), compact_V, temperature, top_p, top_k, seqs[orig_i].rng); + + // Map the sampled index back to global vocabulary ID + int tok = (compact_tok == 0) ? TOKEN_IM_END : (AUDIO_CODE_BASE + compact_tok - 1); + + seqs[orig_i].last_token = tok; if (tok == TOKEN_IM_END) { - seqs[i].done = true; - n_active--; - } else if (tok >= AUDIO_CODE_BASE && tok < AUDIO_CODE_BASE + AUDIO_CODE_COUNT) { - seqs[i].audio_codes.push_back(tok - AUDIO_CODE_BASE); + seqs[orig_i].done = true; + } else { + seqs[orig_i].audio_codes.push_back(tok - AUDIO_CODE_BASE); } } - int total_codes = 0; + // 4. UPDATE ACTIVE COUNT for the next loop iteration + int next_active_count = 0; + int total_codes = 0; for (int i = 0; i < N; i++) { + if (!seqs[i].done) { + next_active_count++; + } total_codes += (int) seqs[i].audio_codes.size(); } + n_active = next_active_count; + // Debug output if ((step + 1) % 50 == 0) { double elapsed = t_decode.ms() / 1000.0; fprintf(stderr, "[Decode] step %d, %d active, %d total codes, %.1f tok/s\n", step + 1, n_active, total_codes, (double) (step + 1) * N / elapsed); } } + // --- END OF OPTIMIZED PHASE 2 DECODE --- double decode_ms = t_decode.ms(); fprintf(stderr, "[Phase2] Decode %.0fms\n", decode_ms); From 8125a7c3e4ef4f10a2b2c6f8d362fe4a9100aba3 Mon Sep 17 00:00:00 2001 From: Joe Dluzen Date: Mon, 9 Mar 2026 21:22:29 -0400 Subject: [PATCH 2/3] remove comments --- tools/ace-qwen3.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp index 6d75f75..5b63a2d 100644 --- a/tools/ace-qwen3.cpp +++ b/tools/ace-qwen3.cpp @@ -529,7 +529,6 @@ static std::vector run_phase2_batch(Qwen3LM * } // Batched decode loop, partial LM head: only project [TOKEN_IM_END..V) - // --- START OF OPTIMIZED PHASE 2 DECODE --- Timer t_decode; int V_eff = V - TOKEN_IM_END; @@ -635,14 +634,12 @@ static std::vector run_phase2_batch(Qwen3LM * } n_active = next_active_count; - // Debug output if ((step + 1) % 50 == 0) { double elapsed = t_decode.ms() / 1000.0; fprintf(stderr, "[Decode] step %d, %d active, %d total codes, %.1f tok/s\n", step + 1, n_active, total_codes, (double) (step + 1) * N / elapsed); } } - // --- END OF OPTIMIZED PHASE 2 DECODE --- double decode_ms = t_decode.ms(); fprintf(stderr, "[Phase2] Decode %.0fms\n", decode_ms); From 876fef1c911e128aecda7ca483febdf4a1ab4207 Mon Sep 17 00:00:00 2001 From: Joe Dluzen Date: Mon, 9 Mar 2026 21:25:05 -0400 Subject: [PATCH 3/3] remove comments --- tools/ace-qwen3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp index 5b63a2d..7420be5 100644 --- a/tools/ace-qwen3.cpp +++ b/tools/ace-qwen3.cpp @@ -577,7 +577,7 @@ static std::vector run_phase2_batch(Qwen3LM * } } - // 2. FORWARD PASS: GPU only computes attention for n_active sequences (Massive speedup as sequences finish!) + // 2. FORWARD PASS: GPU only computes attention for n_active sequences int actual_batch_size = use_cfg ? (2 * n_active) : n_active; qw3lm_forward_batch(m, batch_tokens.data(), batch_sets.data(), actual_batch_size, batch_logits.data(), TOKEN_IM_END, V_eff);