diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5e8a93d4..59b38d53 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -148,6 +148,92 @@ jobs:
       - working-directory: transpile-tests/ffi-integration
         run: zig build test
 
+  benchmark-leak:
+    # Leak-mode benchmark CI compiles every CLEAR benchmark in debug/GPA
+    # mode and runs a reduced workload. It does not compare timings; it
+    # catches benchmark-only leaks and debug-mode runtime wiring issues
+    # that unit/transpile tests do not exercise.
+    name: Benchmark leak checks (shard ${{ matrix.shard }}/5)
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    permissions:
+      contents: read
+      checks: write
+      pull-requests: write
+    env:
+      BENCHER_PROJECT: ${{ vars.BENCHER_PROJECT }}
+      BENCHER_TESTBED: ${{ vars.BENCHER_TESTBED }}
+    strategy:
+      fail-fast: false
+      matrix:
+        shard: [0, 1, 2, 3, 4]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: ${{ env.RUBY_VERSION }}
+          bundler-cache: true
+      - uses: mlugg/setup-zig@v2
+        with:
+          version: ${{ env.ZIG_VERSION }}
+      - uses: actions/cache@v4
+        with:
+          path: |
+            zig/.clear-cache
+            zig/.clear-transpile-cache
+          key: clear-bench-leak-${{ runner.os }}-zig${{ env.ZIG_VERSION }}-${{ hashFiles('src/**', 'zig/runtime/**', 'zig/lib/**', 'benchmarks/**/*.cht', 'Gemfile.lock') }}
+          restore-keys: |
+            clear-bench-leak-${{ runner.os }}-zig${{ env.ZIG_VERSION }}-
+      - run: ruby benchmarks/runner.rb --leak --all --shard=${{ matrix.shard }}/5 --cores=2 --bencher-json tmp/benchmark-leak-${{ matrix.shard }}.json
+      - name: Validate Bencher JSON
+        run: |
+          ruby -rjson -e '
+            path = "tmp/benchmark-leak-${{ matrix.shard }}.json"
+            data = JSON.parse(File.read(path))
+            abort("Bencher JSON is empty") if data.empty?
+            data.each do |benchmark, measures|
+              abort("Bencher benchmark #{benchmark.inspect} has no measures") unless measures.is_a?(Hash) && !measures.empty?
+              measures.each do |measure, payload|
+                value = payload["value"]
+                abort("Bencher measure #{benchmark}/#{measure} missing numeric value") unless value.is_a?(Numeric)
+              end
+            end
+          '
+      - uses: bencherdev/bencher@main
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+      - name: Upload leak benchmark shard to Bencher
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        env:
+          BENCHER_API_TOKEN: ${{ secrets.BENCHER_API_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          BENCHER_JSON: tmp/benchmark-leak-${{ matrix.shard }}.json
+        run: |
+          testbed="${BENCHER_TESTBED:-ubuntu-latest}"
+          project="${BENCHER_PROJECT:-clear}"
+          branch="${GITHUB_HEAD_REF:-$GITHUB_REF_NAME}"
+
+          args=(
+            --project "$project"
+            --token "$BENCHER_API_TOKEN"
+            --branch "$branch"
+            --testbed "$testbed"
+            --adapter json
+            --file "$BENCHER_JSON"
+            --err
+            --github-actions "$GITHUB_TOKEN"
+          )
+
+          if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
+            args+=(
+              --start-point "$GITHUB_BASE_REF"
+              --start-point-hash "${{ github.event.pull_request.base.sha }}"
+              --start-point-clone-thresholds
+              --start-point-reset
+            )
+          fi
+
+          bencher run "${args[@]}"
+
   zig-unit:
     name: Zig unit tests (fast non-TSan)
     runs-on: ubuntu-latest
diff --git a/CLAUDE.md b/CLAUDE.md
index bcd3ffcd..36962114 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -471,6 +471,36 @@ If you ever encounter a compiler bug, stop everything you're doing, and fix the
 
 If you ever find a limitation in the language that you have to work around, stop, identify the problem, and suggest how the language needs to be improved to fix this limitation focing work arounds.
 
+## Definition of Done
+
+Before concluding a task and declaring it complete, you must explicitly review and verify the following:
+
+### Transpilation Review Requirements
+
+If the transpiler in src/ is touched, make these checks:
+
+- **Memory Safety Invariants:** Verify that no existing MIRChecker invariants (INV-1 through INV-10) were bypassed or modified.
+- **Escape Analysis Completeness:** Confirm that any frame-allocated values that survive their declaring frame are explicitly upgraded to the heap in `EscapeAnalysis`.  If any new escape method is added, it must be considered *EVERYWHERE* that does any escape analysis.
+- **Zero Transpiler Band-aids:** Ensure no special logic for intrinsic/standard library functions was added outside of `src/ast/std_lib.rb` or `src/ast/type.rb`.  No new RawZig is allowed.  Add Zig code in zig/ and thoroughly unit test it there.  Do not shoe-horn it into the transpiler.
+- **Zero Runtime Overhead:** The transpiler should never add runtime overhead. You need explicit permission to add any runtime overhead. Zig comptime should be used to achieve all abstractions unless explicitly permitted otherwise.
+
+### Concurrency Review Requirements
+
+If the runtime code in zig/ is touched, make these checks:
+
+- **Atomics Introduced:** You must write a **Loom test** to exhaust CPU instruction reordering and memory visibility permutations.
+- **Locks, Threads, or FFI Introduced:** You must write a **Hammer test** (oversubscribed threads, saturated queues) and run it with TSan/ASan. For Zig, ensure execution via `std.testing.allocator` to catch leaks.
+- **Retries, Timeouts, Network, or Disk I/O Introduced:** You must write a deterministic **VOPR (simulator) test** using a deterministic seed to catch combinatoric failures. Do not write real-time Chaos tests.
+- **File Operations / General Concurrency:** Actively search for logic races, starvation, or priority inversion. If found, write a test proving the failure, then implement the fix.
+- **Performance:** Code on critical, hot paths must be strictly non-blocking. This definitively prohibits any form of lock acquisition and any global heap allocations (which inherently rely on hidden locks) within these paths.
+
+### Other Review Requirements
+
+- **Changes to Tests:** Make sure there are no hacks in test changes. Any changes to tests should be because there was a bug before, or the code has changed such that the new expectations match a correct state.
+- **Deletions to Tests:** No test should be deleted unless 1: the corresponding functionality was deleted and the test is no longer needed, or 2: it was a test-nothing test, or 3: it is redundant with other tests.
+- **Test Additions:** Do not test nothing just to cover lines. Make sure that tests actually test that the code works correctly, not just that things "run" and don't fail. Avoid adding redundant tests when an existing test could be modestly expanded to test a new expectation. Avoid abstractions in tests as much as possible. Tests can repeat themselves. Production code should not repeat itself. Avoid adding production changes *specifically* for testing. To the extent possible use test code and mocks to test what you need to test. Production code ideally is readable and has no concerns or special cases for testing.
+
+
 ## Output
 - Answer is always line 1. Reasoning comes after, never before.
 - No preamble. No "Great question!", "Sure!", "Of course!", "Certainly!", "Absolutely!".
diff --git a/benchmarks/README.md b/benchmarks/README.md
index b9bf6811..a29eef0f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -42,11 +42,21 @@ ruby benchmarks/runner.rb --smoke benchmarks/server/02_json_api/  # CLEAR only,
 ruby benchmarks/runner.rb --fast  benchmarks/sequential/04_hashmap/ # All langs, 0.25x scale
 ruby benchmarks/runner.rb --release benchmarks/sequential/04_hashmap/ # 5x scale, best of 5
 ruby benchmarks/runner.rb --smoke --all                            # Smoke test everything
+ruby benchmarks/runner.rb --leak --all --bencher-json bencher.json  # Leak checks + Bencher BMF JSON
 
 # Core count
 ruby benchmarks/runner.rb --cores=2 benchmarks/concurrent/09_kvstore/
 ```
 
+## Bencher CI
+
+Leak-mode benchmark CI writes Bencher JSON for each shard and uploads it with `bencher run`.
+GitHub Actions needs:
+
+- `secrets.BENCHER_API_TOKEN` -- Bencher API token
+- `vars.BENCHER_PROJECT` -- optional Bencher project slug; CI defaults to `clear` when unset
+- `vars.BENCHER_TESTBED` -- optional; defaults to `ubuntu-latest`
+
 The runner automatically:
 - Transpiles `.cht` -> Zig -> binary (ReleaseFast)
 - Compiles C (`bench.c`), Rust (`bench.rs`/`Cargo.toml`), and Go (`bench.go`) baselines if present
diff --git a/benchmarks/concurrent/01_socket_throughput/bench.cht b/benchmarks/concurrent/01_socket_throughput/bench.cht
index e2f54558..d45c55bf 100644
--- a/benchmarks/concurrent/01_socket_throughput/bench.cht
+++ b/benchmarks/concurrent/01_socket_throughput/bench.cht
@@ -1,4 +1,5 @@
 -- @leak: 25600000 -> 25600
+-- @leak: 100000 -> 100
 -- Socket Throughput Benchmark
 --
 -- WHAT WE ARE MEASURING:
diff --git a/benchmarks/concurrent/06_dynamic_spawn/TIMEOUT b/benchmarks/concurrent/06_dynamic_spawn/TIMEOUT
new file mode 100644
index 00000000..7ed6ff82
--- /dev/null
+++ b/benchmarks/concurrent/06_dynamic_spawn/TIMEOUT
@@ -0,0 +1 @@
+5
diff --git a/benchmarks/concurrent/09_kvstore/bench.cht b/benchmarks/concurrent/09_kvstore/bench.cht
index 6b9e8466..27ec4639 100644
--- a/benchmarks/concurrent/09_kvstore/bench.cht
+++ b/benchmarks/concurrent/09_kvstore/bench.cht
@@ -79,7 +79,7 @@ FN main() RETURNS Void ->
     WHILE wi < workers DO
         start = wi * chunk;
         cnt = chunk;
-        set_futures.append(BG {
+        set_futures.append(BG { @parallel ->
             FOR idx IN (start ..< start + cnt) DO
                 map["key:${idx.toString()}"] = "value-${idx.toString()}";
             END
@@ -96,7 +96,7 @@ FN main() RETURNS Void ->
     WHILE wi < workers DO
         start = wi * chunk;
         cnt = chunk;
-        get_futures.append(BG {
+        get_futures.append(BG { @parallel ->
             MUTABLE hits: Int64 = 0;
             FOR idx IN (start ..< start + cnt) DO
                 got = map["key:${idx.toString()}"] OR "";
@@ -119,7 +119,7 @@ FN main() RETURNS Void ->
     WHILE wi < workers DO
         seed = wi + 42;
         cnt = chunk;
-        zipf_futures.append(BG {
+        zipf_futures.append(BG { @parallel ->
             MUTABLE hits: Int64 = 0;
             MUTABLE state: Int64 = seed;
             FOR idx IN (0_i64 ..< cnt) DO
@@ -143,7 +143,7 @@ FN main() RETURNS Void ->
     WHILE wi < workers DO
         seed = wi + 99;
         cnt = chunk;
-        mix_futures.append(BG {
+        mix_futures.append(BG { @parallel ->
             MUTABLE hits: Int64 = 0;
             MUTABLE state: Int64 = seed;
             MUTABLE val: String = "";
diff --git a/benchmarks/concurrent/10_shard_vs_locked/bench b/benchmarks/concurrent/10_shard_vs_locked/bench
deleted file mode 100755
index 8fb7cf77..00000000
Binary files a/benchmarks/concurrent/10_shard_vs_locked/bench and /dev/null differ
diff --git a/benchmarks/concurrent/10_shard_vs_locked/bench.cht b/benchmarks/concurrent/10_shard_vs_locked/bench.cht
index dd972e3c..30251524 100644
--- a/benchmarks/concurrent/10_shard_vs_locked/bench.cht
+++ b/benchmarks/concurrent/10_shard_vs_locked/bench.cht
@@ -1,4 +1,4 @@
--- @leak: n = 1000000 -> n = 1000
+-- @leak: n = 10000000 -> n = 10000
 -- KV Store Benchmark — CLEAR (True Shared-Nothing via SHARD pipeline)
 --
 -- Uses @sharded(32) with the SHARD pipeline operator.
@@ -11,36 +11,36 @@
 --   3. Every map operation is LOCAL — no SPSC, no yield, no locks
 --
 -- Workloads:
---   1. Uniform SET   — 1M sequential keys (partitioned across 32 shards)
---   2. Uniform GET   — 1M sequential keys (100% hit, all local)
---   3. Mixed 80/20   — 200K SET + 800K GET via separate SHARD pipelines
+--   1. Uniform SET   — 10M sequential keys (partitioned across 32 shards)
+--   2. Uniform GET   — 10M sequential keys (100% hit, all local)
+--   3. Mixed 80/20   — 2M SET + 8M GET via separate SHARD pipelines
 --
 -- Compare to Go (sync.Map) and Rust (dashmap).
 
 FN main() RETURNS Void ->
     MUTABLE map: HashMap<String>@sharded(32) = {};
-    n = 1000000;
+    n = 10000000;
 
     -- Workload 1: Uniform SET (1M keys, partitioned)
     t0 = timestampMs();
-    (0..<n) s> SHARD("key:" + toString(_), map) s> CONCURRENT EACH {
+    (0..<n) s> SHARD("key:" + toString(_), map) s> CONCURRENT(batch: 128) EACH {
         map[_] = "value";
     };
     setMs = timestampMs() - t0;
 
     -- Workload 2: Uniform GET (1M keys, all local, 100% hit)
     t1 = timestampMs();
-    (0..<n) s> SHARD("key:" + toString(_), map) s> CONCURRENT EACH {
+    (0..<n) s> SHARD("key:" + toString(_), map) s> CONCURRENT(batch: 128) EACH {
         got = map[_] OR "";
     };
     getMs = timestampMs() - t1;
 
     -- Workload 3: Mixed — 200K SET (overwrite) + 800K GET
     t2 = timestampMs();
-    (0..<(n / 5)) s> SHARD("key:" + toString(_), map) s> CONCURRENT EACH {
+    (0..<(n / 5)) s> SHARD("key:" + toString(_), map) s> CONCURRENT(batch: 128) EACH {
         map[_] = "updated";
     };
-    (0..<((n / 5) * 4)) s> SHARD("key:" + toString(_), map) s> CONCURRENT EACH {
+    (0..<((n / 5) * 4)) s> SHARD("key:" + toString(_), map) s> CONCURRENT(batch: 128) EACH {
         got = map[_] OR "";
     };
     mixMs = timestampMs() - t2;
diff --git a/benchmarks/concurrent/10_shard_vs_locked/bench.go b/benchmarks/concurrent/10_shard_vs_locked/bench.go
index 605881aa..a1747ef5 100644
--- a/benchmarks/concurrent/10_shard_vs_locked/bench.go
+++ b/benchmarks/concurrent/10_shard_vs_locked/bench.go
@@ -26,7 +26,7 @@ import (
 )
 
 const (
-	numKeys   = 1_000_000
+	numKeys   = 10_000_000
 	numShards = 32
 	chanBuf   = 512
 )
diff --git a/benchmarks/concurrent/10_shard_vs_locked/bench.rs b/benchmarks/concurrent/10_shard_vs_locked/bench.rs
index 176f2dd0..71e8c861 100644
--- a/benchmarks/concurrent/10_shard_vs_locked/bench.rs
+++ b/benchmarks/concurrent/10_shard_vs_locked/bench.rs
@@ -22,7 +22,7 @@ use std::sync::Arc;
 use std::thread;
 use std::time::Instant;
 
-const NUM_KEYS: usize = 1_000_000;
+const NUM_KEYS: usize = 10_000_000;
 const NUM_SHARDS: usize = 32;
 const CHAN_BUF: usize = 512;
 
diff --git a/benchmarks/concurrent/11_parallel_aggregation/README.md b/benchmarks/concurrent/11_parallel_aggregation/README.md
index aa887983..49c4ce6f 100644
--- a/benchmarks/concurrent/11_parallel_aggregation/README.md
+++ b/benchmarks/concurrent/11_parallel_aggregation/README.md
@@ -1,6 +1,6 @@
 # Benchmark 19: Parallel Aggregation (Histogram)
 
-1M events bucketed into 1,000 categories via deterministic LCG. Two phases:
+10M events bucketed into 1,000 categories via deterministic LCG. Two phases:
 
 1. Build histogram (parallel, shared-nothing)
 2. Compute sum/max/min/avg over histogram values (parallel reduce)
@@ -13,40 +13,65 @@ All three use the same LCG with the same seed => identical results.
 - **Go**: Per-goroutine local maps + merge (~40 lines). Stats via goroutine partial reduce (~30 lines).
 - **Rust**: Rayon `par_iter().fold().reduce()` for histogram. Rayon `par_iter().sum()/reduce()` for stats (~20 lines).
 
+This is intentionally documented as **not apples-to-apples** for the histogram
+phase. Go and Rust use local fold/reduce: each worker owns a private local
+histogram over a contiguous slice, then the program merges the 1,000 bucket
+counts at the end. CLEAR currently uses `SHARD`, which routes every item to
+the owning shard worker before mutating the map. That measures the general
+shared-nothing routing primitive, not the ideal histogram algorithm.
+
 ## Results
 
 ```
-Rust (rayon)    0.018 s   RSS: 16 MB
-Go (goroutines) 0.014 s   RSS: 12 MB
-CLEAR (fibers)  0.081 s   RSS: 46 MB
+Rust (rayon)    0.027 s   RSS: 91 MB
+Go (goroutines) 0.017 s   RSS: 80 MB
+CLEAR (fibers)  0.107 s   RSS: 357 MB
 
-CLEAR vs Go:   +493%
-CLEAR vs Rust: +342%
+CLEAR vs Go:   +529%
+CLEAR vs Rust: +296%
 ```
 
-Previous result before switching to integer keys: CLEAR 0.111 s (+745% vs Go).
-Integer keys eliminate ~1M string allocations per run, saving ~30ms.
+Previous result before switching to integer keys: CLEAR 0.111 s at 1M events
+(+745% vs Go). Integer keys eliminated ~1M string allocations per run at that
+size. Later SHARD transport batching reduced routing overhead substantially,
+but it does not change the algorithmic mismatch.
 
 ## Why CLEAR is slower
 
-Same fiber runtime overhead documented in benchmark 18 (SHARD vs locked).
-The SHARD routing pipeline (hash key, send to owning fiber via SPSC channel,
-receive + process) costs ~60ns per item. With 1M items and trivially cheap
-per-item work (one map increment, ~10ns), routing dominates.
+The SHARD routing pipeline computes a key, hashes it, sends it to the owning
+shard fiber, receives it, and then performs a tiny map increment. Even with
+transport batching, every event still pays routing overhead that local
+fold/reduce avoids.
 
 Go and Rust avoid routing entirely: goroutines/rayon threads each own a
 local slice of work and write their local map without coordination. The
 merge is a single sequential pass over 1,000 entries.
 
-SHARD amortizes well when per-item work is expensive (parsing,
-transformation, I/O). For simple counting it is over-engineered.
+SHARD amortizes well when per-item work is expensive or when long-lived
+per-key ownership matters (request routing, actor-like state, sharded services).
+For simple counting over a fixed input, it is over-engineered.
+
+## Current profile note
+
+`SHARD(...) s> CONCURRENT EACH` now runs as real shard-parallel work: one
+producer routes keys into per-shard bounded queues, and one worker fiber drains
+each shard. `clear profile` shows the shard workers distributed across
+schedulers, so the old failure mode (a single serial SHARD loop) is no longer
+the limiting factor.
+
+The remaining cost is structural. SHARD is doing correct shared-nothing
+routing, but the benchmark wants local worker-private aggregation. Batching
+helps the SHARD path, but it cannot make per-item routing equivalent to no
+routing.
 
-## TODO: PARALLEL FOLD primitive
+## TODO: PARALLEL FOLD / GROUP_BY primitive
 
-The remaining gap is structural. SHARD routes each item to its owning fiber
-via SPSC channel (~60ns/item); for trivially cheap per-item work (~10ns) the
-routing cost dominates 6:1. Go/Rust win by giving each worker a private local
-map and merging after the barrier — no routing, no channels.
+The benchmark should ultimately use a CLEAR primitive that matches the Go/Rust
+algorithm: each worker gets a private local accumulator, then a merge combines
+partials after the barrier. If/when `GROUP_BY` exists, this benchmark should
+prefer that over `SHARD` for the primary comparison. Until then, the SHARD
+version is useful as a routing benchmark but should not be interpreted as
+CLEAR's best possible histogram implementation.
 
 A future `PARALLEL FOLD ... MERGE` pipeline stage would close this gap:
 
@@ -60,9 +85,8 @@ counts = (0..<n) s> PARALLEL FOLD HashMap<Int64, Int64> {
 ```
 
 Each worker owns a private `_acc`; a sequential merge combines partials after
-`WaitGroup.wait()`. SHARD remains correct and preferable when per-item work
-exceeds ~60ns. Not urgent for v0.1-pre; typical workloads are not bottlenecked
-by scheduling overhead relative to real work.
+`WaitGroup.wait()`. SHARD remains correct and preferable when ownership/routing
+semantics matter; local fold/reduce is the right tool for pure aggregation.
 
 ## Ergonomics comparison
 
diff --git a/benchmarks/concurrent/11_parallel_aggregation/bench.cht b/benchmarks/concurrent/11_parallel_aggregation/bench.cht
index 011ee80a..79c4eac0 100644
--- a/benchmarks/concurrent/11_parallel_aggregation/bench.cht
+++ b/benchmarks/concurrent/11_parallel_aggregation/bench.cht
@@ -1,7 +1,7 @@
--- @leak: n = 1000000 -> n = 1000
+-- @leak: n = 10000000 -> n = 10000
 -- Parallel Aggregation Benchmark — Distributed Histogram + Stats
 --
--- Deterministic LCG generates 100K events. Each event is bucketed into
+-- Deterministic LCG generates 10M events. Each event is bucketed into
 -- one of 1,000 categories. Two phases:
 --
 --   Phase 1 (SHARD): Build histogram via @sharded(32) map.
@@ -24,7 +24,7 @@ FN absInt(x: Int64) RETURNS Int64 ->
 END
 
 FN main() RETURNS Void ->
-    n = 1000000;
+    n = 10000000;
     buckets = 1000;
 
     -- Pre-compute seeds (LCG is sequential so we must pre-compute).
@@ -38,7 +38,7 @@ FN main() RETURNS Void ->
     -- Phase 1: Build histogram via SHARD with integer keys — zero string allocation.
     t0 = timestampMs();
     MUTABLE counts: HashMap<Int64, Int64>@sharded(32) = {};
-    (0..<n) s> SHARD(absInt(seeds[_]) MOD buckets, counts) s> CONCURRENT EACH {
+    (0..<n) s> SHARD(absInt(seeds[_]) MOD buckets, counts) s> CONCURRENT(batch: 512) EACH {
         counts[_] = (counts[_] OR 0_i64) + 1_i64;
     };
     shardMs = timestampMs() - t0;
diff --git a/benchmarks/concurrent/11_parallel_aggregation/bench.go b/benchmarks/concurrent/11_parallel_aggregation/bench.go
index e7af5e9f..d028a9a5 100644
--- a/benchmarks/concurrent/11_parallel_aggregation/bench.go
+++ b/benchmarks/concurrent/11_parallel_aggregation/bench.go
@@ -23,7 +23,7 @@ import (
 )
 
 const (
-	n       = 1_000_000
+	n       = 10_000_000
 	buckets = 1_000
 )
 
diff --git a/benchmarks/concurrent/11_parallel_aggregation/bench.rs b/benchmarks/concurrent/11_parallel_aggregation/bench.rs
index 341bf16b..fd7d84b3 100644
--- a/benchmarks/concurrent/11_parallel_aggregation/bench.rs
+++ b/benchmarks/concurrent/11_parallel_aggregation/bench.rs
@@ -16,7 +16,7 @@ use rayon::prelude::*;
 use std::collections::HashMap;
 use std::time::Instant;
 
-const N: i64 = 1_000_000;
+const N: usize = 10_000_000;
 const BUCKETS: i64 = 1_000;
 
 fn lcg(state: i64) -> i64 {
@@ -26,7 +26,7 @@ fn lcg(state: i64) -> i64 {
 
 fn main() {
     // Pre-compute seeds (LCG is sequential)
-    let mut seeds = Vec::with_capacity(N as usize);
+    let mut seeds = Vec::with_capacity(N);
     let mut seed: i64 = 42;
     for _ in 0..N {
         seed = lcg(seed);
@@ -66,7 +66,7 @@ fn main() {
     let average = total / values.len() as f64;
     let stats_time = t1.elapsed();
 
-    assert_eq!(total as i64, N, "total mismatch");
+    assert_eq!(total as i64, N as i64, "total mismatch");
 
     println!("Events: {}", N);
     println!("Buckets: {}", BUCKETS);
diff --git a/benchmarks/concurrent/12_false_sharing/README.md b/benchmarks/concurrent/12_false_sharing/README.md
index e45f6ff3..4c95378b 100644
--- a/benchmarks/concurrent/12_false_sharing/README.md
+++ b/benchmarks/concurrent/12_false_sharing/README.md
@@ -24,6 +24,44 @@ Tests whether CLEAR's `@shared:locked` eliminates false sharing by construction.
 | Go heap-alloc (racy) | ~3ms | n/a - no mutex |
 | C padded (racy) | ~3ms | n/a - no mutex |
 
+## CLEAR scheduler mode
+
+This benchmark should use stackful CLEAR workers, for example
+`BG { @standard:@parallel -> ... }`.
+
+The worker task is deliberately tiny:
+
+```clear
+FOR j IN (0_i64 ..< increments) DO
+    WITH EXCLUSIVE ref AS inner {
+        inner.value = inner.value + 1;
+    }
+END
+```
+
+Each worker repeats an uncontended lock, one integer increment, and unlock roughly
+1.25M times. There is no I/O, no meaningful blocking, and only `threadCount()`
+long-running workers, so per-task memory is not the limiting factor. The hot cost
+is per-iteration dispatch.
+
+FSM workers are correct here, but they are the wrong tradeoff for this shape. The
+FSM lowering must preserve resumable lock semantics, so each `WITH EXCLUSIVE`
+goes through the FSM lock protocol, state dispatch, body segment, unlock segment,
+and cleanup bookkeeping. Stackful workers lower to a tight acquire/body/release
+loop. On the 32-thread benchmark, the fixed FSM path was about 2x slower than the
+stackful path for this specific workload.
+
+The memory tradeoff goes the other direction. FSM tasks avoid per-fiber stacks;
+the runtime benchmark reports a compact `FsmTask` plus small state storage versus
+a stackful task with `Task`, `Fiber`, and a reserved stack. That is the right trade
+for huge numbers of parked, blocked, or lightly suspended tasks. It is not the
+right trade for a small number of CPU-bound workers executing millions of tiny
+critical sections.
+
+This benchmark is therefore an example of why CLEAR supports both models:
+use FSMs when task count and memory footprint dominate, and use stackful fibers
+when hot-loop compute throughput dominates.
+
 ## Interpretation
 
 **CLEAR vs Rust Arc<Mutex>**: same mechanism (heap alloc + mutex), CLEAR is ~2x faster.
diff --git a/benchmarks/concurrent/12_false_sharing/bench b/benchmarks/concurrent/12_false_sharing/bench
deleted file mode 100755
index 716dc09e..00000000
Binary files a/benchmarks/concurrent/12_false_sharing/bench and /dev/null differ
diff --git a/benchmarks/concurrent/12_false_sharing/bench.cht b/benchmarks/concurrent/12_false_sharing/bench.cht
index 8537ed22..d8cc2d55 100644
--- a/benchmarks/concurrent/12_false_sharing/bench.cht
+++ b/benchmarks/concurrent/12_false_sharing/bench.cht
@@ -31,7 +31,7 @@ FN main() RETURNS Void ->
     MUTABLE wi = 0_i64;
     WHILE wi < workers DO
         ref = counters[wi];
-        futures.append(BG { @parallel ->
+        futures.append(BG { @standard:@parallel ->
             FOR j IN (0_i64 ..< increments) DO
                 WITH EXCLUSIVE ref AS inner {
                     inner.value = inner.value + 1;
diff --git a/benchmarks/concurrent/14_nested_lock/TIMEOUT b/benchmarks/concurrent/14_nested_lock/TIMEOUT
index 7ed6ff82..f599e28b 100644
--- a/benchmarks/concurrent/14_nested_lock/TIMEOUT
+++ b/benchmarks/concurrent/14_nested_lock/TIMEOUT
@@ -1 +1 @@
-5
+10
diff --git a/benchmarks/concurrent/15_fsm_vs_stackful/bench_stackful.cht b/benchmarks/concurrent/15_fsm_vs_stackful/bench_stackful.cht
deleted file mode 100644
index ced4f8e8..00000000
--- a/benchmarks/concurrent/15_fsm_vs_stackful/bench_stackful.cht
+++ /dev/null
@@ -1,25 +0,0 @@
--- Stackful baseline. Same body as bench_fsm.cht with @xl to
--- force the stackful path (full per-fiber stack via spawnBest).
-
-FN bench_work(x: Int64) RETURNS Int64 ->
-  RETURN x * 2 + 1;
-END
-
-FN main() RETURNS Void ->
-  count: Int64 = 200_i64;
-
-  MUTABLE futures: ~Int64[]@list = [];
-  MUTABLE total: Int64 = 0_i64;
-
-  FOR i IN (0 ..< count) DO
-    futures.append(BG { @xl -> bench_work(i); });
-  END
-
-  FOR j IN (0 ..< count) DO
-    r: Int64 = NEXT futures[j];
-    total = total + r;
-  END
-
-  ASSERT total > 0, "non-zero result";
-  RETURN;
-END
diff --git a/benchmarks/concurrent/16_observables/README.md b/benchmarks/concurrent/16_observables/README.md
index 55faff7a..fb321216 100644
--- a/benchmarks/concurrent/16_observables/README.md
+++ b/benchmarks/concurrent/16_observables/README.md
@@ -1,8 +1,7 @@
 # Concurrent Observables — CLEAR vs Go vs Rust
 
-Benchmarks the lock-free `@observable` runtime backing CLEAR's
-`~T@observable` types against equivalents in Go and Rust, plus
-`@locked` baselines in each.
+Benchmarks the CLEAR-language `~T@observable` pipeline-terminal
+form against matching Go and Rust stream/channel implementations.
 
 Two distinct measurements live in this directory:
 
@@ -12,11 +11,10 @@ Two distinct measurements live in this directory:
    join via `NEXT`. Measures end-to-end cost of the language form,
    including stream yield/resume overhead.
 
-2. **Cross-language atomic-counter comparison** (`bench_clear.zig`,
-   `bench.go`, `bench.rs`): 1 writer thread + K reader threads
-   hammer a shared atomic accumulator. Measures the underlying
-   lock-free runtime (`obs.AtomicSum` for CLEAR; `atomic.Int64` for
-   Go; `AtomicI64` for Rust) head-to-head with no language overhead.
+2. **Runtime-level helper** (`bench_clear.zig`): hand-written Zig
+   for isolating `obs.AtomicSum` itself. This is not used by the
+   benchmark runner's CLEAR headline result because it is not `.cht`
+   code.
 
 ```clear
 -- bench.cht (the canonical CLEAR form)
@@ -25,36 +23,29 @@ final = NEXT running;
 ```
 
 The compiler heap-allocates an `*ObservableSum(i64)` plus a
-WaitGroup, **spawns a CONSUMER fiber cross-scheduler** that pulls
-from `gen` and calls `.add(item)` per emit, and `NEXT` parks main
-on the WG until the consumer's `defer ctx.acc.finish()` issues
-`wg.done()`. Producer (BG STREAM gen), consumer fiber, and main
-all run on different worker threads in the default multi-threaded
-runtime, so the fold genuinely overlaps with the joiner.
+WaitGroup, spawns a consumer fiber that pulls from `gen` and calls
+`.add(item)` per emit, and `NEXT` parks main on the WG until the
+consumer's `defer ctx.acc.finish()` issues `wg.done()`.
 
 ## Workload
 
-  - 1 writer producing 5,000,000 increments
-  - K reader threads each calling `view()` until the writer finishes
-    (K ∈ {1, 4, 8})
+  - 1 producer emitting `0..2,000,000`
+  - 1 consumer summing the stream
+  - 1 joiner waiting for the final sum
+  - deterministic checksum: `sum(0..N-1) + N * 131`
 
 ## Results (this box, ReleaseFast / `-O` / `--release`)
 
 ### CLEAR-language pipeline form (`bench.cht` → `./clear build --optimized`)
 
-```
-CLEAR observable: 12499997500000 (sum 0..N-1) in 61 ms      (~12 ns/item)
-```
-
-5M values produced by a `BG STREAM`, folded via `s> SUM _` (which
-auto-produces a `~Int64@observable`), and joined via `NEXT`. The
-producer (BG STREAM gen), consumer fiber (spawned by the SUM emit
-cross-scheduler), and main (parked on the observable's WaitGroup)
-all run on different worker threads concurrently:
+2M values are produced by a `BG STREAM`, folded via `s> SUM _`
+(which auto-produces a `~Int64@observable`), and joined via `NEXT`.
+`bench.go` and `bench.rs` mirror this shape with bounded channels:
+producer -> consumer sum -> join.
 
 ```clear
 FN main() RETURNS Void ->
-    n_writes: Int64 = 5_000_000_i64;
+    n_writes: Int64 = 2_000_000_i64;
     gen: ~?Int64[] = BG STREAM {
         MUTABLE i: Int64 = 0_i64;
         WHILE i < n_writes DO YIELD i; i = i + 1_i64; END
@@ -70,25 +61,8 @@ FN main() RETURNS Void ->
 END
 ```
 
-The 12 ns/item is the language-form cost: BG STREAM yield/resume
-overhead per item + cross-scheduler atomic add + WaitGroup join.
-The pure atomic-add-only number (no stream fiber, no consumer
-fiber spawn) lives in the cross-language reader-stress table below.
-
-### Concurrent reader stress (`bench_clear.zig` / `bench.go` / `bench.rs`)
-
-1 writer thread + K reader threads hammering the shared atomic
-counter. Same workload across all four implementations. (Median of
-3 runs; high variance on hot CPUs, especially at 8 readers.)
-
-| Variant                              | 1 reader               | 4 readers              | 8 readers              |
-|--------------------------------------|------------------------|------------------------|------------------------|
-| **CLEAR `obs.AtomicSum`**            | 37 ns/inc, **142 M r/s** | 67 ns/inc, **466 M r/s** | 88 ns/inc, **831 M r/s** |
-| Go `atomic.Int64`                    | 30 ns/inc, 950 M r/s   | 50 ns/inc, 2.97 G r/s  | 53 ns/inc, 4.31 G r/s  |
-| Rust `AtomicI64`                     | 35 ns/inc, 400 M r/s   | 50 ns/inc, 1.78 G r/s  | 58 ns/inc, 2.40 G r/s  |
-| **CLEAR `compat.Mutex`** (@locked)   | 130 ns/inc, 7.4 M r/s  | 416 ns/inc, 9.5 M r/s  | 797 ns/inc, 10.2 M r/s |
-| Go `sync.Mutex`                      | 63 ns/inc, 25 M r/s    | 384 ns/inc, 11 M r/s   | 1220 ns/inc, 7.7 M r/s |
-| Rust `Mutex<i64>`                    | 157 ns/inc, 6.9 M r/s  | 288 ns/inc, 4.8 M r/s  | 430 ns/inc, 6.6 M r/s  |
+The measured cost is the language-form cost: BG STREAM yield/resume
+overhead per item + observable accumulator add + WaitGroup join.
 
 ### Perf optimization round (`obs.AtomicSum` vs raw `std.atomic.Value`)
 
@@ -152,7 +126,7 @@ const __obs_acc = CheatLib.obs.ObservableSum(i64).new(rt.heapAlloc()) catch unre
 const __obs_wg = rt.heapAlloc().create(CheatHeader.WaitGroup) catch unreachable;
 __obs_wg.* = CheatHeader.WaitGroup.init(rt.getSched()); __obs_wg.add(1);
 __obs_acc.setCompletion(@ptrCast(__obs_wg), CheatHeader.obsWgDone, CheatHeader.obsWgWait, CheatHeader.obsWgDestroy);
-// Spawn consumer fiber cross-scheduler:
+// Spawn consumer fiber:
 const ConsumerCtx = struct { acc: *..., gen: ..., fn run(...) {
     defer ctx.acc.finish();   // wg.done() via callback
     while (try ctx.gen.next()) |it| ctx.acc.add(it);
diff --git a/benchmarks/concurrent/16_observables/TIMEOUT b/benchmarks/concurrent/16_observables/TIMEOUT
new file mode 100644
index 00000000..f599e28b
--- /dev/null
+++ b/benchmarks/concurrent/16_observables/TIMEOUT
@@ -0,0 +1 @@
+10
diff --git a/benchmarks/concurrent/16_observables/bench.cht b/benchmarks/concurrent/16_observables/bench.cht
index d3fe47ff..e0768923 100644
--- a/benchmarks/concurrent/16_observables/bench.cht
+++ b/benchmarks/concurrent/16_observables/bench.cht
@@ -2,24 +2,18 @@
 --
 -- The pipeline `gen s> SUM _` heap-allocates an `*ObservableSum(i64)`
 -- and spawns a CONSUMER fiber that pulls from `gen` and calls
--- `.add(item)` per emit. The consumer fiber is unpinned, so in the
--- default multi-threaded runtime it runs on a sibling worker
--- thread concurrently with the BG STREAM gen producer fiber.
+-- `.add(item)` per emit. The consumer stays on the source scheduler
+-- so stream wakeups remain local; cross-scheduler stream wakeups are
+-- covered separately by runtime tests.
 -- `NEXT running` parks main on the observable's WaitGroup and
 -- wakes when the consumer publishes `.finish()`.
 --
--- The metric is wall time for 5M concurrent producer→consumer→
--- accumulator emits, the same workload bench.go and bench.rs
--- measure. The comparable read-side hot-poll (`WHILE current <
--- expected DO WITH VIEW running AS s ... END`) is currently
--- gated by a scheduler limitation -- pinned-vs-ready pickNext
--- (scheduler.zig:826-830) starves an unpinned producer when the
--- joiner is the pinned main fiber. The cross-language
--- many-readers comparison lives in `bench_clear.zig` (real OS
--- threads + obs.AtomicSum directly).
+-- The metric is wall time for 2M producer->consumer->accumulator
+-- emits. bench.go and bench.rs mirror this shape with channels:
+-- one producer, one consumer sum, and a join.
 
 FN main() RETURNS Void ->
-    n_writes: Int64 = 5_000_000_i64;
+    n_writes: Int64 = 2_000_000_i64;
     expected: Int64 = (n_writes * (n_writes - 1_i64)) / 2_i64;
 
     gen: ~?Int64[] = BG STREAM {
@@ -34,8 +28,13 @@ FN main() RETURNS Void ->
     running: ~Int64@observable = gen s> SUM _;
     final = NEXT running;
     elapsed = timestampMs() - t0;
+    checksum: Int64 = final + (n_writes * 131_i64);
+    expected_checksum: Int64 = expected + (n_writes * 131_i64);
 
     ASSERT final == expected, "final mismatch";
+    ASSERT checksum == expected_checksum, "checksum mismatch";
     print("CLEAR observable: ", final, " (sum 0..N-1) in ", elapsed, " ms");
+    print("BENCH_INFO: CLEAR stream_sum final=", final, " checksum=", checksum, " n=", n_writes);
+    print("BENCH_RESULT: ", elapsed, " ms");
     RETURN;
 END
diff --git a/benchmarks/concurrent/16_observables/bench.go b/benchmarks/concurrent/16_observables/bench.go
index 1b9fcc42..c5934787 100644
--- a/benchmarks/concurrent/16_observables/bench.go
+++ b/benchmarks/concurrent/16_observables/bench.go
@@ -1,165 +1,51 @@
-// Concurrent-readers benchmark — Go.
-// Mirrors bench_clear.zig: 1 writer + K readers, observe by view().
+// Concurrent observable stream-sum benchmark — Go.
+// Mirrors bench.cht: producer stream -> consumer sum -> join.
 package main
 
 import (
 	"fmt"
-	"sync"
-	"sync/atomic"
 	"time"
 )
 
-const NWrites = 5_000_000
+const NWrites = 2_000_000
 
-var readerCounts = []int{1, 4, 8}
-
-// ---------------- atomic.Int64 (Go's "@observable" equivalent) ----------------
-
-func runAtomic(nReaders int) {
-	var counter atomic.Int64
-	var stop atomic.Uint32
-
-	readerN := make([]int, nReaders)
+func expectedSum() int64 {
+	n := int64(NWrites)
+	return (n * (n - 1)) / 2
+}
 
-	var wg sync.WaitGroup
-	totalSink := make([]int64, nReaders)
-	for i := range readerN {
-		i := i
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			n := 0
-			var sink int64 = 0
-			for stop.Load() == 0 {
-				sink ^= counter.Load() // data-dependent so compiler can't elide
-				n++
-			}
-			readerN[i] = n
-			totalSink[i] = sink
-		}()
-	}
+func main() {
+	ch := make(chan int64, 64)
+	done := make(chan int64, 1)
 
 	t0 := time.Now()
-	wg.Add(1)
 	go func() {
-		defer wg.Done()
-		for i := 0; i < NWrites; i++ {
-			counter.Add(1)
+		var sum int64 = 0
+		for v := range ch {
+			sum += v
 		}
-		stop.Store(1)
+		done <- sum
 	}()
-	wg.Wait()
-	elapsed := time.Since(t0)
-
-	totalReads := 0
-	for _, n := range readerN {
-		totalReads += n
-	}
-	nsPerInc := elapsed.Nanoseconds() / NWrites
-	readsPerSec := int64(0)
-	if elapsed.Nanoseconds() > 0 {
-		readsPerSec = int64(totalReads) * 1_000_000_000 / elapsed.Nanoseconds()
-	}
-	fmt.Printf("[Go atomic.Int64]    writer=%3d ns/inc  readers=%d  total_reads=%d  reads/sec=%d\n",
-		nsPerInc, nReaders, totalReads, readsPerSec)
-	if counter.Load() != int64(NWrites) {
-		fmt.Printf("  !! counter view %d != expected %d\n", counter.Load(), NWrites)
-	}
-	// keep totalSink alive so the compiler can't elide the data-dependent reads
-	var sinkSum int64 = 0
-	for _, s := range totalSink {
-		sinkSum ^= s
-	}
-	if sinkSum == 0xdeadbeef {
-		fmt.Println("  (sink check)")
-	}
-}
-
-// ---------------- sync.Mutex<int64> (Go's "@locked Int64" equivalent) ----------------
-
-type LockedI64 struct {
-	mu  sync.Mutex
-	val int64
-}
-
-func (l *LockedI64) Add(n int64) {
-	l.mu.Lock()
-	l.val += n
-	l.mu.Unlock()
-}
-
-func (l *LockedI64) View() int64 {
-	l.mu.Lock()
-	defer l.mu.Unlock()
-	return l.val
-}
-
-func runLocked(nReaders int) {
-	var counter LockedI64
-	var stop atomic.Uint32
-
-	readerN := make([]int, nReaders)
-	totalSink := make([]int64, nReaders)
-
-	var wg sync.WaitGroup
-	for i := range readerN {
-		i := i
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			n := 0
-			var sink int64 = 0
-			for stop.Load() == 0 {
-				sink ^= counter.View()
-				n++
-			}
-			readerN[i] = n
-			totalSink[i] = sink
-		}()
-	}
-
-	t0 := time.Now()
-	wg.Add(1)
 	go func() {
-		defer wg.Done()
-		for i := 0; i < NWrites; i++ {
-			counter.Add(1)
+		for i := int64(0); i < int64(NWrites); i++ {
+			ch <- i
 		}
-		stop.Store(1)
+		close(ch)
 	}()
-	wg.Wait()
-	elapsed := time.Since(t0)
 
-	totalReads := 0
-	for _, n := range readerN {
-		totalReads += n
-	}
-	nsPerInc := elapsed.Nanoseconds() / NWrites
-	readsPerSec := int64(0)
-	if elapsed.Nanoseconds() > 0 {
-		readsPerSec = int64(totalReads) * 1_000_000_000 / elapsed.Nanoseconds()
-	}
-	fmt.Printf("[Go sync.Mutex]      writer=%3d ns/inc  readers=%d  total_reads=%d  reads/sec=%d\n",
-		nsPerInc, nReaders, totalReads, readsPerSec)
-	if counter.View() != int64(NWrites) {
-		fmt.Printf("  !! counter view %d != expected %d\n", counter.View(), NWrites)
-	}
-	var sinkSum int64 = 0
-	for _, s := range totalSink {
-		sinkSum ^= s
+	final := <-done
+	elapsed := time.Since(t0)
+	expected := expectedSum()
+	checksum := final + int64(NWrites)*131
+	expectedChecksum := expected + int64(NWrites)*131
+	if final != expected {
+		panic(fmt.Sprintf("final %d != expected %d", final, expected))
 	}
-	if sinkSum == 0xdeadbeef {
-		fmt.Println("  (sink check)")
+	if checksum != expectedChecksum {
+		panic(fmt.Sprintf("checksum %d != expected %d", checksum, expectedChecksum))
 	}
-}
 
-func main() {
-	fmt.Printf("Concurrent observable benchmark — Go — N=%d writes, readers=%v\n", NWrites, readerCounts)
-	for _, k := range readerCounts {
-		runAtomic(k)
-	}
-	fmt.Println()
-	for _, k := range readerCounts {
-		runLocked(k)
-	}
+	fmt.Printf("Go observable stream sum: %d (sum 0..N-1) in %.6f ms\n", final, float64(elapsed.Nanoseconds())/1_000_000.0)
+	fmt.Printf("BENCH_INFO: Go stream_sum final=%d checksum=%d n=%d\n", final, checksum, NWrites)
+	fmt.Printf("BENCH_RESULT: %.6f ms\n", float64(elapsed.Nanoseconds())/1_000_000.0)
 }
diff --git a/benchmarks/concurrent/16_observables/bench.rs b/benchmarks/concurrent/16_observables/bench.rs
index 04eabefa..e068c8f4 100644
--- a/benchmarks/concurrent/16_observables/bench.rs
+++ b/benchmarks/concurrent/16_observables/bench.rs
@@ -1,135 +1,56 @@
-//! Concurrent-readers benchmark — Rust.
-//! Mirrors bench_clear.zig: 1 writer + K readers, observe by view().
+//! Concurrent observable stream-sum benchmark — Rust.
+//! Mirrors bench.cht: producer stream -> consumer sum -> join.
 
-use std::sync::atomic::{AtomicI64, AtomicU8, Ordering};
-use std::sync::{Arc, Mutex};
+use std::sync::mpsc;
 use std::thread;
 use std::time::Instant;
 
-const N_WRITES: usize = 5_000_000;
-const READER_COUNTS: &[usize] = &[1, 4, 8];
+const N_WRITES: usize = 2_000_000;
 
-// ---------------- AtomicI64 (Rust's "@observable" equivalent) ----------------
-
-fn run_atomic(n_readers: usize) {
-    let counter = Arc::new(AtomicI64::new(0));
-    let stop = Arc::new(AtomicU8::new(0));
+fn expected_sum() -> i64 {
+    let n = N_WRITES as i64;
+    (n * (n - 1)) / 2
+}
 
-    let mut reader_handles = Vec::with_capacity(n_readers);
-    for _ in 0..n_readers {
-        let counter = Arc::clone(&counter);
-        let stop = Arc::clone(&stop);
-        reader_handles.push(thread::spawn(move || -> (usize, i64) {
-            let mut n: usize = 0;
-            let mut sink: i64 = 0;
-            while stop.load(Ordering::Acquire) == 0 {
-                sink ^= counter.load(Ordering::Acquire); // data-dependent
-                n += 1;
-            }
-            (n, sink)
-        }));
-    }
+fn main() {
+    let (tx, rx) = mpsc::sync_channel::<i64>(64);
 
     let t0 = Instant::now();
-    let writer = {
-        let counter = Arc::clone(&counter);
-        let stop = Arc::clone(&stop);
+    let consumer = thread::spawn(move || -> i64 {
+        let mut sum = 0i64;
+        for value in rx {
+            sum += value;
+        }
+        sum
+    });
+    let producer = {
         thread::spawn(move || {
-            for _ in 0..N_WRITES {
-                counter.fetch_add(1, Ordering::Relaxed);
+            for i in 0..N_WRITES {
+                tx.send(i as i64).unwrap();
             }
-            stop.store(1, Ordering::Release);
         })
     };
-    writer.join().unwrap();
-    let mut total_reads: usize = 0;
-    let mut sink_sum: i64 = 0;
-    for h in reader_handles {
-        let (n, s) = h.join().unwrap();
-        total_reads += n;
-        sink_sum ^= s;
-    }
+    producer.join().unwrap();
+    let final_value = consumer.join().unwrap();
     let elapsed = t0.elapsed();
 
-    let ns_per_inc = elapsed.as_nanos() as usize / N_WRITES;
-    let reads_per_sec = if elapsed.as_nanos() == 0 { 0 } else {
-        (total_reads as u128 * 1_000_000_000 / elapsed.as_nanos()) as usize
-    };
-    println!(
-        "[Rust AtomicI64]     writer={:>3} ns/inc  readers={}  total_reads={}  reads/sec={}",
-        ns_per_inc, n_readers, total_reads, reads_per_sec
-    );
-    if counter.load(Ordering::Acquire) != N_WRITES as i64 {
-        println!("  !! counter view {} != expected {}", counter.load(Ordering::Acquire), N_WRITES);
-    }
-    if sink_sum == 0xdeadbeef { println!("  (sink check)"); }
-}
-
-// ---------------- Mutex<i64> (Rust's "@locked Int64" equivalent) ----------------
-
-fn run_locked(n_readers: usize) {
-    let counter = Arc::new(Mutex::new(0i64));
-    let stop = Arc::new(AtomicU8::new(0));
-
-    let mut reader_handles = Vec::with_capacity(n_readers);
-    for _ in 0..n_readers {
-        let counter = Arc::clone(&counter);
-        let stop = Arc::clone(&stop);
-        reader_handles.push(thread::spawn(move || -> (usize, i64) {
-            let mut n: usize = 0;
-            let mut sink: i64 = 0;
-            while stop.load(Ordering::Acquire) == 0 {
-                sink ^= *counter.lock().unwrap();
-                n += 1;
-            }
-            (n, sink)
-        }));
+    let expected = expected_sum();
+    let checksum = final_value + (N_WRITES as i64) * 131;
+    let expected_checksum = expected + (N_WRITES as i64) * 131;
+    if final_value != expected {
+        panic!("final {} != expected {}", final_value, expected);
     }
-
-    let t0 = Instant::now();
-    let writer = {
-        let counter = Arc::clone(&counter);
-        let stop = Arc::clone(&stop);
-        thread::spawn(move || {
-            for _ in 0..N_WRITES {
-                let mut g = counter.lock().unwrap();
-                *g += 1;
-            }
-            stop.store(1, Ordering::Release);
-        })
-    };
-    writer.join().unwrap();
-    let mut total_reads: usize = 0;
-    let mut sink_sum: i64 = 0;
-    for h in reader_handles {
-        let (n, s) = h.join().unwrap();
-        total_reads += n;
-        sink_sum ^= s;
+    if checksum != expected_checksum {
+        panic!("checksum {} != expected {}", checksum, expected_checksum);
     }
-    let elapsed = t0.elapsed();
-
-    let ns_per_inc = elapsed.as_nanos() as usize / N_WRITES;
-    let reads_per_sec = if elapsed.as_nanos() == 0 { 0 } else {
-        (total_reads as u128 * 1_000_000_000 / elapsed.as_nanos()) as usize
-    };
     println!(
-        "[Rust Mutex<i64>]    writer={:>3} ns/inc  readers={}  total_reads={}  reads/sec={}",
-        ns_per_inc, n_readers, total_reads, reads_per_sec
+        "Rust observable stream sum: {} (sum 0..N-1) in {:.6} ms",
+        final_value,
+        elapsed.as_secs_f64() * 1000.0
     );
-    let final_v = *counter.lock().unwrap();
-    if final_v != N_WRITES as i64 {
-        println!("  !! counter view {} != expected {}", final_v, N_WRITES);
-    }
-    if sink_sum == 0xdeadbeef { println!("  (sink check)"); }
-}
-
-fn main() {
-    println!("Concurrent observable benchmark — Rust — N={} writes, readers={:?}", N_WRITES, READER_COUNTS);
-    for &k in READER_COUNTS {
-        run_atomic(k);
-    }
-    println!();
-    for &k in READER_COUNTS {
-        run_locked(k);
-    }
+    println!(
+        "BENCH_INFO: Rust stream_sum final={} checksum={} n={}",
+        final_value, checksum, N_WRITES
+    );
+    println!("BENCH_RESULT: {:.6} ms", elapsed.as_secs_f64() * 1000.0);
 }
diff --git a/benchmarks/concurrent/18_atomic_counter/bench.cht b/benchmarks/concurrent/18_atomic_counter/bench.cht
index 526fce61..14c8e2d4 100644
--- a/benchmarks/concurrent/18_atomic_counter/bench.cht
+++ b/benchmarks/concurrent/18_atomic_counter/bench.cht
@@ -1,4 +1,5 @@
 -- @leak: 1000000 -> 1000
+-- @leak: 8000000_i64 -> 8000_i64
 -- Atomic Counter Benchmark — CLEAR
 --
 -- N workers each perform ITERATIONS atomic increments on a single shared
diff --git a/benchmarks/sequential/12_recursion_yield_overhead/README.md b/benchmarks/inter-clear/01_sequential_recursion_yield_overhead/README.md
similarity index 90%
rename from benchmarks/sequential/12_recursion_yield_overhead/README.md
rename to benchmarks/inter-clear/01_sequential_recursion_yield_overhead/README.md
index 7fc3dbba..7f1cb012 100644
--- a/benchmarks/sequential/12_recursion_yield_overhead/README.md
+++ b/benchmarks/inter-clear/01_sequential_recursion_yield_overhead/README.md
@@ -33,10 +33,14 @@ DELTA reflects the actual yield-check cost.
 ## Run
 
 ```bash
-./clear build benchmarks/sequential/12_recursion_yield_overhead/bench.cht --optimized -o /tmp/bench_yield
+./clear build benchmarks/inter-clear/01_sequential_recursion_yield_overhead/bench.cht --optimized -o /tmp/bench_yield
 for i in 1 2 3 4 5; do /tmp/bench_yield; done
 ```
 
+The benchmark runner uses `TIMEOUT=20` for this directory. One full
+process run executes both the TIGHT and DEFAULT variants, so wall
+time is roughly twice the reported `BENCH_RESULT`.
+
 ## Sample results (~1B iterations, optimized build)
 
 ```
diff --git a/benchmarks/sequential/12_recursion_yield_overhead/THREADS b/benchmarks/inter-clear/01_sequential_recursion_yield_overhead/THREADS
similarity index 100%
rename from benchmarks/sequential/12_recursion_yield_overhead/THREADS
rename to benchmarks/inter-clear/01_sequential_recursion_yield_overhead/THREADS
diff --git a/benchmarks/inter-clear/01_sequential_recursion_yield_overhead/TIMEOUT b/benchmarks/inter-clear/01_sequential_recursion_yield_overhead/TIMEOUT
new file mode 100644
index 00000000..209e3ef4
--- /dev/null
+++ b/benchmarks/inter-clear/01_sequential_recursion_yield_overhead/TIMEOUT
@@ -0,0 +1 @@
+20
diff --git a/benchmarks/sequential/12_recursion_yield_overhead/bench.cht b/benchmarks/inter-clear/01_sequential_recursion_yield_overhead/bench.cht
similarity index 100%
rename from benchmarks/sequential/12_recursion_yield_overhead/bench.cht
rename to benchmarks/inter-clear/01_sequential_recursion_yield_overhead/bench.cht
diff --git a/benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/README.md b/benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/README.md
new file mode 100644
index 00000000..053a1b16
--- /dev/null
+++ b/benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/README.md
@@ -0,0 +1,30 @@
+# FSM vs stackful BG tasks
+
+Compares two CLEAR-only variants of the same short-lived BG workload:
+
+- `bench_fsm.cht`: default BG body, FSM-eligible.
+- `bench_stackful.cht`: `@standard` BG body, forced stackful.
+
+Both spawn and join 100,000 futures. The body does trivial arithmetic,
+so this primarily measures task scheduling and pool overhead.
+
+The stackful variant calls `touchCurrentFiberStack(16_384, seed)` inside
+each worker. That faults the full standard stack allocation so RSS reflects
+stack-pool residency. Both variants also print `BENCH_INFO` with current
+RSS, peak RSS, and peak virtual memory from `/proc/self/status`.
+
+Run manually:
+
+```bash
+./clear build --optimized benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/bench_fsm.cht -o /tmp/bench_fsm
+./clear build --optimized benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/bench_stackful.cht -o /tmp/bench_stackful
+/usr/bin/time -f 'fsm elapsed=%e rss=%M' env CLEAR_THREADS=32 /tmp/bench_fsm
+/usr/bin/time -f 'stackful elapsed=%e rss=%M' env CLEAR_THREADS=32 /tmp/bench_stackful
+```
+
+Recent 100k sample:
+
+```text
+fsm      median 0.30s  RSS 38912 KB  VmPeak ~2609608 KB
+stackful median 0.23s  RSS 58624 KB  VmPeak ~2660632 KB
+```
diff --git a/benchmarks/concurrent/15_fsm_vs_stackful/bench_fsm.cht b/benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/bench_fsm.cht
similarity index 80%
rename from benchmarks/concurrent/15_fsm_vs_stackful/bench_fsm.cht
rename to benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/bench_fsm.cht
index 63c2fca5..5a126125 100644
--- a/benchmarks/concurrent/15_fsm_vs_stackful/bench_fsm.cht
+++ b/benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/bench_fsm.cht
@@ -6,13 +6,14 @@
 -- rt / inner / alloc + the bound result, ~64-96 bytes. The
 -- stackful @xl variant allocates a 256+ KB stack per BG; the
 -- difference dominates peak RSS at scale.
+-- @leak: 100_000_i64 -> 1_000_i64
 
 FN bench_work(x: Int64) RETURNS Int64 ->
   RETURN x * 2 + 1;
 END
 
 FN main() RETURNS Void ->
-  count: Int64 = 200_i64;
+  count: Int64 = 100_000_i64;
 
   MUTABLE futures: ~Int64[]@list = [];
   MUTABLE total: Int64 = 0_i64;
@@ -27,5 +28,6 @@ FN main() RETURNS Void ->
   END
 
   ASSERT total > 0, "non-zero result";
+  print("BENCH_INFO: fsm current_rss_kb=", currentMemoryKb(), " peak_rss_kb=", peakMemoryKb(), " peak_vm_kb=", peakVirtualMemoryKb());
   RETURN;
 END
diff --git a/benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/bench_stackful.cht b/benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/bench_stackful.cht
new file mode 100644
index 00000000..4e198e90
--- /dev/null
+++ b/benchmarks/inter-clear/02_concurrent_fsm_vs_stackful/bench_stackful.cht
@@ -0,0 +1,30 @@
+-- Stackful baseline. Same body shape as bench_fsm.cht with @standard to
+-- force the stackful path. Each worker touches the full 16 KB standard
+-- stack allocation so RSS reflects stack-pool residency, not just the
+-- small subset of pages naturally reached by this tiny function.
+-- @leak: 100_000_i64 -> 1_000_i64
+
+FN bench_work(x: Int64) RETURNS Int64 ->
+  touched: Int64 = touchCurrentFiberStack(16_384_i64, x);
+  RETURN x * 2 + 1 + (touched MOD 97_i64);
+END
+
+FN main() RETURNS Void ->
+  count: Int64 = 100_000_i64;
+
+  MUTABLE futures: ~Int64[]@list = [];
+  MUTABLE total: Int64 = 0_i64;
+
+  FOR i IN (0 ..< count) DO
+    futures.append(BG { @standard -> bench_work(i); });
+  END
+
+  FOR j IN (0 ..< count) DO
+    r: Int64 = NEXT futures[j];
+    total = total + r;
+  END
+
+  ASSERT total > 0, "non-zero result";
+  print("BENCH_INFO: stackful current_rss_kb=", currentMemoryKb(), " peak_rss_kb=", peakMemoryKb(), " peak_vm_kb=", peakVirtualMemoryKb());
+  RETURN;
+END
diff --git a/benchmarks/concurrent/17_mvcc_vs_rwlock/README.md b/benchmarks/inter-clear/03_concurrent_mvcc_vs_rwlock/README.md
similarity index 77%
rename from benchmarks/concurrent/17_mvcc_vs_rwlock/README.md
rename to benchmarks/inter-clear/03_concurrent_mvcc_vs_rwlock/README.md
index 85249367..73df6a9d 100644
--- a/benchmarks/concurrent/17_mvcc_vs_rwlock/README.md
+++ b/benchmarks/inter-clear/03_concurrent_mvcc_vs_rwlock/README.md
@@ -10,9 +10,9 @@ sparse writes. The read critical section is one cache-line load
 dominates and the speedup compresses to ~2x. For scenarios where
 MVCC pulls further ahead (3-10x), see:
 
-- `17a_mvcc_fat_struct/`        -- multi-field reads (CS amortization)
-- `17b_mvcc_pure_read/`         -- 32 cores, no writers (cache-coherence isolation)
-- `17c_mvcc_writer_pressure/`   -- heavy write rate (reader-pile-up amplification)
+- `04_concurrent_mvcc_fat_struct/`        -- multi-field reads (CS amortization)
+- `05_concurrent_mvcc_pure_read/`         -- 32 cores, no writers (cache-coherence isolation)
+- `06_concurrent_mvcc_writer_pressure/`   -- heavy write rate (reader-pile-up amplification)
 
 ## Workload
 32 reader fibers each performing 100k reads of `counter.value`,
@@ -30,7 +30,7 @@ CLEAR's two built-in choices for read-heavy concurrent state.
 
 Run via:
 
-    ruby benchmarks/runner.rb --release benchmarks/concurrent/17_mvcc_vs_rwlock/
+    ruby benchmarks/runner.rb --release benchmarks/inter-clear/03_concurrent_mvcc_vs_rwlock/
 
 The runner surfaces `BENCH_INFO:` lines that show RwLock vs MVCC
 times and the speedup ratio.
diff --git a/benchmarks/concurrent/17_mvcc_vs_rwlock/bench.cht b/benchmarks/inter-clear/03_concurrent_mvcc_vs_rwlock/bench.cht
similarity index 100%
rename from benchmarks/concurrent/17_mvcc_vs_rwlock/bench.cht
rename to benchmarks/inter-clear/03_concurrent_mvcc_vs_rwlock/bench.cht
diff --git a/benchmarks/concurrent/17a_mvcc_fat_struct/README.md b/benchmarks/inter-clear/04_concurrent_mvcc_fat_struct/README.md
similarity index 83%
rename from benchmarks/concurrent/17a_mvcc_fat_struct/README.md
rename to benchmarks/inter-clear/04_concurrent_mvcc_fat_struct/README.md
index 56b07971..ee80ee22 100644
--- a/benchmarks/concurrent/17a_mvcc_fat_struct/README.md
+++ b/benchmarks/inter-clear/04_concurrent_mvcc_fat_struct/README.md
@@ -3,7 +3,7 @@
 Scenario A from the MVCC scenario taxonomy. Reads an 8-field `Sample`
 struct (64 bytes, one cache line) inside the critical section — the
 read CS sums all 8 fields. This amortizes the lock-acquire / EBR-pin
-cost across more useful work than `17_mvcc_vs_rwlock` (which has a
+cost across more useful work than `03_concurrent_mvcc_vs_rwlock` (which has a
 ~1ns Int64 read).
 
 ## Workload
@@ -22,7 +22,7 @@ lowering or the EBR pin is leaving cycles on the table.
 
 Run via:
 
-    ruby benchmarks/runner.rb --release benchmarks/concurrent/17a_mvcc_fat_struct/
+    ruby benchmarks/runner.rb --release benchmarks/inter-clear/04_concurrent_mvcc_fat_struct/
 
 The runner surfaces `BENCH_INFO:` lines that show RwLock vs MVCC
 times and the speedup ratio.
diff --git a/benchmarks/concurrent/17a_mvcc_fat_struct/bench.cht b/benchmarks/inter-clear/04_concurrent_mvcc_fat_struct/bench.cht
similarity index 98%
rename from benchmarks/concurrent/17a_mvcc_fat_struct/bench.cht
rename to benchmarks/inter-clear/04_concurrent_mvcc_fat_struct/bench.cht
index b893b523..b10d2ead 100644
--- a/benchmarks/concurrent/17a_mvcc_fat_struct/bench.cht
+++ b/benchmarks/inter-clear/04_concurrent_mvcc_fat_struct/bench.cht
@@ -1,6 +1,6 @@
 -- 17a: MVCC vs RwLock with a FAT STRUCT and multi-field reads.
 --
--- Why this scenario: 17_mvcc_vs_rwlock uses a single Int64 cell. The read
+-- Why this scenario: 03_concurrent_mvcc_vs_rwlock uses a single Int64 cell. The read
 -- critical section is one cache-line load (~1ns of work), so the ~10-30ns
 -- lock-acquire / EBR-pin overhead dominates and the speedup compresses to
 -- ~2x. This bench reads 8 fields per CS so the relative cost of the
diff --git a/benchmarks/concurrent/17b_mvcc_pure_read/README.md b/benchmarks/inter-clear/05_concurrent_mvcc_pure_read/README.md
similarity index 92%
rename from benchmarks/concurrent/17b_mvcc_pure_read/README.md
rename to benchmarks/inter-clear/05_concurrent_mvcc_pure_read/README.md
index 9aab3f43..7b5692a7 100644
--- a/benchmarks/concurrent/17b_mvcc_pure_read/README.md
+++ b/benchmarks/inter-clear/05_concurrent_mvcc_pure_read/README.md
@@ -29,4 +29,4 @@ than expected.
 
 Run via:
 
-    ruby benchmarks/runner.rb --release benchmarks/concurrent/17b_mvcc_pure_read/
+    ruby benchmarks/runner.rb --release benchmarks/inter-clear/05_concurrent_mvcc_pure_read/
diff --git a/benchmarks/concurrent/17b_mvcc_pure_read/bench.cht b/benchmarks/inter-clear/05_concurrent_mvcc_pure_read/bench.cht
similarity index 100%
rename from benchmarks/concurrent/17b_mvcc_pure_read/bench.cht
rename to benchmarks/inter-clear/05_concurrent_mvcc_pure_read/bench.cht
diff --git a/benchmarks/concurrent/17c_mvcc_writer_pressure/README.md b/benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/README.md
similarity index 89%
rename from benchmarks/concurrent/17c_mvcc_writer_pressure/README.md
rename to benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/README.md
index dfec89b1..f04a6945 100644
--- a/benchmarks/concurrent/17c_mvcc_writer_pressure/README.md
+++ b/benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/README.md
@@ -8,7 +8,7 @@ RwLock writer-preferring fairness policy forces reader pile-ups.
 - 4 writers x 25k iters bumping the value.
 - Writes are ~3.5% of reads, sustained for the full run.
 
-The original `17_mvcc_vs_rwlock` has writes at ~0.1% of reads and only
+The original `03_concurrent_mvcc_vs_rwlock` has writes at ~0.1% of reads and only
 4k total writes — writers barely show up. This bench keeps the readers
 similar but turns the writer dial up by 25x.
 
@@ -35,4 +35,4 @@ in tens of milliseconds.
 
 Run via:
 
-    ruby benchmarks/runner.rb --release benchmarks/concurrent/17c_mvcc_writer_pressure/
+    ruby benchmarks/runner.rb --release benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/
diff --git a/benchmarks/concurrent/17c_mvcc_writer_pressure/TIMEOUT b/benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/TIMEOUT
similarity index 100%
rename from benchmarks/concurrent/17c_mvcc_writer_pressure/TIMEOUT
rename to benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/TIMEOUT
diff --git a/benchmarks/concurrent/17c_mvcc_writer_pressure/bench.cht b/benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/bench.cht
similarity index 96%
rename from benchmarks/concurrent/17c_mvcc_writer_pressure/bench.cht
rename to benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/bench.cht
index 367d7bec..864fad68 100644
--- a/benchmarks/concurrent/17c_mvcc_writer_pressure/bench.cht
+++ b/benchmarks/inter-clear/06_concurrent_mvcc_writer_pressure/bench.cht
@@ -1,6 +1,6 @@
 -- 17c: MVCC vs RwLock under WRITER PRESSURE.
 --
--- Why this scenario: 17_mvcc_vs_rwlock has 4 writers x 1k writes vs
+-- Why this scenario: 03_concurrent_mvcc_vs_rwlock has 4 writers x 1k writes vs
 -- 32 readers x 100k reads = 4k writes vs 3.2M reads (~0.1%). At that
 -- ratio writers barely show up in the timeline. This bench drives
 -- writers hard so the RwLock's writer-preferring fairness policy
@@ -14,6 +14,8 @@
 -- Expected: 4-7x MVCC win on 32 cores. The mechanism is reader-pile-up
 -- amplification under writer-preferring fairness, which compounds the
 -- baseline ~2x lock-acquire-overhead win.
+-- @leak: 100000_i64 -> 1000_i64
+-- @leak: 25000_i64 -> 250_i64
 
 STRUCT Counter { value: Int64 }
 
diff --git a/benchmarks/runner.rb b/benchmarks/runner.rb
index 704c9e89..6db51156 100644
--- a/benchmarks/runner.rb
+++ b/benchmarks/runner.rb
@@ -2,6 +2,7 @@
 
 require 'fileutils'
 require 'benchmark'
+require 'json'
 
 # Find the Zig compiler (local or system). Resolve to absolute path
 # since the runner chdir's into zig/ for compilation.
@@ -13,6 +14,27 @@
 
 RUN_TIMEOUT = (ENV['BENCH_TIMEOUT'] || 2).to_i
 
+$bencher_metrics = nil
+
+def bencher_benchmark_name(dir, label = nil)
+  return dir unless label
+
+  "#{dir}/#{label}"
+end
+
+def bencher_record(name, measure, value)
+  return unless $bencher_metrics
+
+  $bencher_metrics[name] ||= {}
+  $bencher_metrics[name][measure] = { "value" => value.to_f }
+end
+
+def write_bencher_json(path)
+  dir = File.dirname(path)
+  FileUtils.mkdir_p(dir) unless dir == "."
+  File.write(path, JSON.pretty_generate($bencher_metrics || {}))
+end
+
 # Per-benchmark timeout: read TIMEOUT file if present.
 # Returns nil for server benchmarks with no file (no timeout = original behavior).
 # Returns integer for standalone benchmarks with no file (falls back to RUN_TIMEOUT).
@@ -29,6 +51,30 @@ def bench_threads(dir)
   File.exist?(f) ? File.read(f).strip : nil
 end
 
+def leak_sources(dir)
+  if File.exist?("#{dir}/bench.cht")
+    ["#{dir}/bench.cht"]
+  elsif File.exist?("#{dir}/server.cht")
+    ["#{dir}/server.cht"]
+  else
+    Dir.glob("#{dir}/bench*.cht").sort
+  end
+end
+
+def apply_shard(dirs, shard_spec)
+  return dirs unless shard_spec
+  unless shard_spec =~ /\A(\d+)\/(\d+)\z/
+    abort "Invalid --shard=#{shard_spec.inspect}; expected INDEX/COUNT"
+  end
+
+  index = $1.to_i
+  count = $2.to_i
+  abort "Invalid --shard=#{shard_spec.inspect}; COUNT must be > 0" if count <= 0
+  abort "Invalid --shard=#{shard_spec.inspect}; INDEX must be in 0...COUNT" if index < 0 || index >= count
+
+  dirs.each_with_index.select { |_, i| (i % count) == index }.map(&:first)
+end
+
 # Benchmark output protocol:
 #   If a binary prints "BENCH_RESULT: <ms> ms" to stdout or stderr, that value
 #   is used as the measurement instead of wall time. This allows benchmarks that
@@ -60,6 +106,56 @@ def measure_min(command, runs = 5, timeout: RUN_TIMEOUT)
   [best_time, best_info]
 end
 
+def server_benchmark?(dir)
+  File.exist?("#{dir}/client.go") && File.exist?("#{dir}/server.cht")
+end
+
+def run_leak_check(dir, bench_bin, timeout_s: 60, timeout_ok: false)
+  scale = ENV['BENCH_SCALE'] || "1.0"
+  threads = bench_threads(dir) || ENV['BENCH_CORES'] || ENV['CLEAR_THREADS'] || `nproc 2>/dev/null`.strip
+  threads = "0" if threads.empty?
+
+  cmd = "BENCH_SCALE=#{scale} CLEAR_THREADS=#{threads} timeout #{timeout_s} ./#{bench_bin}"
+  output = nil
+  elapsed_s = Benchmark.realtime { output = `#{cmd} 2>&1` }
+  exit_status = $?.exitstatus
+  leak_lines = output.lines.select { |l| l.include?("leaked:") }
+  leak_count = leak_lines.size
+  status = :clean
+
+  if exit_status == 124
+    if timeout_ok
+      puts "    STARTED (#{timeout_s}s liveness check; server kept running)"
+      status = :started
+    else
+      puts "    TIMEOUT (#{timeout_s}s in debug mode)"
+      status = :timeout
+    end
+  elsif exit_status != 0 && exit_status != 124
+    puts "    CRASH (exit #{exit_status}), leaks: #{leak_count}"
+    status = :crash
+    if leak_count > 0
+      sources = output.scan(/in (\S+) \(/).flatten.uniq
+      sources.each { |s| puts "      - #{s}" }
+    end
+  elsif leak_count > 0
+    puts "    LEAKS: #{leak_count}"
+    status = :leaks
+    sources = output.scan(/in (\S+) \(/).flatten
+    tallied = sources.tally.sort_by { |_, c| -c }
+    tallied.each { |fn, count| puts "      - #{fn} (#{count}x)" }
+  else
+    puts "    CLEAN"
+  end
+
+  {
+    status: status,
+    elapsed_ms: elapsed_s * 1000.0,
+    exit_status: exit_status,
+    leak_count: leak_count,
+  }
+end
+
 # -------------------------------------------------------------------------
 # Standard benchmark: self-contained binary, timed externally
 # -------------------------------------------------------------------------
@@ -69,13 +165,9 @@ def run_bench(dir)
   bto = bench_timeout(dir)
 
   # Detect server benchmarks (have client.go + server.cht)
-  if File.exist?("#{dir}/client.go") && File.exist?("#{dir}/server.cht")
-    if leak_mode
-      puts "=== LEAK CHECK: #{dir} === SKIP (server benchmark)"
-      return
-    end
+  if server_benchmark?(dir)
     # Server benchmarks: only apply timeout if TIMEOUT file exists.
-    return run_server_bench(dir, timeout: bto)
+    return run_server_bench(dir, timeout: bto) unless leak_mode
   end
 
   puts leak_mode ? "=== LEAK CHECK: #{dir} ===" : "=== BENCHMARK: #{dir} ==="
@@ -83,6 +175,7 @@ def run_bench(dir)
   has_c    = !leak_mode && File.exist?("#{dir}/bench.c")
   has_rust = !leak_mode && File.exist?("#{dir}/bench.rs") && system("command -v rustc > /dev/null 2>&1")
   has_go   = !leak_mode && File.exist?("#{dir}/bench.go") && system("command -v go > /dev/null 2>&1")
+  variant_sources = !leak_mode && !File.exist?("#{dir}/bench.cht") ? Dir.glob("#{dir}/bench*.cht").sort : []
 
   # Clean stale binaries before recompiling
   %w[bench_c bench_rust bench_go bench_clear].each { |b| FileUtils.rm_f("#{dir}/#{b}") }
@@ -115,48 +208,118 @@ def run_bench(dir)
     end
   end
 
+  if variant_sources.any?
+    scale = ENV['BENCH_SCALE'] || "1.0"
+    runs = case ENV['BENCH_MODE']
+           when 'fast' then 3
+           when 'release' then 5
+           when 'leak' then 1
+           when 'smoke' then 1
+           else 5
+           end
+    threads = bench_threads(dir) || ENV['BENCH_CORES'] || ENV['CLEAR_THREADS'] || `nproc 2>/dev/null`.strip
+    threads = "0" if threads.empty?
+    jemalloc_lib = Dir.glob("/lib/x86_64-linux-gnu/libjemalloc.so*").first ||
+                   Dir.glob("/usr/lib/libjemalloc.so*").first ||
+                   Dir.glob("/usr/local/lib/libjemalloc.so*").first
+    jemalloc_preload = jemalloc_lib ? "LD_PRELOAD=#{jemalloc_lib} " : ""
+    jemalloc_note = jemalloc_lib ? ", jemalloc" : ""
+
+    results = {}
+    peak_rss = {}
+    bench_info = []
+
+    variant_sources.each do |source_path|
+      label = File.basename(source_path, ".cht")
+      bin = "#{dir}/bench_clear_#{label}"
+      puts "Compiling CLEAR variant #{label}..."
+      output = `./clear build --optimized #{source_path} -o #{bin} 2>&1`
+      unless File.exist?(bin)
+        puts "WARNING: CLEAR variant #{label} failed: #{output.lines.last&.strip}"
+        next
+      end
+      puts "Running CLEAR variant #{label} (best of #{runs}, CLEAR_THREADS=#{threads}#{jemalloc_note}, scale=#{scale})..."
+      results[label], info = measure_min("#{jemalloc_preload}BENCH_SCALE=#{scale} CLEAR_THREADS=#{threads} ./#{bin}", runs, timeout: bto || RUN_TIMEOUT)
+      bench_info += info
+      rss_output = `timeout #{bto || RUN_TIMEOUT}s sh -c "#{jemalloc_preload}CLEAR_THREADS=#{threads} /usr/bin/time -v ./#{bin}" 2>&1`
+      peak_rss[label] = $1.to_i if rss_output =~ /Maximum resident set size.*?:\s*(\d+)/
+      FileUtils.rm_f(bin)
+    end
+
+    puts "\nRESULTS for #{dir}:"
+    results.each do |label, t|
+      rss_str = peak_rss[label] ? "  RSS: #{peak_rss[label]} KB" : ""
+      if t.nil?
+        puts "#{'%-22s' % "CLEAR #{label}"} TIMEOUT (#{bto || RUN_TIMEOUT}s)#{rss_str}"
+      else
+        puts "#{'%-22s' % "CLEAR #{label}"} #{'%.4f' % t} s#{rss_str}"
+      end
+    end
+    bench_info.uniq.each { |line| puts line }
+    return
+  end
+
   # 4. Compile CLEAR
   # bench.zt: pure Zig benchmark (runtime-level, no CLEAR transpilation needed).
   # bench.cht with "@use_zig": scheduler-dependent Zig (e.g. socket I/O, fiber benchmarks).
   has_clear = false
   if leak_mode
     # Leak mode: build with ./clear build (debug, GPA leak detection enabled)
-    if File.exist?("#{dir}/bench.cht")
-      src = File.read("#{dir}/bench.cht")
-
-      # @leak_skip: benchmark has no heap allocations, leak check is pointless
-      if src.include?("@leak_skip")
-        puts "  SKIP (no heap allocations)"
-        return
-      end
+    sources = leak_sources(dir)
+    if sources.any?
+      sources.each do |source_path|
+        label = File.basename(source_path, ".cht")
+        puts "  #{label}:"
+        src = File.read(source_path)
+
+        # @leak_skip: benchmark has no heap allocations, leak check is pointless
+        if src.include?("@leak_skip")
+          puts "    SKIP (no heap allocations)"
+          next
+        end
 
-      # @leak: old -> new  (reduce iteration counts for debug mode)
-      build_src = "#{dir}/bench.cht"
-      subs = src.scan(/^--\s*@leak:\s*(.+?)\s*->\s*(.+?)\s*$/)
-      if subs.any?
-        # Split into comment and code lines so sub! doesn't match the @leak comment itself
-        comment_lines = []
-        code_lines = []
-        src.each_line { |l| (l.match?(/^\s*--/) ? comment_lines : code_lines) << l }
-        code_text = code_lines.join
-        # gsub!, not sub!: the same iteration count usually appears in BOTH
-        # the producer loop and the consumer loop (e.g. spawn N futures /
-        # await N futures), and replacing only the first leaves the
-        # consumer indexing past the producer's reduced length.
-        subs.each { |old, new_val| code_text.gsub!(old.strip, new_val.strip) }
-        patched = comment_lines.join + code_text
-        build_src = "/tmp/bench_leak_#{File.basename(dir)}.cht"
-        File.write(build_src, patched)
-      end
+        # @leak: old -> new  (reduce iteration counts for debug mode)
+        build_src = source_path
+        subs = src.scan(/^--\s*@leak:\s*(.+?)\s*->\s*(.+?)\s*$/)
+        if subs.any?
+          # Split into comment and code lines so sub! doesn't match the @leak comment itself
+          comment_lines = []
+          code_lines = []
+          src.each_line { |l| (l.match?(/^\s*--/) ? comment_lines : code_lines) << l }
+          code_text = code_lines.join
+          # gsub!, not sub!: the same iteration count usually appears in BOTH
+          # the producer loop and the consumer loop (e.g. spawn N futures /
+          # await N futures), and replacing only the first leaves the
+          # consumer indexing past the producer's reduced length.
+          subs.each { |old, new_val| code_text.gsub!(old.strip, new_val.strip) }
+          patched = comment_lines.join + code_text
+          build_src = "/tmp/bench_leak_#{File.basename(dir)}_#{label}.cht"
+          File.write(build_src, patched)
+        end
 
-      puts "Compiling CLEAR (debug, leak detection)..."
-      output = `./clear build #{build_src} -o #{dir}/bench_clear 2>&1`
-      if File.exist?("#{dir}/bench_clear")
-        has_clear = true
-      else
-        puts "WARNING: debug build failed: #{output.lines.last&.strip}"
+        bench_bin = "#{dir}/bench_clear_#{label}"
+        puts "    Compiling CLEAR (debug, leak detection)..."
+        output = nil
+        build_elapsed_s = Benchmark.realtime { output = `./clear build #{build_src} -o #{bench_bin} 2>&1` }
+        if File.exist?(bench_bin)
+          has_clear = true
+          is_server = server_benchmark?(dir)
+          metric_name = bencher_benchmark_name(dir, label)
+          bencher_record(metric_name, "leak-build-ms", build_elapsed_s * 1000.0)
+          leak_result = run_leak_check(
+            dir,
+            bench_bin,
+            timeout_s: is_server ? 1 : 60,
+            timeout_ok: is_server,
+          )
+          bencher_record(metric_name, "leak-run-ms", leak_result[:elapsed_ms])
+          bencher_record(metric_name, "leak-count", leak_result[:leak_count])
+        else
+          puts "    WARNING: debug build failed: #{output.lines.last&.strip}"
+        end
+        FileUtils.rm_f(build_src) if build_src != source_path
+        FileUtils.rm_f(bench_bin)
       end
-      FileUtils.rm_f(build_src) if build_src != "#{dir}/bench.cht"
     else
       puts "No CLEAR source found, skipping."
     end
@@ -214,44 +377,12 @@ def run_bench(dir)
          when 'fast' then 3
          when 'release' then 5
          when 'leak' then 1
+         when 'smoke' then 1
          else 5
          end
 
   # Leak mode: run CLEAR once with timeout, capture stderr for GPA leak reports
-  if leak_mode
-    if has_clear
-      threads = bench_threads(dir) || ENV['BENCH_CORES'] || ENV['CLEAR_THREADS'] || `nproc 2>/dev/null`.strip
-      threads = "0" if threads.empty?
-      cmd = "BENCH_SCALE=#{scale} CLEAR_THREADS=#{threads} timeout 60 ./#{dir}/bench_clear"
-      output = `#{cmd} 2>&1`
-      exit_status = $?.exitstatus
-      leak_lines = output.lines.select { |l| l.include?("leaked:") }
-      leak_count = leak_lines.size
-
-      if exit_status == 124
-        puts "  TIMEOUT (60s in debug mode)"
-      elsif exit_status != 0 && exit_status != 124
-        puts "  CRASH (exit #{exit_status}), leaks: #{leak_count}"
-        # Show first leak source if any
-        if leak_count > 0
-          sources = output.scan(/in (\S+) \(/).flatten.uniq
-          sources.each { |s| puts "    - #{s}" }
-        end
-      elsif leak_count > 0
-        puts "  LEAKS: #{leak_count}"
-        sources = output.scan(/in (\S+) \(/).flatten
-        # Group by unique source function
-        tallied = sources.tally.sort_by { |_, c| -c }
-        tallied.each { |fn, count| puts "    - #{fn} (#{count}x)" }
-      else
-        puts "  CLEAN"
-      end
-    else
-      puts "  SKIP (no CLEAR source or build failed)"
-    end
-    FileUtils.rm_f("#{dir}/bench_clear")
-    return
-  end
+  return if leak_mode
 
   bench_info = []  # BENCH_INFO: lines surfaced from any bench
 
@@ -611,6 +742,8 @@ def run_server_bench(dir, timeout: RUN_TIMEOUT)
   mode = "normal"
   scale = "1.0"
   cores = `nproc 2>/dev/null`.strip
+  shard_spec = ENV['BENCH_SHARD']
+  bencher_json_path = nil
 
   args = ARGV.dup
   while (arg = args.shift)
@@ -626,6 +759,12 @@ def run_server_bench(dir, timeout: RUN_TIMEOUT)
       scale = "1.0"
     when /^--cores=(\d+)$/
       cores = $1
+    when /^--shard=(\d+\/\d+)$/
+      shard_spec = $1
+    when /^--bencher-json=(.+)$/
+      bencher_json_path = $1
+    when "--bencher-json"
+      bencher_json_path = args.shift || abort("Missing path after --bencher-json")
     when "--leak"
       mode = "leak"
       scale = "0.001"
@@ -638,8 +777,10 @@ def run_server_bench(dir, timeout: RUN_TIMEOUT)
       dirs += Dir.glob("benchmarks/concurrent/[0-9]*").select { |d| File.directory?(d) }.sort
     when "--server"
       dirs += Dir.glob("benchmarks/server/[0-9]*").select { |d| File.directory?(d) }.sort
+    when "--inter-clear"
+      dirs += Dir.glob("benchmarks/inter-clear/[0-9]*").select { |d| File.directory?(d) }.sort
     when "--all"
-      dirs += Dir.glob("benchmarks/{sequential,concurrent,server}/[0-9]*").select { |d| File.directory?(d) }.sort
+      dirs += Dir.glob("benchmarks/{sequential,concurrent,server,inter-clear}/[0-9]*").select { |d| File.directory?(d) }.sort
     else
       dirs << arg
     end
@@ -648,15 +789,22 @@ def run_server_bench(dir, timeout: RUN_TIMEOUT)
   if dirs.empty?
     if mode == "leak"
       # Leak mode: run ALL benchmarks by default
-      dirs = Dir.glob("benchmarks/{sequential,concurrent,server}/[0-9]*").select { |d| File.directory?(d) }.sort
+      dirs = Dir.glob("benchmarks/{sequential,concurrent,server,inter-clear}/[0-9]*").select { |d| File.directory?(d) }.sort
     else
       dirs = Dir.glob("benchmarks/sequential/0*").sort
     end
   end
 
+  dirs = apply_shard(dirs.uniq.sort, shard_spec)
+  if shard_spec
+    puts "Benchmark shard #{shard_spec}: #{dirs.length} benchmark(s)"
+  end
+
   ENV['BENCH_MODE'] = mode
   ENV['BENCH_SCALE'] = scale
   ENV['BENCH_CORES'] = cores
 
+  $bencher_metrics = {} if bencher_json_path
   dirs.each { |d| run_bench(d); puts }
+  write_bencher_json(bencher_json_path) if bencher_json_path
 end
diff --git a/benchmarks/sequential/01_call_overhead/README.md b/benchmarks/sequential/01_call_overhead/README.md
index 96f9ae9b..f74c540d 100644
--- a/benchmarks/sequential/01_call_overhead/README.md
+++ b/benchmarks/sequential/01_call_overhead/README.md
@@ -3,6 +3,13 @@
 Recursive Fibonacci(40): ~204M recursive calls, zero heap allocation.
 Measures pure function call overhead and stack frame cost.
 
+`fib` is marked `EFFECTS REENTRANT:TIGHT` so the benchmark measures
+direct recursive call overhead rather than cooperative scheduler fairness
+polling. Plain `EFFECTS REENTRANT` injects `rt.checkYield()` at every
+recursive entry; that is important for long-running recursive work in
+production, but it is not part of an apples-to-apples call overhead test
+against C and Rust.
+
 `BENCH_RESULT` = elapsed ms
 
 ## Results
@@ -10,18 +17,33 @@ Measures pure function call overhead and stack frame cost.
 | Language | Time | vs C |
 |----------|------|------|
 | C | ~154ms | baseline |
-| CLEAR | ~266ms | +73% |
+| CLEAR | ~224-230ms | +45-49% |
 | Rust | ~265ms | +72% |
 
-CLEAR matches Rust, both ~73% slower than C.
+CLEAR is faster than Rust here after removing the scheduler yield poll,
+but still slower than C.
 
 ## Why CLEAR and Rust are slower than C
 
 C compiles `fib(n-1) + fib(n-2)` to a direct call with no overhead.
 
-CLEAR emits `rt` (runtime context pointer) as the first parameter of every function.
-This extra argument prevents some compiler optimizations and adds per-call overhead.
-It also means stack frames are larger (one additional pointer slot).
+CLEAR still emits `rt` (runtime context pointer) as the first parameter of
+the function. This extra argument can inhibit some optimizer choices and
+adds per-call register pressure.
+
+With plain `EFFECTS REENTRANT`, CLEAR also emits a cooperative yield poll
+on every recursive entry:
+
+```zig
+rt.checkYield();
+```
+
+Profiling showed this dominated the old benchmark result: `checkYield`
+updates the per-runtime yield counter, masks it against the 4096-call
+budget, and checks scheduler state before deciding whether to yield. That
+is the right default for fairness, but it is not comparable to C/Rust's
+bare recursive calls. `EFFECTS REENTRANT:TIGHT` keeps real recursion and
+only removes that entry poll.
 
 Rust similarly adds overhead due to its calling convention for small integer return values
 vs C's direct register passing.
diff --git a/benchmarks/sequential/01_call_overhead/bench.cht b/benchmarks/sequential/01_call_overhead/bench.cht
index 51564b99..f8b90733 100644
--- a/benchmarks/sequential/01_call_overhead/bench.cht
+++ b/benchmarks/sequential/01_call_overhead/bench.cht
@@ -2,7 +2,7 @@
 -- Recursive Fibonacci: measures pure call overhead and stack frame cost.
 -- Goal: ~204M recursive calls with zero heap allocation.
 
-FN fib(n: Int64) RETURNS Int64 EFFECTS REENTRANT ->
+FN fib(n: Int64) RETURNS Int64 EFFECTS REENTRANT:TIGHT ->
     IF n <= 1 -> RETURN n;
     RETURN fib(n - 1) + fib(n - 2);
 END
diff --git a/benchmarks/sequential/09_frame_vs_heap/bench.cht b/benchmarks/sequential/09_frame_vs_heap/bench.cht
index 4269a7b6..f34e28d6 100644
--- a/benchmarks/sequential/09_frame_vs_heap/bench.cht
+++ b/benchmarks/sequential/09_frame_vs_heap/bench.cht
@@ -1,4 +1,4 @@
--- @leak: n = 1000000 -> n = 1000
+-- @leak: n = 1000000 -> n = 100000
 -- Benchmark 21: Frame vs Heap Escape — The Memory Boundary Tax
 --
 -- Measures the cost of frame allocation (CLEAR's fast path) vs heap
diff --git a/clear b/clear
index d5c1a80c..2fe3662b 100755
--- a/clear
+++ b/clear
@@ -1052,7 +1052,7 @@ when 'profile'
 
   # Build ReleaseFast with profiling enabled. Debug symbols retained (-fno-strip)
   # for addr2line resolution.
-  do_build(source, output: output, opt_level: 'ReleaseFast', profile: true, profile_max: profile_max)
+  do_build(source, output: output, opt_level: 'ReleaseFast', profile: true, profile_max: profile_max, default_stack: 'Large')
   build_note = profile_max ? " + alloc profiling, profile_max=#{profile_max}" : " + alloc profiling"
   puts "Built: #{output} (ReleaseFast#{build_note})"
 
diff --git a/docs/agents/benchmarks.md b/docs/agents/benchmarks.md
index b0631f8d..d65dc3c2 100644
--- a/docs/agents/benchmarks.md
+++ b/docs/agents/benchmarks.md
@@ -1,5 +1,30 @@
 # Benchmarks
 
+## Branch Quality Tracker - benchmark-fix
+
+Ruby coverage and static analysis findings introduced by this branch:
+
+- [x] Cover `CONCURRENT(batch: N)` validation branches in `PipeAnalysis`.
+- [x] Cover `PipelineHost#substitute_placeholders` for hash, assert, and if nodes.
+- [x] Cover SHARD+CONCURRENT Zig lowering error/cleanup branches.
+- [x] Cover FSM profile dispatch fallback branches.
+- [x] Cover `FsmWrapperEmitter` B1 wrapper emission branches.
+- [x] Cover MIR BG task profile helper branches.
+- [x] Cover Doctor `@parallel` recommendation metadata, source-line fallback, and local BG scanning.
+- [x] Reduce local Reek/Flog pressure in `PipeAnalysis#analyze_concurrent_op` by sharing concurrent option validation helpers.
+- [x] Reduce local Reek/Flog pressure in `Doctor#emit_parallel_bg_hint!` and `Doctor#section_fibers` by splitting recommendation helpers and caching repeated fields.
+- [ ] Reduce new Reek pressure in `PipelineHost#lower_shard_concurrent_each_zig`; tracked for the runtime-lowering follow-up because this RawZig path should move out of `PipelineHost`.
+- [x] Reduce repeated `batch` option handling in `PipelineGenerator#transpile_concurrent_*`.
+- [ ] Add or intentionally defer RuboCop wiring; the current bundle has no `rubocop` executable.
+
+Verification after this pass:
+
+- `COVERAGE=1 bundle exec rspec spec`: 3596 examples, 0 failures; added-line coverage delta is clean (`uncovered_added=0`).
+- `bundle exec reek src --format json`: branch has 83 normalized new findings versus `origin/master`, down from 95 before this pass; remaining findings are mostly large-method/design pressure in the branch's compiler/runtime lowering work.
+- `bundle exec flay src`: branch total 26151 versus master 26099.
+- `bundle exec flog src`: no changed method with a positive complexity delta above 10 after normalization; reported "new" rows are parser/name matching artifacts at score 0.0.
+- `bundle exec rubocop`: not available in the current bundle.
+
 ## Single-Core (Benchmark 05: HashMap, 1M keys)
 
 CLEAR's numeric HashMap outperforms hand-optimized C with FNV-1a hashing. CLEAR uses Zig's AutoHashMap with frame-arena allocation - zero GPA calls in the hot path.
diff --git a/docs/agents/finite-state-machines.md b/docs/agents/finite-state-machines.md
index 4a8eae8c..efe82ad9 100644
--- a/docs/agents/finite-state-machines.md
+++ b/docs/agents/finite-state-machines.md
@@ -232,6 +232,64 @@ convert "worked by accident" tests into hard crashes).
   correctness covered by `fsm-lock-test.zig`, `fsm-lock-vopr-test.zig`
   (32-seed PRNG mixed contention), and `fsm-lock-safety-test.zig`.
 
+## Active Tracker
+
+### EBR Scheduler-Thread Wiring
+
+Recent MVCC/EBR work moved runtime use toward one `ThreadLocalEbr` per
+scheduler thread, with tasks resolving the active participant through
+runtime/scheduler TLS rather than owning a private participant per spawn.
+The correctness contract is:
+
+- A task may be stolen or resumed on another scheduler.
+- A `Versioned(T).Guard` must release the exact EBR participant it pinned.
+- Nested pins must keep the participant active until the outermost guard
+  releases.
+- Retired nodes must survive writer-task exit while any task still holds
+  a guard.
+
+Required tests:
+
+| Kind | Required coverage |
+|---|---|
+| Unit | `ThreadLocalEbr.enter` / `exit` nested pin depth: inner exit must not clear `is_active`; outer exit must clear it. |
+| Unit | `Versioned(T).Guard` captures and releases the same `ThreadLocalEbr` pointer even if the active scheduler changes before guard release. |
+| Fiber stress | Writer task retires a version and exits while a different task holds a guard; reclaim must not free the guarded value until release. |
+| Scheduler stress | MVCC reads/writes under task stealing, proving `Runtime.currentEbr()` follows the executing scheduler and does not use per-task EBR state. |
+| Loom | Exhaustive model for EBR `pin_depth`, `is_active`, `local_epoch`, retire, and reclaim interleavings. Any new atomic field or ordering in EBR must be represented in this model. |
+| Loom | Guard migration model: pin on participant A, scheduler TLS changes before release, guard release still exits participant A and reclaim sees the correct active set. |
+| VOPR | No EBR-specific VOPR is required unless the fix adds retries, IO, timers, or scheduler-yield loops. If scheduler migration/retry logic changes, add it to the scheduler/FSM VOPR model. |
+
+### FSM Context Allocation
+
+Current generated FSM contexts are heap allocated and carry one field per
+promoted variable name. The planned allocation policy is:
+
+- **Now:** add 64 B, 128 B, and 256 B scheduler-local slabs for generated FSM
+  contexts, initialized with the scheduler/runtime allocator.
+- **Now:** add explicit `@stack` so the compiler can select a stack tier
+  at compile time when an FSM context is too large or a stack is the
+  better execution model.
+- **Future:** reuse context slots across disjoint live ranges instead of
+  one field per source variable name.
+- **Future:** add `@fsm:heap` as an explicit opt-in for oversized heap
+  FSM contexts.
+
+Required tests:
+
+| Kind | Required coverage |
+|---|---|
+| Runtime unit | 64 B, 128 B, and 256 B slab allocate/free/reuse, exact-size boundaries, and alignment. |
+| Runtime unit | Oversized context never enters a small slab class. |
+| Runtime scheduler | FSM task allocated on one scheduler and completed/stolen/freed on another routes free correctly and does not touch a non-thread-safe foreign slab directly. |
+| Runtime leak | Slab contexts are returned on success, error, cancellation, lock timeout, and IO wake error paths. |
+| Runtime stress | Many schedulers spawn and complete small FSM contexts concurrently with kcov-compatible bounded hammer tests. |
+| Compiler/transpile | Small context lowers to 64 B slab; medium context lowers to 128 B slab; larger common context lowers to 256 B slab; oversized context requires `@stack` or future `@fsm:heap`. |
+| Compiler/transpile | Generated FSM context no longer stores unnecessary allocator fields for the slab path. |
+| Compiler/transpile | Current no-slot-reuse behavior is covered with a fixture that has disjoint variables across suspend points; future slot reuse changes that expected shape deliberately. |
+| Loom | If FSM context slabs add new atomics or remote-free queues, model allocate/free/reuse and cross-scheduler free routing. Existing runtime stress is not a substitute for this. |
+| VOPR | If context free routing adds retries, timed polling, IO waits, or scheduler-yield loops, add those transitions to the scheduler/FSM VOPR model. Pure local slab allocate/free does not require VOPR by policy. |
+
 ## Roadmap
 
 **Landed:**
diff --git a/docs/agents/fsm.md b/docs/agents/fsm.md
index f5307b79..45d22f91 100644
--- a/docs/agents/fsm.md
+++ b/docs/agents/fsm.md
@@ -3,12 +3,33 @@
 CLEAR has two lowerings for `BG { ... }` blocks:
 
 - **Stackful** — the BG body runs in its own fiber with a real stack (16-64 KB depending on build mode). Fast to suspend (just `swapcontext`), but every BG pays the stack cost up front.
-- **FSM** — the BG body is compiled to a state machine. The "stack" is a heap-allocated context struct (sized to the live cross-segment vars only, typically 32-200 bytes). Suspends become state transitions. Tens of thousands of in-flight FSM tasks fit in the memory of a few hundred fibers.
+- **FSM** — the BG body is compiled to a state machine. The "stack" is a context struct sized to the live cross-segment vars only. Contexts use scheduler-local 64 B / 128 B / 256 B slabs for the common cases. Suspends become state transitions. Tens of thousands of in-flight FSM tasks fit in the memory of a few hundred fibers.
 
 The annotator decides per-BG which lowering to use. Eligible bodies go FSM; the rest stay stackful. A BG annotated with explicit `@local` (stack-size directive) always stays stackful; that opt-out is preserved.
 
 This document covers what FSM lowering supports today, what it doesn't, and why.
 
+## Context allocation plan
+
+**Now: slab-allocate small FSM contexts.** The runtime should provide scheduler-local slabs for 64 B, 128 B, and 256 B FSM context payloads, using the scheduler/runtime allocator passed into initialization. This is for the generated FSM context, not the shared fiber stack pool. The compiler should route contexts that fit into the smallest usable slab class and keep oversized contexts off the slab path.
+
+**Now: add `@stack` for compiler-picked stackful fallback.** When the compiler knows an FSM context is too large for the small slab classes, the explicit escape hatch should be stackful `@stack`, with the compiler selecting the stack tier from compile-time frame analysis. This keeps short-lived large contexts from silently paying heap/FSM costs when a stack is the better model.
+
+**Future: reuse FSM state slots.** Current liveness promotes by source variable name. It does not allocate reusable storage slots for disjoint live ranges. A future pass should build an interference graph over cross-segment values and map non-overlapping values to shared fields, so ten steps that each carry two dead-before-next-step values need two slots, not twenty.
+
+**Future: add `@fsm:heap`.** Heap FSM contexts should be explicit. If a context does not fit the slab classes and the user still wants FSM semantics, `@fsm:heap` will opt into heap allocation. Until that exists, oversized short-lived work should prefer `@stack`.
+
+## Current state-field allocation
+
+The compiler does **not** reuse context slots today.
+
+- `Liveness.analyze` returns `cross_segment_vars` keyed by variable name.
+- `Emit.build_recursive` emits one context field per promoted variable name.
+- `SuspendResolvers.resolve_next` emits a distinct `sp_N` field for each `NEXT` suspend and a distinct result field for each result variable.
+- `Emit.compute_sp_indices` assigns monotonically increasing `sp_1`, `sp_2`, ... to reachable `NEXT`/IO suspend points.
+
+That means reuse happens only when user code literally reuses the same variable name. Distinct variables with disjoint lifetimes still become distinct context fields. Distinct suspend points also get distinct `sp_N` fields.
+
 ## How a BG body becomes an FSM
 
 The pipeline for an FSM-eligible BG body:
diff --git a/spec/bg_profile_metadata_spec.rb b/spec/bg_profile_metadata_spec.rb
new file mode 100644
index 00000000..e7b5633d
--- /dev/null
+++ b/spec/bg_profile_metadata_spec.rb
@@ -0,0 +1,58 @@
+require_relative "../src/backends/transpiler"
+
+RSpec.describe 'BG profile metadata' do
+  def transpile(src)
+    ZigTranspiler.new.transpile(src)
+  end
+
+  it 'emits a stable metadata comment and local dispatch fields for plain BG' do
+    zig = transpile(<<~CLEAR)
+      FN main() RETURNS Void ->
+          f = BG {
+              1;
+          };
+          ASSERT (NEXT f) == 1, "result";
+      END
+    CLEAR
+
+    expect(zig).to include('CLEAR_PROFILE_TASK_SITE')
+    expect(zig).to include('kind=BG')
+    expect(zig).to include('line=2')
+    expect(zig).to include('dispatch=local')
+    expect(zig).to include('form=fsm')
+    expect(zig).to include('.profile_site_id = 1')
+    expect(zig).to include('.profile_dispatch = 1')
+    expect(zig).to include('.profile_site_id = 1')
+  end
+
+  it 'marks @parallel BG sites as parallel in metadata and task config' do
+    zig = transpile(<<~CLEAR)
+      FN main() RETURNS Void ->
+          f = BG { @parallel ->
+              1;
+          };
+          ASSERT (NEXT f) == 1, "result";
+      END
+    CLEAR
+
+    expect(zig).to include('CLEAR_PROFILE_TASK_SITE')
+    expect(zig).to include('dispatch=parallel')
+    expect(zig).to include('.profile_dispatch = 2')
+  end
+
+  it 'emits stackful BG profile metadata when @stack forces a fiber' do
+    zig = transpile(<<~CLEAR)
+      FN main() RETURNS Void ->
+          f = BG { @stack ->
+              1;
+          };
+          ASSERT (NEXT f) == 1, "result";
+      END
+    CLEAR
+
+    expect(zig).to include('CLEAR_PROFILE_TASK_SITE')
+    expect(zig).to include('form=stack')
+    expect(zig).to include('.profile_site_id = 1')
+    expect(zig).to include('.profile_dispatch = 1')
+  end
+end
diff --git a/spec/concurrency_spec.rb b/spec/concurrency_spec.rb
index 40237f1b..5f6471a5 100644
--- a/spec/concurrency_spec.rb
+++ b/spec/concurrency_spec.rb
@@ -302,11 +302,13 @@ def get_last_type(source)
         FLUX
       }
 
-      it "uses spawnBest by default (no auto-pin)" do
+      it "uses local FSM submit by default" do
         zig = ZigTranspiler.new.transpile(code)
         user_code = zig.split("// 3. Main Entry").first
-        # Pure-compute body becomes an FSM (Phase B1) — spawn dispatch is spawnFsmBest.
-        expect(user_code).to match(/spawn(Best|FsmBest)/)
+        # Pure-compute body becomes an FSM (Phase B1). Plain BG stays
+        # scheduler-local; explicit @parallel is the distributed path.
+        expect(user_code).to include("submitFsmSpawn")
+        expect(user_code).not_to match(/spawn(Best|FsmBest)/)
       end
     end
 
@@ -686,6 +688,15 @@ def get_last_type(source)
       end
     end
 
+    context "Zig output: DO @stack branch emits computed stack tier exactly" do
+      let(:code) { preamble + "DO { @stack -> work() }" }
+
+      it "emits .Micro rather than the default .Standard" do
+        zig = ZigTranspiler.new.transpile(code)
+        expect(zig).to include(".stack_size = .Micro")
+      end
+    end
+
     context "Zig output: DO @large branch emits .Large task config" do
       let(:code) { preamble + "DO { @large -> work() }" }
 
@@ -734,6 +745,16 @@ def get_last_type(source)
       end
     end
 
+    context "BG { @stack -> expr; }" do
+      let(:code) { work_fn + "FN f() RETURNS !Void -> p: ~Void = BG { @stack -> work(); }; NEXT p; RETURN; END" }
+
+      it "sets stack_size :stack on the BgBlock" do
+        fn = ast.statements.last
+        bg = fn.body.first.value
+        expect(bg.stack_size).to eq(:stack)
+      end
+    end
+
     context "BG { @large -> expr; }" do
       let(:code) { work_fn + "FN f() RETURNS !Void -> p: ~Void = BG { @large -> work(); }; NEXT p; RETURN; END" }
 
@@ -780,6 +801,16 @@ def get_last_type(source)
       end
     end
 
+    context "Zig output: BG @stack emits computed stack tier exactly" do
+      let(:code) { work_fn + "FN f() RETURNS !Void -> p: ~Void = BG { @stack -> work(); }; NEXT p; RETURN; END" }
+
+      it "emits .Micro rather than the default .Standard" do
+        zig = ZigTranspiler.new.transpile(code)
+        expect(zig).to include(".stack_size = .Micro")
+        expect(zig).not_to include("FsmTask")
+      end
+    end
+
     context "Zig output: BG no prefix uses auto-sized tier" do
       let(:code) { work_fn + "FN f() RETURNS !Void -> p: ~Void = BG { work(); }; NEXT p; RETURN; END" }
 
@@ -840,6 +871,24 @@ def get_last_type(source)
       end
     end
 
+    context "CONCURRENT(workers: 4, batch: 8) WHERE" do
+      let(:code) {
+        preamble +
+        "FN big(x: Float64) RETURNS Bool -> RETURN x > 1.0; END\n" \
+        "FN f() RETURNS !Void -> items: Float64[] = [1.0, 2.0]; " \
+        "r = items s> CONCURRENT(workers: 4, batch: 8) WHERE big(_); RETURN; END"
+      }
+
+      it "parses workers and batch options" do
+        fn   = ast.statements.last
+        pipe = fn.body[1].value
+        conc = pipe.right
+        expect(conc).to be_a(AST::ConcurrentOp)
+        expect(conc.options["workers"]).to be_a(AST::Literal)
+        expect(conc.options["batch"]).to be_a(AST::Literal)
+      end
+    end
+
     context "CONCURRENT(size: STANDARD) EACH" do
       let(:code) {
         preamble +
@@ -902,6 +951,19 @@ def get_last_type(source)
       end
     end
 
+    context "Zig output: default_stack Large upgrades unsized CONCURRENT" do
+      let(:code) {
+        preamble +
+        "FN f() RETURNS !Void -> items: Float64[] = [1.0, 2.0]; " \
+        "r = items s> CONCURRENT SELECT double(_); RETURN; END"
+      }
+
+      it "emits .stack_size = .Large" do
+        zig = ZigTranspiler.new.transpile(code, default_stack: "Large")
+        expect(zig).to include(".stack_size = .Large")
+      end
+    end
+
     context "Zig output: CONCURRENT(size: LARGE) WHERE emits .Large" do
       let(:code) {
         preamble +
@@ -1197,6 +1259,50 @@ def transpile_fn(src)
         expect { run(code) }.to raise_error(/Unknown CONCURRENT option/)
       end
 
+      it "rejects capacity for direct collection sources" do
+        code = <<~CLEAR
+          FN main() RETURNS Void ->
+            nums: Float64[] = [1.0, 2.0];
+            result = nums s> CONCURRENT(workers: 2, capacity: 8) SELECT _ * 2.0;
+            RETURN;
+          END
+        CLEAR
+        expect { run(code) }.to raise_error(/capacity only applies to stream or sharded sources.*batch: N/)
+      end
+
+      it "rejects batch of 0" do
+        code = <<~CLEAR
+          FN main() RETURNS Void ->
+            nums: Float64[] = [1.0, 2.0];
+            result = nums s> CONCURRENT(workers: 2, batch: 0) SELECT _ * 2.0;
+            RETURN;
+          END
+        CLEAR
+        expect { run(code) }.to raise_error(/batch must be greater than 0/)
+      end
+
+      it "rejects negative batch values" do
+        code = <<~CLEAR
+          FN main() RETURNS Void ->
+            nums: Float64[] = [1.0, 2.0];
+            result = nums s> CONCURRENT(workers: 2, batch: -1) SELECT _ * 2.0;
+            RETURN;
+          END
+        CLEAR
+        expect { run(code) }.to raise_error(/batch must be greater than 0/)
+      end
+
+      it "rejects non-number batch values" do
+        code = <<~CLEAR
+          FN main() RETURNS Void ->
+            nums: Float64[] = [1.0, 2.0];
+            result = nums s> CONCURRENT(workers: 2, batch: TRUE) SELECT _ * 2.0;
+            RETURN;
+          END
+        CLEAR
+        expect { run(code) }.to raise_error(/batch must be a number/)
+      end
+
       it "rejects non-Bool pin value" do
         code = <<~CLEAR
           FN main() RETURNS Void ->
@@ -1699,13 +1805,14 @@ def transpile_fn(clear_src)
     end
 
     describe "Transpiler" do
-      it "BgBlock emits a labeled block with Promise spawn and spawnBest (default)" do
+      it "BgBlock emits a labeled block with Promise spawn and local submit by default" do
         src = "FN f() RETURNS !Void -> p: ~Float64 = BG { 42.0; }; r: Float64 = NEXT p; RETURN; END"
         out = transpile_fn(src)
         expect(out).to include("CheatLib.Promise(f64).spawn(")
-        # Pure-compute body is Phase-B1 FSM-eligible — dispatch is spawnFsmBest
-        # (same call shape, same Promise wiring, FsmTask-backed execution).
-        expect(out).to match(/spawn(Best|FsmBest)\(/)
+        # Pure-compute body is Phase-B1 FSM-eligible; default dispatch is
+        # scheduler-local, while explicit @parallel uses spawnFsmBest.
+        expect(out).to include("submitFsmSpawn(")
+        expect(out).not_to match(/spawn(Best|FsmBest)\(/)
         expect(out).to include("break :")
         expect(out).to include("__ctx_0.inner.result = 42")
       end
diff --git a/spec/doctor_parallel_bg_hint_spec.rb b/spec/doctor_parallel_bg_hint_spec.rb
new file mode 100644
index 00000000..14794b35
--- /dev/null
+++ b/spec/doctor_parallel_bg_hint_spec.rb
@@ -0,0 +1,94 @@
+require 'tmpdir'
+require 'fileutils'
+require 'open3'
+
+RSpec.describe 'clear doctor — @parallel BG hint' do
+  let(:clear_bin) { File.expand_path('../../clear', __FILE__) }
+
+  def run_doctor(dir)
+    out, _, status = Open3.capture3(clear_bin, 'doctor', dir)
+    expect(status.exitstatus).to eq(0)
+    out
+  end
+
+  def write_profile(dir, source:, zig:, site_dispatch: 'local')
+    FileUtils.mkdir_p(dir)
+    File.write(File.join(dir, 'fibers.txt'), <<~FIBERS)
+      # fiber-profile v1
+      total_fibers: 4
+      short_fibers_under_1ms: 0
+      vshort_fibers_under_10us: 0
+      total_lifetime_ns: 4000000
+      max_lifetime_ns: 1000000
+      # per-scheduler fibers-run
+      # sched\tfibers
+      0\t0
+      1\t0
+      2\t0
+      3\t4
+      # per-site fibers
+      # site\tspawns\truns\texits\ttotal_lifetime_ns\tmax_lifetime_ns\tdispatch\tform\tschedulers
+      1\t4\t4\t4\t4000000\t1000000\t#{site_dispatch}\tfsm\t3:4
+    FIBERS
+    File.write(File.join(dir, 'source.cht'), source)
+    File.write(File.join(dir, 'transpiled.zig'), zig)
+  end
+
+  it 'points local BG worker fanout at @parallel when all work runs on one scheduler' do
+    Dir.mktmpdir do |dir|
+      profile_dir = File.join(dir, 'p.profile')
+      write_profile(
+        profile_dir,
+        source: <<~CHT,
+          FN main() RETURNS Void ->
+              workers = threadCount();
+              WHILE workers > 0 DO
+                  BG {
+                      doCpuWork();
+                  };
+              END
+          END
+        CHT
+        zig: <<~ZIG
+          pub fn bg0() void {}
+          // CLEAR_PROFILE_TASK_SITE id=1 kind=BG line=4 column=9 dispatch=local form=fsm
+          try rt.getSched().submitFsmSpawn(__bg0_ctx.task);
+        ZIG
+      )
+
+      out = run_doctor(profile_dir)
+      expect(out).to include('Scheduler imbalance')
+      expect(out).to include('Exact imbalanced local BG task sites')
+      expect(out).to include('BG { @parallel -> ... }')
+      expect(out).to include('line 4')
+      expect(out).to include('site=1 form=fsm runs=4')
+    end
+  end
+
+  it 'does not suggest @parallel when the source already uses it' do
+    Dir.mktmpdir do |dir|
+      profile_dir = File.join(dir, 'p.profile')
+      write_profile(
+        profile_dir,
+        source: <<~CHT,
+          FN main() RETURNS Void ->
+              BG { @parallel ->
+                  doCpuWork();
+              };
+          END
+        CHT
+        site_dispatch: 'parallel',
+        zig: <<~ZIG
+          pub fn bg0() void {}
+          // CLEAR_PROFILE_TASK_SITE id=1 kind=BG line=2 column=5 dispatch=parallel form=fsm
+          try CheatHeader.spawnFsmBest(__bg0_ctx.task);
+        ZIG
+      )
+
+      out = run_doctor(profile_dir)
+      expect(out).to include('Scheduler imbalance')
+      expect(out).not_to include('BG { @parallel -> ... }')
+      expect(out).not_to include('Candidate BG sites')
+    end
+  end
+end
diff --git a/spec/doctor_spec.rb b/spec/doctor_spec.rb
index 1afdcd24..2716f263 100644
--- a/spec/doctor_spec.rb
+++ b/spec/doctor_spec.rb
@@ -77,4 +77,71 @@ def capture_stdout
       expect(out).to include("(heap rc) = @multiowned RC allocation tracked by rcCreate")
     end
   end
+
+  it "prints exact @parallel recommendations for imbalanced local BG sites" do
+    Dir.mktmpdir do |dir|
+      File.write(File.join(dir, "source.cht"), "FN main() ->\n  x = BG { 1 };\nEND\n")
+      File.write(File.join(dir, "transpiled.zig"), <<~ZIG)
+        // CLEAR_PROFILE_TASK_SITE id=7 kind=BG line=2 column=7 dispatch=local form=fsm
+      ZIG
+      rows = [{
+        id: 7,
+        runs: 10,
+        exits: 5,
+        total_lifetime_ns: 15_000,
+        scheds: { 0 => 9, 1 => 1 },
+        dispatch: "local",
+        form: "fsm",
+      }]
+
+      out = capture_stdout { Doctor.emit_parallel_bg_hint!(dir, rows) }
+
+      expect(out).to include("Exact imbalanced local BG task sites")
+      expect(out).to include("line 2: x = BG { 1 };")
+      expect(out).to include("site=7 form=fsm runs=10 sched=0 90% avg=3.0us")
+      expect(out).to include("Use `BG { @parallel -> ... }`")
+    end
+  end
+
+  it "falls back to source scanning when profile metadata has local dispatches" do
+    Dir.mktmpdir do |dir|
+      File.write(File.join(dir, "source.cht"), <<~CLEAR)
+        FN main() ->
+          a = BG { 1 };
+          b = BG { @parallel -> 2 };
+        END
+      CLEAR
+      File.write(File.join(dir, "transpiled.zig"), <<~ZIG)
+        try rt.getSched().submitSpawn(...);
+        try rt.getSched().submitSpawn(...);
+        try CheatHeader.spawnBest(...);
+      ZIG
+
+      out = capture_stdout { Doctor.emit_parallel_bg_hint!(dir, []) }
+
+      expect(out).to include("Profile contains local BG dispatches")
+      expect(out).to include("Candidate BG sites:")
+      expect(out).to include("line 2: a = BG { 1 };")
+      expect(out).not_to include("line 3: b = BG")
+    end
+  end
+
+  it "parses task metadata and handles missing source lines" do
+    Dir.mktmpdir do |dir|
+      File.write(File.join(dir, "transpiled.zig"), <<~ZIG)
+        // CLEAR_PROFILE_TASK_SITE id=0 kind=BG line=1 column=1 dispatch=local form=fsm
+        // CLEAR_PROFILE_TASK_SITE id=12 kind=BG line=4 column=9 dispatch=parallel form=stack
+      ZIG
+
+      expect(Doctor.task_site_metadata(dir)[12]).to include(
+        kind: "BG",
+        line: 4,
+        column: 9,
+        dispatch: "parallel",
+        form: "stack",
+      )
+      expect(Doctor.source_line(dir, "?")).to eq("")
+      expect(Doctor.source_line(dir, 4)).to eq("")
+    end
+  end
 end
diff --git a/spec/fsm_classifier_spec.rb b/spec/fsm_classifier_spec.rb
index 70540aa1..e6c2d2ed 100644
--- a/spec/fsm_classifier_spec.rb
+++ b/spec/fsm_classifier_spec.rb
@@ -1,4 +1,6 @@
 require "rspec"
+require "open3"
+require "tempfile"
 require_relative "../src/backends/transpiler"
 require_relative "../src/ast/ast"
 require_relative "../src/annotator-helpers/effects"
@@ -255,10 +257,19 @@ def transpile(source)
       expect(user_code).not_to include("submitSpawn(")
     end
 
-    it "emits spawnFsmBest for an unpinned pure-compute BG" do
+    it "emits submitFsmSpawn for a default pure-compute BG" do
       src = "FN main() RETURNS Void -> p: ~Int64 = BG { 42; }; _ = NEXT p; RETURN; END"
       user_code = transpile(src).split("// 3. Main Entry").first
+      expect(user_code).to include("submitFsmSpawn(__bg0_ctx.task)")
+      expect(user_code).not_to include("CheatHeader.spawnFsmBest(")
+      expect(user_code).not_to include("CheatHeader.spawnBest(")
+    end
+
+    it "emits spawnFsmBest for an explicit @parallel pure-compute BG" do
+      src = "FN main() RETURNS Void -> p: ~Int64 = BG { @parallel -> 42; }; _ = NEXT p; RETURN; END"
+      user_code = transpile(src).split("// 3. Main Entry").first
       expect(user_code).to include("CheatHeader.spawnFsmBest(__bg0_ctx.task)")
+      expect(user_code).not_to include("submitFsmSpawn(__bg0_ctx.task)")
       expect(user_code).not_to include("CheatHeader.spawnBest(")
     end
 
@@ -274,11 +285,11 @@ def transpile(source)
       expect(match[1].strip).to start_with("task: *CheatHeader.FsmTask")
     end
 
-    it "resumeFn returns Done, clears wg, and destroys the ctx" do
+    it "resumeFn returns Done, clears wg, and frees the ctx through the runtime helper" do
       src = "FN main() RETURNS Void -> p: ~Int64 = BG { 42; }; _ = NEXT p; RETURN; END"
       user_code = transpile(src).split("// 3. Main Entry").first
       expect(user_code).to include("__ctx_0.inner.wg.done()")
-      expect(user_code).to include("__ctx_0.alloc.destroy(__ctx_0)")
+      expect(user_code).to include("CheatHeader.freeFsmCtx(@This(), __fsm_task, __ctx_0)")
       expect(user_code).to include("return .{ .Done = {} }")
     end
 
@@ -291,6 +302,33 @@ def transpile(source)
       expect(user_code).not_to include("spawnFsmBest")
     end
 
+    it "falls back to stackful for BG with @stack wildcard sizing" do
+      src = "FN main() RETURNS Void -> p: ~Int64 = BG { @stack -> 7; }; _ = NEXT p; RETURN; END"
+      user_code = transpile(src).split("// 3. Main Entry").first
+      expect(user_code).to include("submitSpawn").or include("spawnBest(")
+      expect(user_code).to include(".stack_size = .Micro")
+      expect(user_code).not_to include("FsmTask")
+      expect(user_code).not_to include("spawnFsmBest")
+    end
+
+    it "emits a compile error telling oversized FSM ctx users to use @stack" do
+      vars = (0...34).map { |i| "    a#{i}: Int64 = #{i}_i64;" }.join("\n")
+      sum = (0...34).map { |i| "a#{i}" }.join(" + ")
+      src = <<~CLEAR
+        FN main() RETURNS Void ->
+        #{vars}
+            p: ~Int64 = BG { #{sum}; };
+            result: Int64 = NEXT p;
+            RETURN;
+        END
+      CLEAR
+
+      user_code = transpile(src).split("// 3. Main Entry").first
+      expect(user_code).to include("@compileError")
+      expect(user_code).to include("FSM context is larger than 256 bytes")
+      expect(user_code).to include("use @stack")
+    end
+
     it "falls back to stackful for BG that transitively calls @reentrant" do
       src = <<~CLEAR
         FN countDown(n: Int64) RETURNS Void @reentrant ->
diff --git a/spec/fsm_liveness_spec.rb b/spec/fsm_liveness_spec.rb
index 91ff76a6..d5e28304 100644
--- a/spec/fsm_liveness_spec.rb
+++ b/spec/fsm_liveness_spec.rb
@@ -69,6 +69,20 @@ def io_call(name, args, stdlib_def)
       expect(result.cross_segment_vars).to have_key("buf")
     end
 
+    it "flags a pre-decl referenced by the suspend call receiver" do
+      pre_file = bind_decl("file", AST::Literal.new("{}", :File), full_type: :File)
+      receiver = ident("file")
+      call = io_call("read", [], io_def)
+      call.receiver = receiver
+      seg0 = FsmTransform::Segments::Segment.new(0, [pre_file],
+        FsmTransform::Segments::IoSuspend.new(call, io_def, nil))
+      seg1 = FsmTransform::Segments::Segment.new(1, [],
+        FsmTransform::Segments::Done.new(nil))
+
+      result = FsmTransform::Liveness.analyze([seg0, seg1], { captured: {} })
+      expect(result.cross_segment_vars).to have_key("file")
+    end
+
     it "does NOT flag a pre-decl used only within its own segment" do
       # x is declared and used in seg 0, never read in seg 1 nor in
       # the tail's args.
diff --git a/spec/fsm_transform_emit_spec.rb b/spec/fsm_transform_emit_spec.rb
new file mode 100644
index 00000000..34e05efc
--- /dev/null
+++ b/spec/fsm_transform_emit_spec.rb
@@ -0,0 +1,15 @@
+require "rspec"
+require_relative "../src/mir/fsm_transform/emit"
+
+RSpec.describe FsmTransform::Emit do
+  it "maps profile dispatch ids and emits task-site comments" do
+    expect(described_class.profile_dispatch_id(:local)).to eq(1)
+    expect(described_class.profile_dispatch_id(:parallel)).to eq(2)
+    expect(described_class.profile_dispatch_id(:shared)).to eq(3)
+    expect(described_class.profile_dispatch_id(:unexpected)).to eq(1)
+
+    ctx = { profile_site_id: 11, profile_line: 22, profile_column: 5 }
+    expect(described_class.bg_profile_site_comment(ctx, :parallel, :fsm))
+      .to eq("// CLEAR_PROFILE_TASK_SITE id=11 kind=BG line=22 column=5 dispatch=parallel form=fsm")
+  end
+end
diff --git a/spec/fsm_wrapper_emitter_spec.rb b/spec/fsm_wrapper_emitter_spec.rb
index 444a16fe..6fd36823 100644
--- a/spec/fsm_wrapper_emitter_spec.rb
+++ b/spec/fsm_wrapper_emitter_spec.rb
@@ -67,6 +67,32 @@ def body(blk_label: "__bg0", **kw)
     MIR::FsmIoBody.new(blk_label, ctx_struct(**kw), spawn_setup)
   end
 
+  def b1_body
+    MIR::FsmB1Body.new(
+      "__bg_b1",
+      MIR::FsmB1CtxStruct.new(
+        "__BgB1Ctx",
+        "CheatLib.Promise(i64)",
+        "value: i64,",
+        MIR::FsmStep.new(0, 0, "__rt_b1", "_ = &__rt_b1;", [
+          MIR::RawZig.new("__ctx_0.inner.result = __ctx_0.value;", :fsm_body, nil, nil),
+        ]),
+      ),
+      MIR::FsmSpawnSetup.new(
+        "__bg_b1_alloc",
+        "rt.heapAlloc()",
+        "__bg_b1_promise",
+        "CheatLib.Promise(i64)",
+        "",
+        "__bg_b1_ctx",
+        "__BgB1Ctx",
+        ".task = undefined,\n.rt = rt,\n.inner = __bg_b1_promise.inner,\n.alloc = __bg_b1_alloc,\n.value = 42,",
+        "try CheatHeader.spawnFsmBest(__bg_b1_ctx.task);",
+        "rt",
+      ),
+    )
+  end
+
   describe "outer block structure" do
     it "wraps the entire emission in a labeled block" do
       out = FsmWrapperEmitter.render(body)
@@ -80,6 +106,19 @@ def body(blk_label: "__bg0", **kw)
     end
   end
 
+  describe "B1 pure-compute body" do
+    it "emits a single runBody and fixed resume function" do
+      out = FsmWrapperEmitter.render(b1_body)
+
+      expect(out).to start_with("__bg_b1: {")
+      expect(out).to include("fn runBody(__ctx_0: *@This()) anyerror!void {")
+      expect(out).to include("__ctx_0.inner.result = __ctx_0.value;")
+      expect(out).to include("if (runBody(__ctx_0)) |_| {} else |err|")
+      expect(out).to include("return .{ .Done = {} };")
+      expect(out).to include("break :__bg_b1 __bg_b1_promise;")
+    end
+  end
+
   describe "ctx struct decl" do
     it "emits the fixed-prefix fields (task, rt, inner, alloc)" do
       out = FsmWrapperEmitter.render(body)
@@ -150,6 +189,7 @@ def body(blk_label: "__bg0", **kw)
       # is rendered separately on the ctx struct.
       expect(out).to include("fn destroyTask(")
       expect(out).to include(".destroy_fn = &__BgCtx0.destroyTask")
+      expect(out).to include("CheatHeader.freeFsmCtx(@This(), __fsm_task, __ctx_0)")
     end
 
     it "interpolates step-0 error cleanup into the catch arm" do
@@ -201,12 +241,12 @@ def body(blk_label: "__bg0", **kw)
       out = FsmWrapperEmitter.render(body)
       expect(out).to include("const __bg0_alloc = rt.heapAlloc();")
       expect(out).to include("const __bg0_promise = try CheatLib.Promise(i64).spawn(__bg0_alloc, rt.getSched());")
-      expect(out).to include("const __bg0_ctx = try __bg0_alloc.create(__BgCtx0);")
-      expect(out).to include("errdefer __bg0_alloc.destroy(__bg0_ctx);")
+      expect(out).to include("const __bg0_ctx_task = try CheatHeader.allocFsmTask(rt, &__BgCtx0.resumeFn);")
+      expect(out).to include("const __bg0_ctx = try CheatHeader.allocFsmCtx(__BgCtx0, rt, __bg0_ctx_task);")
+      expect(out).to include("errdefer CheatHeader.freeFsmCtx(__BgCtx0, __bg0_ctx_task, __bg0_ctx);")
       expect(out).to include("__bg0_ctx.* = .{")
       expect(out).to include(".rt = rt,")
       expect(out).to include(".inner = __bg0_promise.inner,")
-      expect(out).to include("const __bg0_ctx_task = try CheatHeader.allocFsmTask(rt, &__BgCtx0.resumeFn);")
       expect(out).to include("__bg0_ctx_task.ctx = __bg0_ctx;")
       expect(out).to include("__bg0_ctx.task = __bg0_ctx_task;")
       expect(out).to include("try CheatHeader.spawnFsmBest(__bg0_ctx.task);")
diff --git a/spec/mir_emitter_spec.rb b/spec/mir_emitter_spec.rb
index 60bd3d7a..f0b1fc51 100644
--- a/spec/mir_emitter_spec.rb
+++ b/spec/mir_emitter_spec.rb
@@ -471,10 +471,27 @@
 
   describe "EscapePromote" do
     it "emits list promotion" do
-      node = MIR::EscapePromote.new("items", "CheatLib.ArrayListUnmanaged(i64)", :list, nil, "rt")
+      node = MIR::EscapePromote.new("items", "CheatLib.ArrayListUnmanaged(i64)", :list, nil, "rt", "i64")
       expect(e.emit(node)).to eq("try CheatLib.promoteList(i64, rt, &items);")
     end
 
+    it "emits nested list promotion from explicit elem_type" do
+      node = MIR::EscapePromote.new(
+        "items",
+        "CheatLib.ArrayListUnmanaged(CheatLib.Promise(i64))",
+        :list,
+        nil,
+        "rt",
+        "CheatLib.Promise(i64)"
+      )
+      expect(e.emit(node)).to eq("try CheatLib.promoteList(CheatLib.Promise(i64), rt, &items);")
+    end
+
+    it "requires explicit elem_type for list promotion" do
+      node = MIR::EscapePromote.new("items", "CheatLib.ArrayListUnmanaged(i64)", :list, nil, "rt")
+      expect { e.emit(node) }.to raise_error(RuntimeError, /missing elem_type/)
+    end
+
     it "emits string_map promotion" do
       node = MIR::EscapePromote.new("cache", nil, :string_map, nil, "rt")
       expect(e.emit(node)).to eq("cache.alloc = rt.heapAlloc();")
diff --git a/spec/mir_lowering_spec.rb b/spec/mir_lowering_spec.rb
index 3bfbd579..98559ee0 100644
--- a/spec/mir_lowering_spec.rb
+++ b/spec/mir_lowering_spec.rb
@@ -50,6 +50,34 @@ def make_binop(left, op, right)
     node
   end
 
+  describe "task profile helpers" do
+    it "injects profile fields into empty and non-empty task configs" do
+      low = lowering
+
+      expect(low.send(:task_config_with_profile, ".{}", 9, :parallel)).to eq(".{ .profile_site_id = 9, .profile_dispatch = 2 }")
+      expect(low.send(:task_config_with_profile, ".{ .stack_size = .Large }", 4, :shared))
+        .to eq(".{ .stack_size = .Large , .profile_site_id = 4, .profile_dispatch = 3 }")
+    end
+
+    it "maps unknown dispatches to local and emits profile comments" do
+      low = lowering
+
+      expect(low.send(:profile_dispatch_id, :unexpected)).to eq(1)
+      expect(low.send(:bg_profile_site_comment, 5, 12, 3, :unexpected, :stack))
+        .to eq("// CLEAR_PROFILE_TASK_SITE id=5 kind=BG line=12 column=3 dispatch=unexpected form=stack")
+    end
+
+    it "routes parallel fiber spawn through spawnBest" do
+      low = lowering
+
+      out = low.send(:fiber_spawn_call_zig, "__rt", "__Worker", "__worker", ".{}", :parallel)
+
+      expect(out).to include("CheatHeader.spawnBest")
+      expect(out).to include("&__Worker.run")
+      expect(out).to include("__worker")
+    end
+  end
+
   # =========================================================================
   # Old MIR translation
   # =========================================================================
@@ -100,12 +128,13 @@ def make_binop(left, op, right)
     end
 
     it "translates MIR::Promote to MIR::EscapePromote" do
-      promote = MIR::Promote.new(tok, "items", "ArrayListUnmanaged(i64)", :list, nil)
+      promote = MIR::Promote.new(tok, "items", "ArrayListUnmanaged(i64)", :list, nil, "i64")
       result = lowering.lower(promote)
       expect(result).to be_a(MIR::EscapePromote)
       expect(result.name).to eq("items")
       expect(result.strategy).to eq(:list)
       expect(result.zig_type).to eq("ArrayListUnmanaged(i64)")
+      expect(result.elem_type).to eq("i64")
       zig = emit(result)
       expect(zig).to include("promoteList")
     end
@@ -1648,7 +1677,9 @@ def make_fn(name, params: [], return_type: :Void, body: [], visibility: nil,
       expect(zig).to include("__BgCtx")
       expect(zig).to include(".spawn(")
       expect(zig).to include("fn run(")
-      expect(zig).to include("spawnBest")
+      expect(zig).to include("submitSpawn")
+      expect(zig).to include(".profile_dispatch = 1")
+      expect(zig).not_to include("spawnBest")
     end
 
     it "lowers BgBlock with captures" do
diff --git a/spec/pipeline_backend_coverage_spec.rb b/spec/pipeline_backend_coverage_spec.rb
index d40d50d8..7a898de3 100644
--- a/spec/pipeline_backend_coverage_spec.rb
+++ b/spec/pipeline_backend_coverage_spec.rb
@@ -247,6 +247,12 @@ def task_config_zig(_stack_size, _computed_tier = nil)
       expect(pipeline_host.send(:substitute_placeholders, AST::StructLit.new(tok, "Box", { "x" => id("_") })).fields["x"].name).to eq("__it")
       expect(pipeline_host.send(:substitute_placeholders, AST::HashLit.new(tok, { "k" => id("_") })).pairs["k"].name).to eq("__it")
       expect(pipeline_host.send(:substitute_placeholders, AST::Assert.new(tok, id("_"), nil)).condition.name).to eq("__it")
+
+      if_stmt = AST::IfStatement.new(tok, id("_"), [AST::FuncCall.new(tok, "t", [id("_")])], [AST::FuncCall.new(tok, "e", [id("acc")])])
+      replaced = pipeline_host.send(:substitute_placeholders, if_stmt)
+      expect(replaced.condition.name).to eq("__it")
+      expect(replaced.then_branch.first.args.first.name).to eq("__it")
+      expect(replaced.else_branch.first.args.first.name).to eq("__acc")
     end
 
     it "substitutes assignment and bind targets plus SOA EACH fields" do
@@ -267,5 +273,42 @@ def task_config_zig(_stack_size, _computed_tier = nil)
       expect(pipeline_host.send(:ast_stmts_use_placeholder?, [stmt])).to be true
       expect(pipeline_host.send(:ast_stmts_use_placeholder?, [AST::FuncCall.new(tok, "f", [id("x")])])).to be false
     end
+
+    it "lowers SHARD+CONCURRENT EACH to a sequential BC loop" do
+      lowering.instance_variable_set(:@target, :bc)
+      range = AST::RangeLit.new(tok, lit(0), lit(4), false)
+      key_expr = id("_")
+      map = id("counts")
+      conc = AST::ConcurrentOp.new(tok, AST::EachOp.new(tok, [AST::FuncCall.new(tok, "touch", [id("_")])]), {})
+      conc.shard_context = { auto_detected: true, key_expr: key_expr, map_var: map }
+
+      pipeline_host.define_singleton_method(:visit_mir) do |node|
+        node.is_a?(AST::Literal) ? MIR::Lit.new(node.value) : MIR::Ident.new(node.name)
+      end
+      pipeline_host.define_singleton_method(:visit_pipeline_body_mir) do |_body, placeholder:|
+        [MIR::ExprStmt.new(MIR::Ident.new("body_for_#{placeholder}"), nil)]
+      end
+
+      result = pipeline_host.send(:lower_shard_concurrent_each, range, conc, OpenStruct.new)
+
+      expect(result).to be_a(MIR::ForStmt)
+      expect(result.capture).to match(/__sh\d+_i/)
+      expect(result.body.first).to be_a(MIR::Let)
+      expect(result.body.last.expr.name).to match(/body_for___sh\d+_key/)
+    ensure
+      lowering.instance_variable_set(:@target, nil)
+    end
+
+    it "wraps explicit concurrent batch options for Zig usize use" do
+      batch = lit(8)
+      conc = OpenStruct.new(options: { "batch" => batch })
+      pipeline_host.define_singleton_method(:visit_mir) { |node| MIR::Lit.new(node.value) }
+      lowering.define_singleton_method(:emit_expr) { |node| node.respond_to?(:value) ? node.value.to_s : node.name.to_s }
+
+      mir = pipeline_host.send(:bounded_concurrent_batch_mir, conc)
+
+      expect(mir).to be_a(MIR::InlineZig)
+      expect(mir.code).to eq("@intCast(8)")
+    end
   end
 end
diff --git a/spec/shard_concurrent_lowering_spec.rb b/spec/shard_concurrent_lowering_spec.rb
new file mode 100644
index 00000000..53699cfb
--- /dev/null
+++ b/spec/shard_concurrent_lowering_spec.rb
@@ -0,0 +1,22 @@
+require_relative "../src/backends/transpiler"
+
+RSpec.describe "SHARD + CONCURRENT EACH lowering" do
+  it "emits per-shard bounded channels and workers instead of the serial SHARD loop" do
+    src = <<~CLEAR
+      FN main() RETURNS Void ->
+          MUTABLE counts: HashMap<Int64, Int64>@sharded(4) = {};
+          (0..<16_i64) s> SHARD(_ MOD 4_i64, counts) s> CONCURRENT EACH {
+              counts[_] = (counts[_] OR 0_i64) + 1_i64;
+          };
+      END
+    CLEAR
+
+    zig = ZigTranspiler.new.transpile(src)
+
+    expect(zig).to include("CheatLib.BoundedChannel(__ShWork")
+    expect(zig).to include("__ShWorker")
+    expect(zig).to include("CheatHeader.spawnBest")
+    expect(zig).to include("putDirect")
+    expect(zig).not_to include("while ((__sh1_i < __sh1_end)) : (__sh1_i = (__sh1_i + 1))")
+  end
+end
diff --git a/spec/stack_tier_spec.rb b/spec/stack_tier_spec.rb
index 017c2cf7..389119db 100644
--- a/spec/stack_tier_spec.rb
+++ b/spec/stack_tier_spec.rb
@@ -331,6 +331,37 @@ def stack_bytes_for(source, fn_name)
   end
 
   describe "BG block auto-sizing" do
+    it "@stack warns with the computed concrete tier" do
+      src = <<~CLEAR
+        FN compute(n: Float64) RETURNS Float64 ->
+            RETURN n * 2.0;
+        END
+        FN main() RETURNS Void ->
+            p: ~Float64 = BG { @stack -> compute(21.0); };
+            result: Float64 = NEXT p;
+            RETURN;
+        END
+      CLEAR
+      warnings = []
+      allow($stderr).to receive(:puts) { |msg| warnings << msg }
+      analyze(src)
+      expect(warnings.any? { |w| w.include?("@stack resolved to @micro") && w.include?("replace @stack with @micro") }).to be true
+    end
+
+    it "@stack does not silently become @service for plain reentrant callees" do
+      src = <<~CLEAR
+        FN fib(n: Float64) RETURNS Float64 @reentrant ->
+            IF n < 2.0 THEN RETURN n; END
+            RETURN fib(n - 1.0) + fib(n - 2.0);
+        END
+        FN main() RETURNS Void ->
+            p: ~Float64 = BG { @stack -> fib(10.0); };
+            result: Float64 = NEXT p; RETURN;
+        END
+      CLEAR
+      expect { analyze(src) }.to raise_error(CompilerError, /Declare `@service` explicitly/)
+    end
+
     it "assigns computed_stack_tier to BG blocks" do
       src = <<~CLEAR
         FN compute(n: Float64) RETURNS Float64 ->
diff --git a/src/annotator-helpers/pipe_analysis.rb b/src/annotator-helpers/pipe_analysis.rb
index b59f93e0..b81e3611 100644
--- a/src/annotator-helpers/pipe_analysis.rb
+++ b/src/annotator-helpers/pipe_analysis.rb
@@ -1286,45 +1286,76 @@ def analyze_shard_op(node)
     node.storage = :stack
   end
 
-  VALID_CONCURRENT_OPTIONS = %w[workers capacity parallel size].freeze
+  VALID_CONCURRENT_OPTIONS = %w[workers capacity batch parallel size].freeze
   VALID_CONCURRENT_SIZES   = %w[MICRO STANDARD LARGE XL].freeze
 
+  def validate_positive_numeric_concurrent_option!(name, expr)
+    visit(expr)
+    unless [:Float64, :Int64].include?(expr.resolved_type)
+      error!(expr, "CONCURRENT #{name} must be a number, got #{expr.resolved_type}")
+    end
+
+    literal_val = numeric_literal_value(expr)
+    if literal_val && literal_val <= 0
+      error!(expr, "CONCURRENT #{name} must be greater than 0, got #{literal_val.to_i}")
+    end
+  end
+
+  def numeric_literal_value(expr)
+    if expr.is_a?(AST::Literal)
+      expr.value.to_f
+    elsif expr.is_a?(AST::UnaryOp) && expr.op == :SUB
+      right = expr.right
+      -right.value.to_f if right.is_a?(AST::Literal)
+    end
+  end
+
+  def queue_backed_concurrent_source?(node)
+    lhs = node.left
+    lhs_type = lhs.type_info
+    shard_concurrent_source?(lhs) || bounded_stream_source?(lhs) ||
+      lhs_type&.inf_stream? || lhs_type&.dynamic_stream? ||
+      lhs_type&.open_stream? || lhs.is_a?(AST::RangeLit)
+  end
+
+  def shard_concurrent_source?(lhs)
+    lhs.is_a?(AST::BinaryOp) && lhs.op == :SMOOTH && lhs.right.is_a?(AST::ShardOp)
+  end
+
   def analyze_concurrent_op(node)
     conc    = node.right   # the ConcurrentOp node
     options = conc.options
     lhs_type = node.left.type_info
 
+    # Detect SHARD predecessor: (range) s> SHARD(key, map) s> CONCURRENT EACH { ... }
+    # node.left is BinaryOp(SMOOTH, range, ShardOp) when SHARD precedes CONCURRENT.
+    shard_node = nil
+    if shard_concurrent_source?(node.left)
+      shard_node = node.left.right
+      target_info = shard_node.target_map.type_info
+      conc.shard_context = {
+        map_var: shard_node.target_map,
+        shard_count: target_info&.shard_count,
+        key_expr: shard_node.key_expr
+      }
+    end
+
     # Validate workers option if present
     if (ps = options["workers"])
-      visit(ps)
-      unless [:Float64, :Int64].include?(ps.resolved_type)
-        error!(ps, "CONCURRENT workers must be a number, got #{ps.resolved_type}")
-      end
-      # Validate workers > 0 for literal values (including negated literals like -1)
-      literal_val = if ps.is_a?(AST::Literal)
-        ps.value.to_f
-      elsif ps.is_a?(AST::UnaryOp) && ps.op == :SUB && ps.right.is_a?(AST::Literal)
-        -ps.right.value.to_f
-      end
-      if literal_val && literal_val <= 0
-        error!(ps, "CONCURRENT workers must be greater than 0, got #{literal_val.to_i}")
-      end
+      validate_positive_numeric_concurrent_option!("workers", ps)
     end
 
     # Validate capacity option if present
     if (cap = options["capacity"])
-      visit(cap)
-      unless [:Float64, :Int64].include?(cap.resolved_type)
-        error!(cap, "CONCURRENT capacity must be a number, got #{cap.resolved_type}")
-      end
-      literal_val = if cap.is_a?(AST::Literal)
-        cap.value.to_f
-      elsif cap.is_a?(AST::UnaryOp) && cap.op == :SUB && cap.right.is_a?(AST::Literal)
-        -cap.right.value.to_f
-      end
-      if literal_val && literal_val <= 0
-        error!(cap, "CONCURRENT capacity must be greater than 0, got #{literal_val.to_i}")
+      unless queue_backed_concurrent_source?(node)
+        error!(cap, "CONCURRENT capacity only applies to stream or sharded sources; use batch: N to control work chunking for collections")
       end
+      validate_positive_numeric_concurrent_option!("capacity", cap)
+    end
+
+    # Validate batch option if present
+    if (batch = options["batch"])
+      validate_positive_numeric_concurrent_option!("batch", batch)
     end
 
     # Validate parallel option is Bool if present
@@ -1352,19 +1383,6 @@ def analyze_concurrent_op(node)
       end
     end
 
-    # Detect SHARD predecessor: (range) s> SHARD(key, map) s> CONCURRENT EACH { ... }
-    # node.left is BinaryOp(SMOOTH, range, ShardOp) when SHARD precedes CONCURRENT.
-    shard_node = nil
-    if node.left.is_a?(AST::BinaryOp) && node.left.op == :SMOOTH && node.left.right.is_a?(AST::ShardOp)
-      shard_node = node.left.right
-      target_info = shard_node.target_map.type_info
-      conc.shard_context = {
-        map_var: shard_node.target_map,
-        shard_count: target_info&.shard_count,
-        key_expr: shard_node.key_expr
-      }
-    end
-
     # Type analysis for concurrent ops is identical to synchronous versions.
     # Create a proxy BinaryOp(SMOOTH, left, inner_op) so we can reuse the existing analyze_* methods.
     proxy = AST::BinaryOp.new(node.token, node.left, :SMOOTH, conc.op)
diff --git a/src/annotator.rb b/src/annotator.rb
index 0ec07f39..0c6f2aaa 100644
--- a/src/annotator.rb
+++ b/src/annotator.rb
@@ -6118,6 +6118,13 @@ def validate_fiber_stack!(node, call_names, user_size, can_smash)
       end
     end
 
+    if user_size == :stack
+      loc = node.respond_to?(:line) ? " (line #{node.line})" : ""
+      $stderr.puts "\e[33m[Warning]\e[0m Stack sizing: @stack resolved to @#{computed}; " \
+                   "replace @stack with @#{computed}. In STRICT mode, @stack will be rejected.#{loc}"
+      return
+    end
+
     # User-specified size too small
     if user_size && TIER_ORDER.fetch(user_size, 0) < TIER_ORDER.fetch(computed, 0)
       error!(node, "Stack safety: @#{user_size} (#{EffectTracker::STACK_TIER_BUDGET[user_size]} bytes) " \
diff --git a/src/ast/ast.rb b/src/ast/ast.rb
index 2454fe01..7182c866 100644
--- a/src/ast/ast.rb
+++ b/src/ast/ast.rb
@@ -1175,13 +1175,14 @@ def needs_cleanup; true; end
   # Promote: escape promotion inserted before return statements.
   # Emits frame->heap copy/promotion code. Replaces PromotionClassifier lookups in transpiler.
   #
-  # strategy:  :list     — promoteList (dupe backing buffer to heap)
+  # strategy:  :list     — promoteList (dupe backing buffer to heap; elem_type required)
   #            :string_map — swap allocator to heapAlloc
   #            :fields   — promoteFields (recursive field promotion)
   #            :generic  — promote (single value deep copy)
-  Promote = Struct.new(:token, :name, :zig_type, :strategy, :fields) do
+  Promote = Struct.new(:token, :name, :zig_type, :strategy, :fields, :elem_type) do
     include AST::Locatable
     # fields: Set of field names for :fields strategy (nil = all fields)
+    # elem_type: Zig element type for :list promotion.
   end
 
   # SuppressCleanup: move suppression marker inserted at consumption points
diff --git a/src/ast/parser.rb b/src/ast/parser.rb
index 48e6644d..c5136c05 100644
--- a/src/ast/parser.rb
+++ b/src/ast/parser.rb
@@ -3340,6 +3340,7 @@ def parse_lock_rank_arg!(sigil_tok, attrs, dims)
   # After `:` the next word is also looked up here (with `@` prepended if absent).
   DO_BRANCH_SIGILS = {
     '@micro'    => { stack_size: :micro    },
+    '@stack'    => { stack_size: :stack    },
     '@standard' => { stack_size: :standard },
     '@large'    => { stack_size: :large    },
     '@xl'       => { stack_size: :xl       },
@@ -3352,6 +3353,7 @@ def parse_lock_rank_arg!(sigil_tok, attrs, dims)
   # Sigils valid at the start of a BG body (stack size + pinned).
   BG_SIGILS = {
     '@micro'    => { stack_size: :micro    },
+    '@stack'    => { stack_size: :stack    },
     '@standard' => { stack_size: :standard },
     '@large'    => { stack_size: :large    },
     '@xl'       => { stack_size: :xl       },
@@ -3380,7 +3382,7 @@ def parse_branch_prefix
       cap_name = tok.value.start_with?('@') ? tok.value : "@#{tok.value}"
       attrs    = DO_BRANCH_SIGILS[cap_name]
       error!(tok, "Unknown branch prefix #{tok.value.inspect}. " \
-                  "Expected @micro, @standard, @large, @xl, @pinned, @parallel, or @canSmash") unless attrs
+                  "Expected @micro, @stack, @standard, @large, @xl, @service, @pinned, @parallel, or @canSmash") unless attrs
 
       if attrs[:stack_size]
         error!(tok, "Duplicate stack size in branch prefix") if stack_size
diff --git a/src/ast/std_lib.rb b/src/ast/std_lib.rb
index 19fc8f2a..7651d687 100644
--- a/src/ast/std_lib.rb
+++ b/src/ast/std_lib.rb
@@ -643,6 +643,14 @@
     bc: true,
   },
 
+  # Peak virtual memory size (VmPeak) in KB.
+  "peakVirtualMemoryKb" => {
+    args: [],
+    return: :Int64,
+    zig: "CheatLib.peakVirtualMemoryKb()",
+    bc: true,
+  },
+
   # Current resident set size (VmRSS) in KB — physical memory in use right now.
   "currentMemoryKb" => {
     args: [],
@@ -651,6 +659,14 @@
     bc: true,
   },
 
+  # Benchmark helper: touch pages in the current fiber's allocated stack slice.
+  # No-op on root-stack/FSM execution.
+  "touchCurrentFiberStack" => {
+    args: [:Int64, :Int64],
+    return: :Int64,
+    zig: "CheatLib.touchCurrentFiberStack({0}, {1})",
+  },
+
   # Sleep the current fiber for N milliseconds. Cooperative — other fibers run.
   # Usage: sleep(100);
   "sleep" => {
@@ -1131,49 +1147,49 @@
   # {0}=elem_zig_type
   needsCleanup: { zig: "CheatLib.needsCleanup({0})", bc: true, borrows: :all },
   concurrentBoundedSelect: {
-    zig: "try CheatLib.concurrentBoundedSelect({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10})",
+    zig: "try CheatLib.concurrentBoundedSelect({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11})",
     bc: true,
     allocates: true
   },
   concurrentBoundedWhere: {
-    zig: "try CheatLib.concurrentBoundedWhere({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})",
+    zig: "try CheatLib.concurrentBoundedWhere({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10})",
     bc: true,
     allocates: true
   },
   concurrentBoundedEach: {
-    zig: "try CheatLib.concurrentBoundedEach({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8})",
+    zig: "try CheatLib.concurrentBoundedEach({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})",
     bc: true,
     borrows: :all
   },
   concurrentStreamSelect: {
-    zig: "try CheatLib.concurrentStreamSelect({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11})",
+    zig: "try CheatLib.concurrentStreamSelect({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12})",
     bc: true,
     allocates: true
   },
   concurrentStreamWhere: {
-    zig: "try CheatLib.concurrentStreamWhere({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10})",
+    zig: "try CheatLib.concurrentStreamWhere({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11})",
     bc: true,
     allocates: true
   },
   concurrentStreamEach: {
-    zig: "try CheatLib.concurrentStreamEach({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10})",
+    zig: "try CheatLib.concurrentStreamEach({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11})",
     bc: true,
     borrows: :all
   },
   concurrentListSelect: {
-    zig: "try CheatLib.concurrentListSelect({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})",
+    zig: "try CheatLib.concurrentListSelect({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10})",
     allocates: true
   },
   concurrentListWhere: {
-    zig: "try CheatLib.concurrentListWhere({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8})",
+    zig: "try CheatLib.concurrentListWhere({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})",
     allocates: true
   },
   concurrentListEach: {
-    zig: "try CheatLib.concurrentListEach({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})",
+    zig: "try CheatLib.concurrentListEach({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8})",
     borrows: :all
   },
   concurrentListEachInPlace: {
-    zig: "try CheatLib.concurrentListEachInPlace({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})",
+    zig: "try CheatLib.concurrentListEachInPlace({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8})",
     borrows: :all
   },
 }.freeze
diff --git a/src/backends/pipeline_generator.rb b/src/backends/pipeline_generator.rb
index cc9e9019..7b8d3973 100644
--- a/src/backends/pipeline_generator.rb
+++ b/src/backends/pipeline_generator.rb
@@ -1737,6 +1737,11 @@ def concurrent_spawn_call(options, wg_var, ctx_type, ctx_var)
     end
   end
 
+  def concurrent_batch_code(options)
+    batch = options["batch"]
+    batch ? visit(batch) : "1"
+  end
+
   # Inspect the expression for OR PRUNE / OR RAISE error policy
   # Returns [:prune, inner_expr], [:raise, inner_expr], or [:default, expr]
   def extract_concurrent_error_policy(expr)
@@ -1794,6 +1799,7 @@ def transpile_concurrent_select(list_node, select_op, id, workers_code, rt_name,
     # Persistent worker pool: spawn N workers that each pull items
     # from a shared atomic index.  Zero per-item heap allocation.
     spawn_call = concurrent_spawn_call(options, "__ccs#{id}_wg", "__CcsWorker#{id}", "__ccs#{id}_workers[__w]")
+    batch_code = concurrent_batch_code(options)
 
     <<~ZIG.chomp
       #{@current_pipe_label}: {
@@ -1808,10 +1814,12 @@ def transpile_concurrent_select(list_node, select_op, id, workers_code, rt_name,
           for (__ccs#{id}_results) |*__s| __s.* = null;#{err_decl}
           var __ccs#{id}_wg = CheatHeader.WaitGroup.init(#{rt_name}.getSched());
           const __ccs#{id}_n_workers: usize = @intCast(#{workers_code});
+          const __ccs#{id}_batch: usize = @max(@as(usize, @intCast(#{batch_code})), 1);
           const __CcsWorker#{id} = struct {
               wg:      *CheatHeader.WaitGroup,
               items:   []const #{item_zig},
               results: []?#{result_zig},
+              batch:   usize,
               next:    *std.atomic.Value(usize),#{err_field}
               fn run(raw_rt: *anyopaque, raw_args: ?*anyopaque) anyerror!void {
                   const __rt = @as(*Runtime, @ptrCast(@alignCast(raw_rt)));
@@ -1819,9 +1827,12 @@ def transpile_concurrent_select(list_node, select_op, id, workers_code, rt_name,
                   const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
                   defer ctx.wg.done();
                   while (true) {
-                      const __idx = ctx.next.fetchAdd(1, .monotonic);
-                      if (__idx >= ctx.items.len) break;
-                      #{fiber_result_code}
+                      const __start = ctx.next.fetchAdd(ctx.batch, .monotonic);
+                      if (__start >= ctx.items.len) break;
+                      const __end = @min(__start + ctx.batch, ctx.items.len);
+                      for (__start..__end) |__idx| {
+                          #{fiber_result_code}
+                      }
                       __rt.checkYield();
                   }
               }
@@ -1835,6 +1846,7 @@ def transpile_concurrent_select(list_node, select_op, id, workers_code, rt_name,
                   .wg      = &__ccs#{id}_wg,
                   .items   = __ccs#{id}_items,
                   .results = __ccs#{id}_results,
+                  .batch   = __ccs#{id}_batch,
                   .next    = &__ccs#{id}_next,#{err_ctx_init}
               };
               #{spawn_call}
@@ -1888,6 +1900,7 @@ def transpile_concurrent_where(list_node, where_op, id, workers_code, rt_name, o
     end
 
     spawn_call = concurrent_spawn_call(options, "__ccw#{id}_wg", "__CcwWorker#{id}", "__ccw#{id}_workers[__w]")
+    batch_code = concurrent_batch_code(options)
 
     <<~ZIG.chomp
       #{@current_pipe_label}: {
@@ -1902,10 +1915,12 @@ def transpile_concurrent_where(list_node, where_op, id, workers_code, rt_name, o
           for (__ccw#{id}_results) |*__s| __s.* = null;#{err_decl}
           var __ccw#{id}_wg = CheatHeader.WaitGroup.init(#{rt_name}.getSched());
           const __ccw#{id}_n_workers: usize = @intCast(#{workers_code});
+          const __ccw#{id}_batch: usize = @max(@as(usize, @intCast(#{batch_code})), 1);
           const __CcwWorker#{id} = struct {
               wg:      *CheatHeader.WaitGroup,
               items:   []const #{item_zig},
               results: []?#{item_zig},
+              batch:   usize,
               next:    *std.atomic.Value(usize),#{err_field}
               fn run(raw_rt: *anyopaque, raw_args: ?*anyopaque) anyerror!void {
                   const __rt = @as(*Runtime, @ptrCast(@alignCast(raw_rt)));
@@ -1913,9 +1928,12 @@ def transpile_concurrent_where(list_node, where_op, id, workers_code, rt_name, o
                   const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
                   defer ctx.wg.done();
                   while (true) {
-                      const __idx = ctx.next.fetchAdd(1, .monotonic);
-                      if (__idx >= ctx.items.len) break;
-                      #{pred_body}
+                      const __start = ctx.next.fetchAdd(ctx.batch, .monotonic);
+                      if (__start >= ctx.items.len) break;
+                      const __end = @min(__start + ctx.batch, ctx.items.len);
+                      for (__start..__end) |__idx| {
+                          #{pred_body}
+                      }
                       __rt.checkYield();
                   }
               }
@@ -1929,6 +1947,7 @@ def transpile_concurrent_where(list_node, where_op, id, workers_code, rt_name, o
                   .wg      = &__ccw#{id}_wg,
                   .items   = __ccw#{id}_items,
                   .results = __ccw#{id}_results,
+                  .batch   = __ccw#{id}_batch,
                   .next    = &__ccw#{id}_next,#{err_ctx_init}
               };
               #{spawn_call}
@@ -1968,6 +1987,7 @@ def transpile_concurrent_each(list_node, each_op, id, workers_code, rt_name, opt
     end
 
     spawn_call = concurrent_spawn_call(options, "__cce#{id}_wg", "__CceWorker#{id}", "__cce#{id}_workers[__w]")
+    batch_code = concurrent_batch_code(options)
 
     <<~ZIG.chomp
       {
@@ -1980,9 +2000,11 @@ def transpile_concurrent_each(list_node, each_op, id, workers_code, rt_name, opt
           if (__cce#{id}_len == 0) {} else {
           var __cce#{id}_wg = CheatHeader.WaitGroup.init(#{rt_name}.getSched());
           const __cce#{id}_n_workers: usize = @intCast(#{workers_code});
+          const __cce#{id}_batch: usize = @max(@as(usize, @intCast(#{batch_code})), 1);
           const __CceWorker#{id} = struct {
               wg:    *CheatHeader.WaitGroup,
               items: []#{item_zig},
+              batch: usize,
               next:  *std.atomic.Value(usize),
               fn run(raw_rt: *anyopaque, raw_args: ?*anyopaque) anyerror!void {
                   const __rt = @as(*Runtime, @ptrCast(@alignCast(raw_rt)));
@@ -1990,11 +2012,14 @@ def transpile_concurrent_each(list_node, each_op, id, workers_code, rt_name, opt
                   const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
                   defer ctx.wg.done();
                   while (true) {
-                      const __idx = ctx.next.fetchAdd(1, .monotonic);
-                      if (__idx >= ctx.items.len) break;
-                      const __each_item = &ctx.items[__idx];
-                      _ = __each_item;
-                      #{body_code}
+                      const __start = ctx.next.fetchAdd(ctx.batch, .monotonic);
+                      if (__start >= ctx.items.len) break;
+                      const __end = @min(__start + ctx.batch, ctx.items.len);
+                      for (__start..__end) |__idx| {
+                          const __each_item = &ctx.items[__idx];
+                          _ = __each_item;
+                          #{body_code}
+                      }
                       __rt.checkYield();
                   }
               }
@@ -2007,6 +2032,7 @@ def transpile_concurrent_each(list_node, each_op, id, workers_code, rt_name, opt
               __cce#{id}_workers[__w] = .{
                   .wg    = &__cce#{id}_wg,
                   .items = @constCast(__cce#{id}_items),
+                  .batch = __cce#{id}_batch,
                   .next  = &__cce#{id}_next,
               };
               #{spawn_call}
diff --git a/src/backends/pipeline_host.rb b/src/backends/pipeline_host.rb
index 5914da67..6ec05fa9 100644
--- a/src/backends/pipeline_host.rb
+++ b/src/backends/pipeline_host.rb
@@ -314,6 +314,18 @@ def substitute_placeholders(node)
         copy_type_info(node, new_assert)
         return new_assert
       end
+    when AST::IfStatement
+      new_cond = substitute_placeholders(node.condition)
+      new_then = node.then_branch.map { |stmt| substitute_placeholders(stmt) }
+      new_else = node.else_branch&.map { |stmt| substitute_placeholders(stmt) }
+      if new_cond != node.condition || new_then != node.then_branch || new_else != node.else_branch
+        new_if = AST::IfStatement.new(node.token, new_cond, new_then, new_else, node.then_drops, node.else_drops)
+        new_if.expr_mode = node.expr_mode if node.respond_to?(:expr_mode)
+        new_if.then_result_type = node.then_result_type if node.respond_to?(:then_result_type)
+        new_if.else_result_type = node.else_result_type if node.respond_to?(:else_result_type)
+        copy_type_info(node, new_if)
+        return new_if
+      end
     end
 
     node
@@ -2317,15 +2329,11 @@ def lower_range_fold_observable(p, smooth_node, label, source_node,
       .gsub(/\b__obs_acc\b/, "ctx.acc")
       .gsub(/\b#{Regexp.escape(p[:source_name])}\b/, "ctx.gen")
 
-    # Spawn the consumer cross-scheduler (unpinned) so in
-    # multi-threaded mode it can run on a different worker thread
-    # than the joiner. This is critical for hot-poll patterns
-    # (`WHILE current < expected DO WITH VIEW running AS s ... END`)
-    # where the joiner is a pinned main: pinned-first pickNext
-    # (scheduler.zig:826-830) would starve a same-scheduler
-    # consumer. With the consumer on a sibling scheduler, the two
-    # OS threads make actual concurrent progress -- main can spin
-    # on `.view()` and the consumer publishes items in parallel.
+    # Spawn the consumer on the source scheduler. Observable terminal
+    # consumers are tightly coupled to their source Stream; keeping both
+    # sides local avoids cross-scheduler stream waiter handoff on the
+    # hot path and lets producer/consumer/main make cooperative progress
+    # through the scheduler's normal yield points.
     task_cfg = @lowering.send(:task_config_zig, nil, nil)
 
     spawn_zig = <<~ZIG.chomp
@@ -2344,7 +2352,7 @@ def lower_range_fold_observable(p, smooth_node, label, source_node,
           const #{ctx_var} = #{rt_name}.heapAlloc().create(#{ctx_type}) catch unreachable;
           errdefer #{rt_name}.heapAlloc().destroy(#{ctx_var});
           #{ctx_var}.* = .{ .acc = __obs_acc, .gen = #{p[:source_name]} };
-          try CheatHeader.spawnBest(
+          try #{rt_name}.getSched().submitSpawn(
               @intFromPtr(&Runtime.entryWrapper),
               @as(CheatHeader.TaskFn, @ptrCast(&#{ctx_type}.run)),
               #{ctx_var},
@@ -3027,25 +3035,13 @@ def extract_concurrent_error_policy_for_bc(expr)
     [:default, expr]
   end
 
-  # SHARD + CONCURRENT EACH (both backends): produces structural MIR
-  # describing a fused single-fiber loop. The "CONCURRENT" name is
-  # misleading -- the original transpile_shard_concurrent_each emitted
-  # a serial while-loop with no real fibers; the body's map[k] = v
-  # used putDirect with the shard idx computed per iteration.
-  #
-  # Both backends produce the same shape:
-  #   ScopeBlock {
-  #     // (Zig only) Let map_ptr = &map; map_ptr.ensureOwnership();
-  #     // (Zig only) ForStmt body has Let sh = @TypeOf(map.*).shardIndexWithHash(key)
-  #     ForStmt(IterRange(start, end), idx_var, [
-  #       Let key_var = key_expr_with_idx
-  #       ...body lowered with shard_context set so map[k]=v
-  #          dispatches to ShardedMapPut(shard_direct)...
-  #     ])
-  #   }
+  # SHARD + CONCURRENT EACH.
   #
-  # For BC, shard_idx/shard_key on the ShardedMapPut are ignored and
-  # the put compiles to plain MAP_PUT.
+  # BC remains sequential because the VM has no scheduler ownership model.
+  # Zig uses one bounded channel and one worker fiber per shard. The producer
+  # computes the routing key/hash and enqueues an owned WorkItem; each worker
+  # serially drains its shard and lowers the body in shard-direct mode so
+  # map[k]/map[k]=v compile to getDirect/putDirect.
   def lower_shard_concurrent_each(lhs, conc_op, smooth_node)
     ctx = conc_op.shard_context
     each_op = conc_op.op
@@ -3067,16 +3063,17 @@ def lower_shard_concurrent_each(lhs, conc_op, smooth_node)
     map_node = ctx[:map_var]
     map_var_name = map_node.is_a?(AST::Identifier) ? map_node.name.to_s : nil
 
+    unless is_bc
+      return lower_shard_concurrent_each_zig(
+        id, range_node, conc_op, each_op, ctx, map_node, map_var_name,
+        idx_var, key_var, sh_var, map_ptr, start_mir, end_mir)
+    end
+
     # Set mir_lowering's @shard_context so the body's map[k] = v dispatches
-    # to ShardedMapPut/Get with shard_direct mode (Zig) -- BC ignores these
-    # fields but the same path runs for both backends.
+    # to ShardedMapPut/Get with shard_direct mode. BC ignores those fields, so
+    # keep it nil and compile a correctness-equivalent serial loop.
     prev_ctx = @lowering.instance_variable_get(:@shard_context)
-    @lowering.instance_variable_set(:@shard_context, is_bc ? nil : {
-      map: map_var_name,
-      idx: "#{sh_var}.shard",
-      key: key_var,
-      hash: "#{sh_var}.hash"
-    })
+    @lowering.instance_variable_set(:@shard_context, nil)
 
     key_mir, body_mir = nil, nil
     begin
@@ -3100,45 +3097,239 @@ def lower_shard_concurrent_each(lhs, conc_op, smooth_node)
       inner << MIR::InlineZig.new("defer #{rt}.restoreLoopMark(#{lm_var});", "shard_loop_restore_mark")
     end
     inner << MIR::Let.new(key_var, key_mir, false, nil, nil)
-    unless is_bc
-      # __sh#_sh = @TypeOf(__sh#_map.*).shardIndexWithHash(__sh#_key)
-      # Used by the body's putDirect / getDirect (via ShardedMapPut shard_idx).
-      inner << MIR::Let.new(sh_var,
-        MIR::Call.new("@TypeOf(#{map_ptr}.*).shardIndexWithHash",
-          [MIR::Ident.new(key_var)], false),
-        false, nil, nil)
-    end
     inner.concat(body_mir)
 
-    if is_bc
-      # BC: ForStmt over IterRange (the VM iterates Int64 directly so
-      # the idx_var binds an Int64). No map ptr setup or ensureOwnership.
-      return MIR::ForStmt.new(
-        MIR::IterRange.new(start_mir, end_expr),
-        idx_var, inner, nil)
+    # BC: ForStmt over IterRange (the VM iterates Int64 directly so the idx_var
+    # binds an Int64). No map ptr setup, channels, or ensureOwnership.
+    MIR::ForStmt.new(MIR::IterRange.new(start_mir, end_expr), idx_var, inner, nil)
+  end
+
+  def lower_shard_concurrent_each_zig(id, range_node, conc_op, each_op, ctx,
+                                      map_node, map_var_name, idx_var, key_var,
+                                      sh_var, map_ptr, start_mir, end_mir)
+    shard_count = ctx[:shard_count] || map_node.type_info&.shard_count
+    raise "SHARD target missing shard_count" unless shard_count
+
+    map_t = map_node.type_info
+    key_t = if map_t&.numeric_map? && map_t&.key_type
+      map_t.key_type
+    else
+      Type.new(:String)
+    end
+    key_zig = key_t.zig_type
+    string_key = !map_t&.numeric_map?
+
+    start_zig = @lowering.send(:emit_expr, start_mir)
+    end_zig   = @lowering.send(:emit_expr, end_mir)
+    cap_mir = stream_concurrent_capacity_mir(conc_op, shard_count.to_s)
+    cap_zig = @lowering.send(:emit_expr, cap_mir)
+    batch_mir = bounded_concurrent_batch_mir(conc_op)
+    batch_zig = @lowering.send(:emit_expr, batch_mir)
+    task_cfg = task_config_zig(conc_op.options["size"]&.name&.downcase&.to_sym)
+
+    key_mir = with_pipeline_context(placeholder: idx_var) { visit_mir(ctx[:key_expr]) }
+    key_zig_expr = @lowering.send(:emit_expr, key_mir)
+
+    caps = FiberCtxBuilder.build(conc_op.capture_analysis, body_access_prefix: "ctx")
+    shard_map_field = "__shard_map"
+    map_capture_map = map_var_name ? { map_var_name => "ctx.#{shard_map_field}.*" } : {}
+    capture_fields_arr = ["        #{shard_map_field}: *@TypeOf(#{map_ptr}.*),"]
+    capture_fields_arr.concat(caps.specs.map { |s| "        #{s.name}: #{s.field_type_zig}," })
+    capture_fields = capture_fields_arr.join("\n")
+    capture_inits = [".#{shard_map_field} = #{map_ptr}"] + caps.specs.map { |s| ".#{s.name} = #{s.init_value_zig}" }
+    capture_inits_str = capture_inits.empty? ? "" : ", #{capture_inits.join(", ")}"
+
+    prev_ctx = @lowering.instance_variable_get(:@shard_context)
+    body_mir = nil
+    begin
+      @lowering.instance_variable_set(:@shard_context, {
+        map: map_var_name,
+        idx: "ctx.shard",
+        key: key_var,
+        hash: "0"
+      })
+      body_mir = with_pipeline_context(placeholder: key_var) do
+        with_fiber_capture_map(map_capture_map.merge(caps.capture_map), capture_symbols: caps.capture_symbols, rt_override: "__rt") do
+          visit_pipeline_body_mir(each_op.body, placeholder: key_var)
+        end
+      end
+    ensure
+      @lowering.instance_variable_set(:@shard_context, prev_ctx)
     end
 
-    # Zig: WhileStmt with explicit i64 counter so the idx_var has the
-    # right type for @mod/arithmetic in the key expression. ForStmt's
-    # `for (0..n) |i|` would bind i as usize, mismatching with i64
-    # operands in the typical key_expr pattern.
+    saved_low_rt = @lowering.instance_variable_get(:@rt_name)
+    @lowering.instance_variable_set(:@rt_name, "__rt")
+    body_zig = begin
+      body_mir.filter_map { |m|
+        code = @lowering.send(:emit_expr, m)
+        next nil if code.nil? || code.empty?
+        code.strip.end_with?("}", ";") ? code : "#{code};"
+      }.join("\n                    ")
+    ensure
+      @lowering.instance_variable_set(:@rt_name, saved_low_rt)
+    end
+
+    key_loop_mark = if ctx[:key_allocates_frame]
+      "const __sh#{id}_key_mark = rt.saveLoopMark();\n              defer rt.restoreLoopMark(__sh#{id}_key_mark);"
+    else
+      ""
+    end
+    body_loop_mark = if ctx[:body_allocates_frame]
+      "const __sh#{id}_body_mark = __rt.saveLoopMark();\n                      defer __rt.restoreLoopMark(__sh#{id}_body_mark);"
+    else
+      ""
+    end
+    key_store_expr = string_key ? "try rt.heapAlloc().dupe(u8, #{key_var})" : key_var
+    key_free_work = if string_key
+      "for (__work.keys) |__k| __rt.heapAlloc().free(__k);\n                  __rt.heapAlloc().free(__work.keys);"
+    else
+      "__rt.heapAlloc().free(__work.keys);"
+    end
+    key_free_success = string_key ? "__rt.heapAlloc().free(#{key_var});" : ""
+    key_free_remaining = string_key ? "errdefer for (__work.keys[__sh#{id}_ki..]) |__k| __rt.heapAlloc().free(__k);" : ""
+    key_slice_cleanup = string_key ? "for (__sh#{id}_keys) |__k| rt.heapAlloc().free(__k);" : ""
+    pending_batch_cleanup = string_key ? "for (__sh#{id}_batches[__s].items) |__k| rt.heapAlloc().free(__k);" : ""
+    channel_buffer_cleanup = if string_key
+      <<~ZIG.chomp
+        fn cleanupBuffered(chan: *CheatLib.BoundedChannel(__ShWork#{id}), __rt: *Runtime) void {
+                          const inner = chan.inner;
+                          inner.mutex.lock();
+                          while (inner.tail != inner.head) {
+                              const __work = inner.buf[inner.tail & inner.mask];
+                              inner.tail += 1;
+                              for (__work.keys) |__k| __rt.heapAlloc().free(__k);
+                              __rt.heapAlloc().free(__work.keys);
+                          }
+                          inner.mutex.unlock();
+                      }
+      ZIG
+    else
+      <<~ZIG.chomp
+        fn cleanupBuffered(chan: *CheatLib.BoundedChannel(__ShWork#{id}), __rt: *Runtime) void {
+                          const inner = chan.inner;
+                          inner.mutex.lock();
+                          while (inner.tail != inner.head) {
+                              const __work = inner.buf[inner.tail & inner.mask];
+                              inner.tail += 1;
+                              __rt.heapAlloc().free(__work.keys);
+                          }
+                          inner.mutex.unlock();
+                      }
+      ZIG
+    end
     op_str = range_node.inclusive ? "<=" : "<"
-    end_var = "__sh#{id}_end"
-    MIR::ScopeBlock.new([
-      MIR::Let.new(map_ptr, MIR::UnaryOp.new("&", visit_mir(map_node)), false, nil, nil),
-      MIR::ExprStmt.new(
-        MIR::MethodCall.new(MIR::Ident.new(map_ptr), "ensureOwnership", [], false), false),
-      MIR::ScopeBlock.new([
-        MIR::Let.new(idx_var, start_mir, true, "i64", nil),
-        MIR::Let.new(end_var, end_mir, false, "i64", nil),
-        MIR::WhileStmt.new(
-          MIR::BinOp.new(op_str, MIR::Ident.new(idx_var), MIR::Ident.new(end_var)),
-          inner, nil,
-          MIR::Set.new(MIR::Ident.new(idx_var),
-            MIR::BinOp.new("+", MIR::Ident.new(idx_var), MIR::Lit.new("1"))),
-          nil, nil)
-      ])
-    ])
+
+    code = <<~ZIG.chomp
+      {
+          const #{map_ptr} = &#{@lowering.send(:emit_expr, visit_mir(map_node))};
+          #{map_ptr}.ensureOwnership();
+          const __sh#{id}_cap: usize = #{cap_zig};
+          const __sh#{id}_batch: usize = @max(@as(usize, #{batch_zig}), 1);
+          const __ShWork#{id} = struct {
+              keys: []#{key_zig},
+          };
+          const __ShCleanup#{id} = struct {
+              #{channel_buffer_cleanup}
+          };
+          var __sh#{id}_chans: [#{shard_count}]CheatLib.BoundedChannel(__ShWork#{id}) = undefined;
+          for (0..#{shard_count}) |__s| {
+              __sh#{id}_chans[__s] = try CheatLib.BoundedChannel(__ShWork#{id}).init(rt.heapAlloc(), __sh#{id}_cap);
+          }
+          defer for (0..#{shard_count}) |__s| __sh#{id}_chans[__s].deinit();
+
+          var __sh#{id}_wg = CheatHeader.WaitGroup.init(rt.getSched());
+          var __sh#{id}_err = std.atomic.Value(bool).init(false);
+          const __ShWorker#{id} = struct {
+              wg: *CheatHeader.WaitGroup,
+              chans: *[#{shard_count}]CheatLib.BoundedChannel(__ShWork#{id}),
+              err: *std.atomic.Value(bool),
+              shard: usize,
+      #{capture_fields.empty? ? "" : capture_fields + "\n"}        fn run(raw_rt: *anyopaque, raw_args: ?*anyopaque) anyerror!void {
+                  const __rt = @as(*Runtime, @ptrCast(@alignCast(raw_rt)));
+                  const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
+                  defer ctx.wg.done();
+                  errdefer {
+                      ctx.err.store(true, .release);
+                      for (0..#{shard_count}) |__s| ctx.chans[__s].setError(error.CheatError);
+                  }
+                  while (ctx.chans[ctx.shard].pop() catch |__err| {
+                      ctx.err.store(true, .release);
+                      for (0..#{shard_count}) |__s| ctx.chans[__s].setError(__err);
+                      return __err;
+                  }) |__work| {
+                      errdefer {
+                          #{key_free_work}
+                      }
+                      var __sh#{id}_ki: usize = 0;
+                      while (__sh#{id}_ki < __work.keys.len) : (__sh#{id}_ki += 1) {
+                          #{key_free_remaining}
+                          const #{key_var}: #{key_zig} = __work.keys[__sh#{id}_ki];
+                          #{body_loop_mark}
+                          #{body_zig}
+                          #{key_free_success}
+                      }
+                      __rt.heapAlloc().free(__work.keys);
+                      __rt.checkYield();
+                  }
+              }
+          };
+          var __sh#{id}_workers: [#{shard_count}]__ShWorker#{id} = undefined;
+          __sh#{id}_wg.add(#{shard_count});
+          for (0..#{shard_count}) |__s| {
+              __sh#{id}_workers[__s] = .{ .wg = &__sh#{id}_wg, .chans = &__sh#{id}_chans, .err = &__sh#{id}_err, .shard = __s#{capture_inits_str} };
+              try CheatHeader.spawnBest(
+                  @intFromPtr(&Runtime.entryWrapper),
+                  @as(CheatHeader.TaskFn, @ptrCast(&__ShWorker#{id}.run)),
+                  &__sh#{id}_workers[__s],
+                  #{task_cfg},
+              );
+          }
+
+          var __sh#{id}_batches: [#{shard_count}]std.ArrayListUnmanaged(#{key_zig}) = [_]std.ArrayListUnmanaged(#{key_zig}){.empty} ** #{shard_count};
+          defer for (0..#{shard_count}) |__s| {
+              #{pending_batch_cleanup}
+              __sh#{id}_batches[__s].deinit(rt.heapAlloc());
+          };
+
+          var #{idx_var}: i64 = #{start_zig};
+          const __sh#{id}_end: i64 = #{end_zig};
+          while ((#{idx_var} #{op_str} __sh#{id}_end) and !__sh#{id}_err.load(.acquire)) : (#{idx_var} += 1) {
+              #{key_loop_mark}
+              const #{key_var}: #{key_zig} = #{key_zig_expr};
+              const #{sh_var} = @TypeOf(#{map_ptr}.*).shardIndexWithHash(#{key_var});
+              try __sh#{id}_batches[#{sh_var}.shard].append(rt.heapAlloc(), #{key_store_expr});
+              if (__sh#{id}_batches[#{sh_var}.shard].items.len >= __sh#{id}_batch) {
+                  const __sh#{id}_keys = try __sh#{id}_batches[#{sh_var}.shard].toOwnedSlice(rt.heapAlloc());
+                  const __sh#{id}_work = __ShWork#{id}{ .keys = __sh#{id}_keys };
+                  __sh#{id}_chans[#{sh_var}.shard].push(__sh#{id}_work) catch |__err| {
+                      #{key_slice_cleanup}
+                      rt.heapAlloc().free(__sh#{id}_keys);
+                      __sh#{id}_err.store(true, .release);
+                      for (0..#{shard_count}) |__s| __sh#{id}_chans[__s].setError(__err);
+                      break;
+                  };
+              }
+          }
+          for (0..#{shard_count}) |__s| {
+              if (__sh#{id}_batches[__s].items.len > 0 and !__sh#{id}_err.load(.acquire)) {
+                  const __sh#{id}_keys = try __sh#{id}_batches[__s].toOwnedSlice(rt.heapAlloc());
+                  const __sh#{id}_work = __ShWork#{id}{ .keys = __sh#{id}_keys };
+                  __sh#{id}_chans[__s].push(__sh#{id}_work) catch |__err| {
+                      #{key_slice_cleanup}
+                      rt.heapAlloc().free(__sh#{id}_keys);
+                      __sh#{id}_err.store(true, .release);
+                      for (0..#{shard_count}) |__ss| __sh#{id}_chans[__ss].setError(__err);
+                      break;
+                  };
+              }
+          }
+          for (0..#{shard_count}) |__s| __sh#{id}_chans[__s].close();
+          __sh#{id}_wg.wait();
+          for (0..#{shard_count}) |__s| __ShCleanup#{id}.cleanupBuffered(&__sh#{id}_chans[__s], rt);
+          if (__sh#{id}_err.load(.acquire)) return error.CheatError;
+      }
+    ZIG
+    MIR::InlineZig.new(code, "shard_concurrent_each")
   end
 
   # CONCURRENT SELECT ... OR PRUNE: build a structural for-loop that runs
@@ -3238,6 +3429,16 @@ def bounded_concurrent_parallel_mir(conc_op)
     end
   end
 
+  def bounded_concurrent_batch_mir(conc_op)
+    if (batch = conc_op.options["batch"])
+      raw = visit_mir(batch)
+      raw_zig = @lowering.send(:emit_expr, raw)
+      MIR::InlineZig.new("@intCast(#{raw_zig})", "bounded_batch_usize")
+    else
+      MIR::Lit.new("1")
+    end
+  end
+
   # Resolve the bare struct name (if any) for a capture symbol, used to
   # stamp BC pre-decoded slots with `:struct_<Name>`. Returns a String or
   # nil. SymbolEntry#type may be a Type object, a sigil Symbol like
@@ -3377,6 +3578,7 @@ def lower_concurrent_bounded_select(lhs, conc_op, inner)
       MIR::Ident.new("rt"),
       items_ptr,
       bounded_concurrent_worker_count_for_call_mir(conc_op),
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3404,6 +3606,7 @@ def lower_concurrent_bounded_where(lhs, conc_op, _inner)
       MIR::Ident.new("rt"),
       items_ptr,
       bounded_concurrent_worker_count_for_call_mir(conc_op),
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3430,6 +3633,7 @@ def lower_concurrent_bounded_each(lhs, conc_op, _inner)
       MIR::Ident.new("rt"),
       items_ptr,
       bounded_concurrent_worker_count_for_call_mir(conc_op),
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3505,6 +3709,7 @@ def lower_concurrent_stream_select(lhs, conc_op, inner)
       src_ptr,
       bounded_concurrent_worker_count_for_call_mir(conc_op),
       cap_mir,
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3539,6 +3744,7 @@ def lower_concurrent_stream_where(lhs, conc_op, inner)
       src_ptr,
       bounded_concurrent_worker_count_for_call_mir(conc_op),
       cap_mir,
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3573,6 +3779,7 @@ def lower_concurrent_stream_each(lhs, conc_op, inner)
       src_ptr,
       bounded_concurrent_worker_count_for_call_mir(conc_op),
       cap_mir,
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3619,6 +3826,7 @@ def lower_concurrent_list_select(lhs, conc_op, inner)
       MIR::Ident.new("rt"),
       MIR::Ident.new("pipe_items"),
       bounded_concurrent_worker_count_for_call_mir(conc_op),
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3645,6 +3853,7 @@ def lower_concurrent_list_where(lhs, conc_op, inner)
       MIR::Ident.new("rt"),
       MIR::Ident.new("pipe_items"),
       bounded_concurrent_worker_count_for_call_mir(conc_op),
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3670,6 +3879,7 @@ def lower_concurrent_list_each(lhs, conc_op, inner)
       MIR::Ident.new("rt"),
       MIR::Ident.new("pipe_items"),
       bounded_concurrent_worker_count_for_call_mir(conc_op),
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
@@ -3744,6 +3954,7 @@ def lower_concurrent_list_each_in_place(lhs, conc_op, inner)
       # const so the in-place helper can write through the slice.
       MIR::InlineZig.new("@constCast(pipe_items)", "list_each_inplace_mut_items"),
       bounded_concurrent_worker_count_for_call_mir(conc_op),
+      bounded_concurrent_batch_mir(conc_op),
       bounded_concurrent_parallel_mir(conc_op),
       bounded_concurrent_task_cfg_mir(conc_op),
       MIR::AddressOf.new(MIR::Ident.new(cb[:ctx_var])),
diff --git a/src/backends/transpiler.rb b/src/backends/transpiler.rb
index ccd2e070..3deb9a80 100644
--- a/src/backends/transpiler.rb
+++ b/src/backends/transpiler.rb
@@ -49,17 +49,19 @@ def collect_bg_blocks(node, result)
 
   # Single-file entry point (used by the CLI and simple callers).
   # pkg_paths: { "name" => "/abs/path/to/lib.cht" } for REQUIRE "pkg:name" resolution.
-  def transpile(cheat_code, source_dir: @source_dir, pkg_paths: {}, use_c_allocator: false, use_debug_allocator: false, test_mode: false, strict_test: false, exact_tiers: nil, main_tier: nil)
+  def transpile(cheat_code, source_dir: @source_dir, pkg_paths: {}, use_c_allocator: false, use_debug_allocator: false, test_mode: false, strict_test: false, exact_tiers: nil, main_tier: nil, default_stack: nil)
     transpile_mir(cheat_code, source_dir: source_dir, pkg_paths: pkg_paths,
                   use_c_allocator: use_c_allocator, use_debug_allocator: use_debug_allocator,
                   test_mode: test_mode, strict_test: strict_test,
-                  exact_tiers: exact_tiers, main_tier: main_tier)
+                  exact_tiers: exact_tiers, main_tier: main_tier,
+                  default_stack: default_stack)
   end
 
   # MIR pipeline: front-end -> MIRLowering -> MIREmitter -> Zig output.
-  def transpile_mir(cheat_code, source_dir: @source_dir, pkg_paths: {}, use_c_allocator: false, use_debug_allocator: false, test_mode: false, strict_test: false, exact_tiers: nil, main_tier: nil)
+  def transpile_mir(cheat_code, source_dir: @source_dir, pkg_paths: {}, use_c_allocator: false, use_debug_allocator: false, test_mode: false, strict_test: false, exact_tiers: nil, main_tier: nil, default_stack: nil)
     @source_dir = File.expand_path(source_dir)
     @test_mode = test_mode
+    @default_stack_size = default_stack unless default_stack.nil?
     @importer ||= ModuleImporter.new(base_dir: @source_dir, pkg_paths: pkg_paths, use_mir: true)
 
     result = CompilerFrontend.compile(cheat_code, importer: @importer, source_dir: @source_dir, strict_test: strict_test)
@@ -200,7 +202,6 @@ def transpile_as_module(cheat_code, source_dir: @source_dir, pkg_paths: {})
 
         test "cheat main" {
             const fp = CheatHeader.scheduler;
-            const fm = CheatHeader.fiber_memory;
             var da = std.heap.DebugAllocator(.{}){};
             defer _ = da.deinit();
             const allocator = da.allocator();
@@ -209,9 +210,7 @@ def transpile_as_module(cheat_code, source_dir: @source_dir, pkg_paths: {})
             var rt = try Runtime.init(allocator, 128 * 1024 * 1024, &global_ctx);
             defer rt.deinit();
             rt.wireAllocator();
-            var stack_pool = fm.StackPool.init(allocator);
-            defer stack_pool.deinit();
-            var sched = try fp.Scheduler.init(allocator, &global_ctx, &stack_pool);
+            var sched = try fp.Scheduler.init(allocator, &global_ctx, null);
             defer {
                 sched.deinit();
                 fp.global_registry.deinit(allocator);
@@ -312,13 +311,15 @@ def transpile_as_module(cheat_code, source_dir: @source_dir, pkg_paths: {})
       puts transpiler.transpile_as_module(code, source_dir: source_dir, pkg_paths: options[:pkg_paths])
     when :test
       puts transpiler.transpile(code, source_dir: source_dir, pkg_paths: options[:pkg_paths],
-                                test_mode: true, strict_test: !!options[:strict])
+                                test_mode: true, strict_test: !!options[:strict],
+                                default_stack: options[:default_stack])
     else
       puts transpiler.transpile(code, source_dir: source_dir, pkg_paths: options[:pkg_paths],
                                 use_c_allocator: !!options[:use_c_allocator],
                                 use_debug_allocator: !!options[:use_debug_allocator],
                                 exact_tiers: options[:exact_tiers],
-                                main_tier: options[:main_tier])
+                                main_tier: options[:main_tier],
+                                default_stack: options[:default_stack])
     end
   else
     $stderr.puts "Usage: ruby transpiler.rb [--module] [--pkg name=/path/to/lib.cht] <script.cht>"
diff --git a/src/mir/fsm_transform/emit.rb b/src/mir/fsm_transform/emit.rb
index 21fbf768..29676de5 100644
--- a/src/mir/fsm_transform/emit.rb
+++ b/src/mir/fsm_transform/emit.rb
@@ -363,6 +363,7 @@ def build_recursive(ctx, segments, liveness, lowering)
       end
       segments.each do |seg|
         next unless seg.tail.respond_to?(:result_var)
+        next if seg.tail.is_a?(Segments::IoSuspend)
         rv = seg.tail.result_var
         capture_map[rv] ||= "__ctx_#{id}.#{rv}" if rv && rv != "_"
       end
@@ -879,13 +880,16 @@ def compute_sp_indices(segments)
     # body shapes.
     def build_spawn_setup(ctx, lowering)
       is_local_pin = (ctx[:pin_mode] == true || ctx[:pin_mode] == :local)
+      is_default_local = (ctx[:pin_mode].nil? || ctx[:pin_mode] == false) && !ctx[:parallel]
+      is_local_dispatch = is_local_pin || is_default_local
+      dispatch = is_local_dispatch ? :local : :parallel
       spawn_call_zig =
-        if is_local_pin
+        if is_local_dispatch
           "try #{ctx[:rt_name]}.getSched().submitFsmSpawn(#{ctx[:ctx_var]}.task);"
         else
           "try CheatHeader.spawnFsmBest(#{ctx[:ctx_var]}.task);"
         end
-      alloc_expr_zig = is_local_pin ?
+      alloc_expr_zig = is_local_dispatch ?
         "#{ctx[:rt_name]}.getSched().allocator" : "#{ctx[:rt_name]}.heapAlloc()"
 
       # `.task = undefined` and `.rt = undefined` here are rebound by
@@ -909,7 +913,23 @@ def build_spawn_setup(ctx, lowering)
         ctx_init_zig,
         spawn_call_zig,
         ctx[:rt_name],
+        ctx[:profile_site_id],
+        profile_dispatch_id(dispatch),
+        bg_profile_site_comment(ctx, dispatch, :fsm),
       )
     end
+
+    def profile_dispatch_id(dispatch)
+      case dispatch
+      when :local, true then 1
+      when :parallel then 2
+      when :shared then 3
+      else 1
+      end
+    end
+
+    def bg_profile_site_comment(ctx, dispatch, form)
+      "// CLEAR_PROFILE_TASK_SITE id=#{ctx[:profile_site_id]} kind=BG line=#{ctx[:profile_line]} column=#{ctx[:profile_column]} dispatch=#{dispatch} form=#{form}"
+    end
   end
 end
diff --git a/src/mir/fsm_transform/liveness.rb b/src/mir/fsm_transform/liveness.rb
index 96448cb9..e2435ef3 100644
--- a/src/mir/fsm_transform/liveness.rb
+++ b/src/mir/fsm_transform/liveness.rb
@@ -52,7 +52,8 @@ def analyze(segments, ctx)
         # NextSuspend with a binding); that var is "defined" at
         # the end of this segment AND consumed by the next, so
         # it counts as cross-segment by construction.
-        if seg.tail.respond_to?(:result_var) && seg.tail.result_var
+        if seg.tail.respond_to?(:result_var) && seg.tail.result_var &&
+           !seg.tail.is_a?(Segments::IoSuspend)
           # Type info comes from the call's full_type; the
           # emitter resolves it via the AST node when emitting
           # the state field decl.
@@ -168,7 +169,6 @@ def collect_tail_uses(seg, uses_by_seg)
       case tail
       when Segments::IoSuspend
         next_idx = seg.index + 1
-        return unless uses_by_seg.key?(next_idx) || true
         bucket = (uses_by_seg[next_idx] ||= Set.new)
         if tail.call_node.respond_to?(:receiver) && tail.call_node.receiver
           walk_idents(tail.call_node.receiver) { |name| bucket << name }
diff --git a/src/mir/fsm_wrapper_emitter.rb b/src/mir/fsm_wrapper_emitter.rb
index 98a11729..5e0a7679 100644
--- a/src/mir/fsm_wrapper_emitter.rb
+++ b/src/mir/fsm_wrapper_emitter.rb
@@ -55,6 +55,7 @@ def render_io_body(body)
     parts = []
     parts << "#{body.blk_label}: {"
     parts << render_ctx_struct(body.ctx_struct, mir_emitter)
+    parts << render_ctx_size_gate(body.ctx_struct.type_name)
     parts << render_spawn_setup(body.spawn_setup, body.blk_label)
     parts << "}"
     parts.join("\n")
@@ -69,6 +70,7 @@ def render_b1_body(body)
     parts = []
     parts << "#{body.blk_label}: {"
     parts << render_b1_ctx_struct(body.ctx_struct, mir_emitter)
+    parts << render_ctx_size_gate(body.ctx_struct.type_name)
     parts << render_spawn_setup(body.spawn_setup, body.blk_label)
     parts << "}"
     parts.join("\n")
@@ -128,7 +130,7 @@ def render_b1_resume_fn(ctx_id)
   # embedded in it) lives long enough for the status write before
   # being freed here.
   #
-  # extra_zig is optional cleanup that runs BEFORE alloc.destroy
+  # extra_zig is optional cleanup that runs BEFORE freeFsmCtx
   # (e.g. WITH+suspend-in-CS releases any locks still held on the
   # err path).
   def render_destroy_task(ctx_id, extra_zig = nil)
@@ -141,7 +143,17 @@ def render_destroy_task(ctx_id, extra_zig = nil)
     <<~ZIG.chomp.lines.map { |l| "        #{l}" }.join.chomp
       fn destroyTask(__fsm_task: *CheatHeader.FsmTask) void {
           const __ctx_#{ctx_id}: *@This() = @ptrCast(@alignCast(__fsm_task.ctx.?));
-      #{extra}    __ctx_#{ctx_id}.alloc.destroy(__ctx_#{ctx_id});
+      #{extra}    CheatHeader.freeFsmCtx(@This(), __fsm_task, __ctx_#{ctx_id});
+      }
+    ZIG
+  end
+
+  def render_ctx_size_gate(type_name)
+    <<~ZIG.chomp.lines.map { |l| "    #{l}" }.join.chomp
+      comptime {
+          if (@sizeOf(#{type_name}) > 256) {
+              @compileError("FSM context is larger than 256 bytes; use @stack on this BG block to force a compiler-sized stackful fiber.");
+          }
       }
     ZIG
   end
@@ -222,6 +234,7 @@ def render_generic_body(body)
     parts = []
     parts << "#{body.blk_label}: {"
     parts << render_generic_ctx_struct(body.ctx_struct, mir_emitter)
+    parts << render_ctx_size_gate(body.ctx_struct.type_name)
     parts << render_spawn_setup(body.spawn_setup, body.blk_label)
     parts << "}"
     parts.join("\n")
@@ -478,17 +491,18 @@ def render_member_fn(fn, mir_emitter)
 
   def render_spawn_setup(s, blk_label)
     parts = []
+    parts << "    #{s.profile_site_comment}" if s.respond_to?(:profile_site_comment) && !empty?(s.profile_site_comment)
     parts << "    const #{s.alloc_var} = #{s.alloc_expr_zig};"
     parts << "    const #{s.promise_var} = try #{s.promise_zig}.spawn(#{s.alloc_var}, #{s.rt_name}.getSched());"
     parts << indent_block(s.promoted_decls_zig, 4) unless empty?(s.promoted_decls_zig)
-    parts << "    const #{s.ctx_var} = try #{s.alloc_var}.create(#{s.ctx_type});"
-    parts << "    errdefer #{s.alloc_var}.destroy(#{s.ctx_var});"
     # Allocate the FsmTask from the scheduler's fsm_task_slab so
     # detectCycleFsm can pin it during chain walks (mirrors stackful
     # Task slab + Option-(C) protocol). The task's `ctx` field is the
     # forward pointer used by resumeFn / destroyTask to recover *Ctx.
     parts << "    const #{s.ctx_var}_task = try CheatHeader.allocFsmTask(#{s.rt_name}, &#{s.ctx_type}.resumeFn);"
     parts << "    errdefer #{s.rt_name}.getSched().fsm_task_slab.destroy(#{s.ctx_var}_task);"
+    parts << "    const #{s.ctx_var} = try CheatHeader.allocFsmCtx(#{s.ctx_type}, #{s.rt_name}, #{s.ctx_var}_task);"
+    parts << "    errdefer CheatHeader.freeFsmCtx(#{s.ctx_type}, #{s.ctx_var}_task, #{s.ctx_var});"
     parts << "    #{s.ctx_var}_task.ctx = #{s.ctx_var};"
     # Wire the destroy callback so the scheduler frees ctx after
     # dispatchOnce finishes writing task.status (the resume fn no
@@ -496,20 +510,18 @@ def render_spawn_setup(s, blk_label)
     # scheduler returns the FsmTask slot to fsm_task_slab AFTER
     # destroy_fn runs.
     parts << "    #{s.ctx_var}_task.destroy_fn = &#{s.ctx_type}.destroyTask;"
+    if s.respond_to?(:profile_site_id) && s.profile_site_id
+      parts << "    #{s.ctx_var}_task.profile_site_id = #{s.profile_site_id};"
+      parts << "    #{s.ctx_var}_task.profile_dispatch = #{s.profile_dispatch_id};"
+    end
     parts << "    #{s.ctx_var}.* = .{"
     parts << indent_block(s.ctx_init_zig, 8)
     parts << "    };"
     parts << "    #{s.ctx_var}.task = #{s.ctx_var}_task;"
-    # Allocate a per-task Runtime backed by a per-task ThreadLocalEbr.
-    # The scheduler frees both on .Done (Scheduler.releaseFsmTaskEbr).
-    # Without this, FSM ctxs would share the spawning fiber's rt -- and
-    # when distributed across worker schedulers via spawnFsmBest, calls
-    # to rt.ebr.enter()/.exit()/.retire() would touch a non-thread-safe
-    # ThreadLocalEbr from a foreign OS thread, corrupting the limbo
-    # list and aborting in glibc's malloc with `realloc(): invalid old
-    # size` on the next allocation. See zig/runtime/versioned-fiber-
-    # stress-test.zig::"FSM Versioned: 4 BG-FSM writers via
-    # spawnFsmBest with per-task rt -- bench-17 fix verifier".
+    # Allocate a per-task Runtime shell. EBR is resolved at dispatch time
+    # through Runtime.currentEbr(), so FSMs running on worker schedulers use
+    # the active scheduler thread's registered EBR slot instead of the
+    # spawning runtime's fallback slot.
     parts << "    #{s.ctx_var}.rt = try CheatHeader.allocFsmTaskRuntime(#{s.ctx_var}_task, #{s.rt_name});"
     parts << "    #{s.spawn_call_zig}"
     parts << "    break :#{blk_label} #{s.promise_var};"
diff --git a/src/mir/mir.rb b/src/mir/mir.rb
index ce21354c..e962862f 100644
--- a/src/mir/mir.rb
+++ b/src/mir/mir.rb
@@ -838,6 +838,9 @@ def kind; :cond_jump; end
                            # capture inits the lowering decided)
     :spawn_call_zig,       # "try ...spawn(&ctx.task);"
     :rt_name,              # "rt" (the surrounding fn's runtime)
+    :profile_site_id,      # integer id used by runtime fiber profile
+    :profile_dispatch_id,  # fiber-profile.DispatchKind enum value
+    :profile_site_comment, # CLEAR_PROFILE_TASK_SITE metadata comment
   )
 
   # Catch wrapper. Wraps raw Zig code for try/catch but exposes
@@ -982,10 +985,11 @@ def kind; :cond_jump; end
   # `name` may be a dotted path (e.g. "__ret.field") for per-field
   # promotion in a return-with-promotion pattern; the emitter takes &name
   # verbatim.
-  EscapePromote = Struct.new(:name, :zig_type, :strategy, :data, :rt_expr) do
+  EscapePromote = Struct.new(:name, :zig_type, :strategy, :data, :rt_expr, :elem_type) do
     include Stmt
     # data: strategy-specific payload (field set, alloc symbol, etc.)
     # rt_expr: Zig expression for runtime (e.g. "rt", "do_rt")
+    # elem_type: Zig element type for :list promotion.
   end
 
   # --- Deep Copy ---
diff --git a/src/mir/mir_emitter.rb b/src/mir/mir_emitter.rb
index a728a0a2..94d30831 100644
--- a/src/mir/mir_emitter.rb
+++ b/src/mir/mir_emitter.rb
@@ -843,7 +843,8 @@ def emit_escape_promote(node)
     rt = node.rt_expr || "rt"
     case node.strategy
     when :list
-      elem = node.zig_type[/ArrayListUnmanaged\((.+)\)/, 1]
+      elem = node.elem_type or
+        raise "MIREmitter#emit_escape_promote: :list promotion for '#{node.name}' missing elem_type"
       "try CheatLib.promoteList(#{elem}, #{rt}, &#{node.name});"
     when :string_map
       "#{node.name}.alloc = #{rt}.heapAlloc();"
diff --git a/src/mir/mir_lowering.rb b/src/mir/mir_lowering.rb
index 8c292772..ee937bf3 100644
--- a/src/mir/mir_lowering.rb
+++ b/src/mir/mir_lowering.rb
@@ -728,7 +728,8 @@ def lower_promote(node)
       node.zig_type,
       node.strategy,
       node.fields,
-      @rt_name
+      @rt_name,
+      node.elem_type,
     )
   end
 
@@ -2624,9 +2625,13 @@ def with_match_unwrap_value(zig_var)
     is_ptr  = "@typeInfo(@TypeOf(#{zig_var})) == .pointer"
     inner_t = "@typeInfo(@TypeOf(#{zig_var})).pointer.child"
     "(if (comptime #{is_ptr}) " \
-      "(if (comptime @hasField(#{inner_t}, \"ctrl\")) #{zig_var}.ctrl.data else #{zig_var}) " \
+      "(if (comptime @typeInfo(#{inner_t}) == .@\"struct\") " \
+        "(if (comptime @hasField(#{inner_t}, \"ctrl\")) #{zig_var}.ctrl.data else #{zig_var}) " \
+       "else #{zig_var}) " \
     "else " \
-      "(if (comptime @hasField(@TypeOf(#{zig_var}), \"ctrl\")) #{zig_var}.ctrl.data else &#{zig_var}))"
+      "(if (comptime @typeInfo(@TypeOf(#{zig_var})) == .@\"struct\") " \
+        "(if (comptime @hasField(@TypeOf(#{zig_var}), \"ctrl\")) #{zig_var}.ctrl.data else &#{zig_var}) " \
+       "else &#{zig_var}))"
   end
 
   # Per-arm prelude that binds the user's alias (`va` in
@@ -3329,6 +3334,9 @@ def lower_bg_block(node)
 
     task_cfg = task_config_zig(node.stack_size, node.computed_stack_tier)
     pin_mode = node.respond_to?(:pinned) ? node.pinned : nil
+    bg_site_id = id + 1
+    bg_site_line = node.token&.line || 0
+    bg_site_col = node.token&.column || 0
 
     # Universal transform path (CLAUDE.md Invariant 13). The
     # transform inspects the AST body via Segments.split, produces
@@ -3363,6 +3371,9 @@ def lower_bg_block(node)
         is_void: is_void, alloc_var: alloc_var, promise_var: promise_var,
         ctx_var: ctx_var, promoted_decls: promoted_decls,
         capture_inits: capture_inits, rt_name: rt_name, pin_mode: pin_mode,
+        parallel: !!node.parallel,
+        profile_site_id: bg_site_id, profile_line: bg_site_line,
+        profile_column: bg_site_col,
         inner_zig: inner_zig, arena_init_flag: !!node.arena_mode,
       }
       transform_result = FsmTransform.transform(node, transform_ctx, self)
@@ -3389,10 +3400,14 @@ def lower_bg_block(node)
     # delegation works; Stage 4b inlines them into Emit.build_*
     # and deletes them.
 
-    spawn_call = fiber_spawn_call_zig(rt_name, ctx_type, ctx_var, task_cfg, pin_mode)
+    bg_dispatch = node.parallel ? :parallel : ((pin_mode == false || pin_mode.nil?) ? :local : pin_mode)
+    profiled_task_cfg = task_config_with_profile(task_cfg, bg_site_id, bg_dispatch)
+    spawn_call = fiber_spawn_call_zig(rt_name, ctx_type, ctx_var, profiled_task_cfg, bg_dispatch)
+    profile_comment = bg_profile_site_comment(bg_site_id, bg_site_line, bg_site_col, bg_dispatch, :stack)
 
     bg_code = <<~ZIG.chomp
       #{blk_label}: {
+          #{profile_comment}
           const #{ctx_type} = struct {
               inner: *#{promise_zig}.Inner,
               alloc: std.mem.Allocator,
@@ -4124,7 +4139,11 @@ def filter_zig_blocks(source, names)
   def task_config_zig(stack_size, computed_tier)
     default = @debug_mode ? "Large" : "Standard"
     variant = if stack_size
-      STACK_SIZE_ZIG_VARIANT.fetch(stack_size, default)
+      if stack_size == :stack || stack_size == "stack"
+        STACK_SIZE_ZIG_VARIANT.fetch(computed_tier || :standard, default)
+      else
+        STACK_SIZE_ZIG_VARIANT.fetch(stack_size, default)
+      end
     elsif computed_tier
       computed = STACK_SIZE_ZIG_VARIANT.fetch(computed_tier, default)
       TIER_RANK.fetch(computed, 0) >= TIER_RANK.fetch(default, 0) ? computed : default
@@ -4144,11 +4163,33 @@ def fiber_spawn_call_zig(rt_name, ctx_type, ctx_var, task_cfg, pin_mode)
       "try #{rt_name}.getSched().submitSpawn(\n    #{spawn_args}\n);"
     when :shared
       "try CheatHeader.spawnPinned(\n    #{spawn_args}\n);"
+    when :parallel
+      "try CheatHeader.spawnBest(\n    #{spawn_args}\n);"
     else
       "try CheatHeader.spawnBest(\n    #{spawn_args}\n);"
     end
   end
 
+  def task_config_with_profile(task_cfg, site_id, dispatch)
+    fields = ".profile_site_id = #{site_id}, .profile_dispatch = #{profile_dispatch_id(dispatch)}"
+    stripped = task_cfg.strip
+    return ".{ #{fields} }" if stripped == ".{}"
+    stripped.sub(/\}\s*\z/, ", #{fields} }")
+  end
+
+  def profile_dispatch_id(dispatch)
+    case dispatch
+    when :local, true then 1
+    when :parallel then 2
+    when :shared then 3
+    else 1
+    end
+  end
+
+  def bg_profile_site_comment(site_id, line, col, dispatch, form)
+    "// CLEAR_PROFILE_TASK_SITE id=#{site_id} kind=BG line=#{line} column=#{col} dispatch=#{dispatch} form=#{form}"
+  end
+
   def fiber_string_promotes(node, id, prefix)
     promotes = node.capture_string_dupes || Set.new
     names = {}
diff --git a/src/mir/mir_pass.rb b/src/mir/mir_pass.rb
index 29c93fa7..2b7f2a4f 100644
--- a/src/mir/mir_pass.rb
+++ b/src/mir/mir_pass.rb
@@ -539,7 +539,7 @@ def insert_bg_escape_promote!(result, stmt)
         next if t.needs_pointer_passing?
         next if @bg_heap_upgraded&.include?(name)  # Already heap from Phase 1.5b
         if t.list_collection?
-          result << MIR::Promote.new(bg.token, name, t.zig_type, :list, nil)
+          result << MIR::Promote.new(bg.token, name, t.zig_type, :list, nil, list_elem_zig_type(t))
         else
           # :bg_string: annotate directly on BgBlock (no MIR::Promote needed)
           bg.capture_string_dupes ||= Set.new
@@ -864,7 +864,7 @@ def insert_promotion!(result, ret_node, promo)
     # Per-variable promotions.
     (filtered[:var_promotes] || []).each do |vp|
       strategy = classify_promote_strategy(vp[:zig_type])
-      result << MIR::Promote.new(ret_node.token, vp[:var], vp[:zig_type], strategy, nil)
+      result << MIR::Promote.new(ret_node.token, vp[:var], vp[:zig_type], strategy, nil, vp[:elem_zig_type])
     end
 
     # Struct-level field promotion: annotate the ReturnNode directly so
@@ -931,4 +931,10 @@ def classify_promote_strategy(zig_type)
     end
   end
 
+  def list_elem_zig_type(type_obj)
+    elem = type_obj&.element_type
+    return nil unless elem
+    Type.new(elem).zig_type
+  end
+
 end
diff --git a/src/mir/promotion_plan.rb b/src/mir/promotion_plan.rb
index 8659cbd4..4ac0fe43 100644
--- a/src/mir/promotion_plan.rb
+++ b/src/mir/promotion_plan.rb
@@ -70,7 +70,11 @@ def self.classify(fn_node, schema_lookup:)
           next unless fval.is_a?(AST::Identifier)
           next unless fti&.needs_escape_promotion? && !fti&.string? && !fti&.heap_provenance?
 
-          var_promotes << { var: fval.name, zig_type: fti.zig_type }
+          var_promotes << {
+            var: fval.name,
+            zig_type: fti.zig_type,
+            elem_zig_type: elem_zig_type_for(fti),
+          }
           handled_fields << fname.to_s
         end
 
@@ -100,7 +104,11 @@ def self.classify(fn_node, schema_lookup:)
                        !ti&.string? && !ti&.heap_provenance?
         if needs_escape
           if ti.list_collection? || ti.map?
-            var_promotes << { var: val.name, zig_type: ti.zig_type }
+            var_promotes << {
+              var: val.name,
+              zig_type: ti.zig_type,
+              elem_zig_type: elem_zig_type_for(ti),
+            }
           else
             struct_promote ||= zig_type_for(ret_type)
           end
@@ -213,6 +221,14 @@ def self.needs_promote?(plan, ret_node)
     Type.new(name).zig_type
   end
 
+  private_class_method def self.elem_zig_type_for(type)
+    elem = type&.element_type
+    return nil unless elem
+    Type.new(elem).zig_type
+  rescue
+    elem.to_s
+  end
+
   private_class_method def self.referenced_vars(node)
     vars = Set.new
     return vars unless node
diff --git a/src/tools/doctor.rb b/src/tools/doctor.rb
index 3d0b6dd5..56b03438 100644
--- a/src/tools/doctor.rb
+++ b/src/tools/doctor.rb
@@ -272,18 +272,42 @@ def section_fibers(profile_dir)
 
     totals = {}
     sched_rows = []
-    in_sched = false
+    site_rows = []
+    mode = nil
     File.readlines(fiber_file).each do |l|
       s = l.strip
-      next if s.empty? || s.start_with?('#') && !s.include?('per-scheduler')
+      next if s.empty?
       if s.include?('per-scheduler')
-        in_sched = true
+        mode = :sched
+        next
+      elsif s.include?('per-site fibers')
+        mode = :site
         next
       end
-      if in_sched
+      next if s.start_with?('#')
+      if mode == :sched
         f = s.split
         next if f.size < 2
         sched_rows << { idx: f[0].to_i, runs: f[1].to_i } if f[0] =~ /\A\d+\z/
+      elsif mode == :site
+        f = s.split(/\t/)
+        next if f.size < 9 || f[0] !~ /\A\d+\z/
+        scheds = {}
+        f[8].to_s.split(',').each do |pair|
+          sid, runs = pair.split(':', 2)
+          scheds[sid.to_i] = runs.to_i if sid && runs
+        end
+        site_rows << {
+          id: f[0].to_i,
+          spawns: f[1].to_i,
+          runs: f[2].to_i,
+          exits: f[3].to_i,
+          total_lifetime_ns: f[4].to_i,
+          max_lifetime_ns: f[5].to_i,
+          dispatch: f[6],
+          form: f[7],
+          scheds: scheds,
+        }
       else
         if s =~ /\A(\w+):\s*(\d+)/
           totals[$1] = $2.to_i
@@ -333,11 +357,134 @@ def section_fibers(profile_dir)
         puts "  *** Scheduler imbalance: one scheduler ran #{imbalance_pct}%"
         puts "      of all fibers. Other schedulers were idle. BG spawn"
         puts "      policy may be routing too aggressively to a single target."
+        emit_parallel_bg_hint!(profile_dir, site_rows)
       end
       puts ""
     end
   end
 
+  def emit_parallel_bg_hint!(profile_dir, site_rows = [])
+    metadata = task_site_metadata(profile_dir)
+    imbalanced_sites = site_rows.select do |site|
+      next false unless site[:runs] && site[:runs] > 0
+      site_scheduler_skew(site) >= 0.80
+    end
+    local_sites = imbalanced_sites.select { |site| site[:dispatch] == 'local' }
+
+    if local_sites.any?
+      emit_exact_local_bg_sites!(profile_dir, local_sites, metadata)
+      return
+    end
+
+    dispatch_counts = task_dispatch_counts(profile_dir)
+    return unless local_dispatch_warning?(dispatch_counts)
+
+    emit_generic_local_bg_hint!(local_bg_source_lines(File.join(profile_dir, 'source.cht')))
+  end
+
+  def emit_exact_local_bg_sites!(profile_dir, local_sites, metadata)
+    puts ""
+    puts "      Exact imbalanced local BG task sites:"
+    local_sites.sort_by { |site| -site[:runs] }.first(8).each do |site|
+      site_id = site[:id]
+      runs = site[:runs]
+      exits = site[:exits]
+      line = (metadata[site_id] || {})[:line] || '?'
+      snippet = source_line(profile_dir, line)
+      max_sched, max_runs = site[:scheds].max_by { |_, runs| runs } || [nil, 0]
+      pct = runs > 0 ? (max_runs.to_f / runs * 100).round : 0
+      avg_us = exits > 0 ? (site[:total_lifetime_ns] / exits / 1000.0).round(1) : 0
+      puts "        line #{line}: #{snippet}"
+      puts "          site=#{site_id} form=#{site[:form]} runs=#{runs} sched=#{max_sched} #{pct}% avg=#{avg_us}us"
+    end
+    emit_parallel_bg_advice!
+  end
+
+  def emit_generic_local_bg_hint!(local_bg_lines)
+    puts ""
+    puts "      Profile contains local BG dispatches (`BG {}` defaults to the"
+    puts "      current scheduler)."
+    if local_bg_lines.any?
+      puts "      Candidate BG sites:"
+      local_bg_lines.first(6).each do |site|
+        puts "        line #{site[:line]}: #{site[:text]}"
+      end
+    end
+    emit_parallel_bg_advice!
+  end
+
+  def emit_parallel_bg_advice!
+    puts "      Use `BG { @parallel -> ... }` for CPU-parallel worker fanout."
+    puts "      Keep plain `BG {}` for scheduler-affine, IO-affine, or"
+    puts "      locality-sensitive work."
+  end
+
+  def site_scheduler_skew(site)
+    max_runs = site[:scheds].values.max || 0
+    max_runs.to_f / site[:runs]
+  end
+
+  def task_dispatch_counts(profile_dir)
+    zig_source = File.join(profile_dir, 'transpiled.zig')
+    return { local: 0, parallel: 0 } unless File.exist?(zig_source)
+
+    zig = File.read(zig_source)
+    {
+      local: zig.scan(/\bsubmitFsmSpawn\s*\(/).size + zig.scan(/\bsubmitSpawn\s*\(/).size,
+      parallel: zig.scan(/\bspawnFsmBest\s*\(/).size + zig.scan(/\bspawnBest\s*\(/).size,
+    }
+  end
+
+  def local_dispatch_warning?(counts)
+    local = counts[:local]
+    parallel = counts[:parallel]
+    local > 0 && (parallel == 0 || local > parallel)
+  end
+
+  def task_site_metadata(profile_dir)
+    zig_source = File.join(profile_dir, 'transpiled.zig')
+    return {} unless File.exist?(zig_source)
+
+    sites = {}
+    File.foreach(zig_source) do |line|
+      next unless line.include?('CLEAR_PROFILE_TASK_SITE')
+      attrs = {}
+      line.scan(/(\w+)=([^\s]+)/) { |k, v| attrs[k.to_sym] = v }
+      id = attrs[:id].to_i
+      next if id <= 0
+      sites[id] = {
+        kind: attrs[:kind],
+        line: attrs[:line]&.to_i,
+        column: attrs[:column]&.to_i,
+        dispatch: attrs[:dispatch],
+        form: attrs[:form],
+      }
+    end
+    sites
+  end
+
+  def source_line(profile_dir, line)
+    return '' unless line && line != '?'
+    clear_source = File.join(profile_dir, 'source.cht')
+    return '' unless File.exist?(clear_source)
+    File.readlines(clear_source)[line.to_i - 1]&.strip.to_s[0, 90]
+  end
+
+  def local_bg_source_lines(clear_source)
+    return [] unless File.exist?(clear_source)
+
+    lines = File.readlines(clear_source)
+    sites = []
+    lines.each_with_index do |line, idx|
+      next unless line.include?('BG') && line =~ /\bBG\s*\{/
+
+      next if line.include?('@parallel')
+
+      sites << { line: idx + 1, text: line.strip[0, 80] }
+    end
+    sites
+  end
+
   # ── Lock Hold & Contention ──
   # Per-mutex stats from ParkingMutex + ParkingRwLock (write path).
   # Each row = one unique lock instance. Diagnoses:
diff --git a/testdata/write_test_output.txt b/testdata/write_test_output.txt
index 78a49c79..fdb8a13e 100644
--- a/testdata/write_test_output.txt
+++ b/testdata/write_test_output.txt
@@ -1 +1 @@
-1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35
\ No newline at end of file
+resource test
\ No newline at end of file
diff --git a/transpile-tests/351_shard_concurrent_frame_key.cht b/transpile-tests/351_shard_concurrent_frame_key.cht
new file mode 100644
index 00000000..58aab094
--- /dev/null
+++ b/transpile-tests/351_shard_concurrent_frame_key.cht
@@ -0,0 +1,16 @@
+-- SHARD + CONCURRENT EACH with a frame-allocating string key expression.
+-- The producer must enqueue an owned key so workers do not observe rewound
+-- frame memory.
+
+FN main() RETURNS Void ->
+    n = 128_i64;
+    MUTABLE map: HashMap<Int64>@sharded(4) = {};
+
+    (0..<n) s> SHARD("k:${toString(_)}", map) s> CONCURRENT EACH {
+        map[_] = 1_i64;
+    };
+
+    ASSERT map.count() == n, "all frame-built keys inserted";
+    ASSERT (map["k:0"] OR 0_i64) == 1_i64, "first key present";
+    ASSERT (map["k:127"] OR 0_i64) == 1_i64, "last key present";
+END
diff --git a/transpile-tests/352_shard_concurrent_error.cht b/transpile-tests/352_shard_concurrent_error.cht
new file mode 100644
index 00000000..26fe6087
--- /dev/null
+++ b/transpile-tests/352_shard_concurrent_error.cht
@@ -0,0 +1,22 @@
+-- SHARD worker errors must close all shard channels and propagate without
+-- hanging the producer.
+
+FN failsInShard() RETURNS !Bool ->
+    MUTABLE counts: HashMap<Int64, Int64>@sharded(4) = {};
+
+    (0..<1000_i64) s> SHARD(_ MOD 16_i64, counts) s> CONCURRENT(capacity: 4) EACH {
+        IF _ == 7_i64 THEN
+            RAISE Input, ShardBoom, "boom";
+        END
+        counts[_] = (counts[_] OR 0_i64) + 1_i64;
+    };
+
+    RETURN FALSE;
+CATCH Input WITH(ShardBoom)
+    RETURN TRUE;
+END
+
+FN main() RETURNS !Void ->
+    caught = failsInShard() OR RAISE;
+    ASSERT caught, "worker error propagated";
+END
diff --git a/transpile-tests/353_shard_concurrent_capacity.cht b/transpile-tests/353_shard_concurrent_capacity.cht
new file mode 100644
index 00000000..247f817e
--- /dev/null
+++ b/transpile-tests/353_shard_concurrent_capacity.cht
@@ -0,0 +1,16 @@
+-- Small per-shard capacity exercises BoundedChannel back pressure.
+
+FN main() RETURNS Void ->
+    n = 256_i64;
+    buckets = 8_i64;
+    MUTABLE counts: HashMap<Int64, Int64>@sharded(4) = {};
+
+    (0..<n) s> SHARD(_ MOD buckets, counts) s> CONCURRENT(capacity: 2, batch: 3) EACH {
+        counts[_] = (counts[_] OR 0_i64) + 1_i64;
+    };
+
+    ASSERT counts.count() == buckets, "expected bucket count";
+    vals = counts.values();
+    total = vals s> CONCURRENT SUM toFloat(_);
+    ASSERT total == toFloat(n), "total mismatch with small shard channel capacity and batch";
+END
diff --git a/transpile-tests/354_concurrent_batch.cht b/transpile-tests/354_concurrent_batch.cht
new file mode 100644
index 00000000..427d83fb
--- /dev/null
+++ b/transpile-tests/354_concurrent_batch.cht
@@ -0,0 +1,17 @@
+-- Test: CONCURRENT(batch: N) preserves item-wise semantics while chunking work claims.
+
+FN main() RETURNS Void ->
+    nums = [1, 2, 3, 4, 5, 6, 7];
+
+    doubled = nums s> CONCURRENT(workers: 3, batch: 3) SELECT _ * 2;
+    ASSERT doubled.length() == 7, "batch SELECT length";
+    ASSERT doubled[0] == 2, "batch SELECT order 0";
+    ASSERT doubled[6] == 14, "batch SELECT partial final item";
+
+    evens = nums s> CONCURRENT(workers: 2, batch: 4) WHERE _ MOD 2 == 0;
+    ASSERT evens.length() == 3, "batch WHERE length";
+    ASSERT evens[0] == 2, "batch WHERE order 0";
+    ASSERT evens[2] == 6, "batch WHERE order 2";
+
+    RETURN;
+END
diff --git a/transpile-tests/gen.rb b/transpile-tests/gen.rb
index c47e241a..2db500d8 100755
--- a/transpile-tests/gen.rb
+++ b/transpile-tests/gen.rb
@@ -67,11 +67,8 @@ def generate_test_block(filename, cheat_code, source_dir: Dir.pwd)
 
     execution_block = if needs_scheduler
       <<~ZIG
-          const fm = @import("runtime/fiber-memory.zig");
           const fp = @import("runtime/scheduler.zig");
-          var stack_pool = fm.StackPool.init(t_alloc);
-          defer stack_pool.deinit();
-          var sched = try fp.Scheduler.init(t_alloc, &global_ctx, &stack_pool);
+          var sched = try fp.Scheduler.init(t_alloc, &global_ctx, null);
           defer {
               fp.scheduler_running = false;
               sched.deinit();
diff --git a/zig/build.zig b/zig/build.zig
index c23ace2a..afe7d5f2 100644
--- a/zig/build.zig
+++ b/zig/build.zig
@@ -170,6 +170,7 @@ pub fn build(b: *std.Build) void {
         .{ .path = "epoll-steal-test.zig", .tsan = true },
         .{ .path = "ffi-concurrency-test.zig", .tsan = true },
         .{ .path = "fiber-test.zig", .tsan = true },
+        .{ .path = "fiber-profile-test.zig", .tsan = true },
         // FSM (stackless task) tests
         .{ .path = "fsm-benchmark-test.zig", .tsan = true },
         .{ .path = "fsm-concurrent-test.zig", .tsan = true },
@@ -731,6 +732,25 @@ pub fn build(b: *std.Build) void {
     }
     loom_step.dependOn(&run_pl_loom.step);
 
+    const versioned_loom_exe = b.addExecutable(.{
+        .name = "versioned-loom-test",
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("versioned-loom-test.zig"),
+            .target = target,
+            .optimize = optimize,
+        }),
+    });
+    versioned_loom_exe.root_module.addAssemblyFile(switch_s);
+    versioned_loom_exe.root_module.addAssemblyFile(onroot_s);
+    versioned_loom_exe.root_module.link_libc = true;
+    const run_versioned_loom = b.addRunArtifact(versioned_loom_exe);
+    run_versioned_loom.has_side_effects = true;
+    run_versioned_loom.stdio = .inherit;
+    if (shard_index == 0) {
+        test_loom_vopr_step.dependOn(&run_versioned_loom.step);
+    }
+    loom_step.dependOn(&run_versioned_loom.step);
+
     // -------------------------------------------------------------------------
     // VERSIONED-EXHAUST -- Deterministic MVCC retry-exhaustion check
     // -------------------------------------------------------------------------
diff --git a/zig/fiber-profile-test.zig b/zig/fiber-profile-test.zig
new file mode 100644
index 00000000..98a9a8eb
--- /dev/null
+++ b/zig/fiber-profile-test.zig
@@ -0,0 +1,5 @@
+const fp = @import("runtime/fiber-profile.zig");
+
+test {
+    _ = fp;
+}
diff --git a/zig/lib/atomic_ptr.zig b/zig/lib/atomic_ptr.zig
index 4b4c9839..7f752f3f 100644
--- a/zig/lib/atomic_ptr.zig
+++ b/zig/lib/atomic_ptr.zig
@@ -80,7 +80,9 @@ else
 // Runtime, so we duck-type with @hasField.
 inline fn extractEbr(arg: anytype) *ThreadLocalEbr {
     const T = @TypeOf(arg);
-    return if (comptime @hasField(@typeInfo(T).pointer.child, "ebr"))
+    return if (comptime @hasDecl(@typeInfo(T).pointer.child, "currentEbr"))
+        arg.currentEbr()
+    else if (comptime @hasField(@typeInfo(T).pointer.child, "ebr"))
         arg.ebr
     else
         arg;
diff --git a/zig/lib/data-structures.zig b/zig/lib/data-structures.zig
index 71221aa2..fbe681f4 100644
--- a/zig/lib/data-structures.zig
+++ b/zig/lib/data-structures.zig
@@ -712,7 +712,9 @@ pub fn bind(comptime deps: type) type {
                 tail:          std.atomic.Value(u32) = std.atomic.Value(u32).init(0),
                 lock:          std.atomic.Value(u32) = std.atomic.Value(u32).init(0),
                 consumer_task: ?*Task = null,
+                consumer_sched: ?*fp.Scheduler = null,
                 producer_task: ?*Task = null,
+                producer_sched: ?*fp.Scheduler = null,
                 sched:         *fp.Scheduler,
                 /// Atomic so push/next can fast-path-read it without
                 /// taking `lock`. Writers (close, deinit, setError) hold
@@ -753,9 +755,11 @@ pub fn bind(comptime deps: type) type {
                         if (h == t) {
                             while (inner.lock.swap(1, .acquire) == 1) std.Thread.yield() catch {};
                             if (inner.consumer_task) |consumer| {
+                                const consumer_sched = inner.consumer_sched orelse inner.sched;
                                 inner.consumer_task = null;
+                                inner.consumer_sched = null;
                                 inner.lock.store(0, .release);
-                                inner.sched.schedule(consumer);
+                                consumer_sched.schedule(consumer);
                             } else {
                                 inner.lock.store(0, .release);
                             }
@@ -773,9 +777,11 @@ pub fn bind(comptime deps: type) type {
                         inner.lock.store(0, .release);
                         continue;
                     }
-                    const task = inner.sched.getCurrent();
+                    const waiter_sched = fp.active_scheduler;
+                    const task = waiter_sched.getCurrent();
                     task.status.store(.Blocked, .release);
                     inner.producer_task = task;
+                    inner.producer_sched = waiter_sched;
                     inner.lock.store(0, .release);
                     task.base.yield();
                 }
@@ -789,9 +795,11 @@ pub fn bind(comptime deps: type) type {
                 while (inner.lock.swap(1, .acquire) == 1) std.Thread.yield() catch {};
                 inner.closed.store(true, .release);
                 if (inner.consumer_task) |consumer| {
+                    const consumer_sched = inner.consumer_sched orelse inner.sched;
                     inner.consumer_task = null;
+                    inner.consumer_sched = null;
                     inner.lock.store(0, .release);
-                    inner.sched.schedule(consumer);
+                    consumer_sched.schedule(consumer);
                 } else {
                     inner.lock.store(0, .release);
                 }
@@ -824,9 +832,11 @@ pub fn bind(comptime deps: type) type {
                         if (h -% t == BUF_SIZE) {
                             while (inner.lock.swap(1, .acquire) == 1) std.Thread.yield() catch {};
                             if (inner.producer_task) |producer| {
+                                const producer_sched = inner.producer_sched orelse inner.sched;
                                 inner.producer_task = null;
+                                inner.producer_sched = null;
                                 inner.lock.store(0, .release);
-                                inner.sched.schedule(producer);
+                                producer_sched.schedule(producer);
                             } else {
                                 inner.lock.store(0, .release);
                             }
@@ -848,9 +858,11 @@ pub fn bind(comptime deps: type) type {
                         if (inner.err) |err| return err;
                         return null;
                     }
-                    const task = inner.sched.getCurrent();
+                    const waiter_sched = fp.active_scheduler;
+                    const task = waiter_sched.getCurrent();
                     task.status.store(.Blocked, .release);
                     inner.consumer_task = task;
+                    inner.consumer_sched = waiter_sched;
                     inner.lock.store(0, .release);
                     task.base.yield();
                 }
@@ -874,9 +886,11 @@ pub fn bind(comptime deps: type) type {
                     inner.tail.store(h, .release);
                 }
                 if (inner.producer_task) |producer| {
+                    const producer_sched = inner.producer_sched orelse inner.sched;
                     inner.producer_task = null;
+                    inner.producer_sched = null;
                     inner.lock.store(0, .release);
-                    inner.sched.schedule(producer);
+                    producer_sched.schedule(producer);
                 } else {
                     inner.lock.store(0, .release);
                 }
diff --git a/zig/lib/ebr.zig b/zig/lib/ebr.zig
index 67c58cb0..a10e70ca 100644
--- a/zig/lib/ebr.zig
+++ b/zig/lib/ebr.zig
@@ -148,6 +148,10 @@ pub const ThreadLocalEbr = struct {
     // is_active: "I am currently holding a pointer inside a critical section"
     is_active: Atomic(bool) = Atomic(bool).init(false),
 
+    // Nested EBR guards on the same participant. The first enter publishes
+    // the epoch; only the final exit clears is_active.
+    pin_depth: Atomic(u32) = Atomic(u32).init(0),
+
     // link to the global world
     context: *EbrContext,
 
@@ -215,6 +219,9 @@ pub const ThreadLocalEbr = struct {
     // now 1) and is the dominant cost in tight read loops like the
     // bench-17 200K-read fiber.
     pub fn enter(self: *ThreadLocalEbr) void {
+        const prev_depth = self.pin_depth.fetchAdd(1, .acq_rel);
+        if (prev_depth != 0) return;
+
         const global = self.context.global_epoch.load(.acquire);
         self.local_epoch.store(global, .monotonic);
         // The .seq_cst here is doing two jobs:
@@ -234,6 +241,10 @@ pub const ThreadLocalEbr = struct {
     // StoreLoad (no further loads in this thread depend on reclaim
     // observing is_active=false promptly).
     pub fn exit(self: *ThreadLocalEbr) void {
+        const prev_depth = self.pin_depth.fetchSub(1, .acq_rel);
+        std.debug.assert(prev_depth > 0);
+        if (prev_depth != 1) return;
+
         self.is_active.store(false, .release);
     }
 };
diff --git a/zig/lib/parking-lot.zig b/zig/lib/parking-lot.zig
index c8282c57..82c3e518 100644
--- a/zig/lib/parking-lot.zig
+++ b/zig/lib/parking-lot.zig
@@ -687,19 +687,22 @@ pub const ParkingMutex = struct {
                 }
             }
         }
-        // Safety: cycle detection. Same gating: only treat fsm_owner
-        // as authoritative when the lock is actually held.
+        // Safety: cycle detection is only meaningful when the lock has an
+        // owner. Keep the uncontended path equivalent to stackful lock() so
+        // hot independent locks do not pay chain-walk setup on every acquire.
         const pre_task_owner = ownerOf(pre_state);
         const pre_fsm_owner: ?*fp.FsmTask =
             if ((pre_state & STATE_LOCKED) != 0) self.fsm_owner.load(.acquire) else null;
-        detectCycleFsm(fsm_task, pre_task_owner, pre_fsm_owner, self) catch |err| {
-            fsm_task.lock_error = switch (err) {
-                error.Deadlock => .Deadlock,
-                error.LockCycle => .LockCycle,
-                else => .Deadlock,
+        if ((pre_state & STATE_LOCKED) != 0) {
+            detectCycleFsm(fsm_task, pre_task_owner, pre_fsm_owner, self) catch |err| {
+                fsm_task.lock_error = switch (err) {
+                    error.Deadlock => .Deadlock,
+                    error.LockCycle => .LockCycle,
+                    else => .Deadlock,
+                };
+                return .Error;
             };
-            return .Error;
-        };
+        }
 
         // Fast path: uncontended CAS.
         const cur = self.state.load(.acquire);
diff --git a/zig/lib/streams.zig b/zig/lib/streams.zig
index 439064ec..80c34346 100644
--- a/zig/lib/streams.zig
+++ b/zig/lib/streams.zig
@@ -440,6 +440,7 @@ pub fn concurrentBoundedSelect(
     rt: anytype,
     items: anytype,
     workers: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -462,12 +463,14 @@ pub fn concurrentBoundedSelect(
     var err_code = std.atomic.Value(u16).init(0);
     var next_idx = std.atomic.Value(usize).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Worker = struct {
         wg: *WaitGroupT,
         items: *[N]PromiseT,
         slots: []Slot,
         next_idx: *std.atomic.Value(usize),
+        batch_size: usize,
         err_code: *std.atomic.Value(u16),
         user_ctx: ?*anyopaque,
 
@@ -476,14 +479,17 @@ pub fn concurrentBoundedSelect(
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
             while (true) {
-                const idx = ctx.next_idx.fetchAdd(1, .monotonic);
-                if (idx >= N) break;
-                const item = try ctx.items[idx].next();
-                const mapped = mapFn(worker_rt, ctx.user_ctx, item) catch |err| {
-                    _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
-                    continue;
-                };
-                ctx.slots[idx] = mapped;
+                const start = ctx.next_idx.fetchAdd(ctx.batch_size, .monotonic);
+                if (start >= N) break;
+                const end = @min(start + ctx.batch_size, N);
+                for (start..end) |idx| {
+                    const item = try ctx.items[idx].next();
+                    const mapped = mapFn(worker_rt, ctx.user_ctx, item) catch |err| {
+                        _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
+                        continue;
+                    };
+                    ctx.slots[idx] = mapped;
+                }
                 worker_rt.checkYield();
             }
         }
@@ -499,6 +505,7 @@ pub fn concurrentBoundedSelect(
             .items = items,
             .slots = slots,
             .next_idx = &next_idx,
+            .batch_size = batch_size,
             .err_code = &err_code,
             .user_ctx = user_ctx,
         };
@@ -543,6 +550,7 @@ pub fn concurrentBoundedWhere(
     rt: anytype,
     items: anytype,
     workers: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -564,6 +572,7 @@ pub fn concurrentBoundedWhere(
     var err_code = std.atomic.Value(u16).init(0);
     var next_idx = std.atomic.Value(usize).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Worker = struct {
         wg: *WaitGroupT,
@@ -571,6 +580,7 @@ pub fn concurrentBoundedWhere(
         slots: []Slot,
         alloc: std.mem.Allocator,
         next_idx: *std.atomic.Value(usize),
+        batch_size: usize,
         err_code: *std.atomic.Value(u16),
         user_ctx: ?*anyopaque,
 
@@ -579,17 +589,20 @@ pub fn concurrentBoundedWhere(
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
             while (true) {
-                const idx = ctx.next_idx.fetchAdd(1, .monotonic);
-                if (idx >= N) break;
-                var item = try ctx.items[idx].next();
-                const keep = predFn(worker_rt, ctx.user_ctx, item) catch |err| {
-                    _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
-                    continue;
-                };
-                if (keep) {
-                    ctx.slots[idx] = item;
-                } else {
-                    cleanupItemFn(ctx.alloc, &item);
+                const start = ctx.next_idx.fetchAdd(ctx.batch_size, .monotonic);
+                if (start >= N) break;
+                const end = @min(start + ctx.batch_size, N);
+                for (start..end) |idx| {
+                    var item = try ctx.items[idx].next();
+                    const keep = predFn(worker_rt, ctx.user_ctx, item) catch |err| {
+                        _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
+                        continue;
+                    };
+                    if (keep) {
+                        ctx.slots[idx] = item;
+                    } else {
+                        cleanupItemFn(ctx.alloc, &item);
+                    }
                 }
                 worker_rt.checkYield();
             }
@@ -607,6 +620,7 @@ pub fn concurrentBoundedWhere(
             .slots = slots,
             .alloc = alloc,
             .next_idx = &next_idx,
+            .batch_size = batch_size,
             .err_code = &err_code,
             .user_ctx = user_ctx,
         };
@@ -649,6 +663,7 @@ pub fn concurrentBoundedEach(
     rt: anytype,
     items: anytype,
     workers: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -661,11 +676,13 @@ pub fn concurrentBoundedEach(
     var err_code = std.atomic.Value(u16).init(0);
     var next_idx = std.atomic.Value(usize).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Worker = struct {
         wg: *WaitGroupT,
         items: *[N]PromiseT,
         next_idx: *std.atomic.Value(usize),
+        batch_size: usize,
         err_code: *std.atomic.Value(u16),
         user_ctx: ?*anyopaque,
 
@@ -674,13 +691,16 @@ pub fn concurrentBoundedEach(
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
             while (true) {
-                const idx = ctx.next_idx.fetchAdd(1, .monotonic);
-                if (idx >= N) break;
-                const item = try ctx.items[idx].next();
-                eachFn(worker_rt, ctx.user_ctx, item) catch |err| {
-                    _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
-                    continue;
-                };
+                const start = ctx.next_idx.fetchAdd(ctx.batch_size, .monotonic);
+                if (start >= N) break;
+                const end = @min(start + ctx.batch_size, N);
+                for (start..end) |idx| {
+                    const item = try ctx.items[idx].next();
+                    eachFn(worker_rt, ctx.user_ctx, item) catch |err| {
+                        _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
+                        continue;
+                    };
+                }
                 worker_rt.checkYield();
             }
         }
@@ -695,6 +715,7 @@ pub fn concurrentBoundedEach(
             .wg = &wg,
             .items = items,
             .next_idx = &next_idx,
+            .batch_size = batch_size,
             .err_code = &err_code,
             .user_ctx = user_ctx,
         };
@@ -738,6 +759,7 @@ pub fn concurrentStreamSelect(
     src: anytype,
     workers: usize,
     capacity: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -750,6 +772,7 @@ pub fn concurrentStreamSelect(
     defer chan.deinit();
     var err_code = std.atomic.Value(u16).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Feeder = struct {
         wg: *WaitGroupT,
@@ -778,18 +801,25 @@ pub fn concurrentStreamSelect(
         local: std.ArrayListUnmanaged(R),
         alloc: std.mem.Allocator,
         err: *std.atomic.Value(u16),
+        batch_size: usize,
         user_ctx: ?*anyopaque,
 
         fn run(raw_rt: *anyopaque, raw_args: ?*anyopaque) anyerror!void {
             const worker_rt = @as(*RuntimeT, @ptrCast(@alignCast(raw_rt)));
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
-            while (try ctx.chan.pop()) |item| {
-                const mapped = mapFn(worker_rt, ctx.user_ctx, item) catch |e| {
-                    _ = ctx.err.cmpxchgStrong(0, @intFromError(e), .seq_cst, .seq_cst);
-                    continue;
-                };
-                try ctx.local.append(ctx.alloc, mapped);
+            while (try ctx.chan.pop()) |first| {
+                var item = first;
+                var n: usize = 0;
+                while (true) : (n += 1) {
+                    const mapped = mapFn(worker_rt, ctx.user_ctx, item) catch |e| {
+                        _ = ctx.err.cmpxchgStrong(0, @intFromError(e), .seq_cst, .seq_cst);
+                        break;
+                    };
+                    try ctx.local.append(ctx.alloc, mapped);
+                    if (n + 1 >= ctx.batch_size) break;
+                    item = (try ctx.chan.pop()) orelse break;
+                }
                 worker_rt.checkYield();
             }
         }
@@ -811,6 +841,7 @@ pub fn concurrentStreamSelect(
             .local = .empty,
             .alloc = alloc,
             .err = &err_code,
+            .batch_size = batch_size,
             .user_ctx = user_ctx,
         };
         if (parallel) {
@@ -859,6 +890,7 @@ pub fn concurrentStreamWhere(
     src: anytype,
     workers: usize,
     capacity: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -870,6 +902,7 @@ pub fn concurrentStreamWhere(
     defer chan.deinit();
     var err_code = std.atomic.Value(u16).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Feeder = struct {
         wg: *WaitGroupT,
@@ -898,23 +931,30 @@ pub fn concurrentStreamWhere(
         local: std.ArrayListUnmanaged(T),
         alloc: std.mem.Allocator,
         err: *std.atomic.Value(u16),
+        batch_size: usize,
         user_ctx: ?*anyopaque,
 
         fn run(raw_rt: *anyopaque, raw_args: ?*anyopaque) anyerror!void {
             const worker_rt = @as(*RuntimeT, @ptrCast(@alignCast(raw_rt)));
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
-            while (try ctx.chan.pop()) |item| {
-                var item_mut = item;
-                const keep = predFn(worker_rt, ctx.user_ctx, item) catch |e| {
-                    _ = ctx.err.cmpxchgStrong(0, @intFromError(e), .seq_cst, .seq_cst);
-                    cleanupItemFn(ctx.alloc, &item_mut);
-                    continue;
-                };
-                if (keep) {
-                    try ctx.local.append(ctx.alloc, item);
-                } else {
-                    cleanupItemFn(ctx.alloc, &item_mut);
+            while (try ctx.chan.pop()) |first| {
+                var item = first;
+                var n: usize = 0;
+                while (true) : (n += 1) {
+                    var item_mut = item;
+                    const keep = predFn(worker_rt, ctx.user_ctx, item) catch |e| {
+                        _ = ctx.err.cmpxchgStrong(0, @intFromError(e), .seq_cst, .seq_cst);
+                        cleanupItemFn(ctx.alloc, &item_mut);
+                        break;
+                    };
+                    if (keep) {
+                        try ctx.local.append(ctx.alloc, item);
+                    } else {
+                        cleanupItemFn(ctx.alloc, &item_mut);
+                    }
+                    if (n + 1 >= ctx.batch_size) break;
+                    item = (try ctx.chan.pop()) orelse break;
                 }
                 worker_rt.checkYield();
             }
@@ -936,6 +976,7 @@ pub fn concurrentStreamWhere(
             .local = .empty,
             .alloc = alloc,
             .err = &err_code,
+            .batch_size = batch_size,
             .user_ctx = user_ctx,
         };
         if (parallel) {
@@ -983,6 +1024,7 @@ pub fn concurrentStreamEach(
     src: anytype,
     workers: usize,
     capacity: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -995,6 +1037,7 @@ pub fn concurrentStreamEach(
     defer chan.deinit();
     var err_code = std.atomic.Value(u16).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Feeder = struct {
         wg: *WaitGroupT,
@@ -1021,17 +1064,24 @@ pub fn concurrentStreamEach(
         wg: *WaitGroupT,
         chan: *ChannelT,
         err: *std.atomic.Value(u16),
+        batch_size: usize,
         user_ctx: ?*anyopaque,
 
         fn run(raw_rt: *anyopaque, raw_args: ?*anyopaque) anyerror!void {
             const worker_rt = @as(*RuntimeT, @ptrCast(@alignCast(raw_rt)));
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
-            while (try ctx.chan.pop()) |item| {
-                eachFn(worker_rt, ctx.user_ctx, item) catch |e| {
-                    _ = ctx.err.cmpxchgStrong(0, @intFromError(e), .seq_cst, .seq_cst);
-                    continue;
-                };
+            while (try ctx.chan.pop()) |first| {
+                var item = first;
+                var n: usize = 0;
+                while (true) : (n += 1) {
+                    eachFn(worker_rt, ctx.user_ctx, item) catch |e| {
+                        _ = ctx.err.cmpxchgStrong(0, @intFromError(e), .seq_cst, .seq_cst);
+                        break;
+                    };
+                    if (n + 1 >= ctx.batch_size) break;
+                    item = (try ctx.chan.pop()) orelse break;
+                }
                 worker_rt.checkYield();
             }
         }
@@ -1050,6 +1100,7 @@ pub fn concurrentStreamEach(
             .wg = &wg,
             .chan = &chan,
             .err = &err_code,
+            .batch_size = batch_size,
             .user_ctx = user_ctx,
         };
         if (parallel) {
@@ -1083,6 +1134,7 @@ pub fn concurrentListSelect(
     rt: anytype,
     items: []const T,
     workers: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -1102,12 +1154,14 @@ pub fn concurrentListSelect(
     var err_code = std.atomic.Value(u16).init(0);
     var next_idx = std.atomic.Value(usize).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Worker = struct {
         wg: *WaitGroupT,
         items: []const T,
         slots: []Slot,
         next_idx: *std.atomic.Value(usize),
+        batch_size: usize,
         err_code: *std.atomic.Value(u16),
         user_ctx: ?*anyopaque,
 
@@ -1116,14 +1170,17 @@ pub fn concurrentListSelect(
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
             while (true) {
-                const idx = ctx.next_idx.fetchAdd(1, .monotonic);
-                if (idx >= ctx.items.len) break;
-                const item = ctx.items[idx];
-                const mapped = mapFn(worker_rt, ctx.user_ctx, item) catch |err| {
-                    _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
-                    continue;
-                };
-                ctx.slots[idx] = mapped;
+                const start = ctx.next_idx.fetchAdd(ctx.batch_size, .monotonic);
+                if (start >= ctx.items.len) break;
+                const end = @min(start + ctx.batch_size, ctx.items.len);
+                for (start..end) |idx| {
+                    const item = ctx.items[idx];
+                    const mapped = mapFn(worker_rt, ctx.user_ctx, item) catch |err| {
+                        _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
+                        continue;
+                    };
+                    ctx.slots[idx] = mapped;
+                }
                 worker_rt.checkYield();
             }
         }
@@ -1139,6 +1196,7 @@ pub fn concurrentListSelect(
             .items = items,
             .slots = slots,
             .next_idx = &next_idx,
+            .batch_size = batch_size,
             .err_code = &err_code,
             .user_ctx = user_ctx,
         };
@@ -1182,6 +1240,7 @@ pub fn concurrentListWhere(
     rt: anytype,
     items: []const T,
     workers: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -1201,12 +1260,14 @@ pub fn concurrentListWhere(
     var err_code = std.atomic.Value(u16).init(0);
     var next_idx = std.atomic.Value(usize).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Worker = struct {
         wg: *WaitGroupT,
         items: []const T,
         slots: []Slot,
         next_idx: *std.atomic.Value(usize),
+        batch_size: usize,
         err_code: *std.atomic.Value(u16),
         user_ctx: ?*anyopaque,
 
@@ -1215,14 +1276,17 @@ pub fn concurrentListWhere(
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
             while (true) {
-                const idx = ctx.next_idx.fetchAdd(1, .monotonic);
-                if (idx >= ctx.items.len) break;
-                const item = ctx.items[idx];
-                const keep = predFn(worker_rt, ctx.user_ctx, item) catch |err| {
-                    _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
-                    continue;
-                };
-                if (keep) ctx.slots[idx] = item;
+                const start = ctx.next_idx.fetchAdd(ctx.batch_size, .monotonic);
+                if (start >= ctx.items.len) break;
+                const end = @min(start + ctx.batch_size, ctx.items.len);
+                for (start..end) |idx| {
+                    const item = ctx.items[idx];
+                    const keep = predFn(worker_rt, ctx.user_ctx, item) catch |err| {
+                        _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
+                        continue;
+                    };
+                    if (keep) ctx.slots[idx] = item;
+                }
                 worker_rt.checkYield();
             }
         }
@@ -1238,6 +1302,7 @@ pub fn concurrentListWhere(
             .items = items,
             .slots = slots,
             .next_idx = &next_idx,
+            .batch_size = batch_size,
             .err_code = &err_code,
             .user_ctx = user_ctx,
         };
@@ -1279,6 +1344,7 @@ pub fn concurrentListEach(
     rt: anytype,
     items: []const T,
     workers: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -1288,11 +1354,13 @@ pub fn concurrentListEach(
     var err_code = std.atomic.Value(u16).init(0);
     var next_idx = std.atomic.Value(usize).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Worker = struct {
         wg: *WaitGroupT,
         items: []const T,
         next_idx: *std.atomic.Value(usize),
+        batch_size: usize,
         err_code: *std.atomic.Value(u16),
         user_ctx: ?*anyopaque,
 
@@ -1301,13 +1369,16 @@ pub fn concurrentListEach(
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
             while (true) {
-                const idx = ctx.next_idx.fetchAdd(1, .monotonic);
-                if (idx >= ctx.items.len) break;
-                const item = ctx.items[idx];
-                eachFn(worker_rt, ctx.user_ctx, item) catch |err| {
-                    _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
-                    continue;
-                };
+                const start = ctx.next_idx.fetchAdd(ctx.batch_size, .monotonic);
+                if (start >= ctx.items.len) break;
+                const end = @min(start + ctx.batch_size, ctx.items.len);
+                for (start..end) |idx| {
+                    const item = ctx.items[idx];
+                    eachFn(worker_rt, ctx.user_ctx, item) catch |err| {
+                        _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
+                        continue;
+                    };
+                }
                 worker_rt.checkYield();
             }
         }
@@ -1322,6 +1393,7 @@ pub fn concurrentListEach(
             .wg = &wg,
             .items = items,
             .next_idx = &next_idx,
+            .batch_size = batch_size,
             .err_code = &err_code,
             .user_ctx = user_ctx,
         };
@@ -1350,6 +1422,7 @@ pub fn concurrentListEachInPlace(
     rt: anytype,
     items: []T,
     workers: usize,
+    batch: usize,
     parallel: bool,
     task_cfg: anytype,
     user_ctx: ?*anyopaque,
@@ -1359,11 +1432,13 @@ pub fn concurrentListEachInPlace(
     var err_code = std.atomic.Value(u16).init(0);
     var next_idx = std.atomic.Value(usize).init(0);
     var wg = WaitGroupT.init(rt.getSched());
+    const batch_size = @max(batch, 1);
 
     const Worker = struct {
         wg: *WaitGroupT,
         items: []T,
         next_idx: *std.atomic.Value(usize),
+        batch_size: usize,
         err_code: *std.atomic.Value(u16),
         user_ctx: ?*anyopaque,
 
@@ -1372,13 +1447,16 @@ pub fn concurrentListEachInPlace(
             const ctx = @as(*@This(), @ptrCast(@alignCast(raw_args.?)));
             defer ctx.wg.done();
             while (true) {
-                const idx = ctx.next_idx.fetchAdd(1, .monotonic);
-                if (idx >= ctx.items.len) break;
-                const item_ptr = &ctx.items[idx];
-                eachFn(worker_rt, ctx.user_ctx, item_ptr) catch |err| {
-                    _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
-                    continue;
-                };
+                const start = ctx.next_idx.fetchAdd(ctx.batch_size, .monotonic);
+                if (start >= ctx.items.len) break;
+                const end = @min(start + ctx.batch_size, ctx.items.len);
+                for (start..end) |idx| {
+                    const item_ptr = &ctx.items[idx];
+                    eachFn(worker_rt, ctx.user_ctx, item_ptr) catch |err| {
+                        _ = ctx.err_code.cmpxchgStrong(0, @intFromError(err), .seq_cst, .seq_cst);
+                        continue;
+                    };
+                }
                 worker_rt.checkYield();
             }
         }
@@ -1393,6 +1471,7 @@ pub fn concurrentListEachInPlace(
             .wg = &wg,
             .items = items,
             .next_idx = &next_idx,
+            .batch_size = batch_size,
             .err_code = &err_code,
             .user_ctx = user_ctx,
         };
diff --git a/zig/runtime/alloc-profile.zig b/zig/runtime/alloc-profile.zig
index 289baedc..42bca5c7 100644
--- a/zig/runtime/alloc-profile.zig
+++ b/zig/runtime/alloc-profile.zig
@@ -6,6 +6,7 @@
 
 const std = @import("std");
 const compat = @import("../lib/compat.zig");
+const SpinLock = @import("profile-lock.zig").SpinLock;
 
 // Profile-table size; shared default with lock-profile / mvcc-profile.
 // `clear profile --profile-max=N` injects the override into the
@@ -26,6 +27,7 @@ const Site = struct {
 
 var sites: [MAX_SITES]Site = [_]Site{.{}} ** MAX_SITES;
 var total_allocs: u64 = 0;
+var mu: SpinLock = .{};
 
 // Counts findSlot() calls that hit the saturated table. Surfaced
 // in the dump as a `# WARNING:` header so doctor can advise the
@@ -59,6 +61,8 @@ pub fn totalBytes() u64 {
 }
 
 pub fn recordAlloc(ret_addr: usize, size: usize) void {
+    mu.lock();
+    defer mu.unlock();
     if (findSlot(ret_addr)) |site| {
         site.alloc_count += 1;
         site.alloc_bytes += size;
@@ -68,6 +72,8 @@ pub fn recordAlloc(ret_addr: usize, size: usize) void {
 }
 
 pub fn recordFree(ret_addr: usize, size: usize) void {
+    mu.lock();
+    defer mu.unlock();
     if (findSlot(ret_addr)) |site| {
         site.free_count += 1;
         site.free_bytes += size;
@@ -81,6 +87,9 @@ pub fn dump() void {
     const fd = compat.createFileTruncate(path_ptr) catch return;
     defer compat.closeFd(fd);
 
+    mu.lock();
+    defer mu.unlock();
+
     var buf: [256]u8 = undefined;
 
     compat.writeAllFd(fd, "# alloc-profile v1\n") catch return;
diff --git a/zig/runtime/bounded-channel-test.zig b/zig/runtime/bounded-channel-test.zig
index d84d2080..c9c13683 100644
--- a/zig/runtime/bounded-channel-test.zig
+++ b/zig/runtime/bounded-channel-test.zig
@@ -592,6 +592,133 @@ test "BoundedChannel: setError() unblocks consumers with the error" {
     try std.testing.expect(shared.got_error.load(.seq_cst));
 }
 
+test "BoundedChannel: setError() unblocks a producer waiting on full channel" {
+    const alloc = std.testing.allocator;
+    const Shared = struct {
+        ch: CheatLib.BoundedChannel(i64),
+        producer_filled: std.atomic.Value(bool) = std.atomic.Value(bool).init(false),
+        producer_saw_closed: std.atomic.Value(bool) = std.atomic.Value(bool).init(false),
+    };
+    var shared = Shared{
+        .ch = try CheatLib.BoundedChannel(i64).init(alloc, 4),
+    };
+    defer shared.ch.deinit();
+
+    const Producer = struct {
+        fn run(s: *Shared) void {
+            s.ch.push(1) catch return;
+            s.ch.push(2) catch return;
+            s.ch.push(3) catch return;
+            s.ch.push(4) catch return;
+            s.producer_filled.store(true, .release);
+            s.ch.push(5) catch |err| {
+                if (err == error.StreamClosed) s.producer_saw_closed.store(true, .seq_cst);
+                return;
+            };
+        }
+    };
+
+    var th = try std.Thread.spawn(.{}, Producer.run, .{&shared});
+    while (!shared.producer_filled.load(.acquire)) std.Thread.yield() catch {};
+    shared.ch.setError(error.ProducerFailed);
+    th.join();
+
+    try std.testing.expect(shared.producer_saw_closed.load(.seq_cst));
+}
+
+test "BoundedChannel: independent channels drain concurrently without deadlock" {
+    const alloc = std.testing.allocator;
+    var global_ctx: EbrContext = undefined;
+    var stack_pool: fm.StackPool = undefined;
+    var sched: fp.Scheduler = undefined;
+    var rt: Runtime = undefined;
+    try initSchedEnv(alloc, &global_ctx, &stack_pool, &sched, &rt);
+    defer deinitSchedEnv(&global_ctx, &stack_pool, &sched, &rt, alloc);
+
+    const N_CHANS = 4;
+    const N_ITEMS = 64;
+    const Shared = struct {
+        chans: [N_CHANS]CheatLib.BoundedChannel(i64),
+        wg: CheatHeader.WaitGroup,
+        delivered: std.atomic.Value(usize) = std.atomic.Value(usize).init(0),
+        sum: std.atomic.Value(i64) = std.atomic.Value(i64).init(0),
+    };
+    var shared = Shared{
+        .chans = undefined,
+        .wg = CheatHeader.WaitGroup.init(&sched),
+    };
+    for (0..N_CHANS) |i| shared.chans[i] = try CheatLib.BoundedChannel(i64).init(alloc, 8);
+    defer for (0..N_CHANS) |i| shared.chans[i].deinit();
+
+    const PairCtx = struct { s: *Shared, idx: usize };
+    const Bundle = struct {
+        s: *Shared,
+        producers: *[N_CHANS]PairCtx,
+        consumers: *[N_CHANS]PairCtx,
+    };
+
+    const Producer = struct {
+        fn run(_: *Runtime, raw: ?*anyopaque) anyerror!void {
+            const ctx = @as(*PairCtx, @ptrCast(@alignCast(raw.?)));
+            defer ctx.s.wg.done();
+            var i: i64 = 0;
+            while (i < N_ITEMS) : (i += 1) {
+                try ctx.s.chans[ctx.idx].push(@as(i64, @intCast(ctx.idx * N_ITEMS)) + i);
+            }
+            ctx.s.chans[ctx.idx].close();
+        }
+    };
+
+    const Consumer = struct {
+        fn run(_: *Runtime, raw: ?*anyopaque) anyerror!void {
+            const ctx = @as(*PairCtx, @ptrCast(@alignCast(raw.?)));
+            defer ctx.s.wg.done();
+            while (try ctx.s.chans[ctx.idx].pop()) |val| {
+                _ = ctx.s.delivered.fetchAdd(1, .seq_cst);
+                _ = ctx.s.sum.fetchAdd(val, .seq_cst);
+            }
+        }
+    };
+
+    var producer_ctxs: [N_CHANS]PairCtx = undefined;
+    var consumer_ctxs: [N_CHANS]PairCtx = undefined;
+
+    const Main = struct {
+        fn run(_: *Runtime, raw: ?*anyopaque) anyerror!void {
+            const bundle = @as(*Bundle, @ptrCast(@alignCast(raw.?)));
+            bundle.s.wg.add(N_CHANS * 2);
+            for (0..N_CHANS) |i| {
+                bundle.producers[i] = .{ .s = bundle.s, .idx = i };
+                bundle.consumers[i] = .{ .s = bundle.s, .idx = i };
+                try fp.active_scheduler.submitSpawn(
+                    @intFromPtr(&Runtime.entryWrapper),
+                    @as(CheatHeader.TaskFn, @ptrCast(&Producer.run)),
+                    &bundle.producers[i], .{},
+                );
+                try fp.active_scheduler.submitSpawn(
+                    @intFromPtr(&Runtime.entryWrapper),
+                    @as(CheatHeader.TaskFn, @ptrCast(&Consumer.run)),
+                    &bundle.consumers[i], .{},
+                );
+            }
+            bundle.s.wg.wait();
+        }
+    };
+
+    var bundle = Bundle{ .s = &shared, .producers = &producer_ctxs, .consumers = &consumer_ctxs };
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(CheatHeader.TaskFn, @ptrCast(&Main.run)),
+        &bundle, .{},
+    );
+    sched.run();
+
+    const expected_count = N_CHANS * N_ITEMS;
+    const last: i64 = @intCast(expected_count - 1);
+    try std.testing.expectEqual(@as(usize, expected_count), shared.delivered.load(.seq_cst));
+    try std.testing.expectEqual(@divExact(last * (last + 1), 2), shared.sum.load(.seq_cst));
+}
+
 // ---------------------------------------------------------------------------
 // Test: back pressure ordering — large item count with small ring; verify all
 //       items arrive exactly once and in-order (single consumer).
diff --git a/zig/runtime/channel-profile.zig b/zig/runtime/channel-profile.zig
index c2af1e01..0be960a3 100644
--- a/zig/runtime/channel-profile.zig
+++ b/zig/runtime/channel-profile.zig
@@ -15,6 +15,7 @@
 
 const std = @import("std");
 const compat = @import("../lib/compat.zig");
+const SpinLock = @import("profile-lock.zig").SpinLock;
 
 pub const MAX_CHANNELS: usize = 1024;
 
@@ -41,7 +42,7 @@ pub const ProfStats = extern struct {
 // would read garbage.
 var stats: [MAX_CHANNELS]ProfStats = [_]ProfStats{.{}} ** MAX_CHANNELS;
 var count: usize = 0;
-var mu: compat.Mutex = .{};
+var mu: SpinLock = .{};
 
 // Returns an id in [0, MAX_CHANNELS) that the channel uses to index
 // into `stats`. When the registry is full, returns MAX_CHANNELS and
@@ -58,6 +59,8 @@ pub fn register(initial_capacity: u64) usize {
 
 pub inline fn recordPush(id: usize, depth: u64, blocked: bool) void {
     if (id >= MAX_CHANNELS) return;
+    mu.lock();
+    defer mu.unlock();
     stats[id].pushes += 1;
     if (depth > stats[id].max_depth) stats[id].max_depth = depth;
     if (blocked) stats[id].push_blocked += 1;
@@ -65,11 +68,15 @@ pub inline fn recordPush(id: usize, depth: u64, blocked: bool) void {
 
 pub inline fn recordPop(id: usize) void {
     if (id >= MAX_CHANNELS) return;
+    mu.lock();
+    defer mu.unlock();
     stats[id].pops += 1;
 }
 
 pub inline fn recordPopBlocked(id: usize) void {
     if (id >= MAX_CHANNELS) return;
+    mu.lock();
+    defer mu.unlock();
     stats[id].pop_blocked += 1;
 }
 
@@ -82,6 +89,9 @@ pub fn dumpToEnvFile() void {
     const fd = compat.createFileTruncate(path_ptr) catch return;
     defer compat.closeFd(fd);
 
+    mu.lock();
+    defer mu.unlock();
+
     var buf: [256]u8 = undefined;
     _ = compat.writeAllFd(fd, "# channel-profile v1\n") catch return;
     _ = compat.writeAllFd(fd, "# id\tpushes\tpops\tpush_blocked\tpop_blocked\tmax_depth\tcapacity\n") catch return;
diff --git a/zig/runtime/fiber-core.zig b/zig/runtime/fiber-core.zig
index cefeaa49..19c41ef4 100644
--- a/zig/runtime/fiber-core.zig
+++ b/zig/runtime/fiber-core.zig
@@ -167,6 +167,9 @@ pub const StackSize = enum {
 pub const Stack = struct {
     // The raw slice of memory we own
     memory: []u8,
+    // Scheduler that owns the stack pool this memory came from. Kept as an
+    // opaque pointer to avoid a fiber-core -> scheduler import cycle.
+    owner: ?*anyopaque = null,
 
     // Add this helper. Fiber.reset() relies on it.
     pub fn getStackTop(self: Stack) usize {
@@ -185,6 +188,10 @@ pub const Fiber = struct {
     stack_guard_head: ?*safety.GuardNode = null,
 
     pub fn init(memory: []u8, entry_fn: usize, size: StackSize) Fiber {
+        return initWithOwner(memory, entry_fn, size, null);
+    }
+
+    pub fn initWithOwner(memory: []u8, entry_fn: usize, size: StackSize, owner: ?*anyopaque) Fiber {
         //std.debug.print("\n=== Fiber.init ===\n", .{});
         //std.debug.print("Memory: 0x{x} - 0x{x} ({} bytes)\n", .{
         //    @intFromPtr(memory.ptr),
@@ -197,7 +204,7 @@ pub const Fiber = struct {
         // In release builds this is 16KB * N_fibers of wasted writes.
         if (builtin.mode == .Debug) @memset(memory, 0xCC);
 
-        const stack = Stack{ .memory = memory };
+        const stack = Stack{ .memory = memory, .owner = owner };
         const stack_top_addr = @intFromPtr(memory.ptr) + memory.len;
         const aligned_top = stack_top_addr & ~@as(usize, 15);
 
diff --git a/zig/runtime/fiber-profile.zig b/zig/runtime/fiber-profile.zig
index ea6f53e8..e57862e8 100644
--- a/zig/runtime/fiber-profile.zig
+++ b/zig/runtime/fiber-profile.zig
@@ -15,10 +15,12 @@
 
 const std = @import("std");
 const compat = @import("../lib/compat.zig");
+const SpinLock = @import("profile-lock.zig").SpinLock;
 
 // CLEAR defaults to 4 scheduler threads; scale if more are used. Fixed
 // upper bound avoids per-scheduler allocations in the profile module.
 pub const MAX_SCHEDULERS: usize = 32;
+pub const MAX_SITES: usize = 256;
 
 // Buckets (in nanoseconds) used to classify lifetimes.
 pub const SHORT_NS: u64 = 1_000_000;     // 1 ms
@@ -30,30 +32,157 @@ var short_fibers: u64 = 0;    // < 1ms
 var vshort_fibers: u64 = 0;   // < 10us
 var total_lifetime_ns: u64 = 0;
 var max_lifetime_ns: u64 = 0;
+var mu: SpinLock = .{};
 
 // Per-scheduler fibers-run counter. Index = Scheduler.index.
 var sched_runs: [MAX_SCHEDULERS]u64 = [_]u64{0} ** MAX_SCHEDULERS;
 var sched_active: usize = 0;
 
+pub const DispatchKind = enum(u8) {
+    unknown = 0,
+    local = 1,
+    parallel = 2,
+    pinned = 3,
+};
+
+pub const TaskForm = enum(u8) {
+    unknown = 0,
+    stack = 1,
+    fsm = 2,
+};
+
+const Site = struct {
+    id: u32 = 0,
+    dispatch: DispatchKind = .unknown,
+    form: TaskForm = .unknown,
+    spawns: u64 = 0,
+    runs: u64 = 0,
+    exits: u64 = 0,
+    total_lifetime_ns: u64 = 0,
+    max_lifetime_ns: u64 = 0,
+    sched_runs: [MAX_SCHEDULERS]u64 = [_]u64{0} ** MAX_SCHEDULERS,
+};
+
+var sites: [MAX_SITES]Site = [_]Site{.{}} ** MAX_SITES;
+var site_dropped: u64 = 0;
+
+pub fn resetForTest() void {
+    mu.lock();
+    defer mu.unlock();
+    total_fibers = 0;
+    short_fibers = 0;
+    vshort_fibers = 0;
+    total_lifetime_ns = 0;
+    max_lifetime_ns = 0;
+    sched_runs = [_]u64{0} ** MAX_SCHEDULERS;
+    sched_active = 0;
+    sites = [_]Site{.{}} ** MAX_SITES;
+    site_dropped = 0;
+}
+
+fn findSiteLocked(site_id: u32) ?*Site {
+    if (site_id == 0) return null;
+    var idx: usize = @as(usize, site_id) % MAX_SITES;
+    var probes: usize = 0;
+    while (probes < MAX_SITES) : (probes += 1) {
+        if (sites[idx].id == site_id) return &sites[idx];
+        if (sites[idx].id == 0) {
+            sites[idx].id = site_id;
+            return &sites[idx];
+        }
+        idx = (idx + 1) % MAX_SITES;
+    }
+    site_dropped += 1;
+    return null;
+}
+
 pub inline fn nowNs() u64 {
     return compat.nanoTimestamp();
 }
 
 pub inline fn recordSchedulerRun(sched_idx: usize) void {
+    mu.lock();
+    defer mu.unlock();
     if (sched_idx >= MAX_SCHEDULERS) return;
     sched_runs[sched_idx] += 1;
     if (sched_idx + 1 > sched_active) sched_active = sched_idx + 1;
 }
 
-pub inline fn recordFiberExit(spawn_ns: u64, now: u64) void {
+pub inline fn recordSiteSpawn(site_id: u32, dispatch: DispatchKind, form: TaskForm) void {
+    if (site_id == 0) return;
+    mu.lock();
+    defer mu.unlock();
+    if (findSiteLocked(site_id)) |site| {
+        site.spawns += 1;
+        if (site.dispatch == .unknown) site.dispatch = dispatch;
+        if (site.form == .unknown) site.form = form;
+    }
+}
+
+test "fiber profile records per-site scheduler attribution" {
+    resetForTest();
+    defer resetForTest();
+
+    recordSiteSpawn(7, .local, .fsm);
+    recordSchedulerRun(3);
+    recordSiteRun(7, 3);
+    recordFiberExit(7, 100, 2100);
+
+    try std.testing.expectEqual(@as(u64, 1), total_fibers);
+    try std.testing.expectEqual(@as(u64, 1), sched_runs[3]);
+    const site = findSiteLocked(7).?;
+    try std.testing.expectEqual(@as(u64, 1), site.spawns);
+    try std.testing.expectEqual(@as(u64, 1), site.runs);
+    try std.testing.expectEqual(@as(u64, 1), site.exits);
+    try std.testing.expectEqual(DispatchKind.local, site.dispatch);
+    try std.testing.expectEqual(TaskForm.fsm, site.form);
+    try std.testing.expectEqual(@as(u64, 1), site.sched_runs[3]);
+    try std.testing.expectEqual(@as(u64, 2000), site.total_lifetime_ns);
+}
+
+test "fiber profile handles site collisions and saturation" {
+    resetForTest();
+    defer resetForTest();
+
+    recordSiteSpawn(1, .local, .stack);
+    recordSiteSpawn(1 + MAX_SITES, .parallel, .fsm);
+    try std.testing.expect(findSiteLocked(1) != null);
+    try std.testing.expect(findSiteLocked(1 + MAX_SITES) != null);
+
+    var i: u32 = 2;
+    while (i <= MAX_SITES) : (i += 1) {
+        recordSiteSpawn(i, .local, .stack);
+    }
+    recordSiteSpawn(MAX_SITES + 2, .parallel, .fsm);
+    try std.testing.expectEqual(@as(u64, 2), site_dropped);
+}
+
+pub inline fn recordSiteRun(site_id: u32, sched_idx: usize) void {
+    if (site_id == 0) return;
+    mu.lock();
+    defer mu.unlock();
+    if (findSiteLocked(site_id)) |site| {
+        site.runs += 1;
+        if (sched_idx < MAX_SCHEDULERS) site.sched_runs[sched_idx] += 1;
+    }
+}
+
+pub inline fn recordFiberExit(site_id: u32, spawn_ns: u64, now: u64) void {
     if (spawn_ns == 0) return;           // never recorded a spawn
     if (now <= spawn_ns) return;          // clock went backwards
     const dur: u64 = now - spawn_ns;
+    mu.lock();
+    defer mu.unlock();
     total_fibers += 1;
     total_lifetime_ns += dur;
     if (dur > max_lifetime_ns) max_lifetime_ns = dur;
     if (dur < SHORT_NS)  short_fibers  += 1;
     if (dur < VSHORT_NS) vshort_fibers += 1;
+    if (findSiteLocked(site_id)) |site| {
+        site.exits += 1;
+        site.total_lifetime_ns += dur;
+        if (dur > site.max_lifetime_ns) site.max_lifetime_ns = dur;
+    }
 }
 
 pub fn dumpToEnvFile() void {
@@ -61,7 +190,10 @@ pub fn dumpToEnvFile() void {
     const fd = compat.createFileTruncate(path_ptr) catch return;
     defer compat.closeFd(fd);
 
-    var buf: [256]u8 = undefined;
+    mu.lock();
+    defer mu.unlock();
+
+    var buf: [1024]u8 = undefined;
 
     _ = compat.writeAllFd(fd, "# fiber-profile v1\n") catch return;
 
@@ -77,4 +209,44 @@ pub fn dumpToEnvFile() void {
         const line = std.fmt.bufPrint(&buf, "{d}\t{d}\n", .{ i, sched_runs[i] }) catch continue;
         _ = compat.writeAllFd(fd, line) catch return;
     }
+
+    _ = compat.writeAllFd(fd, "# per-site fibers\n# site\tspawns\truns\texits\ttotal_lifetime_ns\tmax_lifetime_ns\tdispatch\tform\tschedulers\n") catch return;
+    if (site_dropped > 0) {
+        const warn = std.fmt.bufPrint(&buf,
+            "# WARNING: {d} fiber-site samples dropped (cap={d}; rebuild runtime with larger MAX_SITES)\n",
+            .{ site_dropped, MAX_SITES },
+        ) catch return;
+        _ = compat.writeAllFd(fd, warn) catch return;
+    }
+    for (&sites) |*site| {
+        if (site.id == 0) continue;
+        var sched_buf: [512]u8 = undefined;
+        var sched_len: usize = 0;
+        var first = true;
+        var si: usize = 0;
+        while (si < sched_active) : (si += 1) {
+            const runs = site.sched_runs[si];
+            if (runs == 0) continue;
+            if (!first and sched_len < sched_buf.len) {
+                sched_buf[sched_len] = ',';
+                sched_len += 1;
+            }
+            first = false;
+            const part = std.fmt.bufPrint(sched_buf[sched_len..], "{d}:{d}", .{ si, runs }) catch break;
+            sched_len += part.len;
+        }
+        const scheds = sched_buf[0..sched_len];
+        const line = std.fmt.bufPrint(&buf, "{d}\t{d}\t{d}\t{d}\t{d}\t{d}\t{s}\t{s}\t{s}\n", .{
+            site.id,
+            site.spawns,
+            site.runs,
+            site.exits,
+            site.total_lifetime_ns,
+            site.max_lifetime_ns,
+            @tagName(site.dispatch),
+            @tagName(site.form),
+            scheds,
+        }) catch continue;
+        _ = compat.writeAllFd(fd, line) catch return;
+    }
 }
diff --git a/zig/runtime/fiber-test.zig b/zig/runtime/fiber-test.zig
index 1fe3109a..18854817 100644
--- a/zig/runtime/fiber-test.zig
+++ b/zig/runtime/fiber-test.zig
@@ -512,7 +512,7 @@ fn threadEntryPoint(allocator: std.mem.Allocator, global_ctx: *EbrContext, stack
 }
 
 test "Multi-Threaded Shared Nothing" {
-    if (build_options.coverage) return error.SkipZigTest;
+    if (build_options.coverage or build_options.tsan) return error.SkipZigTest;
 
     const allocator = std.heap.smp_allocator;
 
diff --git a/zig/runtime/fsm-steal-test.zig b/zig/runtime/fsm-steal-test.zig
index bdb4ab5d..7f42ba14 100644
--- a/zig/runtime/fsm-steal-test.zig
+++ b/zig/runtime/fsm-steal-test.zig
@@ -104,6 +104,38 @@ test "S2: stolen tasks complete correctly when dispatched by the stealer" {
     try std.testing.expectEqual(@as(u64, 0), sched_b.active_tasks.load(.monotonic));
 }
 
+test "S2b: stolen task completion returns FsmTask slot to allocating scheduler" {
+    var ebr_ctx: ebr.EbrContext = .{};
+    defer ebr_ctx.deinit(alloc);
+    var pool_a = fm.StackPool.init(alloc);
+    defer pool_a.deinit();
+    var pool_b = fm.StackPool.init(alloc);
+    defer pool_b.deinit();
+    var sched_a = try fp.Scheduler.init(alloc, &ebr_ctx, &pool_a);
+    defer sched_a.deinit();
+    var sched_b = try fp.Scheduler.init(alloc, &ebr_ctx, &pool_b);
+    defer sched_b.deinit();
+
+    var stolen_ctx: Counter = .{ .task = try sched_a.allocFsmTask(&Counter.doResume) };
+    stolen_ctx.task.ctx = &stolen_ctx;
+    sched_a.enqueueFsm(stolen_ctx.task);
+
+    const stolen = sched_b.fsm_ready_queue.tryStealFrom(&sched_a.fsm_ready_queue, alloc);
+    try std.testing.expectEqual(@as(usize, 1), stolen);
+    _ = sched_b.active_tasks.fetchAdd(stolen, .monotonic);
+    _ = sched_a.active_tasks.fetchSub(stolen, .monotonic);
+
+    sched_b.drainFsmQueue();
+    try std.testing.expect(stolen_ctx.completed);
+
+    var fresh_b: Counter = .{ .task = try sched_b.allocFsmTask(&Counter.doResume) };
+    fresh_b.task.ctx = &fresh_b;
+    defer sched_b.fsm_task_slab.destroy(fresh_b.task);
+
+    try std.testing.expect(sched_b.fsm_task_slab.refFromPtr(fresh_b.task) != null);
+    try std.testing.expect(sched_a.fsm_task_slab.refFromPtr(fresh_b.task) == null);
+}
+
 test "S3: FSM queue is structurally distinct from stackful ready_queue" {
     var ebr_ctx: ebr.EbrContext = .{};
     defer ebr_ctx.deinit(alloc);
diff --git a/zig/runtime/fsm-vopr-test.zig b/zig/runtime/fsm-vopr-test.zig
index 89df7187..b793bb4a 100644
--- a/zig/runtime/fsm-vopr-test.zig
+++ b/zig/runtime/fsm-vopr-test.zig
@@ -285,3 +285,83 @@ test "FSM VOPR: enqueue -> drain round-trip preserves active_tasks" {
     try std.testing.expectEqual(@as(u64, 0), sched.active_tasks.load(.monotonic));
     try std.testing.expect(sched.fsm_ready_queue.len() == 0);
 }
+
+test "FSM VOPR: remote ctx slab frees drain through owner scheduler" {
+    const N_SEEDS = if (build_options.coverage) 2 else 32;
+    const OPS = if (build_options.coverage) 16 else 128;
+    const MAX_LIVE = 64;
+
+    const SmallCtx = extern struct { bytes: [64]u8 };
+
+    var seed: u64 = 0;
+    while (seed < N_SEEDS) : (seed += 1) {
+        var global_ebr: ebr.EbrContext = .{};
+        defer global_ebr.deinit(alloc);
+        var pool_owner = fm.StackPool.init(alloc);
+        defer pool_owner.deinit();
+        var pool_current = fm.StackPool.init(alloc);
+        defer pool_current.deinit();
+
+        var owner = try fp.Scheduler.init(alloc, &global_ebr, &pool_owner);
+        defer owner.deinit();
+        var current = try fp.Scheduler.init(alloc, &global_ebr, &pool_current);
+        defer current.deinit();
+        owner.index = 0;
+        current.index = 1;
+
+        var tasks: [MAX_LIVE]?*fsm.FsmTask = [_]?*fsm.FsmTask{null} ** MAX_LIVE;
+        var ctxs: [MAX_LIVE]?*SmallCtx = [_]?*SmallCtx{null} ** MAX_LIVE;
+        defer {
+            fp.scheduler_running = false;
+            for (tasks, ctxs) |task_opt, ctx_opt| {
+                if (task_opt) |task| {
+                    if (ctx_opt) |ctx| owner.freeFsmCtx(SmallCtx, task, ctx);
+                    owner.fsm_task_slab.destroy(task);
+                }
+            }
+            owner.drainChannels();
+        }
+
+        var prng = std.Random.DefaultPrng.init(seed);
+        const rng = prng.random();
+
+        var op: usize = 0;
+        while (op < OPS) : (op += 1) {
+            switch (rng.uintLessThan(u8, 4)) {
+                0, 1 => {
+                    var slot: ?usize = null;
+                    for (tasks, 0..) |task_opt, i| {
+                        if (task_opt == null) {
+                            slot = i;
+                            break;
+                        }
+                    }
+                    if (slot) |i| {
+                        const task = try owner.allocFsmTask(&Yieldy.doResume);
+                        const ctx = try owner.allocFsmCtx(SmallCtx, task);
+                        task.ctx = ctx;
+                        tasks[i] = task;
+                        ctxs[i] = ctx;
+                    }
+                },
+                2 => {
+                    const i = rng.uintLessThan(usize, MAX_LIVE);
+                    if (tasks[i]) |task| {
+                        if (ctxs[i]) |ctx| {
+                            fp.active_scheduler = &current;
+                            fp.scheduler_running = true;
+                            current.freeFsmCtx(SmallCtx, task, ctx);
+                            fp.scheduler_running = false;
+                            ctxs[i] = null;
+                        }
+                        owner.fsm_task_slab.destroy(task);
+                        tasks[i] = null;
+                    }
+                },
+                else => owner.drainChannels(),
+            }
+        }
+
+        owner.drainChannels();
+    }
+}
diff --git a/zig/runtime/fsm.zig b/zig/runtime/fsm.zig
index 4a754082..4bf8edba 100644
--- a/zig/runtime/fsm.zig
+++ b/zig/runtime/fsm.zig
@@ -25,7 +25,6 @@
 //   - FSM <-> stackful await interop (handled by promise wake path)
 
 const std = @import("std");
-const ebr_mod = @import("../lib/ebr.zig");
 
 // Comptime atomic type selection: SimAtomic in Loom mode, real atomics
 // otherwise. Mirrors queues.zig: loom harness exports SimAtomic so FsmTask
@@ -90,6 +89,14 @@ pub const YieldReason = union(enum) {
 
 pub const ResumeFn = *const fn (*FsmTask) YieldReason;
 
+pub const FsmCtxAllocClass = enum(u8) {
+    none,
+    slab64,
+    slab128,
+    slab256,
+    heap,
+};
+
 // -----------------------------------------------------------------------------
 // FsmTask
 // -----------------------------------------------------------------------------
@@ -120,6 +127,10 @@ pub const FsmTask = struct {
     status: FsmStatus = .Ready,
     /// Profile-only: spawn timestamp in ns.
     spawn_ns: u64 = 0,
+    /// Profile-only: generated BG/worker site id; 0 = unattributed.
+    profile_site_id: u32 = 0,
+    /// Profile-only: fiber-profile.DispatchKind enum value.
+    profile_dispatch: u8 = 0,
     /// Forward pointer to the user-owned ctx struct that holds resume
     /// state. FsmTask is slab-allocated separately from the ctx (mirrors
     /// stackful Task: slab pin protects detectCycleFsm chain walks from
@@ -137,6 +148,10 @@ pub const FsmTask = struct {
     /// slot reused mid-walk = torn snapshot. Mirrors stackful
     /// Task.generation.
     generation: Atomic(u32) = Atomic(u32).init(0),
+    /// Scheduler that allocated this task's slab slot. FSM tasks may be
+    /// load-balanced to another scheduler, but completion must return the
+    /// slot to the allocating scheduler's slab.
+    owner_scheduler: ?*anyopaque = null,
     /// Non-null when blocked on IO.
     waiter: ?*FsmIoWaiter = null,
     /// Non-null when blocked on a parking-lot lock. Opaque `*WaiterNode`.
@@ -182,27 +197,21 @@ pub const FsmTask = struct {
     /// The FSM emit synthesizes a per-ctx-type fn:
     ///   fn destroyTask(t: *FsmTask) void {
     ///       const c: *@This() = @ptrCast(@alignCast(t.ctx.?));
-    ///       c.alloc.destroy(c);
+    ///       CheatHeader.freeFsmCtx(@This(), t, c);
     ///   }
     /// and stores its pointer here at spawn time.
     destroy_fn: ?*const fn (*FsmTask) void = null,
 
-    /// Per-task ThreadLocalEbr slot.
-    ///
-    /// FSM tasks dispatched via spawnFsmBest can run on ANY scheduler
-    /// thread. If the task body calls into the EBR primitives (read,
-    /// update, retire) using a Runtime captured at spawn time, those
-    /// calls would touch the spawning thread's ThreadLocalEbr from a
-    /// foreign OS thread -- corrupting the non-thread-safe limbo
-    /// list. To prevent that, every FSM task gets its own ebr_slot,
-    /// registered with the global EbrContext, owned by the slot's
-    /// Runtime (see task_runtime), and freed on .Done dispatch.
-    ebr_slot: ?*ebr_mod.ThreadLocalEbr = null,
-
-    /// Per-task Runtime. Heap-allocated alongside ebr_slot at spawn
-    /// time and freed by the scheduler on .Done. The codegen binds
-    /// the FSM ctx's `rt` field to this pointer BEFORE spawning so
-    /// all EBR ops in the body route through the per-task slot.
+    /// Allocation class for the generated FSM ctx pointed to by `ctx`.
+    /// Set by CheatHeader.allocFsmCtx; consumed by freeFsmCtx. `none`
+    /// covers hand-written runtime tests that do not use generated ctx
+    /// allocation.
+    ctx_alloc_class: FsmCtxAllocClass = .none,
+
+    /// Per-task Runtime shell. Heap-allocated at spawn time and freed by
+    /// the scheduler on .Done. The codegen binds the FSM ctx's `rt` field
+    /// to this pointer BEFORE spawning; MVCC uses Runtime.currentEbr() to
+    /// resolve the active scheduler thread's EBR slot at dispatch time.
     /// Lazy-imports runtime.zig in the field type so the
     /// runtime.zig -> scheduler.zig -> fsm.zig -> runtime.zig cycle
     /// resolves: only the pointer-to-Runtime is needed here, not the
diff --git a/zig/runtime/lock-profile.zig b/zig/runtime/lock-profile.zig
index d8e2429a..10d0230d 100644
--- a/zig/runtime/lock-profile.zig
+++ b/zig/runtime/lock-profile.zig
@@ -16,6 +16,7 @@
 
 const std = @import("std");
 const compat = @import("../lib/compat.zig");
+const SpinLock = @import("profile-lock.zig").SpinLock;
 
 // Profile-table size; shared default with alloc-profile / mvcc-profile.
 // `clear profile --profile-max=N` injects the override into the
@@ -48,6 +49,7 @@ pub const LockStats = struct {
 };
 
 var stats: [MAX_LOCKS]LockStats = [_]LockStats{.{}} ** MAX_LOCKS;
+var mu: SpinLock = .{};
 
 // Counts findSlot() calls that hit the saturated table. Surfaced
 // in the dump as a `# WARNING:` header so doctor can advise the
@@ -76,6 +78,8 @@ fn findSlot(addr: usize) ?*LockStats {
 }
 
 pub inline fn recordAcquire(addr: usize, wait_ns: u64, contended: bool) void {
+    mu.lock();
+    defer mu.unlock();
     if (findSlot(addr)) |s| {
         s.acquires += 1;
         s.total_wait_ns += wait_ns;
@@ -88,6 +92,8 @@ pub inline fn recordAcquire(addr: usize, wait_ns: u64, contended: bool) void {
 /// but stored in the read counters so doctor can compute read/write
 /// split (read-heavy → recommend @shared:versioned).
 pub inline fn recordReadAcquire(addr: usize, wait_ns: u64, contended: bool) void {
+    mu.lock();
+    defer mu.unlock();
     if (findSlot(addr)) |s| {
         s.read_acquires += 1;
         s.read_total_wait_ns += wait_ns;
@@ -97,6 +103,8 @@ pub inline fn recordReadAcquire(addr: usize, wait_ns: u64, contended: bool) void
 }
 
 pub inline fn recordRelease(addr: usize, hold_ns: u64) void {
+    mu.lock();
+    defer mu.unlock();
     if (findSlot(addr)) |s| {
         s.total_hold_ns += hold_ns;
         if (hold_ns > s.max_hold_ns) s.max_hold_ns = hold_ns;
@@ -108,6 +116,9 @@ pub fn dumpToEnvFile() void {
     const fd = compat.createFileTruncate(path_ptr) catch return;
     defer compat.closeFd(fd);
 
+    mu.lock();
+    defer mu.unlock();
+
     var buf: [512]u8 = undefined;
     _ = compat.writeAllFd(fd, "# lock-profile v2\n") catch return;
     if (dropped_samples > 0) {
diff --git a/zig/runtime/mvcc-profile.zig b/zig/runtime/mvcc-profile.zig
index 147d4e9e..b222c80c 100644
--- a/zig/runtime/mvcc-profile.zig
+++ b/zig/runtime/mvcc-profile.zig
@@ -22,6 +22,7 @@
 
 const std = @import("std");
 const compat = @import("../lib/compat.zig");
+const SpinLock = @import("profile-lock.zig").SpinLock;
 
 // Profile-table size, shared across alloc-profile / lock-profile /
 // mvcc-profile via a single root-level override knob. Default 1024
@@ -58,6 +59,7 @@ pub const CellStats = struct {
 };
 
 var stats: [MAX_CELLS]CellStats = [_]CellStats{.{}} ** MAX_CELLS;
+var mu: SpinLock = .{};
 
 // Counts findSlot() calls that hit the saturated table and had to
 // drop the sample. Surfaced in the dump as a `# WARNING:` header
@@ -83,6 +85,8 @@ fn findSlot(addr: usize, struct_size: u32) ?*CellStats {
 }
 
 pub inline fn recordRead(addr: usize, struct_size: u32) void {
+    mu.lock();
+    defer mu.unlock();
     if (findSlot(addr, struct_size)) |s| {
         s.reads += 1;
     }
@@ -93,6 +97,8 @@ pub inline fn recordRead(addr: usize, struct_size: u32) void {
 /// (0 for fast-path commits). `committed` distinguishes a winning
 /// commit from a bailed-out UpdateRetriesExhausted.
 pub inline fn recordUpdate(addr: usize, struct_size: u32, retries: u64, committed: bool) void {
+    mu.lock();
+    defer mu.unlock();
     if (findSlot(addr, struct_size)) |s| {
         s.retries += retries;
         if (committed) {
@@ -111,6 +117,8 @@ pub inline fn recordUpdate(addr: usize, struct_size: u32, retries: u64, committe
 /// (multi-cell commits forbid the upgrade because AtomicPtr has
 /// no multi-pointer CAS).
 pub inline fn recordMultiCommit(addr: usize, struct_size: u32) void {
+    mu.lock();
+    defer mu.unlock();
     if (findSlot(addr, struct_size)) |s| {
         s.multi_commits += 1;
     }
@@ -121,6 +129,9 @@ pub fn dumpToEnvFile() void {
     const fd = compat.createFileTruncate(path_ptr) catch return;
     defer compat.closeFd(fd);
 
+    mu.lock();
+    defer mu.unlock();
+
     var buf: [256]u8 = undefined;
     _ = compat.writeAllFd(fd, "# mvcc-profile v1\n") catch return;
     if (dropped_samples > 0) {
diff --git a/zig/runtime/profile-lock.zig b/zig/runtime/profile-lock.zig
new file mode 100644
index 00000000..0c954c09
--- /dev/null
+++ b/zig/runtime/profile-lock.zig
@@ -0,0 +1,15 @@
+const std = @import("std");
+
+pub const SpinLock = struct {
+    locked: std.atomic.Value(bool) = std.atomic.Value(bool).init(false),
+
+    pub inline fn lock(self: *SpinLock) void {
+        while (self.locked.swap(true, .acquire)) {
+            std.atomic.spinLoopHint();
+        }
+    }
+
+    pub inline fn unlock(self: *SpinLock) void {
+        self.locked.store(false, .release);
+    }
+};
diff --git a/zig/runtime/queues.zig b/zig/runtime/queues.zig
index 91d8b235..74493b4b 100644
--- a/zig/runtime/queues.zig
+++ b/zig/runtime/queues.zig
@@ -1,7 +1,5 @@
 const std = @import("std");
 const fc = @import("fiber-core.zig");
-const ebr_mod = @import("../lib/ebr.zig");
-const ThreadLocalEbr = ebr_mod.ThreadLocalEbr;
 
 const Fiber = fc.Fiber;
 const StackSize = fc.StackSize;
@@ -127,6 +125,7 @@ pub const RunQueue = struct {
     fn makeArray(alloc: std.mem.Allocator, log_size: u5) !*CircularArray {
         const size = @as(u32, 1) << log_size;
         const data = try alloc.alloc(Atomic(?*Task), size);
+        errdefer alloc.free(data);
         for (data) |*slot| slot.* = Atomic(?*Task).init(null);
         const arr = try alloc.create(CircularArray);
         arr.* = .{ .data = data, .mask = size - 1 };
@@ -352,6 +351,8 @@ pub const TaskConfig = struct {
     stack_size: StackSize = .Standard,  // Default to Standard
     pinned: bool = false,              // true = cannot be stolen by other schedulers
     use_arena: bool = false,           // true = expose scheduler local_arena via __pinned_local_alloc (@arena BG blocks only)
+    profile_site_id: u32 = 0,          // profile-only BG/worker site id; 0 = unattributed
+    profile_dispatch: u8 = 0,          // profile-only: fiber-profile.DispatchKind enum value
 };
 
 // ─────────────────────────────────────────────────────────────────────────────
@@ -380,6 +381,7 @@ pub const Task = struct {
     context: ?*anyopaque = null,       // 8
     config: TaskConfig = .{},          // 16  (timeout u64 + 3 bools padded)
     spawn_ns: u64 = 0,                 // 8   (profile-only)
+    profile_site_id: u32 = 0,          // 4   (profile-only)
     wake_time: i64 = 0,                // 8   (0 = not sleeping)
 
     // ── Group 2: cross-thread-touched atomics ───────────────────────────
@@ -455,13 +457,6 @@ pub const Task = struct {
     /// so future readers are not permanently blocked by a phantom writer.
     lock_counter_ptr: ?*u32 = null,
 
-    /// Heap-allocated ThreadLocalEbr (registered with EbrContext).
-    /// Allocated by the scheduler in drainChannels.Spawn on the OS thread
-    /// stack so that EbrContext.register's deep allocator path doesn't
-    /// run inside the fiber (which would overflow Standard 12 KB stacks).
-    /// Unregistered + freed by the scheduler in run() when task finishes.
-    /// entryWrapper hands this pointer to Runtime.initFromSliceWithEbr.
-    ebr_slot: ?*ThreadLocalEbr = null,
 };
 
 pub const LOCK_KIND_NONE: u8 = 0;
diff --git a/zig/runtime/runtime-footer.zig b/zig/runtime/runtime-footer.zig
index b56b804b..26e51244 100644
--- a/zig/runtime/runtime-footer.zig
+++ b/zig/runtime/runtime-footer.zig
@@ -8,7 +8,6 @@
 // Shared state (heap-allocated, outlives all threads):
 //   - allocator
 //   - EbrContext  (thread-safe — has its own registry_lock)
-//   - StackPool   (thread-safe — slab allocator with atomic free lists)
 //   - shutdown    (atomic bool — signals workers to exit after main)
 //
 // Per-thread:
@@ -80,10 +79,7 @@ pub fn main() !void {
     rt.wireAllocator();
 
     // 4. Shared infrastructure
-    const fm = @import("runtime/fiber-memory.zig");
     const fp = @import("runtime/scheduler.zig");
-    var stack_pool = fm.StackPool.init(allocator);
-    defer stack_pool.deinit();
 
     // Global shutdown flag — workers check this each loop iteration.
     var shutdown = std.atomic.Value(bool).init(false);
@@ -110,19 +106,17 @@ pub fn main() !void {
     const WorkerCtx = struct {
         allocator: std.mem.Allocator,
         global_ctx: *EbrContext,
-        stack_pool: *fm.StackPool,
         shutdown: *std.atomic.Value(bool),
     };
     var worker_ctx = WorkerCtx{
         .allocator = allocator,
         .global_ctx = &global_ctx,
-        .stack_pool = &stack_pool,
         .shutdown = &shutdown,
     };
 
     const workerMain = struct {
         fn run(ctx: *WorkerCtx) void {
-            var worker_sched = fp.Scheduler.init(ctx.allocator, ctx.global_ctx, ctx.stack_pool) catch return;
+            var worker_sched = fp.Scheduler.init(ctx.allocator, ctx.global_ctx, null) catch return;
             defer worker_sched.deinit();
             worker_sched.shutdown_on_idle = false; // stay alive until explicit shutdown
             worker_sched.global_shutdown = ctx.shutdown;
@@ -147,7 +141,7 @@ pub fn main() !void {
     }
 
     // 7. Main scheduler (runs on the main thread).
-    var sched = try fp.Scheduler.init(allocator, &global_ctx, &stack_pool);
+    var sched = try fp.Scheduler.init(allocator, &global_ctx, null);
     defer {
         sched.deinit();
         fp.global_registry.deinit(allocator);
diff --git a/zig/runtime/runtime-header-test.zig b/zig/runtime/runtime-header-test.zig
index cbd51e98..7cebcc23 100644
--- a/zig/runtime/runtime-header-test.zig
+++ b/zig/runtime/runtime-header-test.zig
@@ -3,6 +3,7 @@ const rt_mod = @import("runtime.zig");
 const fp = @import("scheduler.zig");
 const qs = @import("queues.zig");
 const fm = @import("fiber-memory.zig");
+const fsm = @import("fsm.zig");
 const ebr = @import("../lib/ebr.zig");
 const header = @import("runtime-header.zig");
 const compat = @import("../lib/compat.zig");
@@ -21,6 +22,30 @@ var global_ebr_ctx: ebr.EbrContext = .{};
 var global_stack_pool: fm.StackPool = undefined;
 var global_shutdown = std.atomic.Value(bool).init(false);
 
+test "CheatLib.read returns immediately when fd already has bytes" {
+    var fds: [2]i32 = undefined;
+    switch (std.posix.errno(std.os.linux.socketpair(std.posix.AF.UNIX, std.posix.SOCK.STREAM, 0, &fds))) {
+        .SUCCESS => {},
+        else => return error.Unexpected,
+    }
+    defer compat.closeFd(fds[0]);
+    defer compat.closeFd(fds[1]);
+
+    const msg = "ready";
+    const written = std.c.write(fds[1], msg.ptr, msg.len);
+    try std.testing.expect(written >= 0);
+    try std.testing.expectEqual(msg.len, @as(usize, @intCast(written)));
+
+    var buf: [16]u8 = undefined;
+    const n = try CheatLib.read(fds[0], &buf);
+    try std.testing.expectEqual(msg.len, n);
+    try std.testing.expectEqualSlices(u8, msg, buf[0..n]);
+}
+
+fn dummyFsmResume(_: *fsm.FsmTask) fsm.YieldReason {
+    return .{ .Done = {} };
+}
+
 fn initWorkerGlobals() void {
     global_stack_pool = fm.StackPool.init(alloc);
 }
@@ -57,6 +82,88 @@ fn stopWorkers(threads: []std.Thread, n: usize) void {
     global_shutdown.store(false, .release);
 }
 
+test "FSM ctx allocation routes 64B, 128B, 256B, and oversized contexts" {
+    const allocator = std.testing.allocator;
+
+    var global_ctx = ebr.EbrContext{};
+    defer global_ctx.deinit(allocator);
+
+    var stack_pool = fm.StackPool.init(allocator);
+    defer stack_pool.deinit();
+
+    var sched = try fp.Scheduler.init(allocator, &global_ctx, &stack_pool);
+    defer sched.deinit();
+    defer fp.global_registry.deinit(allocator);
+
+    const SmallCtx = extern struct { bytes: [64]u8 };
+    const MediumCtx = extern struct { bytes: [128]u8 };
+    const LargeCtx = extern struct { bytes: [256]u8 };
+    const OversizedCtx = extern struct { bytes: [257]u8 };
+
+    const small_task = try sched.allocFsmTask(&dummyFsmResume);
+    defer sched.fsm_task_slab.destroy(small_task);
+    const small = try sched.allocFsmCtx(SmallCtx, small_task);
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.slab64, small_task.ctx_alloc_class);
+    sched.freeFsmCtx(SmallCtx, small_task, small);
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.none, small_task.ctx_alloc_class);
+
+    const medium_task = try sched.allocFsmTask(&dummyFsmResume);
+    defer sched.fsm_task_slab.destroy(medium_task);
+    const medium = try sched.allocFsmCtx(MediumCtx, medium_task);
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.slab128, medium_task.ctx_alloc_class);
+    sched.freeFsmCtx(MediumCtx, medium_task, medium);
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.none, medium_task.ctx_alloc_class);
+
+    const large_task = try sched.allocFsmTask(&dummyFsmResume);
+    defer sched.fsm_task_slab.destroy(large_task);
+    const large = try sched.allocFsmCtx(LargeCtx, large_task);
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.slab256, large_task.ctx_alloc_class);
+    sched.freeFsmCtx(LargeCtx, large_task, large);
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.none, large_task.ctx_alloc_class);
+
+    const oversized_task = try sched.allocFsmTask(&dummyFsmResume);
+    defer sched.fsm_task_slab.destroy(oversized_task);
+    const oversized = try sched.allocFsmCtx(OversizedCtx, oversized_task);
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.heap, oversized_task.ctx_alloc_class);
+    sched.freeFsmCtx(OversizedCtx, oversized_task, oversized);
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.none, oversized_task.ctx_alloc_class);
+}
+
+test "FSM ctx slab free routes back to owner scheduler" {
+    const allocator = std.testing.allocator;
+
+    var global_ctx = ebr.EbrContext{};
+    defer global_ctx.deinit(allocator);
+
+    var pool_a = fm.StackPool.init(allocator);
+    defer pool_a.deinit();
+    var pool_b = fm.StackPool.init(allocator);
+    defer pool_b.deinit();
+
+    var owner = try fp.Scheduler.init(allocator, &global_ctx, &pool_a);
+    defer owner.deinit();
+    var current = try fp.Scheduler.init(allocator, &global_ctx, &pool_b);
+    defer current.deinit();
+    defer fp.global_registry.deinit(allocator);
+    owner.index = 0;
+    current.index = 1;
+
+    const SmallCtx = extern struct { bytes: [256]u8 };
+    const task = try owner.allocFsmTask(&dummyFsmResume);
+    defer owner.fsm_task_slab.destroy(task);
+    const ctx = try owner.allocFsmCtx(SmallCtx, task);
+    task.ctx = ctx;
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.slab256, task.ctx_alloc_class);
+
+    fp.active_scheduler = &current;
+    fp.scheduler_running = true;
+    current.freeFsmCtx(SmallCtx, task, ctx);
+    fp.scheduler_running = false;
+
+    try std.testing.expectEqual(fsm.FsmCtxAllocClass.none, task.ctx_alloc_class);
+    owner.drainChannels();
+}
+
 // This is the function the Fiber will run
 fn fiberFfiTask(rt: *Runtime, _: ?*anyopaque) anyerror!void {
     std.debug.print("\n[Fiber] Entering FFI Task. Current PID: {d}", .{c.getpid()});
diff --git a/zig/runtime/runtime-header.zig b/zig/runtime/runtime-header.zig
index f9907cac..b31b03af 100644
--- a/zig/runtime/runtime-header.zig
+++ b/zig/runtime/runtime-header.zig
@@ -145,6 +145,7 @@ pub const CheatLib = struct {
         rt: *Runtime,
         items: *[N]Promise(T),
         workers: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -166,7 +167,7 @@ pub const CheatLib = struct {
                     cleanup(R, alloc_, ptr);
                 }
             }.cleanupResult,
-            alloc, rt, items, workers, parallel, task_cfg, user_ctx
+            alloc, rt, items, workers, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -178,6 +179,7 @@ pub const CheatLib = struct {
         rt: *Runtime,
         items: *[N]Promise(T),
         workers: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -199,7 +201,7 @@ pub const CheatLib = struct {
                     cleanup(T, alloc_, ptr);
                 }
             }.cleanupItem,
-            alloc, rt, items, workers, parallel, task_cfg, user_ctx
+            alloc, rt, items, workers, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -210,6 +212,7 @@ pub const CheatLib = struct {
         rt: *Runtime,
         items: *[N]Promise(T),
         workers: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -226,7 +229,7 @@ pub const CheatLib = struct {
                     try CheatLib.spawnBest(@intFromPtr(&Runtime.entryWrapper), user_fn, args, config);
                 }
             }.parallelSpawn,
-            rt, items, workers, parallel, task_cfg, user_ctx
+            rt, items, workers, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -243,6 +246,7 @@ pub const CheatLib = struct {
         src: anytype,
         workers: usize,
         capacity: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -264,7 +268,7 @@ pub const CheatLib = struct {
                     cleanup(R, alloc_, ptr);
                 }
             }.cleanupResult,
-            is_inf, alloc, rt, src, workers, capacity, parallel, task_cfg, user_ctx
+            is_inf, alloc, rt, src, workers, capacity, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -277,6 +281,7 @@ pub const CheatLib = struct {
         src: anytype,
         workers: usize,
         capacity: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -298,7 +303,7 @@ pub const CheatLib = struct {
                     cleanup(T, alloc_, ptr);
                 }
             }.cleanupItem,
-            is_inf, alloc, rt, src, workers, capacity, parallel, task_cfg, user_ctx
+            is_inf, alloc, rt, src, workers, capacity, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -311,6 +316,7 @@ pub const CheatLib = struct {
         src: anytype,
         workers: usize,
         capacity: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -327,7 +333,7 @@ pub const CheatLib = struct {
                     try CheatLib.spawnBest(@intFromPtr(&Runtime.entryWrapper), user_fn, args, config);
                 }
             }.parallelSpawn,
-            is_inf, alloc, rt, src, workers, capacity, parallel, task_cfg, user_ctx
+            is_inf, alloc, rt, src, workers, capacity, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -342,6 +348,7 @@ pub const CheatLib = struct {
         rt: *Runtime,
         items: []const T,
         workers: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -363,7 +370,7 @@ pub const CheatLib = struct {
                     cleanup(R, alloc_, ptr);
                 }
             }.cleanupResult,
-            alloc, rt, items, workers, parallel, task_cfg, user_ctx
+            alloc, rt, items, workers, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -374,6 +381,7 @@ pub const CheatLib = struct {
         rt: *Runtime,
         items: []const T,
         workers: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -395,7 +403,7 @@ pub const CheatLib = struct {
                     cleanup(T, alloc_, ptr);
                 }
             }.cleanupItem,
-            alloc, rt, items, workers, parallel, task_cfg, user_ctx
+            alloc, rt, items, workers, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -405,6 +413,7 @@ pub const CheatLib = struct {
         rt: *Runtime,
         items: []const T,
         workers: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -421,7 +430,7 @@ pub const CheatLib = struct {
                     try CheatLib.spawnBest(@intFromPtr(&Runtime.entryWrapper), user_fn, args, config);
                 }
             }.parallelSpawn,
-            rt, items, workers, parallel, task_cfg, user_ctx
+            rt, items, workers, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -431,6 +440,7 @@ pub const CheatLib = struct {
         rt: *Runtime,
         items: []T,
         workers: usize,
+        batch: usize,
         parallel: bool,
         task_cfg: fp.TaskConfig,
         user_ctx: ?*anyopaque,
@@ -447,7 +457,7 @@ pub const CheatLib = struct {
                     try CheatLib.spawnBest(@intFromPtr(&Runtime.entryWrapper), user_fn, args, config);
                 }
             }.parallelSpawn,
-            rt, items, workers, parallel, task_cfg, user_ctx
+            rt, items, workers, batch, parallel, task_cfg, user_ctx
         );
     }
 
@@ -493,9 +503,34 @@ pub const CheatLib = struct {
         };
     }
 
-    // Read from a socket via io_uring IORING_OP_RECV.
-    // Submits a single recv and yields; CQE result is the byte count.
+    // Read from a non-blocking socket.
+    //
+    // Fast path: try a direct read first. Hot loopback/socket workloads often
+    // have bytes ready already, and paying an io_uring submission + yield for
+    // every ready read is far more expensive than the syscall itself.
+    //
+    // Slow path: if the fd would block, submit IORING_OP_RECV and yield. This
+    // preserves the completion-based path needed for streaming/parked fibers.
     pub noinline fn read(fd: i32, buffer: []u8) !usize {
+        const n = std.posix.read(fd, buffer) catch |err| {
+            if (err != error.WouldBlock) return err;
+            if (!fp.scheduler_running) return err;
+
+            const sched = fp.active_scheduler;
+            const task = sched.getCurrent();
+            var waiter = fp.Scheduler.IoWaiter{ .task = task };
+            try sched.submitRecv(&waiter, fd, buffer);
+            task.base.yield();
+            if (waiter.result < 0) return fp.Scheduler.ioError(waiter.result);
+            return @intCast(waiter.result);
+        };
+        return n;
+    }
+
+    // Force completion-based socket read. Kept separate so streaming code can
+    // opt into the one-SQE/one-yield path explicitly instead of penalizing
+    // ready-socket hot paths.
+    pub noinline fn readAsync(fd: i32, buffer: []u8) !usize {
         const sched = fp.active_scheduler;
         const task = sched.getCurrent();
         var waiter = fp.Scheduler.IoWaiter{ .task = task };
@@ -1627,6 +1662,25 @@ pub const CheatLib = struct {
         return -1;
     }
 
+    /// Peak virtual memory size (VmPeak) in KB, from /proc/self/status.
+    pub fn peakVirtualMemoryKb() i64 {
+        const fd = openPathFd("/proc/self/status", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0) catch return -1;
+        defer compat.closeFd(fd);
+        var buf: [4096]u8 = undefined;
+        const n = std.posix.read(fd, &buf) catch return -1;
+        const content = buf[0..n];
+        if (std.mem.indexOf(u8, content, "VmPeak:")) |pos| {
+            var i = pos + 7;
+            while (i < content.len and (content[i] == ' ' or content[i] == '\t')) : (i += 1) {}
+            var val: i64 = 0;
+            while (i < content.len and content[i] >= '0' and content[i] <= '9') : (i += 1) {
+                val = val * 10 + @as(i64, content[i] - '0');
+            }
+            return val;
+        }
+        return -1;
+    }
+
     /// Current resident set size (VmRSS) in KB, from /proc/self/status.
     pub fn currentMemoryKb() i64 {
         const fd = openPathFd("/proc/self/status", .{ .ACCMODE = .RDONLY, .CLOEXEC = true }, 0) catch return -1;
@@ -1646,6 +1700,36 @@ pub const CheatLib = struct {
         return -1;
     }
 
+    /// Fault pages in the current fiber's allocated stack slice.
+    /// This is benchmark-only plumbing: it makes resident memory reflect the
+    /// selected stack tier instead of only the pages naturally reached by a
+    /// tiny stack frame. It is a no-op on the root stack and for FSM tasks.
+    pub fn touchCurrentFiberStack(bytes_i64: i64, seed: i64) i64 {
+        const fiber = fc.__fiber orelse return seed;
+        if (bytes_i64 <= 0) return seed;
+
+        const requested: usize = @intCast(bytes_i64);
+        const stack = fiber.stack.memory;
+        const bytes = @min(requested, stack.len);
+        if (bytes == 0) return seed;
+
+        var acc: u64 = @bitCast(seed);
+        const start = stack.len - bytes;
+        var offset: usize = 0;
+        while (offset < bytes) : (offset += 4096) {
+            const idx = start + offset;
+            const value: u8 = @truncate(acc +% @as(u64, @intCast(offset)));
+            stack[idx] = value;
+            acc +%= stack[idx];
+        }
+
+        const last = stack.len - 1;
+        const last_value: u8 = @truncate(acc +% @as(u64, @intCast(bytes)));
+        stack[last] = last_value;
+        acc +%= stack[last];
+        return @bitCast(acc);
+    }
+
     // -----------------------------------------------------------------
     // Random
     // -----------------------------------------------------------------
@@ -3547,21 +3631,35 @@ pub fn allocFsmTask(parent_rt: *Runtime, resume_fn: fp.ResumeFn) !*fp.FsmTask {
     return parent_rt.getSched().allocFsmTask(resume_fn);
 }
 
-/// Allocate a per-FSM-task Runtime + ThreadLocalEbr.
-///
-/// FSM tasks dispatched via spawnFsmBest can run on any scheduler thread.
-/// If they share the spawning fiber's Runtime, EBR enter/exit/retire calls
-/// in the body would touch the spawning thread's ThreadLocalEbr from a
-/// foreign OS thread -- corrupting the non-thread-safe limbo list and
-/// surfacing later as `realloc(): invalid old size` from glibc.
+/// Allocate a generated FSM context using the scheduler-local ctx slabs
+/// for the common small cases (64 B / 128 B), falling back to heap for
+/// larger or over-aligned contexts. The allocation class is recorded on
+/// the FsmTask so destroy can route the free back to the owning scheduler.
+pub fn allocFsmCtx(comptime T: type, parent_rt: *Runtime, fsm_task: *fp.FsmTask) !*T {
+    return parent_rt.getSched().allocFsmCtx(T, fsm_task);
+}
+
+/// Free a generated FSM context through the allocation class recorded by
+/// allocFsmCtx. This is called by generated destroyTask after it has run
+/// type-specific cleanup.
+pub fn freeFsmCtx(comptime T: type, fsm_task: *fp.FsmTask, ctx: *T) void {
+    const owner: *fp.Scheduler = if (fsm_task.owner_scheduler) |raw|
+        @ptrCast(@alignCast(raw))
+    else
+        fp.active_scheduler;
+    const current = if (fp.scheduler_running) fp.active_scheduler else owner;
+    current.freeFsmCtx(T, fsm_task, ctx);
+}
+
+/// Allocate a per-FSM-task Runtime shell.
 ///
-/// The fix: every FSM task gets its own Runtime backed by its own
-/// ThreadLocalEbr slot (registered with the EbrContext), so all EBR
-/// operations in the body route through a slot dedicated to that task.
+/// FSM tasks still need their own Runtime pointer because generated FSM
+/// contexts store `rt` directly, but EBR no longer lives on the task. MVCC
+/// operations call Runtime.currentEbr(), which resolves to the active
+/// scheduler thread's registered EBR slot at dispatch time.
 ///
-/// Lifecycle: this fn allocates ebr_slot + Runtime, registers the slot
-/// with EbrContext, and stashes both pointers on `task.ebr_slot` /
-/// `task.task_runtime`. The scheduler frees them in `releaseFsmTaskEbr`
+/// Lifecycle: this fn allocates Runtime and stashes it on
+/// `task.task_runtime`. The scheduler frees it in `releaseFsmTaskEbr`
 /// after the task reaches .Done.
 ///
 /// Caller MUST invoke this BEFORE submitting the task (spawnFsmBest /
@@ -3574,22 +3672,17 @@ pub fn allocFsmTask(parent_rt: *Runtime, resume_fn: fp.ResumeFn) !*fp.FsmTask {
 ///   6. try CheatHeader.spawnFsmBest(&ctx.task);
 pub fn allocFsmTaskRuntime(fsm_task: *fp.FsmTask, parent_rt: *Runtime) !*Runtime {
     const allocator = parent_rt.heap_allocator;
-    const sched = parent_rt.getSched();
-
-    const ebr_ptr = try sched.allocEbrSlot();
-    errdefer sched.releaseEbrSlot(ebr_ptr);
 
     const rt_ptr = try allocator.create(Runtime);
     errdefer allocator.destroy(rt_ptr);
-    // Build a minimal Runtime backed by ebr_ptr. No frame slice -- the FSM
-    // body uses its own ctx for state; if it calls frameAlloc via a deep
-    // path, that lands in the lazy-heap arena (initFromSliceWithEbr with
-    // an empty frame slice). The codegen does NOT rely on the per-task
-    // Runtime owning frame memory; only its ebr matters for MVCC ops.
-    rt_ptr.* = try Runtime.initFromSliceWithEbr(&[_]u8{}, ebr_ptr, allocator, 0);
+    // Build a minimal Runtime. No frame slice -- the FSM body uses its own
+    // ctx for state; if it calls frameAlloc via a deep path, that lands in
+    // the lazy-heap arena. The ebr pointer is only the non-scheduler
+    // fallback; under scheduler dispatch Runtime.currentEbr() returns the
+    // active scheduler's thread_ebr.
+    rt_ptr.* = try Runtime.initFromSliceWithEbr(&[_]u8{}, parent_rt.ebr, allocator, 0);
     rt_ptr.wireAllocator();
 
-    fsm_task.ebr_slot = ebr_ptr;
     fsm_task.task_runtime = rt_ptr;
     return rt_ptr;
 }
diff --git a/zig/runtime/runtime.zig b/zig/runtime/runtime.zig
index 7cfea6ad..ccd1ee7a 100644
--- a/zig/runtime/runtime.zig
+++ b/zig/runtime/runtime.zig
@@ -528,6 +528,16 @@ pub const Runtime = struct {
         return fp.active_scheduler;
     }
 
+    /// EBR participant for the currently executing thread. Scheduler tasks
+    /// use the active scheduler's registered slot; direct/non-scheduler code
+    /// keeps using the Runtime-owned slot.
+    pub inline fn currentEbr(self: *Runtime) *ThreadLocalEbr {
+        if (fp.scheduler_running) {
+            return fp.active_scheduler.thread_ebr;
+        }
+        return self.ebr;
+    }
+
     // Cooperative yield check — injected at the back-edge of every non-TIGHT while loop.
     // Uses a power-of-two counter so the hot path is: wrapping-add + AND + compare-zero.
     // Yields to the scheduler only when another fiber is ready; single-fiber programs pay
@@ -623,13 +633,12 @@ pub const Runtime = struct {
         else
             full_stack_memory[0..0]; // empty slice - arena will use heap lazily
 
-        // Use the ThreadLocalEbr the scheduler pre-allocated and
-        // registered with EbrContext on its OS thread stack. Doing the
-        // EbrContext.register() here would overflow Standard fiber
-        // stacks (testing.allocator's append chain costs ~2-3 KB).
+        // MVCC/AtomicPtr access uses Runtime.currentEbr(), which resolves to
+        // the active scheduler's per-thread EBR slot. The runtime's fallback
+        // ebr pointer is only used outside scheduler execution.
         var rt = Runtime.initFromSliceWithEbr(
             frame_slice,
-            task.ebr_slot.?,
+            sched.thread_ebr,
             sched.allocator,
             task.config.timeout_ms
         ) catch unreachable;
diff --git a/zig/runtime/scheduler-direct-test.zig b/zig/runtime/scheduler-direct-test.zig
index b3ff7e2a..5453aa5b 100644
--- a/zig/runtime/scheduler-direct-test.zig
+++ b/zig/runtime/scheduler-direct-test.zig
@@ -103,7 +103,7 @@ test "Scheduler.submitSpawn queues one task after drainChannels" {
     const task = sched.ready_queue.pop().?;
     _ = sched.active_tasks.fetchSub(1, .monotonic);
     sched.releaseTaskEbr(task);
-    stack_pool.free(task.base.stack.memory);
+    sched.freeStack(task.base.stack);
     alloc.destroy(task.base);
     sched.task_slab.destroy(task);
 }
diff --git a/zig/runtime/scheduler.zig b/zig/runtime/scheduler.zig
index c773c2ce..136829ae 100644
--- a/zig/runtime/scheduler.zig
+++ b/zig/runtime/scheduler.zig
@@ -22,6 +22,11 @@ const EbrContext = ebr_mod.EbrContext;
 const ThreadLocalEbr = ebr_mod.ThreadLocalEbr;
 const SlabAllocator = @import("slab-alloc.zig").SlabAllocator;
 
+const Atomic = blk: {
+    const root = @import("root");
+    break :blk if (@hasDecl(root, "SimAtomic")) root.SimAtomic else std.atomic.Value;
+};
+
 fn milliTimestamp() i64 {
     return compat.milliTimestamp();
 }
@@ -153,9 +158,22 @@ pub const RemoteCompletion = struct {
 
 // A thread-safe wake-up signal
 pub const SmartEventFd = struct {
+    const WakeEmpty: u32 = 0;
+    const WakeParked: u32 = 1;
+    const WakeNotified: u32 = 2;
+
     fd: i32,
-    // 0 = Awake (Busy processing), 1 = Sleeping (Waiting on io_uring)
-    state: std.atomic.Value(u32) = std.atomic.Value(u32).init(0),
+
+    // Parker state for cross-scheduler wake coalescing:
+    //   Empty    -- scheduler is awake or no wake token is pending
+    //   Parked   -- scheduler is about to block / blocked in io_uring
+    //   Notified -- one wake token is pending
+    //
+    // Producers swap in Notified after enqueueing work. Only the producer
+    // that observes Parked writes eventfd; producers that observe Empty or
+    // Notified rely on the scheduler's next prepareSleep() consuming the
+    // token or on the already-pending eventfd wake.
+    state: Atomic(u32) = Atomic(u32).init(WakeEmpty),
 
     pub fn init() !SmartEventFd {
         // EFD_SEMAPHORE: Reads decrement counter by 1.
@@ -169,16 +187,18 @@ pub const SmartEventFd = struct {
         compat.closeFd(self.fd);
     }
 
-    // HOT PATH: This is what makes it fast!
+    /// Record a wake token. Returns true only when the target scheduler
+    /// was already parked and therefore needs a kernel eventfd write.
+    pub fn armNotify(self: *SmartEventFd) bool {
+        const old = self.state.swap(WakeNotified, .acq_rel);
+        return old == WakeParked;
+    }
+
+    // HOT PATH: coalesces wakeups in userspace. Only the Empty -> Notified
+    // transition is recorded when the scheduler is awake; only Parked ->
+    // Notified performs the eventfd write needed to wake io_uring.
     pub fn notify(self: *SmartEventFd) void {
-        // Unconditionally write to eventfd.  The previous optimization
-        // (skip write when target appears awake) raced with the target's
-        // markSleeping/hasChannelMessages/poll sequence, causing missed
-        // wakeups that deadlocked pinned fiber yield-poll loops.
-        //
-        // Cost: ~200ns write() syscall per notify.  Acceptable because
-        // notify is called once per submitSpawn/submitResume/sendAndWait,
-        // each of which already costs 1-10us for SPSC push + channel drain.
+        if (!self.armNotify()) return;
         const val: u64 = 1;
         const bytes = std.mem.asBytes(&val);
         _ = std.c.write(self.fd, bytes.ptr, bytes.len);
@@ -192,14 +212,31 @@ pub const SmartEventFd = struct {
         _ = std.posix.read(self.fd, buf) catch {};
     }
 
-    // Called before entering io_uring wait
-    pub fn markSleeping(self: *SmartEventFd) void {
-        self.state.store(1, .seq_cst);
+    /// Prepare to block in io_uring. Returns false when a producer already
+    /// left a wake token while we were awake; the scheduler must not sleep
+    /// and should loop back to drain queues.
+    pub fn prepareSleep(self: *SmartEventFd) bool {
+        while (true) {
+            const old = self.state.load(.acquire);
+            switch (old) {
+                WakeNotified => {
+                    if (self.state.cmpxchgWeak(WakeNotified, WakeEmpty, .acq_rel, .acquire) == null)
+                        return false;
+                },
+                WakeEmpty => {
+                    if (self.state.cmpxchgWeak(WakeEmpty, WakeParked, .acq_rel, .acquire) == null)
+                        return true;
+                },
+                WakeParked => return true,
+                else => unreachable,
+            }
+        }
     }
 
-    // Called immediately after exiting io_uring wait
-    pub fn markAwake(self: *SmartEventFd) void {
-        self.state.store(0, .seq_cst);
+    // Called when the scheduler decides not to sleep after prepareSleep()
+    // or immediately after io_uring returns.
+    pub fn finishSleep(self: *SmartEventFd) void {
+        _ = self.state.swap(WakeEmpty, .acq_rel);
     }
 };
 
@@ -246,7 +283,7 @@ pub const Scheduler = struct {
     dirty_mask: std.atomic.Value(u64) = std.atomic.Value(u64).init(0),
     /// Re-entrancy guard for drainChannels (prevents RemoteCall → map.put → sendAndWait → drainChannels)
     draining: bool = false,
-    stack_pool: *StackPool,    // GLOBAL Stack Cache
+    stack_pool: StackPool,
     event_fd: SmartEventFd,
     load: std.atomic.Value(isize) = std.atomic.Value(isize).init(0),
     global_shutdown: ?*std.atomic.Value(bool) = null,
@@ -254,6 +291,9 @@ pub const Scheduler = struct {
     // 3. IO & Memory
     allocator: std.mem.Allocator,
     global_ebr: *EbrContext,
+    /// One EBR participant per scheduler OS thread. Tasks borrow this
+    /// through Runtime.currentEbr(); they do not allocate EBR slots.
+    thread_ebr: *ThreadLocalEbr,
     /// Per-scheduler slab allocator for Task structs. Tasks live in
     /// page-aligned slabs, which (a) lets walkers compute the owning
     /// slab from a *Task via address arithmetic in Phase 3 (cycle-detect
@@ -271,6 +311,13 @@ pub const Scheduler = struct {
     /// Slab size: 64 KB (power-of-two, ≈800 FsmTasks per slab at
     /// ~80 B each).
     fsm_task_slab: SlabAllocator(fsm_mod.FsmTask),
+    /// Per-scheduler slabs for generated FSM context payloads. The
+    /// compiler/runtime route <=64 B, <=128 B, and <=256 B contexts here; larger
+    /// contexts stay explicit heap until @fsm:heap / @stack policy is
+    /// fully surfaced in the language.
+    fsm_ctx_64_slab: SlabAllocator(FsmCtx64),
+    fsm_ctx_128_slab: SlabAllocator(FsmCtx128),
+    fsm_ctx_256_slab: SlabAllocator(FsmCtx256),
 
     // 4a. io_uring — unified I/O ring for poll-based socket I/O, async file
     // I/O, and eventfd wakeups. In Loom mode, this is SimRing.
@@ -347,7 +394,12 @@ pub const Scheduler = struct {
     // Not used by default — only when the CLEAR programmer opts in with @arena.
     local_arena: std.heap.ArenaAllocator,
 
-    pub fn init(allocator: std.mem.Allocator, global_ebr: *EbrContext, stack_pool: *StackPool) !Scheduler {
+    pub const FsmCtx64 = extern struct { bytes: [64]u8 };
+    pub const FsmCtx128 = extern struct { bytes: [128]u8 };
+    pub const FsmCtx256 = extern struct { bytes: [256]u8 };
+
+    pub fn init(allocator: std.mem.Allocator, global_ebr: *EbrContext, unused_shared_stack_pool: anytype) !Scheduler {
+        _ = unused_shared_stack_pool;
         const efd = try SmartEventFd.init();
 
         // io_uring ring for all I/O: poll-based socket I/O, async file reads,
@@ -365,8 +417,14 @@ pub const Scheduler = struct {
             _ = try ring.submit();
         }
 
+        const thread_ebr = try allocator.create(ThreadLocalEbr);
+        errdefer allocator.destroy(thread_ebr);
+        thread_ebr.* = .{ .context = global_ebr };
+        try global_ebr.register(allocator, thread_ebr);
+        errdefer global_ebr.unregister(thread_ebr);
+
         const sched = Scheduler{
-            .stack_pool = stack_pool,
+            .stack_pool = StackPool.init(allocator),
             .fiber_pool = .empty,
             .ready_queue = try RunQueue.initWithAllocator(allocator),
             .fsm_ready_queue = try FsmRunQueue.initWithAllocator(allocator),
@@ -378,11 +436,15 @@ pub const Scheduler = struct {
             .load = std.atomic.Value(isize).init(0),
             .allocator = allocator,
             .global_ebr = global_ebr,
+            .thread_ebr = thread_ebr,
             // Power-of-two slab size required by SlabAllocator; 64 KB hits
             // the sweet spot for current Task footprint (~192 B incl.
             // cache-line padding) — ≈330 tasks per slab.
             .task_slab = SlabAllocator(Task).init(allocator, 64 * 1024),
             .fsm_task_slab = SlabAllocator(fsm_mod.FsmTask).init(allocator, 64 * 1024),
+            .fsm_ctx_64_slab = SlabAllocator(FsmCtx64).init(allocator, 64 * 1024),
+            .fsm_ctx_128_slab = SlabAllocator(FsmCtx128).init(allocator, 64 * 1024),
+            .fsm_ctx_256_slab = SlabAllocator(FsmCtx256).init(allocator, 64 * 1024),
             .ring = ring,
             .io_helper_stack = try allocator.alloc(u8, IO_HELPER_STACK_SIZE),
             .main_ctx = undefined,
@@ -407,7 +469,7 @@ pub const Scheduler = struct {
             for (q.items) |task| {
                 self.releaseTaskEbr(task);
                 if (task.base.stack.memory.len > 0) {
-                     self.freeStack(task.base.stack.memory);
+                     self.freeStack(task.base.stack);
                 }
                 self.allocator.destroy(task.base); // Free Fiber
                 self.task_slab.destroy(task); // Free Task Struct
@@ -438,7 +500,7 @@ pub const Scheduler = struct {
              const task_opt = self.ready_queue.getBuffer()[i & self.ready_queue.getMask()].load(.monotonic);
              if (task_opt) |task| {
                  self.releaseTaskEbr(task);
-                 self.freeStack(task.base.stack.memory);
+                 self.freeStack(task.base.stack);
                  self.allocator.destroy(task.base);
                  self.task_slab.destroy(task);
              }
@@ -446,7 +508,7 @@ pub const Scheduler = struct {
         self.ready_queue.deinit();
         for (self.pinned_queue.items) |task| {
             self.releaseTaskEbr(task);
-            self.freeStack(task.base.stack.memory);
+            self.freeStack(task.base.stack);
             self.allocator.destroy(task.base);
             self.task_slab.destroy(task);
         }
@@ -457,7 +519,9 @@ pub const Scheduler = struct {
         self.fsm_ready_queue.deinit();
         self.fsm_deferred_queue.deinit(self.allocator);
 
+        self.drainChannels();
         self.stack_pool.flushLocalCache();
+        self.stack_pool.deinit();
         self.stack_cache.deinit(self.allocator);
         for (&self.channels) |*ch| {
             if (ch.load(.acquire)) |ring| self.allocator.destroy(ring);
@@ -472,59 +536,28 @@ pub const Scheduler = struct {
         // their memory back to the general allocator.
         self.task_slab.deinit();
         self.fsm_task_slab.deinit();
+        self.fsm_ctx_64_slab.deinit();
+        self.fsm_ctx_128_slab.deinit();
+        self.fsm_ctx_256_slab.deinit();
+        self.global_ebr.unregister(self.thread_ebr);
+        self.thread_ebr.deinit(self.allocator);
+        self.allocator.destroy(self.thread_ebr);
     }
 
-    /// Allocate + register a per-task ThreadLocalEbr from the heap.
-    /// Used by both stackful Task spawn (drainChannels.Spawn) and FSM
-    /// task spawn (CheatHeader.allocFsmTaskRuntime). Caller owns the
-    /// returned slot until releaseEbrSlot is called.
-    pub fn allocEbrSlot(self: *Scheduler) !*ThreadLocalEbr {
-        const ebr_ptr = try self.allocator.create(ThreadLocalEbr);
-        errdefer self.allocator.destroy(ebr_ptr);
-        ebr_ptr.* = .{ .context = self.global_ebr };
-        try self.global_ebr.register(self.allocator, ebr_ptr);
-        return ebr_ptr;
-    }
-
-    /// Unregister + deinit + free a ThreadLocalEbr previously allocated
-    /// by allocEbrSlot. Shared by releaseTaskEbr (stackful) and
-    /// releaseFsmTaskEbr (FSM).
-    pub fn releaseEbrSlot(self: *Scheduler, ebr: *ThreadLocalEbr) void {
-        self.global_ebr.unregister(ebr);
-        ebr.deinit(self.allocator);
-        self.allocator.destroy(ebr);
-    }
-
-    /// Unregister + free a task's heap-allocated ThreadLocalEbr if any.
-    /// Used both by the .Finished path during normal task completion AND
-    /// by deinit() to clean up tasks left behind in fiber_pool /
-    /// sleeping_queue / ready_queue / pinned_queue at shutdown.
-    /// Public so test runners with their own .Finished handling
-    /// (stream-test, steal-hammer-test) can call this before destroy.
+    /// Compatibility hook for test runners with custom .Finished handling.
+    /// Tasks no longer own EBR slots; the scheduler thread owns one slot.
     pub fn releaseTaskEbr(self: *Scheduler, task: *Task) void {
-        if (task.ebr_slot) |e| {
-            self.releaseEbrSlot(e);
-            task.ebr_slot = null;
-        }
+        _ = self;
+        _ = task;
     }
 
-    /// FSM analog of releaseTaskEbr. The FSM allocator helper
-    /// (CheatHeader.allocFsmTaskRuntime) heap-allocates a per-task
-    /// ThreadLocalEbr + Runtime and stashes the pointers as opaques on
-    /// the FsmTask. drainFsmQueue's .Done branch invokes this BEFORE
-    /// destroy_fn, so that even if destroy_fn frees the ctx struct (and
-    /// with it the FsmTask body), the per-task Runtime + ebr_slot are
-    /// cleanly torn down first.
+    /// Release the per-FSM Runtime shell before destroy_fn frees the ctx.
     pub fn releaseFsmTaskEbr(self: *Scheduler, task: *fsm_mod.FsmTask) void {
         if (task.task_runtime) |rt_ptr| {
             rt_ptr.deinit();
             self.allocator.destroy(rt_ptr);
             task.task_runtime = null;
         }
-        if (task.ebr_slot) |ebr_ptr| {
-            self.releaseEbrSlot(ebr_ptr);
-            task.ebr_slot = null;
-        }
     }
 
     /// Allocate a fresh FsmTask from `fsm_task_slab`, bump its
@@ -554,12 +587,88 @@ pub const Scheduler = struct {
         t.lock_wait_start_ms.store(0, .release);
         t.fsm_wake_time = 0;
         t.destroy_fn = null;
-        t.ebr_slot = null;
         t.task_runtime = null;
         t.generation.store(prev_gen +% 1, .release);
+        t.owner_scheduler = self;
         return t;
     }
 
+    pub fn allocFsmCtx(self: *Scheduler, comptime T: type, task: *fsm_mod.FsmTask) !*T {
+        const size = @sizeOf(T);
+        const alignment = @alignOf(T);
+        const ptr = if (comptime size <= 64 and alignment <= 16) blk: {
+            const slot = try self.fsm_ctx_64_slab.create();
+            task.ctx_alloc_class = .slab64;
+            break :blk @as(*T, @ptrCast(@alignCast(slot)));
+        } else if (comptime size <= 128 and alignment <= 16) blk: {
+            const slot = try self.fsm_ctx_128_slab.create();
+            task.ctx_alloc_class = .slab128;
+            break :blk @as(*T, @ptrCast(@alignCast(slot)));
+        } else if (comptime size <= 256 and alignment <= 16) blk: {
+            const slot = try self.fsm_ctx_256_slab.create();
+            task.ctx_alloc_class = .slab256;
+            break :blk @as(*T, @ptrCast(@alignCast(slot)));
+        } else blk: {
+            const heap_ptr = try self.allocator.create(T);
+            task.ctx_alloc_class = .heap;
+            break :blk heap_ptr;
+        };
+        task.owner_scheduler = self;
+        return ptr;
+    }
+
+    pub fn freeFsmCtx(self: *Scheduler, comptime T: type, task: *fsm_mod.FsmTask, ctx: *T) void {
+        const class = task.ctx_alloc_class;
+        const owner: *Scheduler = if (task.owner_scheduler) |raw|
+            @ptrCast(@alignCast(raw))
+        else
+            self;
+
+        if (owner != self and (class == .slab64 or class == .slab128 or class == .slab256)) {
+            self.submitRemoteFsmCtxFree(owner, class, @intFromPtr(ctx));
+            task.ctx_alloc_class = .none;
+            task.ctx = null;
+            return;
+        }
+
+        switch (class) {
+            .none => {},
+            .slab64 => self.fsm_ctx_64_slab.destroy(@as(*FsmCtx64, @ptrCast(@alignCast(ctx)))),
+            .slab128 => self.fsm_ctx_128_slab.destroy(@as(*FsmCtx128, @ptrCast(@alignCast(ctx)))),
+            .slab256 => self.fsm_ctx_256_slab.destroy(@as(*FsmCtx256, @ptrCast(@alignCast(ctx)))),
+            .heap => owner.allocator.destroy(ctx),
+        }
+        task.ctx_alloc_class = .none;
+        task.ctx = null;
+    }
+
+    fn submitRemoteFsmCtxFree(
+        self: *Scheduler,
+        owner: *Scheduler,
+        class: fsm_mod.FsmCtxAllocClass,
+        ptr: usize,
+    ) void {
+        const sender_idx = if (scheduler_running) active_scheduler.index else self.index;
+        std.debug.assert(sender_idx < owner.channels.len);
+        const ring = owner.ensureChannel(sender_idx) catch {
+            @panic("failed to allocate remote FSM ctx-free channel");
+        };
+        const msg = SpscMessage{
+            .tag = .RemoteFsmCtxFree,
+            .fsm_ctx_ptr = ptr,
+            .fsm_ctx_class = @intFromEnum(class),
+        };
+        while (!ring.push(msg)) {
+            if (scheduler_running) {
+                active_scheduler.drainChannels();
+            }
+            std.Thread.yield() catch {};
+        }
+        const bit = @as(u64, 1) << @intCast(sender_idx);
+        const old_dirty = owner.dirty_mask.fetchOr(bit, .release);
+        if ((old_dirty & bit) == 0) owner.event_fd.notify();
+    }
+
     // ------------------------------------------------------------
     // Memory Management
     // ------------------------------------------------------------
@@ -576,7 +685,7 @@ pub const Scheduler = struct {
     // HOT PATH: Freeing a stack.
     // Standard-sized stacks are kept in the L1 cache for fast reuse.
     // All other sizes are returned directly to the pool slab.
-    fn freeStack(self: *Scheduler, stack: []u8) void {
+    fn freeLocalStackMemory(self: *Scheduler, stack: []u8) void {
         if (stack.len == STANDARD_STACK_SIZE and self.stack_cache.items.len < STACK_CACHE_LIMIT) {
             self.stack_cache.append(self.allocator, stack) catch {
                 self.stack_pool.free(stack);
@@ -586,6 +695,40 @@ pub const Scheduler = struct {
         }
     }
 
+    pub fn freeStack(self: *Scheduler, stack: Stack) void {
+        const owner = if (stack.owner) |raw|
+            @as(*Scheduler, @ptrCast(@alignCast(raw)))
+        else
+            self;
+        if (owner == self) {
+            self.freeLocalStackMemory(stack.memory);
+            return;
+        }
+        self.submitRemoteStackFree(owner, stack.memory);
+    }
+
+    fn submitRemoteStackFree(self: *Scheduler, owner: *Scheduler, memory: []u8) void {
+        const sender_idx = if (scheduler_running) active_scheduler.index else self.index;
+        std.debug.assert(sender_idx < owner.channels.len);
+        const ring = owner.ensureChannel(sender_idx) catch {
+            @panic("failed to allocate remote stack-free channel");
+        };
+        const msg = SpscMessage{
+            .tag = .RemoteStackFree,
+            .stack_ptr = @intFromPtr(memory.ptr),
+            .stack_len = memory.len,
+        };
+        while (!ring.push(msg)) {
+            if (scheduler_running) {
+                active_scheduler.drainChannels();
+            }
+            std.Thread.yield() catch {};
+        }
+        const bit = @as(u64, 1) << @intCast(sender_idx);
+        const old_dirty = owner.dirty_mask.fetchOr(bit, .release);
+        if ((old_dirty & bit) == 0) owner.event_fd.notify();
+    }
+
     // IDLE PATH: Scavenge memory (The Cleanup)
     fn scavengeMemory(self: *Scheduler, draining: bool) void {
         // 1. Drain L1 Cache (Scheduler ArrayList) -> L2 Cache (Slab Magazine)
@@ -635,6 +778,8 @@ pub const Scheduler = struct {
             .config_stack_size = @intFromEnum(config.stack_size),
             .config_pinned = config.pinned,
             .config_timeout_ms = config.timeout_ms,
+            .config_profile_site_id = config.profile_site_id,
+            .config_profile_dispatch = config.profile_dispatch,
         };
         // Wait-and-work: if ring is full, drain our own channels + yield
         while (!ring.push(msg)) {
@@ -645,8 +790,9 @@ pub const Scheduler = struct {
                 std.Thread.yield() catch {};
             }
         }
-        _ = self.dirty_mask.fetchOr(@as(u64, 1) << @intCast(sender_idx), .seq_cst);
-        self.event_fd.notify();
+        const bit = @as(u64, 1) << @intCast(sender_idx);
+        const old_dirty = self.dirty_mask.fetchOr(bit, .release);
+        if ((old_dirty & bit) == 0) self.event_fd.notify();
     }
 
     // ------------------------------------------------------------
@@ -679,8 +825,9 @@ pub const Scheduler = struct {
                 std.Thread.yield() catch {};
             }
         }
-        _ = self.dirty_mask.fetchOr(@as(u64, 1) << @intCast(sender_idx), .seq_cst);
-        self.event_fd.notify();
+        const bit = @as(u64, 1) << @intCast(sender_idx);
+        const old_dirty = self.dirty_mask.fetchOr(bit, .release);
+        if ((old_dirty & bit) == 0) self.event_fd.notify();
     }
 
     /// Wake a previously-parked FSM task. Same routing as submitFsmSpawn
@@ -711,8 +858,9 @@ pub const Scheduler = struct {
                 std.Thread.yield() catch {};
             }
         }
-        _ = self.dirty_mask.fetchOr(@as(u64, 1) << @intCast(sender_idx), .seq_cst);
-        self.event_fd.notify();
+        const bit = @as(u64, 1) << @intCast(sender_idx);
+        const old_dirty = self.dirty_mask.fetchOr(bit, .release);
+        if ((old_dirty & bit) == 0) self.event_fd.notify();
     }
 
     // ------------------------------------------------------------
@@ -752,8 +900,9 @@ pub const Scheduler = struct {
                 std.Thread.yield() catch {};
             }
         }
-        _ = self.dirty_mask.fetchOr(@as(u64, 1) << @intCast(sender_idx), .seq_cst);
-        self.event_fd.notify();
+        const bit = @as(u64, 1) << @intCast(sender_idx);
+        const old_dirty = self.dirty_mask.fetchOr(bit, .release);
+        if ((old_dirty & bit) == 0) self.event_fd.notify();
     }
     /// Lightweight: only process RemoteCall messages. Spawn and Resume are
     /// left in the ring for the full drainChannels to handle. Safe to call
@@ -805,6 +954,8 @@ pub const Scheduler = struct {
                             .stack_size = @enumFromInt(msg.config_stack_size),
                             .pinned = msg.config_pinned,
                             .timeout_ms = msg.config_timeout_ms,
+                            .profile_site_id = msg.config_profile_site_id,
+                            .profile_dispatch = msg.config_profile_dispatch,
                         };
                         const effective_size = cp.recommendSize(
                             if (msg.user_fn) |f| @intFromPtr(f) else 0,
@@ -813,12 +964,12 @@ pub const Scheduler = struct {
                         const stack_mem = self.allocStack(effective_size) catch continue;
                         const task = blk: {
                             const fiber_ptr = self.allocator.create(Fiber) catch {
-                                self.freeStack(stack_mem);
+                                self.freeLocalStackMemory(stack_mem);
                                 continue;
                             };
-                            fiber_ptr.* = Fiber.init(stack_mem, msg.trampoline_addr, effective_size);
+                            fiber_ptr.* = Fiber.initWithOwner(stack_mem, msg.trampoline_addr, effective_size, self);
                             const t = self.task_slab.create() catch {
-                                self.freeStack(stack_mem);
+                                self.freeLocalStackMemory(stack_mem);
                                 self.allocator.destroy(fiber_ptr);
                                 continue;
                             };
@@ -840,17 +991,13 @@ pub const Scheduler = struct {
                             t.generation.store(prev_gen +% 1, .release);
                             if (rt_profile.CLEAR_PROFILE) {
                                 t.spawn_ns = fp_mod.nowNs();
+                                t.profile_site_id = config.profile_site_id;
+                                fp_mod.recordSiteSpawn(
+                                    config.profile_site_id,
+                                    @as(fp_mod.DispatchKind, @enumFromInt(config.profile_dispatch)),
+                                    .stack,
+                                );
                             }
-                            // Allocate + register a per-task ThreadLocalEbr on
-                            // the OS thread stack. Doing this here (instead of
-                            // inside entryWrapper) keeps EbrContext.register's
-                            // deep allocator path off the small fiber stack.
-                            t.ebr_slot = self.allocEbrSlot() catch {
-                                self.freeStack(stack_mem);
-                                self.allocator.destroy(fiber_ptr);
-                                self.allocator.destroy(t);
-                                continue;
-                            };
                             break :blk t;
                         };
                         task.context = msg.args;
@@ -858,15 +1005,19 @@ pub const Scheduler = struct {
                         task.config = config;
                         if (task.config.pinned) {
                             self.pinned_queue.append(self.allocator, task) catch {
-                                self.freeStack(stack_mem);
+                                self.freeLocalStackMemory(stack_mem);
+                                self.allocator.destroy(task.base);
                                 self.task_slab.destroy(task);
                                 continue;
                             };
                         } else {
                             self.ready_queue.push(self.allocator, task) catch {
-                                self.freeStack(stack_mem);
+                                self.freeLocalStackMemory(stack_mem);
                                 self.fiber_pool.append(self.allocator, task) catch
-                                    self.task_slab.destroy(task);
+                                    {
+                                        self.allocator.destroy(task.base);
+                                        self.task_slab.destroy(task);
+                                    };
                                 continue;
                             };
                         }
@@ -890,6 +1041,19 @@ pub const Scheduler = struct {
                         fsm_task.status = .Ready;
                         self.fsm_ready_queue.push(self.allocator, fsm_task) catch unreachable;
                     },
+                    .RemoteStackFree => {
+                        const memory = @as([*]u8, @ptrFromInt(msg.stack_ptr))[0..msg.stack_len];
+                        self.freeLocalStackMemory(memory);
+                    },
+                    .RemoteFsmCtxFree => {
+                        const class: fsm_mod.FsmCtxAllocClass = @enumFromInt(msg.fsm_ctx_class);
+                        switch (class) {
+                            .slab64 => self.fsm_ctx_64_slab.destroy(@as(*FsmCtx64, @ptrCast(@alignCast(@as(*anyopaque, @ptrFromInt(msg.fsm_ctx_ptr)))))),
+                            .slab128 => self.fsm_ctx_128_slab.destroy(@as(*FsmCtx128, @ptrCast(@alignCast(@as(*anyopaque, @ptrFromInt(msg.fsm_ctx_ptr)))))),
+                            .slab256 => self.fsm_ctx_256_slab.destroy(@as(*FsmCtx256, @ptrCast(@alignCast(@as(*anyopaque, @ptrFromInt(msg.fsm_ctx_ptr)))))),
+                            .none, .heap => {},
+                        }
+                    },
                     .RemoteCall => {
                         if (self.draining) {
                             std.debug.print("RE-ENTRANT DRAIN: sched={d}\n", .{self.index});
@@ -959,7 +1123,11 @@ pub const Scheduler = struct {
 
         while (true) {
             if (self.global_shutdown) |flag| {
-                if (flag.load(.monotonic)) break;
+                if (flag.load(.monotonic)) {
+                    self.drainChannels();
+                    self.scavengeMemory(true);
+                    break;
+                }
             }
 
             // Scan lock_waiters every iteration (fast or slow path). Without
@@ -1061,6 +1229,7 @@ pub const Scheduler = struct {
 
                 if (rt_profile.CLEAR_PROFILE) {
                     fp_mod.recordSchedulerRun(self.index);
+                    fp_mod.recordSiteRun(task.profile_site_id, self.index);
                 }
                 // 1. Switch to the Task
                 task.base.switchTo(&self.main_ctx);
@@ -1071,7 +1240,7 @@ pub const Scheduler = struct {
                 switch (task.status.load(.acquire)) {
                     .Finished => {
                         if (rt_profile.CLEAR_PROFILE) {
-                            fp_mod.recordFiberExit(task.spawn_ns, fp_mod.nowNs());
+                            fp_mod.recordFiberExit(task.profile_site_id, task.spawn_ns, fp_mod.nowNs());
                         }
                         _ = self.active_tasks.fetchSub(1, .monotonic);
                         // Remove from lock_waiters before destroying to prevent stale pointer access.
@@ -1084,11 +1253,9 @@ pub const Scheduler = struct {
                                 break;
                             }
                         }
-                        // Unregister + free the per-task ThreadLocalEbr
-                        // (deinit drains limbo into the global orphans list
-                        // so reclaim() can free them once safe).
+                        // Compatibility no-op: tasks no longer own EBR slots.
                         self.releaseTaskEbr(task);
-                        self.freeStack(task.base.stack.memory);
+                        self.freeStack(task.base.stack);
                         self.allocator.destroy(task.base);
                         self.task_slab.destroy(task);
                     },
@@ -1195,15 +1362,20 @@ pub const Scheduler = struct {
             // Flush any pending SQEs (e.g. the timeout above) before sleeping.
             self.flushRing();
 
-            // A. Announce we are going to sleep
-            self.event_fd.markSleeping();
+            // A. Park in userspace. If a producer already left a wake
+            // token while we were awake, consume it and loop instead of
+            // entering io_uring.
+            if (!self.event_fd.prepareSleep()) {
+                continue;
+            }
 
             // B. The Double Check
-            // We must check for new work ONE LAST TIME after setting the flag.
+            // We must check for new work ONE LAST TIME after parking.
             // If we don't do this, a task could arrive between our last check
-            // and the 'markSleeping' call, and we would sleep forever.
+            // and the prepareSleep call. Producers that arrive after
+            // prepareSleep observe WakeParked and write eventfd.
             if (self.hasWork() or self.hasChannelMessages()) {
-                self.event_fd.markAwake();
+                self.event_fd.finishSleep();
                 continue; // Restart loop to process the new work
             }
 
@@ -1211,7 +1383,7 @@ pub const Scheduler = struct {
             const count = self.copyCqesOnIoStack(wait_nr);
 
             // D. We are awake
-            self.event_fd.markAwake();
+            self.event_fd.finishSleep();
 
             if (count > 0) {
                 self.processCqes(self.uring_cqes[0..count]);
@@ -1263,6 +1435,16 @@ pub const Scheduler = struct {
     /// Caller owns the backing state struct; scheduler only moves the handle.
     pub fn enqueueFsm(self: *Scheduler, task: *FsmTask) void {
         task.status = .Ready;
+        if (rt_profile.CLEAR_PROFILE) {
+            if (task.spawn_ns == 0) {
+                task.spawn_ns = fp_mod.nowNs();
+                fp_mod.recordSiteSpawn(
+                    task.profile_site_id,
+                    @as(fp_mod.DispatchKind, @enumFromInt(task.profile_dispatch)),
+                    .fsm,
+                );
+            }
+        }
         self.fsm_ready_queue.push(self.allocator, task) catch unreachable;
         _ = self.active_tasks.fetchAdd(1, .monotonic);
     }
@@ -1290,16 +1472,20 @@ pub const Scheduler = struct {
         var i: usize = 0;
         while (i < snapshot) : (i += 1) {
             const task = self.fsm_ready_queue.pop() orelse break;
+            if (rt_profile.CLEAR_PROFILE) {
+                fp_mod.recordSchedulerRun(self.index);
+                fp_mod.recordSiteRun(task.profile_site_id, self.index);
+            }
             const reason = fsm_mod.dispatchOnce(task);
             switch (reason) {
                 .Done => {
                     if (rt_profile.CLEAR_PROFILE) {
-                        fp_mod.recordFiberExit(task.spawn_ns, fp_mod.nowNs());
+                        fp_mod.recordFiberExit(task.profile_site_id, task.spawn_ns, fp_mod.nowNs());
                     }
                     _ = self.active_tasks.fetchSub(1, .monotonic);
-                    // Per-task Runtime + ebr_slot teardown MUST happen
-                    // before destroy_fn (destroy_fn reads task.ctx and
-                    // frees the user ctx struct).
+                    // Per-task Runtime shell teardown MUST happen before
+                    // destroy_fn (destroy_fn reads task.ctx and frees the
+                    // user ctx struct).
                     self.releaseFsmTaskEbr(task);
                     // The synthesized destroy_fn frees the user ctx
                     // pointed to by task.ctx. It does NOT free the
@@ -1309,7 +1495,11 @@ pub const Scheduler = struct {
                     // happens AFTER destroy_fn runs so destroy_fn can
                     // still read task.ctx.
                     if (task.destroy_fn) |df| df(task);
-                    self.fsm_task_slab.destroy(task);
+                    const owner: *Scheduler = if (task.owner_scheduler) |raw|
+                        @ptrCast(@alignCast(raw))
+                    else
+                        self;
+                    owner.fsm_task_slab.destroy(task);
                 },
                 .Yielded => {
                     // Stage for flush after the batch. Prevents LIFO starvation.
@@ -2108,6 +2298,329 @@ pub fn unpinFsmTask(pin: FsmTaskPin) void {
     if (pin.allocator) |alloc| alloc.unpin(pin.slab);
 }
 
+test "migrated stack free routes back to owning scheduler" {
+    const alloc = std.testing.allocator;
+    var global_ebr: EbrContext = .{};
+    defer global_ebr.deinit(alloc);
+
+    var owner = try Scheduler.init(alloc, &global_ebr, null);
+    defer owner.deinit();
+    owner.index = 0;
+
+    var thief = try Scheduler.init(alloc, &global_ebr, null);
+    defer thief.deinit();
+    thief.index = 1;
+
+    const memory = try owner.allocStack(.Standard);
+    const stack = Stack{ .memory = memory, .owner = &owner };
+
+    active_scheduler = &thief;
+    scheduler_running = true;
+    defer {
+        scheduler_running = false;
+        active_scheduler = undefined;
+    }
+
+    thief.freeStack(stack);
+    try std.testing.expectEqual(@as(usize, 0), thief.stack_cache.items.len);
+    try std.testing.expect(owner.dirty_mask.load(.acquire) != 0);
+
+    owner.drainChannels();
+    try std.testing.expectEqual(@as(usize, 1), owner.stack_cache.items.len);
+}
+
+fn makeDeinitCleanupTask(sched: *Scheduler, size: StackSize) !*Task {
+    const memory = try sched.allocStack(size);
+    const fiber = try sched.allocator.create(Fiber);
+    errdefer sched.allocator.destroy(fiber);
+    fiber.* = Fiber.initWithOwner(memory, @intFromPtr(&dummyTaskFn), size, sched);
+    const task = try sched.task_slab.create();
+    task.* = Task{ .base = fiber, .user_fn = @ptrCast(&dummyTaskFn) };
+    return task;
+}
+
+test "Scheduler.deinit releases task stacks left in internal queues" {
+    const alloc = std.testing.allocator;
+    var global_ebr: EbrContext = .{};
+    defer global_ebr.deinit(alloc);
+
+    var sched = try Scheduler.init(alloc, &global_ebr, null);
+    sched.index = 0;
+
+    try sched.fiber_pool.append(alloc, try makeDeinitCleanupTask(&sched, .Micro));
+    try sched.ready_queue.push(alloc, try makeDeinitCleanupTask(&sched, .Micro));
+    try sched.pinned_queue.append(alloc, try makeDeinitCleanupTask(&sched, .Micro));
+
+    sched.deinit();
+}
+
+test "remote FSM ctx free routes slab128 back to owning scheduler" {
+    const alloc = std.testing.allocator;
+    var global_ebr: EbrContext = .{};
+    defer global_ebr.deinit(alloc);
+
+    var owner = try Scheduler.init(alloc, &global_ebr, null);
+    defer owner.deinit();
+    owner.index = 0;
+
+    var thief = try Scheduler.init(alloc, &global_ebr, null);
+    defer thief.deinit();
+    thief.index = 1;
+
+    const task = try owner.allocFsmTask(&dummyFsmResume);
+    const ctx = try owner.allocFsmCtx(Scheduler.FsmCtx128, task);
+
+    active_scheduler = &thief;
+    scheduler_running = true;
+    defer {
+        scheduler_running = false;
+        active_scheduler = undefined;
+    }
+
+    thief.freeFsmCtx(Scheduler.FsmCtx128, task, ctx);
+    try std.testing.expect(owner.dirty_mask.load(.acquire) != 0);
+    owner.drainChannels();
+    owner.fsm_task_slab.destroy(task);
+}
+
+test "Scheduler.init cleans up thread EBR on later allocation failures" {
+    const backing = std.testing.allocator;
+
+    var fail_after_thread_ebr = std.testing.FailingAllocator.init(backing, .{ .fail_index = 1 });
+    var ebr_after_thread: EbrContext = .{};
+    try std.testing.expectError(
+        error.OutOfMemory,
+        Scheduler.init(fail_after_thread_ebr.allocator(), &ebr_after_thread, null),
+    );
+    defer ebr_after_thread.deinit(backing);
+
+    var fail_after_register = std.testing.FailingAllocator.init(backing, .{ .fail_index = 2 });
+    var ebr_after_register: EbrContext = .{};
+    try std.testing.expectError(
+        error.OutOfMemory,
+        Scheduler.init(fail_after_register.allocator(), &ebr_after_register, null),
+    );
+    defer ebr_after_register.deinit(backing);
+    try std.testing.expectEqual(@as(usize, 0), ebr_after_register.registry.items.len);
+}
+
+test "drainChannels releases stack memory when spawn allocation steps fail" {
+    const backing = std.testing.allocator;
+
+    var offset: usize = 0;
+    while (offset < 64) : (offset += 1) {
+        var failing = std.testing.FailingAllocator.init(backing, .{});
+        const alloc = failing.allocator();
+        var global_ebr: EbrContext = .{};
+        defer global_ebr.deinit(alloc);
+
+        var sched = try Scheduler.init(alloc, &global_ebr, null);
+        defer sched.deinit();
+        sched.index = 0;
+
+        try sched.submitSpawn(@intFromPtr(&dummyTaskFn), @ptrCast(&dummyTaskFn), null, .{});
+        failing.fail_index = failing.alloc_index + offset;
+        sched.drainChannels();
+    }
+
+}
+
+fn prewarmSpawnAllocations(sched: *Scheduler) !void {
+    const stack_mem = try sched.stack_pool.alloc(.Standard);
+    try sched.stack_cache.append(sched.allocator, stack_mem);
+    const task = try sched.task_slab.create();
+    sched.task_slab.destroy(task);
+}
+
+test "drainChannels releases stack memory when pinned queue append fails" {
+    var failing = std.testing.FailingAllocator.init(std.testing.allocator, .{});
+    const alloc = failing.allocator();
+    var global_ebr: EbrContext = .{};
+    defer global_ebr.deinit(alloc);
+
+    var sched = try Scheduler.init(alloc, &global_ebr, null);
+    defer sched.deinit();
+    sched.index = 0;
+    try prewarmSpawnAllocations(&sched);
+
+    try sched.submitSpawn(
+        @intFromPtr(&dummyTaskFn),
+        @ptrCast(&dummyTaskFn),
+        null,
+        .{ .pinned = true },
+    );
+    failing.fail_index = failing.alloc_index + 1;
+    sched.drainChannels();
+}
+
+test "drainChannels releases stack memory when ready queue growth fails" {
+    var failing = std.testing.FailingAllocator.init(std.testing.allocator, .{});
+    const alloc = failing.allocator();
+    var global_ebr: EbrContext = .{};
+    defer global_ebr.deinit(alloc);
+
+    var sched = try Scheduler.init(alloc, &global_ebr, null);
+    defer sched.deinit();
+    sched.index = 0;
+    try prewarmSpawnAllocations(&sched);
+
+    var i: usize = 0;
+    while (i < 64) : (i += 1) {
+        try sched.ready_queue.push(alloc, try makeDeinitCleanupTask(&sched, .Micro));
+    }
+
+    try sched.submitSpawn(@intFromPtr(&dummyTaskFn), @ptrCast(&dummyTaskFn), null, .{});
+    failing.fail_index = failing.alloc_index + 1;
+    sched.drainChannels();
+}
+
+fn finishWhileRegisteredAsLockWaiter(_: *anyopaque, raw: ?*anyopaque) anyerror!void {
+    const sched: *Scheduler = @ptrCast(@alignCast(raw.?));
+    sched.registerLockWaiter(sched.getCurrent());
+}
+
+test "Scheduler.run removes finished tasks from lock waiter scan list" {
+    const alloc = std.heap.smp_allocator;
+    var global_ebr: EbrContext = .{};
+    defer global_ebr.deinit(alloc);
+
+    var sched = try Scheduler.init(alloc, &global_ebr, null);
+    defer sched.deinit();
+    sched.index = 0;
+
+    try sched.submitSpawn(
+        @intFromPtr(&rt_profile.Runtime.entryWrapper),
+        @ptrCast(&finishWhileRegisteredAsLockWaiter),
+        &sched,
+        .{},
+    );
+    sched.run();
+
+    try std.testing.expectEqual(@as(usize, 0), sched.lock_waiters.items.len);
+}
+
+const RemoteFreeDrainArgs = struct {
+    owner: *Scheduler,
+    start: *std.atomic.Value(bool),
+    waiting: *std.atomic.Value(bool),
+};
+
+fn drainRemoteFreeAfterStart(args: *RemoteFreeDrainArgs) void {
+    while (!args.start.load(.acquire)) {
+        args.waiting.store(true, .release);
+        std.Thread.yield() catch {};
+    }
+    args.owner.drainChannels();
+}
+
+fn fillRemoteFsmCtxFreeRing(owner: *Scheduler, sender_idx: usize) !void {
+    const ring = try owner.ensureChannel(sender_idx);
+    var i: usize = 0;
+    while (i < 4096) : (i += 1) {
+        const task = try owner.allocFsmTask(&dummyFsmResume);
+        const ctx = try owner.allocFsmCtx(Scheduler.FsmCtx128, task);
+        task.ctx = ctx;
+        const msg = SpscMessage{
+            .tag = .RemoteFsmCtxFree,
+            .fsm_ctx_ptr = @intFromPtr(ctx),
+            .fsm_ctx_class = @intFromEnum(fsm_mod.FsmCtxAllocClass.slab128),
+        };
+        try std.testing.expect(ring.push(msg));
+        owner.fsm_task_slab.destroy(task);
+    }
+    const bit = @as(u64, 1) << @intCast(sender_idx);
+    _ = owner.dirty_mask.fetchOr(bit, .release);
+}
+
+const RemoteStackFreeArgs = struct {
+    thief: *Scheduler,
+    stack: Stack,
+    started: *std.atomic.Value(bool),
+};
+
+fn remoteStackFreeProducer(args: *RemoteStackFreeArgs) void {
+    active_scheduler = args.thief;
+    scheduler_running = true;
+    defer {
+        scheduler_running = false;
+        active_scheduler = undefined;
+    }
+
+    args.started.store(true, .release);
+    args.thief.freeStack(args.stack);
+}
+
+test "remote FSM ctx free backpressure drains while scheduler is running" {
+    const alloc = std.testing.allocator;
+    var global_ebr: EbrContext = .{};
+    defer global_ebr.deinit(alloc);
+
+    var owner = try Scheduler.init(alloc, &global_ebr, null);
+    defer owner.deinit();
+    owner.index = 0;
+
+    var thief = try Scheduler.init(alloc, &global_ebr, null);
+    defer thief.deinit();
+    thief.index = 1;
+
+    try fillRemoteFsmCtxFreeRing(&owner, thief.index);
+    const task = try owner.allocFsmTask(&dummyFsmResume);
+    const ctx = try owner.allocFsmCtx(Scheduler.FsmCtx128, task);
+
+    var start = std.atomic.Value(bool).init(false);
+    var waiting = std.atomic.Value(bool).init(false);
+    var args = RemoteFreeDrainArgs{ .owner = &owner, .start = &start, .waiting = &waiting };
+    var drainer = try std.Thread.spawn(.{}, drainRemoteFreeAfterStart, .{&args});
+    while (!waiting.load(.acquire)) {
+        std.Thread.yield() catch {};
+    }
+
+    active_scheduler = &thief;
+    scheduler_running = true;
+    defer {
+        scheduler_running = false;
+        active_scheduler = undefined;
+    }
+
+    start.store(true, .release);
+    thief.freeFsmCtx(Scheduler.FsmCtx128, task, ctx);
+    drainer.join();
+    owner.drainChannels();
+    owner.fsm_task_slab.destroy(task);
+}
+
+test "remote stack free backpressure drains while scheduler is running" {
+    const alloc = std.testing.allocator;
+    var global_ebr: EbrContext = .{};
+    defer global_ebr.deinit(alloc);
+
+    var owner = try Scheduler.init(alloc, &global_ebr, null);
+    defer owner.deinit();
+    owner.index = 0;
+
+    var thief = try Scheduler.init(alloc, &global_ebr, null);
+    defer thief.deinit();
+    thief.index = 1;
+
+    try fillRemoteFsmCtxFreeRing(&owner, thief.index);
+    const memory = try owner.allocStack(.Micro);
+    const stack = Stack{ .memory = memory, .owner = &owner };
+
+    var started = std.atomic.Value(bool).init(false);
+    var args = RemoteStackFreeArgs{ .thief = &thief, .stack = stack, .started = &started };
+    var producer = try std.Thread.spawn(.{}, remoteStackFreeProducer, .{&args});
+    while (!started.load(.acquire)) {
+        std.Thread.yield() catch {};
+    }
+    var spins: usize = 0;
+    while (spins < 1024) : (spins += 1) {
+        std.Thread.yield() catch {};
+    }
+    owner.drainChannels();
+    producer.join();
+    owner.drainChannels();
+}
+
 pub const WaitGroup = struct {
     // The counter must be atomic
     counter: std.atomic.Value(usize) = std.atomic.Value(usize).init(0),
@@ -2357,6 +2870,40 @@ test "ioError: i32 min does not overflow" {
     try std.testing.expectEqual(error.Unexpected, err);
 }
 
+test "SmartEventFd: awake notify is consumed without kernel wake" {
+    var efd = try SmartEventFd.init();
+    defer efd.deinit();
+
+    efd.notify();
+    try std.testing.expect(!efd.prepareSleep());
+    efd.finishSleep();
+
+    try std.testing.expect(efd.prepareSleep());
+    efd.finishSleep();
+}
+
+test "SmartEventFd: parked notify writes one wake token" {
+    var efd = try SmartEventFd.init();
+    defer efd.deinit();
+
+    try std.testing.expect(efd.prepareSleep());
+    efd.notify();
+    efd.consume();
+    efd.finishSleep();
+
+    try std.testing.expect(efd.prepareSleep());
+    efd.finishSleep();
+}
+
+test "SmartEventFd: prepareSleep is idempotent while parked" {
+    var efd = try SmartEventFd.init();
+    defer efd.deinit();
+
+    try std.testing.expect(efd.prepareSleep());
+    try std.testing.expect(efd.prepareSleep());
+    efd.finishSleep();
+}
+
 test "IoWaiter: encode/decode roundtrip" {
     var dummy_fiber = fc.Fiber{
         .stack = fc.Stack{ .memory = &[_]u8{} },
@@ -2405,3 +2952,7 @@ test "IoWaiter: encode is distinct from sentinels" {
 }
 
 fn dummyTaskFn(_: *anyopaque, _: ?*anyopaque) anyerror!void {}
+
+fn dummyFsmResume(_: *fsm_mod.FsmTask) YieldReason {
+    return .Done;
+}
diff --git a/zig/runtime/scheme.zig b/zig/runtime/scheme.zig
index 57fd7ac9..52a4179a 100644
--- a/zig/runtime/scheme.zig
+++ b/zig/runtime/scheme.zig
@@ -875,15 +875,6 @@ pub fn main() !void {
     defer rt.deinit();
     rt.wireAllocator();
 
-    // Register the main runtime's ThreadLocalEbr with the global
-    // EbrContext so EbrContext.reclaim() observes its pinned epoch
-    // during Versioned.read() critical sections. Without this, reclaim
-    // can advance past a still-pinned reader's local_epoch -> UAF.
-    // BG fibers get their own per-task ThreadLocalEbr registered by the
-    // scheduler (see drainChannels.Spawn); main runtime registers here.
-    try global_ctx.register(allocator, rt.ebr);
-    defer global_ctx.unregister(rt.ebr);
-
     // 4. Shared infrastructure
     const fm = @import("fiber-memory.zig");
     const fp = @import("scheduler.zig");
@@ -988,4 +979,3 @@ pub fn main() !void {
         workers[i].join();
     }
 }
-
diff --git a/zig/runtime/spsc.zig b/zig/runtime/spsc.zig
index 7533dc6f..d3419b67 100644
--- a/zig/runtime/spsc.zig
+++ b/zig/runtime/spsc.zig
@@ -27,6 +27,13 @@ pub const MessageTag = enum(u8) {
     /// counted when originally enqueued). Used by ParkingMutex unlock
     /// to wake an FSM waiter on a different scheduler.
     FsmResume,
+    /// Return a stackful fiber stack to the scheduler that allocated it.
+    /// Stack pools are scheduler-local because their slab magazines are not
+    /// safe to mutate from arbitrary scheduler threads.
+    RemoteStackFree,
+    /// Return a generated FSM ctx slab slot to the scheduler that allocated
+    /// it. Same locality rule as RemoteStackFree.
+    RemoteFsmCtxFree,
 };
 
 /// A value-type message copied into the ring buffer.
@@ -40,6 +47,8 @@ pub const Message = struct {
     config_stack_size: u8 = 0, // StackSize enum as u8
     config_pinned: bool = false,
     config_timeout_ms: u64 = 0,
+    config_profile_site_id: u32 = 0,
+    config_profile_dispatch: u8 = 0,
     // Resume fields
     task: ?*anyopaque = null, // *Task as opaque
     // RemoteCall fields
@@ -49,6 +58,12 @@ pub const Message = struct {
     // FsmSpawn fields — reuses `task` would conflate with Resume, so
     // a dedicated pointer field keeps decoding branch-free.
     fsm_task: ?*anyopaque = null, // *FsmTask as opaque
+    // RemoteStackFree fields
+    stack_ptr: usize = 0,
+    stack_len: usize = 0,
+    // RemoteFsmCtxFree fields
+    fsm_ctx_ptr: usize = 0,
+    fsm_ctx_class: u8 = 0,
 };
 
 /// SPSC ring buffer.  Fixed capacity, power-of-two size.
diff --git a/zig/runtime/steal-hammer-test.zig b/zig/runtime/steal-hammer-test.zig
index 048ec72d..1d9db055 100644
--- a/zig/runtime/steal-hammer-test.zig
+++ b/zig/runtime/steal-hammer-test.zig
@@ -198,7 +198,7 @@ pub fn main() !void {
                 .Finished => {
                     _ = sched.active_tasks.fetchSub(1, .monotonic);
                     sched.releaseTaskEbr(task);
-                    sched.stack_pool.free(task.base.stack.memory);
+                    sched.freeStack(task.base.stack);
                     sched.allocator.destroy(task.base);
                     sched.task_slab.destroy(task);
                 },
diff --git a/zig/runtime/stream-test.zig b/zig/runtime/stream-test.zig
index 04c5cdd0..34084b8f 100644
--- a/zig/runtime/stream-test.zig
+++ b/zig/runtime/stream-test.zig
@@ -79,27 +79,75 @@ const BoundedEachState = struct {
     total: std.atomic.Value(i64) = std.atomic.Value(i64).init(0),
 };
 
+const BoundedErrorState = struct {
+    items: [4]CheatLib.Promise(i64),
+    err: ?anyerror = null,
+};
+
 fn boundedAccumulate(_: *Runtime, raw_args: ?*anyopaque, value: i64) anyerror!void {
     const state = @as(*BoundedEachState, @ptrCast(@alignCast(raw_args.?)));
     _ = state.total.fetchAdd(value, .seq_cst);
 }
 
+fn boundedMapErrorOnThree(_: *Runtime, _: ?*anyopaque, value: i64) anyerror!i64 {
+    if (value == 3) return error.IntentionalBoundedSelect;
+    return value;
+}
+
+fn boundedWhereErrorOnThree(_: *Runtime, _: ?*anyopaque, value: i64) anyerror!bool {
+    if (value == 3) return error.IntentionalBoundedWhere;
+    return true;
+}
+
+fn boundedEachErrorOnThirty(_: *Runtime, _: ?*anyopaque, value: i64) anyerror!void {
+    if (value == 30) return error.IntentionalBoundedEach;
+}
+
 fn boundedSelectConsumer(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
     const state = @as(*BoundedSelectState, @ptrCast(@alignCast(raw_args.?)));
     state.results = try CheatLib.concurrentBoundedSelect(i64, i64, 4, boundedMapDouble,
-        rt.heapAlloc(), rt, &state.items, 2, false, .{}, null);
+        rt.heapAlloc(), rt, &state.items, 2, 3, false, .{}, null);
 }
 
 fn boundedWhereConsumer(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
     const state = @as(*BoundedSelectState, @ptrCast(@alignCast(raw_args.?)));
     state.results = try CheatLib.concurrentBoundedWhere(i64, 4, boundedKeepGtTwo,
-        rt.heapAlloc(), rt, &state.items, 2, false, .{}, null);
+        rt.heapAlloc(), rt, &state.items, 2, 3, false, .{}, null);
 }
 
 fn boundedEachConsumer(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
     const state = @as(*BoundedEachState, @ptrCast(@alignCast(raw_args.?)));
     try CheatLib.concurrentBoundedEach(i64, 4, boundedAccumulate,
-        rt, &state.items, 2, false, .{}, state);
+        rt, &state.items, 2, 3, false, .{}, state);
+}
+
+fn boundedSelectErrorConsumer(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
+    const state = @as(*BoundedErrorState, @ptrCast(@alignCast(raw_args.?)));
+    var result = CheatLib.concurrentBoundedSelect(i64, i64, 4, boundedMapErrorOnThree,
+        rt.heapAlloc(), rt, &state.items, 2, 3, false, .{}, null) catch |err| {
+        state.err = err;
+        return;
+    };
+    result.deinit(rt.heapAlloc());
+}
+
+fn boundedWhereErrorConsumer(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
+    const state = @as(*BoundedErrorState, @ptrCast(@alignCast(raw_args.?)));
+    var result = CheatLib.concurrentBoundedWhere(i64, 4, boundedWhereErrorOnThree,
+        rt.heapAlloc(), rt, &state.items, 2, 3, false, .{}, null) catch |err| {
+        state.err = err;
+        return;
+    };
+    result.deinit(rt.heapAlloc());
+}
+
+fn boundedEachErrorConsumer(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
+    const state = @as(*BoundedErrorState, @ptrCast(@alignCast(raw_args.?)));
+    CheatLib.concurrentBoundedEach(i64, 4, boundedEachErrorOnThirty,
+        rt, &state.items, 2, 3, false, .{}, null) catch |err| {
+        state.err = err;
+        return;
+    };
 }
 
 fn makeBoundedPromiseItems(rt: *Runtime, values: [4]i64) ![4]CheatLib.Promise(i64) {
@@ -528,6 +576,102 @@ fn splitFiberProducer(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
     try state.stream.push(103);
 }
 
+const PlainStreamCrossSchedulerState = struct {
+    stream: CheatLib.Stream(i64),
+    ready: *std.atomic.Value(usize),
+    completed: *std.atomic.Value(usize),
+    count: usize = 0,
+    total: i64 = 0,
+};
+
+fn plainStreamCrossSchedulerConsumer(_: *Runtime, raw_args: ?*anyopaque) anyerror!void {
+    const state = @as(*PlainStreamCrossSchedulerState, @ptrCast(@alignCast(raw_args.?)));
+    _ = state.ready.fetchAdd(1, .acq_rel);
+
+    while (try state.stream.next()) |value| {
+        state.total += value;
+        state.count += 1;
+    }
+
+    _ = state.completed.fetchAdd(1, .acq_rel);
+}
+
+const PlainStreamProducerWakeState = struct {
+    stream: CheatLib.Stream(i64),
+    attempted: usize,
+    completed: *std.atomic.Value(usize),
+    closed: *std.atomic.Value(usize),
+};
+
+fn plainStreamFillProducer(_: *Runtime, raw_args: ?*anyopaque) anyerror!void {
+    const state = @as(*PlainStreamProducerWakeState, @ptrCast(@alignCast(raw_args.?)));
+    defer state.stream.close();
+
+    var i: usize = 0;
+    while (i < state.attempted) : (i += 1) {
+        state.stream.push(@intCast(i)) catch |err| {
+            if (err == error.StreamClosed) {
+                _ = state.closed.fetchAdd(1, .acq_rel);
+                return;
+            }
+            return err;
+        };
+    }
+
+    _ = state.completed.fetchAdd(1, .acq_rel);
+}
+
+const PlainStreamDeinitState = struct {
+    stream: CheatLib.Stream(i64),
+    completed: *std.atomic.Value(usize),
+};
+
+fn streamProducerParked(stream: CheatLib.Stream(i64)) bool {
+    while (stream.inner.lock.swap(1, .acquire) == 1) std.Thread.yield() catch {};
+    const parked = stream.inner.producer_task != null;
+    stream.inner.lock.store(0, .release);
+    return parked;
+}
+
+fn streamConsumerParked(stream: CheatLib.Stream(i64)) bool {
+    while (stream.inner.lock.swap(1, .acquire) == 1) std.Thread.yield() catch {};
+    const parked = stream.inner.consumer_task != null;
+    stream.inner.lock.store(0, .release);
+    return parked;
+}
+
+fn plainStreamDeinitTask(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
+    const state = @as(*PlainStreamDeinitState, @ptrCast(@alignCast(raw_args.?)));
+    while (!streamProducerParked(state.stream)) rt.checkYield();
+    state.stream.deinit();
+    _ = state.completed.fetchAdd(1, .acq_rel);
+}
+
+const PlainStreamCloseState = struct {
+    stream: CheatLib.Stream(i64),
+    completed: *std.atomic.Value(usize),
+};
+
+fn plainStreamCloseTask(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
+    const state = @as(*PlainStreamCloseState, @ptrCast(@alignCast(raw_args.?)));
+    while (!streamConsumerParked(state.stream)) rt.checkYield();
+    state.stream.close();
+    _ = state.completed.fetchAdd(1, .acq_rel);
+}
+
+const PlainStreamNextWakeState = struct {
+    stream: CheatLib.Stream(i64),
+    value: i64 = -1,
+    completed: *std.atomic.Value(usize),
+};
+
+fn plainStreamNextWakeTask(rt: *Runtime, raw_args: ?*anyopaque) anyerror!void {
+    const state = @as(*PlainStreamNextWakeState, @ptrCast(@alignCast(raw_args.?)));
+    while (!streamProducerParked(state.stream)) rt.checkYield();
+    state.value = (try state.stream.next()).?;
+    _ = state.completed.fetchAdd(1, .acq_rel);
+}
+
 fn splitHammerMix(seed: i64) i64 {
     var x = seed;
     var i: usize = 0;
@@ -661,6 +805,268 @@ test "SplitStream wakes multiple waiting fibers as items arrive" {
     try std.testing.expectEqualSlices(i64, right_values[0..right_state.count], &[_]i64{ 101, 102, 103 });
 }
 
+test "Stream wakes a consumer parked on a different scheduler" {
+    const allocator = std.testing.allocator;
+
+    var global_ctx = EbrContext{};
+    defer global_ctx.deinit(allocator);
+    var stack_pool = fm.StackPool.init(allocator);
+    defer stack_pool.deinit();
+    var shutdown = std.atomic.Value(bool).init(false);
+    var worker_ready = std.atomic.Value(usize).init(0);
+    var worker_sched_ptr = std.atomic.Value(usize).init(0);
+
+    const WorkerCtx = struct {
+        allocator: std.mem.Allocator,
+        global_ctx: *EbrContext,
+        stack_pool: *fm.StackPool,
+        shutdown: *std.atomic.Value(bool),
+        ready: *std.atomic.Value(usize),
+        sched_ptr: *std.atomic.Value(usize),
+    };
+
+    const workerMain = struct {
+        fn run(ctx: *WorkerCtx) void {
+            var worker_sched = fp.Scheduler.init(ctx.allocator, ctx.global_ctx, ctx.stack_pool) catch return;
+            defer worker_sched.deinit();
+            worker_sched.shutdown_on_idle = false;
+            worker_sched.global_shutdown = ctx.shutdown;
+            fp.active_scheduler = &worker_sched;
+            fp.scheduler_running = true;
+            ctx.sched_ptr.store(@intFromPtr(&worker_sched), .release);
+            _ = ctx.ready.fetchAdd(1, .acq_rel);
+            worker_sched.run();
+            fp.scheduler_running = false;
+            ctx.sched_ptr.store(0, .release);
+        }
+    }.run;
+
+    var worker_ctx = WorkerCtx{
+        .allocator = allocator,
+        .global_ctx = &global_ctx,
+        .stack_pool = &stack_pool,
+        .shutdown = &shutdown,
+        .ready = &worker_ready,
+        .sched_ptr = &worker_sched_ptr,
+    };
+
+    const worker = try std.Thread.spawn(.{}, workerMain, .{&worker_ctx});
+    defer {
+        shutdown.store(true, .release);
+        fp.global_registry.notifyAll();
+        worker.join();
+        fp.global_registry.deinit(allocator);
+    }
+
+    const deadline = compat.milliTimestamp() + 5_000;
+    while (worker_ready.load(.acquire) == 0 and compat.milliTimestamp() < deadline) {
+        compat.sleepNs(std.time.ns_per_ms);
+    }
+    try std.testing.expectEqual(@as(usize, 1), worker_ready.load(.acquire));
+
+    const worker_sched = @as(*fp.Scheduler, @ptrFromInt(worker_sched_ptr.load(.acquire)));
+
+    var source_sched = try fp.Scheduler.init(allocator, &global_ctx, &stack_pool);
+    defer source_sched.deinit();
+
+    const S = CheatLib.Stream(i64);
+    var stream = try S.spawnNew(allocator, &source_sched);
+    defer stream.deinit();
+
+    var ready = std.atomic.Value(usize).init(0);
+    var completed = std.atomic.Value(usize).init(0);
+    var state = PlainStreamCrossSchedulerState{
+        .stream = stream,
+        .ready = &ready,
+        .completed = &completed,
+    };
+
+    try worker_sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&plainStreamCrossSchedulerConsumer)),
+        &state,
+        .{ .stack_size = test_stack_size },
+    );
+
+    while (ready.load(.acquire) == 0 and compat.milliTimestamp() < deadline) {
+        compat.sleepNs(std.time.ns_per_ms);
+    }
+    try std.testing.expectEqual(@as(usize, 1), ready.load(.acquire));
+
+    var parked = false;
+    while (compat.milliTimestamp() < deadline) {
+        while (stream.inner.lock.swap(1, .acquire) == 1) std.Thread.yield() catch {};
+        parked = stream.inner.consumer_task != null;
+        stream.inner.lock.store(0, .release);
+        if (parked) break;
+    }
+    try std.testing.expect(parked);
+
+    var producer = S{ .inner = stream.inner, .alloc = allocator };
+    try producer.push(42);
+    producer.close();
+
+    while (completed.load(.acquire) == 0 and compat.milliTimestamp() < deadline) {
+        compat.sleepNs(std.time.ns_per_ms);
+    }
+
+    try std.testing.expectEqual(@as(usize, 1), completed.load(.acquire));
+    try std.testing.expectEqual(@as(usize, 1), state.count);
+    try std.testing.expectEqual(@as(i64, 42), state.total);
+}
+
+test "Stream close wakes a consumer parked on empty ring" {
+    const allocator = std.testing.allocator;
+
+    var global_ctx = EbrContext{};
+    defer global_ctx.deinit(allocator);
+    var stack_pool = fm.StackPool.init(allocator);
+    defer stack_pool.deinit();
+    var sched = try fp.Scheduler.init(allocator, &global_ctx, &stack_pool);
+    defer sched.deinit();
+    fp.active_scheduler = &sched;
+    defer fp.global_registry.deinit(allocator);
+
+    const S = CheatLib.Stream(i64);
+    var stream = try S.spawnNew(allocator, &sched);
+
+    var ready = std.atomic.Value(usize).init(0);
+    var completed = std.atomic.Value(usize).init(0);
+    var close_completed = std.atomic.Value(usize).init(0);
+    var consumer_state = PlainStreamCrossSchedulerState{
+        .stream = stream,
+        .ready = &ready,
+        .completed = &completed,
+    };
+    var close_state = PlainStreamCloseState{
+        .stream = stream,
+        .completed = &close_completed,
+    };
+
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&plainStreamCloseTask)),
+        &close_state,
+        .{ .stack_size = test_stack_size },
+    );
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&plainStreamCrossSchedulerConsumer)),
+        &consumer_state,
+        .{ .stack_size = test_stack_size },
+    );
+    sched.run();
+
+    try std.testing.expectEqual(@as(usize, 1), completed.load(.acquire));
+    try std.testing.expectEqual(@as(usize, 1), close_completed.load(.acquire));
+    try std.testing.expectEqual(@as(usize, 0), consumer_state.count);
+    stream.deinit();
+}
+
+test "Stream next wakes a producer parked on full ring" {
+    const allocator = std.testing.allocator;
+
+    var global_ctx = EbrContext{};
+    defer global_ctx.deinit(allocator);
+    var stack_pool = fm.StackPool.init(allocator);
+    defer stack_pool.deinit();
+    var sched = try fp.Scheduler.init(allocator, &global_ctx, &stack_pool);
+    defer sched.deinit();
+    fp.active_scheduler = &sched;
+    defer fp.global_registry.deinit(allocator);
+
+    const S = CheatLib.Stream(i64);
+    var stream = try S.spawnNew(allocator, &sched);
+    defer stream.deinit();
+
+    var completed = std.atomic.Value(usize).init(0);
+    var closed = std.atomic.Value(usize).init(0);
+    var next_completed = std.atomic.Value(usize).init(0);
+    var state = PlainStreamProducerWakeState{
+        .stream = stream,
+        .attempted = 65,
+        .completed = &completed,
+        .closed = &closed,
+    };
+    var next_state = PlainStreamNextWakeState{
+        .stream = stream,
+        .completed = &next_completed,
+    };
+
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&plainStreamNextWakeTask)),
+        &next_state,
+        .{ .stack_size = test_stack_size },
+    );
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&plainStreamFillProducer)),
+        &state,
+        .{ .stack_size = test_stack_size },
+    );
+    sched.run();
+
+    try std.testing.expectEqual(@as(usize, 1), completed.load(.acquire));
+    try std.testing.expectEqual(@as(usize, 0), closed.load(.acquire));
+    try std.testing.expectEqual(@as(usize, 1), next_completed.load(.acquire));
+    try std.testing.expectEqual(@as(i64, 0), next_state.value);
+
+    var expected: i64 = 1;
+    while (try stream.next()) |value| : (expected += 1) {
+        try std.testing.expectEqual(expected, value);
+    }
+    try std.testing.expectEqual(@as(i64, 65), expected);
+}
+
+test "Stream deinit wakes a producer parked on full ring" {
+    const allocator = std.testing.allocator;
+
+    var global_ctx = EbrContext{};
+    defer global_ctx.deinit(allocator);
+    var stack_pool = fm.StackPool.init(allocator);
+    defer stack_pool.deinit();
+    var sched = try fp.Scheduler.init(allocator, &global_ctx, &stack_pool);
+    defer sched.deinit();
+    fp.active_scheduler = &sched;
+    defer fp.global_registry.deinit(allocator);
+
+    const S = CheatLib.Stream(i64);
+    const stream = try S.spawnNew(allocator, &sched);
+
+    var completed = std.atomic.Value(usize).init(0);
+    var closed = std.atomic.Value(usize).init(0);
+    var producer_state = PlainStreamProducerWakeState{
+        .stream = stream,
+        .attempted = 65,
+        .completed = &completed,
+        .closed = &closed,
+    };
+    var deinit_completed = std.atomic.Value(usize).init(0);
+    var deinit_state = PlainStreamDeinitState{
+        .stream = stream,
+        .completed = &deinit_completed,
+    };
+
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&plainStreamDeinitTask)),
+        &deinit_state,
+        .{ .stack_size = test_stack_size },
+    );
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&plainStreamFillProducer)),
+        &producer_state,
+        .{ .stack_size = test_stack_size },
+    );
+    sched.run();
+
+    try std.testing.expectEqual(@as(usize, 0), completed.load(.acquire));
+    try std.testing.expectEqual(@as(usize, 1), closed.load(.acquire));
+    try std.testing.expectEqual(@as(usize, 1), deinit_completed.load(.acquire));
+}
+
 test "SplitStream survives multithreaded spawnBest pubsub hammer" {
     if (!build_options.tsan and !build_options.coverage) return error.SkipZigTest;
 
@@ -772,7 +1178,7 @@ test "SplitStream survives multithreaded spawnBest pubsub hammer" {
                 .Finished => {
                     _ = sched.active_tasks.fetchSub(1, .monotonic);
                     sched.releaseTaskEbr(task);
-                    sched.stack_pool.free(task.base.stack.memory);
+                    sched.freeStack(task.base.stack);
                     sched.allocator.destroy(task.base);
                     sched.task_slab.destroy(task);
                 },
@@ -900,3 +1306,48 @@ test "concurrentBoundedEach visits every item exactly once" {
 
     try std.testing.expectEqual(@as(i64, 100), state.total.load(.seq_cst));
 }
+
+test "concurrentBounded callbacks propagate worker errors" {
+    const allocator = std.testing.allocator;
+
+    var global_ctx = EbrContext{};
+    defer global_ctx.deinit(allocator);
+    var stack_pool = fm.StackPool.init(allocator);
+    defer stack_pool.deinit();
+    var sched = try fp.Scheduler.init(allocator, &global_ctx, &stack_pool);
+    defer sched.deinit();
+    fp.active_scheduler = &sched;
+    defer fp.global_registry.deinit(allocator);
+
+    var rt = try Runtime.init(allocator, 4 * 1024, &global_ctx);
+    defer rt.deinit();
+    rt.wireAllocator();
+
+    var select_state = BoundedErrorState{ .items = try makeBoundedPromiseItems(&rt, .{ 1, 2, 3, 4 }) };
+    var where_state = BoundedErrorState{ .items = try makeBoundedPromiseItems(&rt, .{ 1, 2, 3, 4 }) };
+    var each_state = BoundedErrorState{ .items = try makeBoundedPromiseItems(&rt, .{ 10, 20, 30, 40 }) };
+
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&boundedSelectErrorConsumer)),
+        &select_state,
+        .{ .stack_size = test_stack_size },
+    );
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&boundedWhereErrorConsumer)),
+        &where_state,
+        .{ .stack_size = test_stack_size },
+    );
+    try sched.submitSpawn(
+        @intFromPtr(&Runtime.entryWrapper),
+        @as(qs.TaskFn, @ptrCast(&boundedEachErrorConsumer)),
+        &each_state,
+        .{ .stack_size = test_stack_size },
+    );
+    sched.run();
+
+    try std.testing.expectEqual(error.IntentionalBoundedSelect, select_state.err.?);
+    try std.testing.expectEqual(error.IntentionalBoundedWhere, where_state.err.?);
+    try std.testing.expectEqual(error.IntentionalBoundedEach, each_state.err.?);
+}
diff --git a/zig/runtime/versioned-fiber-stress-test.zig b/zig/runtime/versioned-fiber-stress-test.zig
index 37a152f0..fcb92d00 100644
--- a/zig/runtime/versioned-fiber-stress-test.zig
+++ b/zig/runtime/versioned-fiber-stress-test.zig
@@ -279,6 +279,63 @@ const ReaderCtx = struct {
     }
 };
 
+const HeldGuardReaderCtx = struct {
+    inner: *CheatLib.Promise(i64).Inner,
+    bg_alloc: std.mem.Allocator,
+    cell: *versioned.Versioned(Sample),
+    reader_pinned: *std.atomic.Value(bool),
+    release_reader: *std.atomic.Value(bool),
+    violations: *std.atomic.Value(usize),
+
+    fn run(rt_raw: *anyopaque, raw: ?*anyopaque) anyerror!void {
+        const ctx: *@This() = @ptrCast(@alignCast(raw.?));
+        defer ctx.bg_alloc.destroy(ctx);
+        defer ctx.inner.wg.done();
+
+        const rt: *Runtime = @ptrCast(@alignCast(rt_raw));
+        var g = ctx.cell.read(rt);
+        defer g.release();
+
+        const view = g.get().*;
+        ctx.reader_pinned.store(true, .release);
+
+        while (!ctx.release_reader.load(.acquire)) {
+            rt.checkYield();
+        }
+
+        if (view.a != 1 or view.b != 2) {
+            _ = ctx.violations.fetchAdd(1, .seq_cst);
+        }
+        ctx.inner.result = view.a + view.b;
+    }
+};
+
+const RetireThenExitWriterCtx = struct {
+    inner: *CheatLib.Promise(i64).Inner,
+    bg_alloc: std.mem.Allocator,
+    cell: *versioned.Versioned(Sample),
+    reader_pinned: *std.atomic.Value(bool),
+
+    fn setSample(p: *Sample, a: i64, b: i64) void {
+        p.a = a;
+        p.b = b;
+    }
+
+    fn run(rt_raw: *anyopaque, raw: ?*anyopaque) anyerror!void {
+        const ctx: *@This() = @ptrCast(@alignCast(raw.?));
+        defer ctx.bg_alloc.destroy(ctx);
+        defer ctx.inner.wg.done();
+
+        const rt: *Runtime = @ptrCast(@alignCast(rt_raw));
+        while (!ctx.reader_pinned.load(.acquire)) {
+            rt.checkYield();
+        }
+
+        try ctx.cell.update(rt, ctx.bg_alloc, setSample, .{ 7, 14 });
+        ctx.inner.result = 1;
+    }
+};
+
 // ----------------------------------------------------------------------
 // The actual test. Mirrors the user's CLEAR repro:
 //   r1 = BG { N reads }; r2 = BG { N reads };
@@ -315,7 +372,7 @@ test "Versioned: 2 BG fibers x 200_000 reads via scheduler -- repro for processC
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -397,7 +454,7 @@ test "Versioned: 2 BG fibers, heap-allocated cell -- exact CLEAR @shared:version
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
                 test_alloc.destroy(cell);
             }
@@ -447,6 +504,80 @@ test "Versioned: 2 BG fibers, heap-allocated cell -- exact CLEAR @shared:version
     }.body);
 }
 
+test "Versioned: retired version survives writer task exit while another task holds a guard" {
+    stack_pool = StackPool.init(test_alloc);
+    defer stack_pool.deinit();
+
+    try withMainRuntime(struct {
+        fn body(rt: *Runtime) !void {
+            const count = fp.global_registry.count();
+            if (count < 2) return error.SkipZigTest;
+
+            const cell = try test_alloc.create(versioned.Versioned(Sample));
+            cell.* = try versioned.Versioned(Sample).init(test_alloc, .{ .a = 1, .b = 2 });
+            defer {
+                cell.deinit(rt, test_alloc) catch unreachable;
+                var i: usize = 0;
+                while (i < 6) : (i += 1) {
+                    global_ebr.reclaim(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
+                }
+                test_alloc.destroy(cell);
+            }
+
+            var reader_pinned = std.atomic.Value(bool).init(false);
+            var release_reader = std.atomic.Value(bool).init(false);
+            var violations = std.atomic.Value(usize).init(0);
+
+            const sa = rt.getSched().allocator;
+            const reader_promise = try CheatLib.Promise(i64).spawn(sa, rt.getSched());
+            const reader_ctx = try sa.create(HeldGuardReaderCtx);
+            reader_ctx.* = .{
+                .inner = reader_promise.inner,
+                .bg_alloc = sa,
+                .cell = cell,
+                .reader_pinned = &reader_pinned,
+                .release_reader = &release_reader,
+                .violations = &violations,
+            };
+            try CheatHeader.spawnBest(
+                @intFromPtr(&Runtime.entryWrapper),
+                @as(CheatHeader.TaskFn, @ptrCast(&HeldGuardReaderCtx.run)),
+                reader_ctx,
+                .{ .stack_size = .Large },
+            );
+
+            const writer_promise = try CheatLib.Promise(i64).spawn(sa, rt.getSched());
+            const writer_ctx = try sa.create(RetireThenExitWriterCtx);
+            writer_ctx.* = .{
+                .inner = writer_promise.inner,
+                .bg_alloc = sa,
+                .cell = cell,
+                .reader_pinned = &reader_pinned,
+            };
+            try CheatHeader.spawnBest(
+                @intFromPtr(&Runtime.entryWrapper),
+                @as(CheatHeader.TaskFn, @ptrCast(&RetireThenExitWriterCtx.run)),
+                writer_ctx,
+                .{ .stack_size = .Large },
+            );
+
+            try testing.expectEqual(@as(i64, 1), try writer_promise.next());
+
+            var i: usize = 0;
+            while (i < 12) : (i += 1) {
+                global_ebr.reclaim(test_alloc);
+                rt.currentEbr().reclaimLocal(test_alloc);
+                rt.checkYield();
+            }
+
+            release_reader.store(true, .release);
+            try testing.expectEqual(@as(i64, 3), try reader_promise.next());
+            try testing.expectEqual(@as(usize, 0), violations.load(.seq_cst));
+        }
+    }.body);
+}
+
 // ----------------------------------------------------------------------
 // Variant: 4 BG readers x 100_000 iters each. Strictly more concurrent
 // EBR enter/exit pressure across more fibers. If the bug is a per-fiber
@@ -467,7 +598,7 @@ test "Versioned: 4 BG fibers x 100_000 reads via scheduler -- broader concurrenc
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -528,8 +659,7 @@ test "Versioned: 4 BG fibers x 100_000 reads via scheduler -- broader concurrenc
 //
 // Key features the previous (stackful spawnBest) tests miss:
 //   1. The FSM body shares the main fiber's Runtime (`rt`). Both BG
-//      fibers + main fiber call `rt.ebr.enter()`/`rt.ebr.exit()` on
-//      the SAME ThreadLocalEbr.
+//      fibers + main fiber enter/exit EBR through Runtime.currentEbr().
 //   2. The FSM body calls `rt.checkYield()` -> `coopYield()` ->
 //      `getCurrent()` which returns the SCHEDULER's current_task,
 //      not the FSM's task. The yield mechanics for FSM-on-worker-
@@ -612,7 +742,7 @@ test "FSM Versioned: 2 BG-FSM fibers x 200_000 reads, single scheduler -- DEFAUL
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
                 test_alloc.destroy(cell);
             }
@@ -737,7 +867,7 @@ test "FSM Versioned CONTROL: same shape WITHOUT checkYield -- isolates the bug"
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
                 test_alloc.destroy(cell);
             }
@@ -786,17 +916,9 @@ test "FSM Versioned CONTROL: same shape WITHOUT checkYield -- isolates the bug"
 }
 
 // ----------------------------------------------------------------------
-// Diagnostic: confirm BG fibers do NOT register their EBR with the
-// shared EbrContext. This is a separate concern from the checkYield
-// bug, but it's a real footgun for any future EBR-pressure scenario:
-// entryWrapper (runtime.zig:564-613) creates a Runtime with a fresh
-// ThreadLocalEbr but never calls global_ctx.register(). No retire is
-// safe across BG fibers because reclaim() can advance the global
-// epoch without waiting for the BG fiber's local_epoch.
-//
-// (Note: this is not what the user's CLEAR program crashes on -- that
-// crash is checkYield-from-FSM. This diagnostic flags an adjacent
-// concern that, as far as I can see, also has no test coverage.)
+// Diagnostic: confirm BG fibers use a registered scheduler-thread EBR.
+// This guards the core safety invariant: reclaim() must observe a pinned
+// BG fiber even though the task itself does not own an EBR slot.
 const RegProbeCtx = struct {
     inner: *CheatLib.Promise(usize).Inner,
     bg_alloc: std.mem.Allocator,
@@ -813,15 +935,15 @@ const RegProbeCtx = struct {
         var g = ctx.cell.read(rt);
         defer g.release();
 
-        // Walk the registry under its lock and count how many entries
-        // are registered. If this BG fiber's ebr is NOT in the
-        // registry, the count won't include it.
+        // Walk the registry under its lock and confirm the scheduler
+        // thread's EBR participant is registered.
         global_ebr.registry_lock.lock();
         defer global_ebr.registry_lock.unlock();
 
         var seen_self: usize = 0;
+        const current_ebr = rt.currentEbr();
         for (global_ebr.registry.items) |entry| {
-            if (entry == rt.ebr) seen_self = 1;
+            if (entry == current_ebr) seen_self = 1;
         }
         ctx.ebr_count_inside.store(seen_self, .seq_cst);
         ctx.inner.result = global_ebr.registry.items.len;
@@ -901,7 +1023,7 @@ test "EBR registration: BG fiber's pin must block reclaim advance (UAF guard)" {
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -966,7 +1088,7 @@ test "DIAGNOSTIC: BG fiber's ThreadLocalEbr IS registered with EbrContext" {
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -1010,8 +1132,7 @@ var ebr_diag_seen_self: usize = 99;
 // `versioned-stress-test.zig` (std.Thread workers, no scheduler), which
 // does NOT exercise:
 //   - Cross-scheduler @parallel BG distribution (spawnFsmBest)
-//   - Scheduler-managed task.ebr_slot lifecycle (alloc on submitter,
-//     destroy on completer; potentially different schedulers)
+//   - Scheduler-thread EBR lifecycle under migratable tasks
 //   - Versioned.update's per-write heap-alloc + EBR-retire under many
 //     concurrent writers all racing the same cell
 //
@@ -1085,7 +1206,7 @@ test "Versioned: 4 BG-FSM writers race the same cell -- bench-17 heap-corruption
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -1147,7 +1268,7 @@ test "Versioned: 32 BG-FSM writers race the same cell -- bench-17 scale-up repro
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -1208,7 +1329,7 @@ test "Versioned: 32 readers + 4 writers on 5 schedulers -- bench-17 mixed-load r
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -1288,7 +1409,7 @@ test "Versioned: 4 readers + 4 writers on 5 schedulers -- writer-heavy repro" {
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -1368,12 +1489,9 @@ test "Versioned: 4 readers + 4 writers on 5 schedulers -- writer-heavy repro" {
 //   };
 //   try CheatHeader.spawnFsmBest(ctx.task);  <-- cross-scheduler distribution!
 //
-// The bug: when these FSMs dispatch on a WORKER scheduler thread, calling
-// `rt.ebr.enter() / .exit() / .retire()` accesses the MAIN scheduler's
-// per-thread EBR data structures from a different OS thread. ThreadLocalEbr
-// is non-thread-safe (limbo lists are plain ArrayList(usize), epoch is
-// non-atomic in the slow paths), so this corrupts the limbo and surfaces
-// as `realloc(): invalid old size` in glibc when reclaim() walks the list.
+// The bug: when these FSMs dispatch on a WORKER scheduler thread, EBR must
+// resolve to that worker scheduler's thread EBR, not to the spawning runtime's
+// fallback slot.
 const FsmWriterCtx = struct {
     task: *CheatHeader.FsmTask,
     rt: *Runtime,
@@ -1422,17 +1540,9 @@ const FsmWriterCtx = struct {
 
 // Bench-17 EXACT shape repro. The previous writer-stress tests used
 // stackful `spawnBest` and passed -- they didn't capture the FSM-on-worker
-// Bench-17 EXACT shape repro using the per-task Runtime pattern: each
-// FSM ctx gets its own Runtime backed by a per-task ThreadLocalEbr
-// (allocFsmTaskRuntime). This is the shape the CLEAR codegen emits.
-//
-// Without the per-task rt, glibc heap aborts with `realloc(): invalid
-// old size` after enough iterations: writer FSMs dispatch on worker
-// threads but call `rt.ebr.retire(...)` on the main rt's ebr, corrupting
-// the non-thread-safe limbo list. With the runtime-side fix
-// (FsmTask.ebr_slot/task_runtime + Scheduler.releaseFsmTaskEbr in
-// drainFsmQueue's .Done branch), this test passes deterministically.
-test "FSM Versioned: 4 BG-FSM writers via spawnFsmBest with per-task rt -- bench-17 fix verifier" {
+// shape. Each FSM ctx still gets its own Runtime shell because codegen stores
+// an `rt` pointer, but EBR resolves through Runtime.currentEbr() at dispatch.
+test "FSM Versioned: 4 BG-FSM writers via spawnFsmBest with Runtime.currentEbr -- bench-17 fix verifier" {
     stack_pool = StackPool.init(test_alloc);
     defer stack_pool.deinit();
 
@@ -1447,7 +1557,7 @@ test "FSM Versioned: 4 BG-FSM writers via spawnFsmBest with per-task rt -- bench
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -1462,7 +1572,7 @@ test "FSM Versioned: 4 BG-FSM writers via spawnFsmBest with per-task rt -- bench
                 const ctx = try sa.create(FsmWriterCtx);
                 ctx.* = .{
                     .task = undefined,
-                    .rt = undefined, // rebound below to per-task rt
+                    .rt = undefined, // rebound below to per-task Runtime shell
                     .inner = promises[i].inner,
                     .alloc = sa,
                     .cell = &cell,
@@ -1472,8 +1582,8 @@ test "FSM Versioned: 4 BG-FSM writers via spawnFsmBest with per-task rt -- bench
                 };
                 ctx.task = try CheatHeader.allocFsmTask(rt, &FsmWriterCtx.resumeFn); ctx.task.ctx = ctx;
                 ctx.task.destroy_fn = &FsmWriterCtx.destroyTask;
-                // The fix: allocate a per-task Runtime + ebr_slot before
-                // spawning. The scheduler frees both on .Done.
+                // Allocate a per-task Runtime shell before spawning. The
+                // scheduler frees it on .Done.
                 const task_rt = try CheatHeader.allocFsmTaskRuntime(ctx.task, rt);
                 ctx.rt = task_rt;
                 try CheatHeader.spawnFsmBest(ctx.task);
@@ -1485,11 +1595,9 @@ test "FSM Versioned: 4 BG-FSM writers via spawnFsmBest with per-task rt -- bench
     }.body);
 }
 
-// Higher writer pressure on more schedulers using the per-task Runtime
-// fix. Validates that the fix scales: more cross-thread dispatch + more
-// retire pressure on more schedulers, and we still don't corrupt the
-// heap because each task's EBR ops route through its own slot.
-test "FSM Versioned: 8 BG-FSM writers via spawnFsmBest on 5 schedulers (per-task rt) -- bench-17 fix scale" {
+// Higher writer pressure on more schedulers using per-task Runtime shells.
+// Validates that Runtime.currentEbr() scales under cross-thread dispatch.
+test "FSM Versioned: 8 BG-FSM writers via spawnFsmBest on 5 schedulers (Runtime.currentEbr) -- bench-17 fix scale" {
     stack_pool = StackPool.init(test_alloc);
     defer stack_pool.deinit();
 
@@ -1504,7 +1612,7 @@ test "FSM Versioned: 8 BG-FSM writers via spawnFsmBest on 5 schedulers (per-task
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
@@ -1519,7 +1627,7 @@ test "FSM Versioned: 8 BG-FSM writers via spawnFsmBest on 5 schedulers (per-task
                 const ctx = try sa.create(FsmWriterCtx);
                 ctx.* = .{
                     .task = undefined,
-                    .rt = undefined, // rebound below to per-task rt
+                    .rt = undefined, // rebound below to per-task Runtime shell
                     .inner = promises[i].inner,
                     .alloc = sa,
                     .cell = &cell,
@@ -1540,16 +1648,14 @@ test "FSM Versioned: 8 BG-FSM writers via spawnFsmBest on 5 schedulers (per-task
     }.body);
 }
 
-// Gap 1: structural invariants for the per-task Runtime fix.
+// Gap 1: structural invariants for the FSM Runtime shell + thread EBR fix.
 //
 // The bench-17 fix verifier above (and its 8-writer scale-up) prove
 // the fix at the *behavioral* level: the heap doesn't abort. This
-// test pins the *structural* property that makes the fix work: each
-// FSM task's Runtime is genuinely separate from the parent rt and
-// from every other FSM task's rt. If a regression ever causes
-// allocFsmTaskRuntime to be skipped (so ctx.rt aliases the parent)
-// or aliased across tasks (shared per-task rt), this test fires
-// before any concurrent work runs.
+// test pins the *structural* properties that make the fix work:
+// each FSM task has a separate Runtime shell, but none allocates a
+// task-local EBR participant. MVCC operations resolve EBR through
+// Runtime.currentEbr(), i.e. the active scheduler thread's registered slot.
 //
 // The behavioral test catches the bug only stochastically; the
 // glibc abort fires after enough iterations corrupt the limbo. This
@@ -1558,22 +1664,17 @@ test "FSM Versioned: 8 BG-FSM writers via spawnFsmBest on 5 schedulers (per-task
 //
 // Invariants checked (all before any FSM task starts running):
 //   I1  every per-task Runtime pointer is unique
-//   I2  every per-task ThreadLocalEbr pointer is unique
+//   I2  registry size does not grow with task count
 //   I3  no per-task Runtime aliases the parent rt
-//   I4  no per-task ThreadLocalEbr aliases the parent rt's ebr
-//   I5  every per-task ebr is registered in the shared EbrContext
-//        (count grew by exactly N)
+//   I4  runtime shells keep only the non-scheduler fallback ebr
 //
 // And after all tasks complete:
-//   I6  the registry has shrunk back to its pre-spawn size (each
-//        per-task ebr was unregistered by releaseFsmTaskEbr)
+//   I5  the registry size is unchanged
 //
-// I6 is the load-bearing invariant for the fix's cleanup half. The
-// fix has two parts: (a) allocate per-task runtime, (b) release it
-// when the FSM task is .Done. If (b) regresses, ebrs leak; the
-// DebugAllocator catches the heap leak, but I6 catches the *logic*
-// directly so the failure is interpretable.
-test "FSM Versioned: per-task Runtime structural separation -- bench-17 fix invariants" {
+// The fix has two parts: (a) allocate per-task runtime shell, (b) keep EBR
+// scheduler-local. If either regresses, this catches it before relying on
+// stochastic heap corruption.
+test "FSM Versioned: Runtime shells use scheduler-thread EBR -- bench-17 fix invariants" {
     stack_pool = StackPool.init(test_alloc);
     defer stack_pool.deinit();
 
@@ -1588,12 +1689,11 @@ test "FSM Versioned: per-task Runtime structural separation -- bench-17 fix inva
                 var i: usize = 0;
                 while (i < 6) : (i += 1) {
                     global_ebr.reclaim(test_alloc);
-                    rt.ebr.reclaimLocal(test_alloc);
+                    rt.currentEbr().reclaimLocal(test_alloc);
                 }
             }
 
-            // Snapshot the registry size BEFORE we allocate any per-task
-            // ebrs. We compare back to this number after cleanup.
+            // Snapshot the registry size before allocating Runtime shells.
             global_ebr.registry_lock.lock();
             const initial_registry: usize = global_ebr.registry.items.len;
             global_ebr.registry_lock.unlock();
@@ -1605,7 +1705,6 @@ test "FSM Versioned: per-task Runtime structural separation -- bench-17 fix inva
             const sa = rt.getSched().allocator;
             var promises: [N]CheatLib.Promise(usize) = undefined;
             var task_rts: [N]*Runtime = undefined;
-            var task_ebrs: [N]*@import("../lib/ebr.zig").ThreadLocalEbr = undefined;
             var ctxs: [N]*FsmWriterCtx = undefined;
 
             // Phase 1: allocate all per-task Runtimes BEFORE spawning any.
@@ -1629,7 +1728,6 @@ test "FSM Versioned: per-task Runtime structural separation -- bench-17 fix inva
                 const task_rt = try CheatHeader.allocFsmTaskRuntime(ctxs[i].task, rt);
                 ctxs[i].rt = task_rt;
                 task_rts[i] = task_rt;
-                task_ebrs[i] = task_rt.ebr;
             }
 
             // I1: per-task Runtimes are pairwise distinct.
@@ -1638,32 +1736,23 @@ test "FSM Versioned: per-task Runtime structural separation -- bench-17 fix inva
                     try testing.expect(task_rts[i] != task_rts[j]);
                 }
             }
-            // I2: per-task ThreadLocalEbrs are pairwise distinct.
-            for (0..N) |i| {
-                for (i + 1..N) |j| {
-                    try testing.expect(task_ebrs[i] != task_ebrs[j]);
-                }
-            }
-            // I3: no per-task Runtime aliases the parent.
-            for (0..N) |i| try testing.expect(task_rts[i] != rt);
-            // I4: no per-task ebr aliases the parent's ebr.
-            for (0..N) |i| try testing.expect(task_ebrs[i] != rt.ebr);
-
-            // I5: every per-task ebr is registered with the shared
-            // EbrContext. Registry grew by exactly N from pre-spawn.
+            // I2: registry does not grow with task count.
             global_ebr.registry_lock.lock();
             const after_alloc_registry = global_ebr.registry.items.len;
             global_ebr.registry_lock.unlock();
-            try testing.expectEqual(initial_registry + N, after_alloc_registry);
+            try testing.expectEqual(initial_registry, after_alloc_registry);
+            // I3: no per-task Runtime aliases the parent.
+            for (0..N) |i| try testing.expect(task_rts[i] != rt);
+            // I4: runtime shells use the parent's fallback ebr pointer, but
+            // scheduler execution ignores it and uses currentEbr().
+            for (0..N) |i| try testing.expect(task_rts[i].ebr == rt.ebr);
 
             // Phase 2: spawn the FSMs and wait for completion.
             for (0..N) |i| try CheatHeader.spawnFsmBest(ctxs[i].task);
             for (&promises) |*p| _ = try p.next();
             _ = failed.load(.seq_cst);
 
-            // I6: every per-task ebr was unregistered by
-            // releaseFsmTaskEbr in drainFsmQueue's .Done branch.
-            // Registry has returned to its pre-spawn size.
+            // I5: task completion did not add or leak EBR participants.
             global_ebr.registry_lock.lock();
             const after_done_registry = global_ebr.registry.items.len;
             global_ebr.registry_lock.unlock();
diff --git a/zig/runtime/versioned-loom-test.zig b/zig/runtime/versioned-loom-test.zig
index d880e6cc..ed12e3d0 100644
--- a/zig/runtime/versioned-loom-test.zig
+++ b/zig/runtime/versioned-loom-test.zig
@@ -27,14 +27,154 @@
 
 const std = @import("std");
 const testing = std.testing;
+const fc = @import("fiber-core.zig");
+const root = @import("root");
+const sim_atomic = if (@hasDecl(root, "SimAtomicState")) root.SimAtomicState else @import("vopr-atomic.zig");
 
 const ebr_mod = @import("../lib/ebr.zig");
 const versioned = @import("versioned.zig");
 const Runtime = @import("runtime.zig").Runtime;
+const scheduler_mod = @import("scheduler.zig");
 
 const EbrContext = ebr_mod.EbrContext;
 const ThreadLocalEbr = ebr_mod.ThreadLocalEbr;
 
+const Fiber = fc.Fiber;
+const Context = fc.Context;
+const STACK_SIZE = 64 * 1024;
+const MAX_STEPS = 10_000;
+
+var pin_harness: *PinDepthLoomHarness = undefined;
+var wake_harness: *WakeGateLoomHarness = undefined;
+
+const PinDepthLoomHarness = struct {
+    fibers: [2]Fiber = undefined,
+    stacks: [2][]u8 = [_][]u8{ &.{}, &.{} },
+    main_ctx: Context = undefined,
+    done: [2]bool = [_]bool{ false, false },
+    schedule: []const u8,
+    pos: usize = 0,
+    allocator: std.mem.Allocator,
+    ctx: EbrContext = .{},
+    local: ThreadLocalEbr,
+    violation: bool = false,
+    observed_pinned: bool = false,
+    observed_inner_window: bool = false,
+    outer_hold_window: bool = false,
+
+    fn init(allocator: std.mem.Allocator, schedule: []const u8) PinDepthLoomHarness {
+        var h = PinDepthLoomHarness{
+            .schedule = schedule,
+            .allocator = allocator,
+            .local = ThreadLocalEbr{ .context = undefined },
+        };
+        h.local.context = &h.ctx;
+        return h;
+    }
+
+    fn deinit(self: *PinDepthLoomHarness) void {
+        fc.__fiber = null;
+        fc.__fiber_parent_ctx = null;
+        fc.__fiber_stack_limit = null;
+        self.local.deinit(self.allocator);
+        self.ctx.deinit(self.allocator);
+        for (&self.stacks) |*stack| {
+            if (stack.len > 0) {
+                self.allocator.free(stack.*);
+                stack.* = &.{};
+            }
+        }
+    }
+
+    fn createThread(self: *PinDepthLoomHarness, id: usize, entry_fn: usize) !void {
+        self.stacks[id] = try self.allocator.alloc(u8, STACK_SIZE);
+        self.fibers[id] = Fiber.init(self.stacks[id], entry_fn, .Large);
+        self.done[id] = false;
+    }
+
+    fn pickThread(self: *PinDepthLoomHarness) usize {
+        if (self.done[0]) return 1;
+        if (self.done[1]) return 0;
+        const bit = if (self.pos < self.schedule.len) self.schedule[self.pos] else 0;
+        self.pos += 1;
+        return bit & 1;
+    }
+
+    fn run(self: *PinDepthLoomHarness) !void {
+        var steps: usize = 0;
+        while (steps < MAX_STEPS) : (steps += 1) {
+            if (self.done[0] and self.done[1]) break;
+            const chosen = self.pickThread();
+            self.fibers[chosen].switchTo(&self.main_ctx);
+        }
+        fc.__fiber = null;
+        fc.__fiber_parent_ctx = null;
+        fc.__fiber_stack_limit = null;
+        if (steps >= MAX_STEPS) return error.StepLimitExceeded;
+        if (self.violation) return error.PinDepthInactiveWhilePinned;
+    }
+
+    fn observe(self: *PinDepthLoomHarness) void {
+        const depth = self.local.pin_depth.load(.seq_cst);
+        const active = self.local.is_active.load(.seq_cst);
+        if (depth > 0) {
+            self.observed_pinned = true;
+            if (self.outer_hold_window) {
+                if (!active) self.violation = true;
+                if (depth == 1) self.observed_inner_window = true;
+            }
+        }
+    }
+};
+
+fn entryNestedPinReader() callconv(.c) void {
+    const h = pin_harness;
+    h.local.enter();
+    h.local.enter();
+    h.local.exit();
+    h.outer_hold_window = true;
+
+    // Keep the inner-exit/outer-still-held window open long enough for
+    // exhaustive schedules to observe it. These are explicit Loom thread
+    // yields, not production retries/timers/IO.
+    fc.__fiber.?.yield();
+    fc.__fiber.?.yield();
+
+    h.outer_hold_window = false;
+    h.local.exit();
+    h.done[0] = true;
+    fc.__fiber.?.yield();
+    unreachable;
+}
+
+fn entryPinObserver() callconv(.c) void {
+    const h = pin_harness;
+    var i: usize = 0;
+    while (i < 8) : (i += 1) {
+        h.observe();
+        fc.__fiber.?.yield();
+    }
+    h.done[1] = true;
+    fc.__fiber.?.yield();
+    unreachable;
+}
+
+fn runPinDepthSchedule(allocator: std.mem.Allocator, schedule: []const u8) !PinDepthLoomHarness {
+    var h = PinDepthLoomHarness.init(allocator, schedule);
+    errdefer h.deinit();
+    pin_harness = &h;
+    try h.createThread(0, @intFromPtr(&entryNestedPinReader));
+    try h.createThread(1, @intFromPtr(&entryPinObserver));
+    try h.run();
+    return h;
+}
+
+fn fillBinarySchedule(buf: []u8, value: usize) void {
+    for (buf, 0..) |*slot, i| {
+        slot.* = @intCast((value >> @intCast(i)) & 1);
+    }
+}
+
 test "Loom-shim sanity: shared-memory.Atomic resolves to std.atomic.Value when SimAtomic absent" {
     // No `pub const SimAtomic = ...` at root, so the comptime
     // resolution should land on std.atomic.Value(*T).  Verify by
@@ -57,6 +197,178 @@ test "Loom-shim sanity: ebr.Atomic resolves to std.atomic.Value when SimAtomic a
     _ = x.cmpxchgWeak(@as(u32, 7), @as(u32, 8), .release, .monotonic);
 }
 
+pub fn testNestedEbrPinDepthLoom(allocator: std.mem.Allocator, require_sim_atomic: bool) !void {
+    const before_ops = sim_atomic.sim_atomic_op_count;
+    var saw_pinned = false;
+    var saw_inner_window = false;
+
+    var schedule: [12]u8 = undefined;
+    var n: usize = 0;
+    while (n < (1 << schedule.len)) : (n += 1) {
+        fillBinarySchedule(&schedule, n);
+        var h = try runPinDepthSchedule(allocator, &schedule);
+        saw_pinned = saw_pinned or h.observed_pinned;
+        saw_inner_window = saw_inner_window or h.observed_inner_window;
+        h.deinit();
+    }
+
+    if (require_sim_atomic and sim_atomic.sim_atomic_op_count == before_ops) {
+        return error.SimAtomicNotActive;
+    }
+    if (!saw_pinned) return error.PinDepthPinnedWindowNotObserved;
+    if (!saw_inner_window) return error.PinDepthInnerExitWindowNotObserved;
+}
+
+test "loom: nested EBR pin keeps is_active true until final exit" {
+    // Under `zig test`, @import("root") is the generated test runner,
+    // so this is a structural fallback. The real SimAtomic-backed run is
+    // the `versioned-loom-test` executable wired in build.zig.
+    try testNestedEbrPinDepthLoom(testing.allocator, false);
+}
+
+const WakeGateLoomHarness = struct {
+    fibers: [2]Fiber = undefined,
+    stacks: [2][]u8 = [_][]u8{ &.{}, &.{} },
+    main_ctx: Context = undefined,
+    done: [2]bool = [_]bool{ false, false },
+    schedule: []const u8,
+    pos: usize = 0,
+    allocator: std.mem.Allocator,
+    event: scheduler_mod.SmartEventFd = .{ .fd = -1 },
+    work_available: bool = false,
+    scheduler_blocked: bool = false,
+    writes: u32 = 0,
+    violation: bool = false,
+    consumed_prepark_notify: bool = false,
+
+    fn init(allocator: std.mem.Allocator, schedule: []const u8) WakeGateLoomHarness {
+        return .{ .schedule = schedule, .allocator = allocator };
+    }
+
+    fn deinit(self: *WakeGateLoomHarness) void {
+        fc.__fiber = null;
+        fc.__fiber_parent_ctx = null;
+        fc.__fiber_stack_limit = null;
+        for (&self.stacks) |*stack| {
+            if (stack.len > 0) {
+                self.allocator.free(stack.*);
+                stack.* = &.{};
+            }
+        }
+    }
+
+    fn createThread(self: *WakeGateLoomHarness, id: usize, entry_fn: usize) !void {
+        self.stacks[id] = try self.allocator.alloc(u8, STACK_SIZE);
+        self.fibers[id] = Fiber.init(self.stacks[id], entry_fn, .Large);
+        self.done[id] = false;
+    }
+
+    fn pickThread(self: *WakeGateLoomHarness) usize {
+        if (self.done[0]) return 1;
+        if (self.done[1]) return 0;
+        const bit = if (self.pos < self.schedule.len) self.schedule[self.pos] else 0;
+        self.pos += 1;
+        return bit & 1;
+    }
+
+    fn run(self: *WakeGateLoomHarness) !void {
+        var steps: usize = 0;
+        while (steps < MAX_STEPS) : (steps += 1) {
+            if (self.done[0] and self.done[1]) break;
+            const chosen = self.pickThread();
+            self.fibers[chosen].switchTo(&self.main_ctx);
+        }
+        fc.__fiber = null;
+        fc.__fiber_parent_ctx = null;
+        fc.__fiber_stack_limit = null;
+        if (steps >= MAX_STEPS) return error.StepLimitExceeded;
+        if (self.violation) return error.WakeGateMissedWake;
+    }
+};
+
+fn entryWakeScheduler() callconv(.c) void {
+    const h = wake_harness;
+    const should_sleep = h.event.prepareSleep();
+    if (!should_sleep) {
+        h.consumed_prepark_notify = true;
+        h.done[0] = true;
+        fc.__fiber.?.yield();
+        unreachable;
+    }
+
+    // Window 1: producer may notify after prepareSleep and before the
+    // scheduler's last work check.
+    fc.__fiber.?.yield();
+    if (h.work_available) {
+        h.event.finishSleep();
+        h.done[0] = true;
+        fc.__fiber.?.yield();
+        unreachable;
+    }
+
+    // Window 2: producer may notify after the last work check but before
+    // the scheduler enters the blocking syscall. It must observe Parked
+    // and request an eventfd write.
+    fc.__fiber.?.yield();
+    h.scheduler_blocked = true;
+
+    // Window 3: producer may notify while the scheduler is logically
+    // blocked. It must request an eventfd write.
+    fc.__fiber.?.yield();
+    h.event.finishSleep();
+    h.scheduler_blocked = false;
+    h.done[0] = true;
+    fc.__fiber.?.yield();
+    unreachable;
+}
+
+fn entryWakeProducer() callconv(.c) void {
+    const h = wake_harness;
+    fc.__fiber.?.yield();
+    h.work_available = true;
+    if (h.event.armNotify()) h.writes += 1;
+    if (h.scheduler_blocked and h.writes == 0) h.violation = true;
+    h.done[1] = true;
+    fc.__fiber.?.yield();
+    unreachable;
+}
+
+fn runWakeGateSchedule(allocator: std.mem.Allocator, schedule: []const u8) !WakeGateLoomHarness {
+    var h = WakeGateLoomHarness.init(allocator, schedule);
+    errdefer h.deinit();
+    wake_harness = &h;
+    try h.createThread(0, @intFromPtr(&entryWakeScheduler));
+    try h.createThread(1, @intFromPtr(&entryWakeProducer));
+    try h.run();
+    return h;
+}
+
+pub fn testSchedulerWakeGateLoom(allocator: std.mem.Allocator, require_sim_atomic: bool) !void {
+    const before_ops = sim_atomic.sim_atomic_op_count;
+    var saw_blocked_write = false;
+    var saw_prepark_token = false;
+
+    var schedule: [10]u8 = undefined;
+    var n: usize = 0;
+    while (n < (1 << schedule.len)) : (n += 1) {
+        fillBinarySchedule(&schedule, n);
+        var h = try runWakeGateSchedule(allocator, &schedule);
+        saw_blocked_write = saw_blocked_write or h.writes > 0;
+        saw_prepark_token = saw_prepark_token or h.consumed_prepark_notify;
+        h.deinit();
+    }
+
+    if (require_sim_atomic and sim_atomic.sim_atomic_op_count == before_ops) {
+        return error.SimAtomicNotActive;
+    }
+    if (!saw_blocked_write) return error.WakeGateParkedNotifyWindowNotObserved;
+    if (!saw_prepark_token) return error.WakeGatePreparkNotifyWindowNotObserved;
+}
+
+test "loom: scheduler wake gate does not miss notify around park" {
+    try testSchedulerWakeGateLoom(testing.allocator, false);
+}
+
 // Smoke test: full Versioned(T) + EBR lifecycle with the shim in place.
 // Same test as in T1 but routed through the shim — proves the shim
 // doesn't break the protocol under the default (real-atomic) build.
diff --git a/zig/runtime/versioned-test.zig b/zig/runtime/versioned-test.zig
index 56f7b1ee..52c507ee 100644
--- a/zig/runtime/versioned-test.zig
+++ b/zig/runtime/versioned-test.zig
@@ -374,6 +374,31 @@ test "Versioned(i64): read returns the current pointer + EBR is active during th
     try testing.expect(!rt.ebr.is_active.load(.seq_cst));
 }
 
+test "Versioned(i64): nested read guards keep EBR pinned until the outer guard releases" {
+    var ctx = EbrContext{};
+    defer ctx.deinit(testing.allocator);
+
+    var frame: [1024]u8 = undefined;
+    var rt = try makeRuntime(&ctx, &frame);
+    defer rt.deinit();
+
+    var s = try versioned.Versioned(i64).init(testing.allocator, 100);
+    defer s.deinit(&rt, testing.allocator) catch unreachable;
+
+    var outer = s.read(&rt);
+    var inner = s.read(&rt);
+    try testing.expect(rt.ebr.is_active.load(.seq_cst));
+    try testing.expectEqual(@as(u32, 2), rt.ebr.pin_depth.load(.seq_cst));
+
+    inner.release();
+    try testing.expect(rt.ebr.is_active.load(.seq_cst));
+    try testing.expectEqual(@as(u32, 1), rt.ebr.pin_depth.load(.seq_cst));
+
+    outer.release();
+    try testing.expect(!rt.ebr.is_active.load(.seq_cst));
+    try testing.expectEqual(@as(u32, 0), rt.ebr.pin_depth.load(.seq_cst));
+}
+
 const incArgs = struct { delta: i64 };
 fn incBy(p: *i64, delta: i64) void {
     p.* += delta;
diff --git a/zig/runtime/versioned.zig b/zig/runtime/versioned.zig
index 70af9e47..186e2174 100644
--- a/zig/runtime/versioned.zig
+++ b/zig/runtime/versioned.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const Runtime = @import("runtime.zig").Runtime;
+const ThreadLocalEbr = @import("../lib/ebr.zig").ThreadLocalEbr;
 const compat = @import("../lib/compat.zig");
 const rt_profile = @import("runtime-header.zig");
 const mvcc_profile = @import("mvcc-profile.zig");
@@ -173,7 +174,7 @@ pub fn Versioned(comptime T: type) type {
         // by the caller immediately after this call returns.
         pub fn deinit(self: *Self, trt: *Runtime, allocator: std.mem.Allocator) !void {
             const current_ptr: *T = @ptrFromInt(addrUntag(self.ptr.load(.acquire)));
-            try trt.ebr.retire(allocator, current_ptr);
+            try trt.currentEbr().retire(allocator, current_ptr);
         }
 
         // B1 fix (2026-04-30): cleanup variant for `Arc(Versioned(T))`.
@@ -206,7 +207,8 @@ pub fn Versioned(comptime T: type) type {
         // through the returned Guard.
         pub fn read(self: *Self, trt: *Runtime) Guard {
             // A. Signal start
-            trt.ebr.enter();
+            const ebr = trt.currentEbr();
+            ebr.enter();
 
             // B. Load pointer (Safe because we are in the epoch).
             //    Acquire-paired with the cmpxchg .release in update().
@@ -218,7 +220,7 @@ pub fn Versioned(comptime T: type) type {
             if (rt_profile.CLEAR_PROFILE) {
                 mvcc_profile.recordRead(@intFromPtr(self), @sizeOf(T));
             }
-            return Guard{ .ptr = val, .rt = trt };
+            return Guard{ .ptr = val, .ebr = ebr };
         }
 
         // H3: closure-based read API that auto-releases via defer.
@@ -240,7 +242,7 @@ pub fn Versioned(comptime T: type) type {
         // The Read Guard
         pub const Guard = struct {
             ptr: *T,
-            rt: *Runtime,
+            ebr: *ThreadLocalEbr,
 
             pub fn get(self: *Guard) *T {
                 return self.ptr;
@@ -248,7 +250,7 @@ pub fn Versioned(comptime T: type) type {
 
             pub fn release(self: *Guard) void {
                 // C. Signal done
-                self.rt.ebr.exit();
+                self.ebr.exit();
             }
         };
 
@@ -290,8 +292,9 @@ pub fn Versioned(comptime T: type) type {
             // forces reclaim's global_epoch to stop at this thread's
             // local until update() returns, so any old_ptr we observe
             // via `self.ptr.load` is alive throughout the memcpy + CAS.
-            trt.ebr.enter();
-            defer trt.ebr.exit();
+            const ebr = trt.currentEbr();
+            ebr.enter();
+            defer ebr.exit();
 
             var retries: usize = 0;
             while (retries < MAX_UPDATE_RETRIES) : (retries += 1) {
@@ -337,7 +340,7 @@ pub fn Versioned(comptime T: type) type {
                 // === SUCCESS PATH === — disarm the defer cleanup,
                 // retire the old pointer for EBR-deferred free.
                 success = true;
-                try trt.ebr.retire(allocator, old_ptr);
+                try ebr.retire(allocator, old_ptr);
                 if (rt_profile.CLEAR_PROFILE) {
                     mvcc_profile.recordUpdate(@intFromPtr(self), @sizeOf(T), retries, true);
                 }
@@ -522,8 +525,9 @@ pub fn updateMulti(
         // 5. Pin EBR. The user's txn body reads from the captured
         //    snapshots (which are still valid because EBR retirement
         //    only fires after every active epoch drains).
-        trt.ebr.enter();
-        defer trt.ebr.exit();
+        const ebr = trt.currentEbr();
+        ebr.enter();
+        defer ebr.exit();
 
         // 6. Copy each captured snapshot into the new_node so the user
         //    starts from the latest committed state for that cell.
@@ -556,7 +560,7 @@ pub fn updateMulti(
         inline for (0..N) |i| {
             const T = @TypeOf(cells[i].*).Inner;
             const old_node: *T = @ptrFromInt(snap_addrs[i]);
-            try trt.ebr.retire(allocator, old_node);
+            try ebr.retire(allocator, old_node);
             // AtomicPtr M3.16: record per-cell that THIS commit was
             // multi-cell. The doctor uses `multi_commits > 0` to
             // disqualify the cell from the @shared:versioned ->
diff --git a/zig/versioned-loom-test.zig b/zig/versioned-loom-test.zig
index bc9a865a..912ff201 100644
--- a/zig/versioned-loom-test.zig
+++ b/zig/versioned-loom-test.zig
@@ -1,5 +1,21 @@
 pub const CLEAR_FRAME_DEBUG = false;
 
+pub const SimAtomicState = @import("runtime/vopr-atomic.zig");
+pub const SimAtomic = SimAtomicState.SimAtomic;
+const std = @import("std");
+const versioned_loom = @import("runtime/versioned-loom-test.zig");
+
+pub fn main() !void {
+    const before = SimAtomicState.sim_atomic_op_count;
+    try versioned_loom.testNestedEbrPinDepthLoom(std.heap.c_allocator, true);
+    try versioned_loom.testSchedulerWakeGateLoom(std.heap.c_allocator, true);
+    const delta = SimAtomicState.sim_atomic_op_count - before;
+    std.debug.print(
+        "versioned loom: nested EBR pin-depth + scheduler wake gate passed ({d} sim ops)\n",
+        .{delta},
+    );
+}
+
 test {
     _ = @import("runtime/versioned-loom-test.zig");
 }