diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4484741..ed0a5fc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -13,7 +13,11 @@ jobs:
     strategy:
       matrix:
         include:
-          - { name: gcc-release, preset: release, cc: gcc, cxx: g++ }
+          # gcc-release builds the shipped CPU config: every ggml-cpu ISA variant
+          # (GGML_CPU_ALL_VARIANTS), runtime-dispatched -- so released binaries get
+          # AVX-512 without -march=native (which Nix strips and which mis-targets
+          # cross-host builds).
+          - { name: gcc-release, preset: release-portable, cc: gcc, cxx: g++ }
           - { name: clang-debug-san, preset: debug, cc: clang, cxx: clang++ }
     name: ${{ matrix.name }}
     runs-on: ubuntu-latest
@@ -32,13 +36,24 @@ jobs:
         run: cmake --preset ${{ matrix.preset }} -DGGML_NATIVE=OFF
         env: { CC: '${{ matrix.cc }}', CXX: '${{ matrix.cxx }}' }
       - name: build
-        run: cmake --build --preset ${{ matrix.preset }} -j
+        # -j4 (not unbounded -j): release-portable compiles 14 ggml-cpu ISA
+        # variants; uncapped parallelism OOMs the 16 GB runner.
+        run: cmake --build --preset ${{ matrix.preset }} -j4
       - name: test (model-independent)
         run: ctest --preset ${{ matrix.preset }} -LE model
       - name: unicode table regen check
         run: |
           python3 scripts/gen_unicode.py > /tmp/unicode_data.inc
           diff -u src/unicode_data.inc /tmp/unicode_data.inc
+      - name: assert SIMD compiled in (guard the SSE-only trap)
+        if: matrix.name == 'gcc-release'
+        # The AVX-512 ISA variant must actually contain AVX-512 -- catches a
+        # silent regression to a SIMD-less build (e.g. a stripped -march).
+        run: |
+          so=build/release-portable/bin/libggml-cpu-skylakex.so
+          n=$(objdump -d "$so" | grep -c '%zmm')
+          echo "skylakex %zmm instructions: $n"
+          test "$n" -gt 0
 
   # Tier 2 (nightly / dispatch): HF reference fixtures + parity + fuzz smoke.
   # Needs the model checkpoint; cached between runs.
@@ -92,9 +107,9 @@ jobs:
             --model ~/models/privacy-filter-multilingual \
             --outfile ~/ggufs/pf-f32.gguf --outtype f32
       - name: build
-        run: cmake --preset release -DGGML_NATIVE=OFF && cmake --build --preset release -j
+        run: cmake --preset release-portable && cmake --build --preset release-portable -j4
       - name: parity suite
-        run: PF_GGUF_DIR=~/ggufs ctest --preset release -L model
+        run: PF_GGUF_DIR=~/ggufs ctest --preset release-portable -L model
       - name: fuzz smoke (5 min/target)
         run: |
           cmake --preset fuzz && cmake --build --preset fuzz -j --target fuzz_tokenizer fuzz_gguf
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3225c26..64580ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,17 @@ if (PF_BUILD_TOOLS)
     add_executable(pf-bench bench/pf-bench.cpp)
     target_link_libraries(pf-bench PRIVATE pf)
     target_include_directories(pf-bench PRIVATE src)
+
+    # GFLOP/s by dtype/shape -- diagnostic for the CPU matmul analysis
+    # (docs/cpu-perf.md). Links ggml directly; no pf/model deps.
+    add_executable(pf-gemm-bench bench/gemm_microbench.cpp)
+    target_link_libraries(pf-gemm-bench PRIVATE ggml)
+
+    # Prototype: O(n*band) block-local attention == full masked attention
+    # (bit-identical), with an O(n*B) mask instead of O(n^2). Proof-of-concept
+    # for de-windowing; see docs/cpu-perf.md. Args: [block] [tokens].
+    add_executable(pf-banded-proto bench/banded_attn_proto.cpp)
+    target_link_libraries(pf-banded-proto PRIVATE ggml)
 endif()
 
 if (PF_FUZZ)
diff --git a/CMakePresets.json b/CMakePresets.json
index b098df2..32e58e2 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -16,6 +16,18 @@
       "binaryDir": "${sourceDir}/build/release",
       "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
     },
+    {
+      "name": "release-portable",
+      "binaryDir": "${sourceDir}/build/release-portable",
+      "description": "Portable + fast CPU: build every ggml-cpu ISA variant and pick the best at runtime. Avoids -march=native (fragile, and stripped by Nix's NIX_ENFORCE_NO_NATIVE), so binaries run anywhere yet still use AVX-512/VNNI where present.",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "GGML_NATIVE": "OFF",
+        "GGML_BACKEND_DL": "ON",
+        "GGML_CPU_ALL_VARIANTS": "ON",
+        "CMAKE_RUNTIME_OUTPUT_DIRECTORY": "${sourceDir}/build/release-portable/bin"
+      }
+    },
     {
       "name": "profile",
       "binaryDir": "${sourceDir}/build/profile",
@@ -35,11 +47,13 @@
   "buildPresets": [
     { "name": "debug",   "configurePreset": "debug" },
     { "name": "release", "configurePreset": "release" },
+    { "name": "release-portable", "configurePreset": "release-portable" },
     { "name": "profile", "configurePreset": "profile" },
     { "name": "fuzz",    "configurePreset": "fuzz" }
   ],
   "testPresets": [
     { "name": "debug",   "configurePreset": "debug",   "output": { "outputOnFailure": true } },
-    { "name": "release", "configurePreset": "release", "output": { "outputOnFailure": true } }
+    { "name": "release", "configurePreset": "release", "output": { "outputOnFailure": true } },
+    { "name": "release-portable", "configurePreset": "release-portable", "output": { "outputOnFailure": true } }
   ]
 }
diff --git a/README.md b/README.md
index 093a920..f4e49f3 100644
--- a/README.md
+++ b/README.md
@@ -130,18 +130,33 @@ PF_GGUF=model.gguf ./build/fuzz/fuzz_tokenizer corpus_tok/
 ## Bench
 
 ```sh
-build/release/pf-bench model.gguf [cpu|vulkan] [iters]
+cmake --preset release-portable && cmake --build --preset release-portable -j
+build/release-portable/bin/pf-bench model.gguf [cpu|vulkan] [iters] [lengths]
 ```
 
-Ryzen 9 7900 (12 threads) / RTX 5070 Ti, f16 GGUF, forward tok/s by length
-(one untimed warm-up per length; GPU pipelines compile lazily):
+Forward tok/s vs stock HF Transformers (transformers 5.9, eager), Ryzen 9 7900 (12
+threads) + RTX 5070 Ti, f16/fp16, matched token counts
+([scripts/bench_torch.py](scripts/bench_torch.py)):
 
-| tokens | cpu | vulkan |
-|-------:|----:|-------:|
-|    189 | 161 | 51 583 |
-|    756 | 178 | 99 756 |
-|  2 898 | 129 | 45 416 |
-| 11 403 |  68 | 20 085 |
-| 45 234 |  60 | 17 390 |
+GPU — ours (Vulkan) vs HF (CUDA):
 
-Weights stay in one zero-copy buffer: ~2.8 GiB RSS over baseline (f16).
+| tokens |      HF |    ours |    × |
+|-------:|--------:|--------:|-----:|
+|    512 |   5 526 | 100 503 |  18× |
+|  2 048 |  16 427 | 145 481 | 8.9× |
+|  8 192 |  14 154 | 105 034 | 7.4× |
+| 32 768 |     OOM |  83 519 |    — |
+| 131072 |     OOM |  81 105 |    — |
+
+CPU — ours vs HF (fp32):
+
+| tokens |    HF |  ours |    × |
+|-------:|------:|------:|-----:|
+|    512 | 2 171 | 3 564 | 1.6× |
+|  2 048 |   978 | 3 490 | 3.6× |
+|  8 192 |   304 | 2 332 | 7.7× |
+
+Memory is flat ~2.8 GiB VRAM / ~3 GiB RAM to 131k tokens; HF OOMs past ~16k on a 16
+GiB GPU. `release-portable` runtime-dispatches the best ggml-cpu ISA (AVX-512
+without `-march=native`); flash + banded attention default on. See
+[docs/cpu-perf.md](docs/cpu-perf.md).
diff --git a/bench/banded_attn_proto.cpp b/bench/banded_attn_proto.cpp
new file mode 100644
index 0000000..50e9e02
--- /dev/null
+++ b/bench/banded_attn_proto.cpp
@@ -0,0 +1,101 @@
+// Prototype: O(n*band) block-local sliding-window attention vs the full O(n^2)
+// masked attention, on random data, to validate correctness before wiring it
+// into the model. Single head for clarity.
+//
+//   Full:   scores[n_kv,n_q] = mul_mat(K,Q); mask |q-k|<=r; softmax; out=V^T@scores
+//   Banded: tokens grouped into blocks of B (>= r). Each query block attends only
+//           to blocks {i-1,i,i+1} (3B keys, since r<=B). A per-block mask
+//           [3B,B,n_blocks] (O(n*B) memory) carries the band + edge validity.
+#include <ggml.h>
+#include <ggml-cpu.h>
+#include <ggml-backend.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <initializer_list>
+
+static ggml_backend_t cpu() {
+    static ggml_backend_t be = [] { ggml_backend_load_all();
+        return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); }();
+    return be;
+}
+static void run(ggml_context * ctx, ggml_tensor * out) {
+    ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, out);
+    ggml_backend_graph_compute(cpu(), gf);
+}
+
+int main(int argc, char ** argv) {
+    const int d = 64;
+    const int r = 128;                       // sliding-window radius
+    const int B = argc > 1 ? std::atoi(argv[1]) : 256;   // block size (>= r)
+    const int n = argc > 2 ? std::atoi(argv[2]) : 4096;  // tokens (multiple of B)
+    const int nb = n / B;
+    const float scale = 1.0f / std::sqrt((float) d);
+
+    ggml_init_params p = { (size_t) 2048 * 1024 * 1024, nullptr, false };
+    ggml_context * ctx = ggml_init(p);
+
+    // shared random q,k,v  [d, n]
+    ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
+    ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
+    ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
+    for (ggml_tensor * t : { Q, K, V })
+        for (int i = 0; i < d * n; i++) ((float *) t->data)[i] = (float) (rand() % 2000) / 1000.0f - 1.0f;
+
+    // ---- full reference ----
+    ggml_tensor * Fmask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);  // [n_kv, n_q]
+    for (int q = 0; q < n; q++)
+        for (int k = 0; k < n; k++)
+            ((float *) Fmask->data)[(size_t) q * n + k] = (std::abs(q - k) <= r) ? 0.0f : -INFINITY;
+    ggml_tensor * sc = ggml_mul_mat(ctx, K, Q);                          // [n_kv, n_q]
+    sc = ggml_soft_max_ext(ctx, sc, Fmask, scale, 0.0f);
+    ggml_tensor * Vt = ggml_cont(ctx, ggml_transpose(ctx, V));           // [n, d]
+    ggml_tensor * full = ggml_mul_mat(ctx, Vt, sc);                      // [d, n_q]
+    run(ctx, full);
+
+    // ---- banded ----
+    ggml_tensor * qb = ggml_reshape_3d(ctx, Q, d, B, nb);               // [d, B, nb]
+    ggml_tensor * kb = ggml_reshape_3d(ctx, K, d, B, nb);
+    ggml_tensor * vb = ggml_reshape_3d(ctx, V, d, B, nb);
+    // pad a zero block each side along the block axis, then 3 shifted views
+    auto ctx3 = [&](ggml_tensor * x) {
+        ggml_tensor * z = ggml_scale(ctx, ggml_view_3d(ctx, x, d, B, 1, x->nb[1], x->nb[2], 0), 0.0f);
+        ggml_tensor * pad = ggml_concat(ctx, ggml_concat(ctx, z, x, 2), z, 2);  // [d, B, nb+2]
+        ggml_tensor * prev = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 0 * pad->nb[2]);
+        ggml_tensor * self = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 1 * pad->nb[2]);
+        ggml_tensor * next = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 2 * pad->nb[2]);
+        return ggml_cont(ctx, ggml_concat(ctx, ggml_concat(ctx, prev, self, 1), next, 1)); // [d, 3B, nb]
+    };
+    ggml_tensor * kc = ctx3(kb);                                        // [d, 3B, nb]
+    ggml_tensor * vc = ctx3(vb);
+    ggml_tensor * Bmask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3 * B, B, nb);  // [3B, B, nb]
+    for (int bi = 0; bi < nb; bi++)
+        for (int j = 0; j < B; j++)
+            for (int pp = 0; pp < 3 * B; pp++) {
+                const int qpos = bi * B + j;
+                const int kpos = bi * B - B + pp;
+                const bool vis = std::abs(qpos - kpos) <= r && kpos >= 0 && kpos < n;
+                ((float *) Bmask->data)[((size_t) bi * B + j) * (3 * B) + pp] = vis ? 0.0f : -INFINITY;
+            }
+    ggml_tensor * scb = ggml_mul_mat(ctx, kc, qb);                      // [3B, B, nb]
+    scb = ggml_soft_max_ext(ctx, scb, Bmask, scale, 0.0f);
+    ggml_tensor * vct = ggml_cont(ctx, ggml_transpose(ctx, vc));        // [3B, d, nb]
+    ggml_tensor * outb = ggml_mul_mat(ctx, vct, scb);                  // [d, B, nb]
+    outb = ggml_cont_2d(ctx, outb, d, n);                              // [d, n]
+    run(ctx, outb);
+
+    // compare
+    double maxabs = 0, maxrel = 0;
+    for (int i = 0; i < d * n; i++) {
+        const double a = ((float *) full->data)[i], b = ((float *) outb->data)[i];
+        maxabs = std::max(maxabs, std::fabs(a - b));
+        maxrel = std::max(maxrel, std::fabs(a - b) / (std::fabs(a) + 1e-6));
+    }
+    const double full_mask_mib = (double) n * n * 4 / 1048576.0;
+    const double band_mask_mib = (double) 3 * B * B * nb * 4 / 1048576.0;
+    std::printf("n=%d B=%d r=%d | max|d|=%.2e maxrel=%.2e | mask: full %.1f MiB, band %.1f MiB (%.1fx)\n",
+                n, B, r, maxabs, maxrel, full_mask_mib, band_mask_mib, full_mask_mib / band_mask_mib);
+    return 0;
+}
diff --git a/bench/gemm_microbench.cpp b/bench/gemm_microbench.cpp
new file mode 100644
index 0000000..300cbfe
--- /dev/null
+++ b/bench/gemm_microbench.cpp
@@ -0,0 +1,78 @@
+// pf-gemm-bench — isolates ggml's CPU matmul throughput (GFLOP/s) by dtype and
+// shape, to explain where CPU time goes vs a BLAS-backed framework (numpy/torch
+// = MKL/oneDNN). result = mul_mat(a[K,M], b[K,N]) -> [M,N]; FLOPs = 2*M*N*K.
+//
+// Finding it was written to demonstrate: ggml's float matmul is ~40 GFLOP/s flat
+// at every size (no cache-blocked SGEMM -- the kernels are per-row vec_dot, tuned
+// for quantized weights where bandwidth dominates), while its q8_0 path is
+// 5-7x faster. MKL's blocked SGEMM is 12-29x the ggml-f32 rate (run mkl side
+// separately, e.g. torch.matmul). See docs/cpu-perf.md.
+#include <ggml.h>
+#include <ggml-cpu.h>
+#include <ggml-backend.h>
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static double gflops(ggml_backend_t be, ggml_type ta, int M, int K, int N, int iters) {
+    ggml_init_params p = { (size_t) 64 * 1024 * 1024, nullptr, true };
+    ggml_context * ctx = ggml_init(p);
+    ggml_tensor * a = ggml_new_tensor_2d(ctx, ta, K, M);
+    ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, K, N);
+    ggml_tensor * c = ggml_mul_mat(ctx, a, b);
+    ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, c);
+    ggml_gallocr * ga = ggml_gallocr_new(ggml_backend_get_default_buffer_type(be));
+    ggml_gallocr_alloc_graph(ga, gf);
+
+    std::vector<float> rb(K * N);
+    for (auto & x : rb) x = (float) (rand() % 1000) / 1000.0f - 0.5f;  // values irrelevant for timing
+    ggml_backend_tensor_set(b, rb.data(), 0, rb.size() * sizeof(float));
+    std::vector<float> ra(K * M);
+    for (auto & x : ra) x = (float) (rand() % 1000) / 1000.0f - 0.5f;
+    if (ta == GGML_TYPE_F32) {
+        ggml_backend_tensor_set(a, ra.data(), 0, ra.size() * sizeof(float));
+    } else {
+        std::vector<char> buf(ggml_nbytes(a));
+        ggml_quantize_chunk(ta, ra.data(), buf.data(), 0, M, K, nullptr);
+        ggml_backend_tensor_set(a, buf.data(), 0, buf.size());
+    }
+
+    ggml_backend_graph_compute(be, gf);  // warm
+    auto t0 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iters; i++) ggml_backend_graph_compute(be, gf);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    double s = std::chrono::duration<double>(t1 - t0).count() / iters;
+    ggml_gallocr_free(ga);
+    ggml_free(ctx);
+    return 2.0 * M * N * K / s / 1e9;
+}
+
+int main(int argc, char ** argv) {
+    const int nth = argc > 1 ? std::atoi(argv[1]) : 12;
+    ggml_backend_load_all();  // pick the best CPU variant in a GGML_BACKEND_DL build
+    ggml_backend_t be = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    auto set_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
+        ggml_backend_dev_backend_reg(ggml_backend_get_device(be)), "ggml_backend_set_n_threads");
+    if (set_fn) set_fn(be, nth);
+    struct { const char * name; int M, K, N; } shp[] = {
+        { "expert gate_up N=16 ", 1280, 640, 16 },
+        { "expert gate_up N=64 ", 1280, 640, 64 },
+        { "expert gate_up N=512", 1280, 640, 512 },
+        { "expert down    N=16 ", 640, 640, 16 },
+        { "large 4096^2   N=512", 4096, 4096, 512 },
+    };
+    std::printf("ggml CPU mul_mat GFLOP/s, %d threads\n", nth);
+    std::printf("%-22s %10s %10s %10s\n", "shape (M,K,N)", "f32", "f16", "q8_0");
+    for (auto & s : shp) {
+        const int it = (double) s.M * s.N > 1e6 ? 20 : 200;
+        std::printf("%-22s %10.1f %10.1f %10.1f\n", s.name,
+                    gflops(be, GGML_TYPE_F32, s.M, s.K, s.N, it),
+                    gflops(be, GGML_TYPE_F16, s.M, s.K, s.N, it),
+                    gflops(be, GGML_TYPE_Q8_0, s.M, s.K, s.N, it));
+    }
+    ggml_backend_free(be);
+    return 0;
+}
diff --git a/bench/pf-bench.cpp b/bench/pf-bench.cpp
index 6118f11..c3dfea2 100644
--- a/bench/pf-bench.cpp
+++ b/bench/pf-bench.cpp
@@ -1,7 +1,11 @@
 // pf-bench — tokens/s and per-stage latency at several document lengths.
-//   pf-bench <model.gguf> [device] [iters]
+//   pf-bench <model.gguf> [device] [iters] [lengths]
 // Synthesizes PII-shaped text, then per length: tokenize / forward (windowed)
 // / decode timings, plus RSS and cold-start (load -> first entity).
+// [lengths] is an optional comma-separated list of EXACT token counts (the
+// synthesized text is tokenized then truncated to each count) — use it to match
+// scripts/bench_torch.py for an apples-to-apples PyTorch comparison. Omitted, it
+// defaults to ~{128,512,2048,8192,32768}-token documents.
 #include "model.h"
 #include "ner.h"
 #include "tokenizer.h"
@@ -43,12 +47,29 @@ static std::string make_text(int approx_tokens) {
 
 int main(int argc, char ** argv) {
     if (argc < 2) {
-        std::fprintf(stderr, "usage: pf-bench <model.gguf> [cpu|vulkan] [iters]\n");
+        std::fprintf(stderr, "usage: pf-bench <model.gguf> [cpu|vulkan] [iters] [len1,len2,...]\n");
         return 2;
     }
     const char * device = argc > 2 ? argv[2] : "cpu";
     const int iters = argc > 3 ? std::atoi(argv[3]) : 3;
 
+    // Optional exact token-count list (4th arg): the synthesized text per length
+    // is truncated to exactly this many tokens, so the lengths match whatever is
+    // passed to scripts/bench_torch.py --lengths. Empty -> the approximate
+    // defaults below.
+    std::vector<int> lengths;
+    if (argc > 4) {
+        for (char * s = std::strtok(argv[4], ","); s; s = std::strtok(nullptr, ","))
+            if (int v = std::atoi(s)) lengths.push_back(v);
+    }
+    const bool exact = !lengths.empty();
+    if (!exact) lengths = { 128, 512, 2048, 8192, 32768 };
+
+    // PF_WINDOW: tokens per forward pass (the pf_set_window knob). Longer inputs
+    // run as overlapping halo windows; larger W means fewer windows (less halo
+    // recompute, faster) but a bigger compute buffer (more RAM/VRAM). Default 4096.
+    const int W = std::getenv("PF_WINDOW") ? std::atoi(std::getenv("PF_WINDOW")) : 4096;
+
     const size_t rss0 = rss_kb("VmRSS:");
     const int64_t t_load0 = ggml_time_us();
     pf::model m;
@@ -72,18 +93,20 @@ int main(int argc, char ** argv) {
         for (size_t i = 0; i < toks.size(); i++) ids[i] = toks[i].id;
         std::vector<pf::ner::tok_span> spans;
         std::string err;
-        pf::ner::classify_tokens(m, ids.data(), (int) ids.size(), 4096, 0.5f, spans, err);
+        pf::ner::classify_tokens(m, ids.data(), (int) ids.size(), W, 0.5f, spans, err);
     }
     const int64_t t_first = ggml_time_us();
 
-    std::printf("device %s | load %.2fs (+%.0f MiB) | cold start %.2fs | %d iters\n\n",
+    // weights buffer: device memory (Vulkan VRAM) or the zero-copy CPU wrap.
+    const double wbuf_mib = m.weights_buf ? ggml_backend_buffer_get_size(m.weights_buf) / 1048576.0 : 0;
+    std::printf("device %s | load %.2fs (+%.0f MiB) | weights %.0f MiB | window %d | %d iters\n\n",
                 m.be.device.c_str(), (t_load1 - t_load0) / 1e6,
-                (rss1 - rss0) / 1024.0, (t_first - t_load0) / 1e6, iters);
-    std::printf("| %8s | %9s | %11s | %9s | %8s |\n",
-                "tokens", "tok ms", "forward ms", "decode ms", "tok/s");
-    std::printf("|---------:|----------:|------------:|----------:|---------:|\n");
+                (rss1 - rss0) / 1024.0, wbuf_mib, W, iters);
+    std::printf("| %8s | %11s | %8s | %9s | %9s |\n",
+                "tokens", "forward ms", "tok/s", "cmp MiB", "RSS MiB");
+    std::printf("|---------:|------------:|--------:|---------:|--------:|\n");
 
-    for (const int target : { 128, 512, 2048, 8192, 32768 }) {
+    for (const int target : lengths) {
         const std::string text = make_text(target);
         int64_t tok_us = 0, fwd_us = 0, dec_us = 0;
         size_t n_tok = 0;
@@ -95,11 +118,13 @@ int main(int argc, char ** argv) {
             int64_t t1 = ggml_time_us();
             std::vector<int32_t> ids(toks.size());
             for (size_t i = 0; i < toks.size(); i++) ids[i] = toks[i].id;
+            // exact mode: truncate to the requested count (make_text overshoots)
+            if (exact && (int) ids.size() > target) ids.resize(target);
             n_tok = ids.size();
 
             std::vector<float> emit;
             std::string err;
-            if (!pf::ner::emit_logprobs(m, ids.data(), (int) ids.size(), 4096, emit, err)) {
+            if (!pf::ner::emit_logprobs(m, ids.data(), (int) ids.size(), W, emit, err)) {
                 std::fprintf(stderr, "forward: %s\n", err.c_str());
                 return 1;
             }
@@ -116,10 +141,14 @@ int main(int argc, char ** argv) {
             (void) spans;
         }
         const double fwd_ms = fwd_us / 1e3 / iters;
-        std::printf("| %8zu | %9.1f | %11.1f | %9.1f | %8.0f |\n",
-                    n_tok, tok_us / 1e3 / iters, fwd_ms, dec_us / 1e3 / iters,
-                    n_tok / (fwd_ms / 1e3));
+        // compute buffer: per-forward activation memory (Vulkan VRAM / CPU RAM),
+        // sized to one window -> grows with min(n_tok, W). RSS is host resident.
+        const double cmp_mib = ggml_gallocr_get_buffer_size(m.be.galloc, 0) / 1048576.0;
+        std::printf("| %8zu | %11.1f | %8.0f | %9.0f | %8.0f |\n",
+                    n_tok, fwd_ms, n_tok / (fwd_ms / 1e3), cmp_mib, rss_kb("VmRSS:") / 1024.0);
+        (void) tok_us; (void) dec_us;
     }
-    std::printf("\npeak RSS %.0f MiB\n", rss_kb("VmHWM:") / 1024.0);
+    std::printf("\npeak RSS %.0f MiB | weights %.0f MiB\n",
+                rss_kb("VmHWM:") / 1024.0, wbuf_mib);
     return 0;
 }
diff --git a/docs/cpu-perf.md b/docs/cpu-perf.md
new file mode 100644
index 0000000..a4d3e4d
--- /dev/null
+++ b/docs/cpu-perf.md
@@ -0,0 +1,173 @@
+# CPU performance
+
+## TL;DR — the build was SSE-only
+
+The CPU slowness traced to a build trap, not the engine. Under Nix the gcc/clang
+wrapper strips `-march=native` (`NIX_ENFORCE_NO_NATIVE`), so a `GGML_NATIVE=ON`
+build silently compiles ggml-cpu with **no AVX2/AVX-512/FMA** — and the CI build
+(`-DGGML_NATIVE=OFF`) has no SIMD either. Confirmed by disassembly:
+
+```
+$ objdump -d libggml-cpu.so | grep -c zmm   # AVX-512   -> 0
+$ objdump -d libggml-cpu.so | grep -c ymm   # AVX2      -> 0
+$ objdump -d libggml-cpu.so | grep -c vfmadd# FMA       -> 0   (37k xmm/SSE only)
+```
+
+With SIMD actually enabled, ggml-f16 on CPU is **~10× faster and beats the
+PyTorch/transformers reference** — no quantization needed. The fix is to build the
+CPU backend for all ISAs and pick at runtime (`GGML_CPU_ALL_VARIANTS`), which also
+sidesteps the Nix `-march=native` stripping.
+
+| 512 tok, f16 | tok/s |
+|---|---:|
+| SSE-only (the trap: `GGML_NATIVE=OFF`, or `=ON` under Nix) | 280 |
+| AVX-512 (explicit `-mavx512*`, or the zen4 runtime variant) | **~3000** |
+| PyTorch CPU (fp32, MKL) | 1935 |
+
+## The fix: GGML_CPU_ALL_VARIANTS (runtime ISA dispatch)
+
+`-march=native` is fragile (stripped by Nix; wrong if you build on a different
+host than you run on). ggml's portable answer is to compile the CPU backend once
+per ISA level and score+load the best at run time:
+
+```sh
+cmake -B build -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ...
+```
+
+This produces `libggml-cpu-{sse42,haswell,skylakex,icelake,zen4,…}.so`; on this
+Ryzen 9 7900 it loads `libggml-cpu-zen4.so` (AVX-512 + VNNI + BF16):
+
+```
+load_backend: loaded CPU backend from libggml-cpu-zen4.so
+```
+
+Engine support (`src/backend.cpp`): call `ggml_backend_load_all()` before
+`ggml_backend_init_by_type`, and set threads through the registry
+(`ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads")`) since the
+CPU-specific symbol now lives in the variant `.so`, not in linked base. Both calls
+are no-ops for a static build, so one code path serves both. Use the
+`release-portable` preset.
+
+## ggml vs MKL once SIMD is real
+
+`pf-gemm-bench` (ggml CPU `mul_mat` GFLOP/s) vs `torch.matmul` (MKL), 12 threads:
+
+| shape (M,K,N) | ggml SSE | ggml AVX-512 | MKL f32 |
+|---|---:|---:|---:|
+| expert gate_up N=16 (f32) | 39 | 426 | 463 |
+| expert gate_up N=512 (f32) | 41 | 528 | 1098 |
+| large 4096² N=512 (f32) | 39 | 442 | 1137 |
+
+ggml's f32 GEMM goes 40 → ~500 GFLOP/s — within ~2× of MKL, and at the actual
+model level (lower per-op overhead than HF's Python expert loop) ggml-f16 **wins**:
+512 tok f16 3006 vs PyTorch 1935. So there was never a missing blocked SGEMM — it
+just wasn't compiled with SIMD.
+
+## Profile (AVX-512) and the minor Q8 option
+
+`PF_PROF=noattn|nomoe` ablation (512 tok, AVX-512): **MoE 64%, attention 34%**,
+rest <1%. `PF_NTHREADS` sweep: near-linear to 12 physical cores, SMT regresses —
+the default is optimal.
+
+Quantization is now a *minor* lever, not a necessity: `scripts/requant_q8.py`
+(Q8_0 experts) adds ~15% over f16+AVX-512 (512 tok: 3006 → 3567) but is a strict
+precision drop — on the 3k-token case it falls below the f16 parity gate (cos
+0.9972, 1 argmax flip in 3053), so it would need its own tier. Given f16+AVX-512
+already beats the reference, Q8 is optional (e.g. for memory: 1.6 vs 2.8 GiB).
+
+## Flash attention (both backends)
+
+With SIMD fixed, attention became the dominant cost at length on *both* backends
+(`PF_PROF` ablation — CPU 8192 tok: attention 72%; Vulkan 2k–32k: ~69%), because
+the engine built the full `[n,n]` score matrix and masked it to the sliding
+window — O(n²) work for an O(n·256) receptive field.
+
+`ggml_flash_attn_ext` (default; `PF_NOFLASH` selects the explicit path) fuses
+QK·softmax·V with no materialized scores, carries the attention sinks
+(`ggml_flash_attn_ext_add_sinks`) and the sliding-window mask, and accumulates in
+F32. It is numerically exact here — passes the f32 `cos>=0.99999` gate and
+window-stitch — and faster where attention dominates:
+
+| forward tok/s | CPU 2048 | CPU 8192 | Vulkan 8192 | Vulkan 131072 |
+|---|---:|---:|---:|---:|
+| explicit (`PF_NOFLASH`) | 1881 | 798 | 11845 | 8992 |
+| flash (default) | 3319 | 1928 | 26918 | 20631 |
+| speedup | 1.8× | 2.4× | 2.3× | 2.3× |
+
+## Memory and the processing window (W)
+
+`PF_WINDOW` (the `pf_set_window` knob, default 4096) sets tokens per forward;
+longer inputs run as overlapping halo windows. At the default, GGML's footprint
+is **flat across document length** — the compute buffer is bounded by the window,
+not the input:
+
+| length | PyTorch VRAM (eager) | GGML Vulkan VRAM (flash, W=4096) |
+|---:|---:|---:|
+| 4 096 | 5 439 | 2 883 |
+| 8 192 | 13 637 | 2 883 |
+| 32 768 | OOM | 2 883 |
+| 131 072 | OOM | **2 883** |
+
+PyTorch (single-pass) grows O(n²) and OOMs by ~16k tokens; GGML holds ~2.9 GiB at
+131k. So the default W=4096 is a good fit for VRAM-constrained deployments.
+
+Raising W to cut the halo recompute is tempting but currently a **bad trade**: it
+OOMs by W=16384. Flash removed the O(n²) *scores*, but the sliding-window **mask
+is still a materialized `[n,n]` tensor** — the last O(n²) term.
+
+### Banded mask (prototype, `pf-banded-proto`)
+
+Grouping tokens into blocks of `B ≥ radius` and having each query block attend
+only to blocks `{i-1, i, i+1}` makes the mask **O(n·B)** (a `[3B, B, n_blocks]`
+band, constant per block) and the attention compute **O(n·band)** — while being
+**bit-identical** to full masked attention (same dot products, computed locally):
+
+```
+$ pf-banded-proto 256 8192
+n=8192 B=256 r=128 | max|d|=0.00e+00 | mask: full 256.0 MiB, band 24.0 MiB (10.7x)
+```
+
+Mask scaling (B=256): 21× smaller at 16k, 85× at 64k.
+
+**On by default for sequences >= 2048 tokens** (`src/model.cpp`; `PF_BANDED`
+forces it on/off): blocks of B=256, each query block flash-attends to blocks
+`{i-1,i,i+1}` with the F16 band mask + sinks; GQA broadcasts over heads;
+out-of-range tokens are padded and masked. Parity-exact — passes the f32
+`cos>=0.99999` gate and window-stitch on CPU and Vulkan. Speedups
+(flash → banded, default W):
+
+| tok/s | CPU 8192 | Vulkan 8192 | Vulkan 32768 |
+|---|---:|---:|---:|
+| flash | 2068 | 42407 | 33893 |
+| banded | 2325 | **105058** | **83664** |
+| | 1.1× | **2.5×** | **2.5×** |
+
+Big on Vulkan (the flash kernel computes the full window; banded only the band),
+modest on CPU. The measured crossover (banded/flash): 0.9× at 256–512 tok, 1.0×
+at 2048, then 1.1× (CPU) / 2.5× (Vulkan) at 4096+. Hence the 2048 default cutoff.
+
+### Dropping the window (`PF_MOE_CHUNK`)
+
+With banded attention the only remaining O(n) cap on a large single window was the
+MoE expert matmul's activation scratch (`mul_mat_id y_sz > maxStorageBufferRange`
+on Vulkan). The MoE is per-token, so `PF_MOE_CHUNK=C` runs it in C-token chunks
+(exact, no halo). It defaults to the forward window (4096), so it's inert at the
+default window (n <= W) but keeps a *larger* window from OOMing. Banded + chunking
+lets a **131072-token document run in one window** instead of windowing at W=4096:
+
+| 131072 tok, Vulkan | tok/s | compute buffer |
+|---|---:|---:|
+| banded, windowed W=4096 | 80 897 | 166 MiB |
+| banded + chunk, single window | **103 539** | 2 389 MiB |
+
+~1.28× faster (no halo recompute) for more memory -- the throughput/VRAM tradeoff
+the window now exposes, capped only by total VRAM. Passes the f32 parity gate.
+
+## Reproduce
+
+```sh
+cmake --preset release-portable && cmake --build --preset release-portable -j
+build/release-portable/pf-gemm-bench 12          # GFLOP/s by dtype/shape
+build/release-portable/pf-bench <f16.gguf> cpu 5 512
+objdump -d build/release-portable/bin/libggml-cpu-zen4.so | grep -c zmm   # > 0
+```
diff --git a/scripts/bench_torch.py b/scripts/bench_torch.py
new file mode 100644
index 0000000..41192c4
--- /dev/null
+++ b/scripts/bench_torch.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""PyTorch reference throughput, comparable to tools/bench (pf-bench).
+
+Times the HF `openai_privacy_filter` model's forward pass at several document
+lengths, one untimed warm-up + N timed iters per length, and prints a markdown
+table of forward ms / tok/s -- the same shape pf-bench emits for the ggml
+engine, so the two tables line up column-for-column.
+
+  python scripts/bench_torch.py --model <hf-dir> --device cpu
+  python scripts/bench_torch.py --model <hf-dir> --device cuda --dtype fp16
+
+Only the model forward is timed (the comparable quantity): tokenization and
+BIOES decode are excluded on both sides. Inputs are real token ids -- a PII
+paragraph tokenized once and tiled/truncated to the exact target length -- so
+both engines see identical sequence lengths. Lengths that OOM or error are
+reported as such and skipped rather than aborting the run.
+"""
+from __future__ import annotations
+
+import argparse
+import time
+
+# A PII-shaped paragraph, mirroring tools/bench/pf-bench.cpp make_text(), so the
+# token stream is representative rather than degenerate (repeated single token).
+SEED_TEXT = (
+    "Case 0: Anna Kowalski reported an issue. Contact at anna.kowalski0@mail.example.com "
+    "or +48 123 456 789. Ships to 12 Elm Street, Lyon. "
+    "Refund to IBAN DE89 3704 0044 0532 0130 00.\n\n"
+)
+
+DTYPES = {"fp32": "float32", "fp16": "float16", "bf16": "bfloat16"}
+
+
+def build_ids(tok, n: int):
+    import torch
+
+    base = tok(SEED_TEXT, add_special_tokens=False)["input_ids"]
+    reps = (n + len(base) - 1) // len(base)
+    ids = (base * reps)[:n]
+    return torch.tensor([ids], dtype=torch.long)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--model", required=True, help="HF checkpoint dir")
+    ap.add_argument("--device", default="cpu", help="cpu | cuda | cuda:N")
+    ap.add_argument("--dtype", default="auto", choices=["auto", *DTYPES],
+                    help="auto: fp32 on cpu, fp16 on cuda")
+    ap.add_argument("--attn", default="sdpa", choices=["sdpa", "eager"])
+    ap.add_argument("--lengths", default="189,756,2898,11403,45234",
+                    help="comma-separated token counts (match pf-bench output)")
+    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument("--threads", type=int, default=0, help="CPU threads (0 = torch default)")
+    args = ap.parse_args()
+
+    import torch
+    import transformers
+    from transformers import AutoModelForTokenClassification, AutoTokenizer
+
+    if args.threads > 0:
+        torch.set_num_threads(args.threads)
+    dtype_name = args.dtype
+    if dtype_name == "auto":
+        dtype_name = "fp16" if args.device.startswith("cuda") else "fp32"
+    dtype = getattr(torch, DTYPES[dtype_name])
+    dev = torch.device(args.device)
+    cuda = dev.type == "cuda"
+    lengths = [int(x) for x in args.lengths.split(",") if x]
+
+    tok = AutoTokenizer.from_pretrained(args.model)
+    t_load0 = time.perf_counter()
+    model = AutoModelForTokenClassification.from_pretrained(
+        args.model, dtype=dtype, attn_implementation=args.attn).eval().to(dev)
+    if cuda:
+        torch.cuda.synchronize()
+    t_load1 = time.perf_counter()
+
+    def fwd(ids):
+        with torch.inference_mode():
+            model(input_ids=ids)
+
+    name = torch.cuda.get_device_name(dev) if cuda else f"cpu x{torch.get_num_threads()}"
+    print(f"torch {torch.__version__} | tf {transformers.__version__} | {name} | "
+          f"{dtype_name} | {args.attn} | load {t_load1 - t_load0:.2f}s | {args.iters} iters\n")
+    print(f"| {'tokens':>8} | {'forward ms':>11} | {'tok/s':>8} | {'peak MiB':>8} |")
+    print("|---------:|------------:|---------:|---------:|")
+
+    for n in lengths:
+        ids = build_ids(tok, n).to(dev)
+        try:
+            if cuda:
+                torch.cuda.reset_peak_memory_stats(dev)
+                torch.cuda.synchronize()
+            fwd(ids)  # warm-up (lazy kernel/autotune, allocator growth)
+            if cuda:
+                torch.cuda.synchronize()
+                ev0, ev1 = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+                ev0.record()
+                for _ in range(args.iters):
+                    fwd(ids)
+                ev1.record()
+                torch.cuda.synchronize()
+                fwd_ms = ev0.elapsed_time(ev1) / args.iters
+                peak = torch.cuda.max_memory_allocated(dev) / 1024 / 1024
+            else:
+                t0 = time.perf_counter()
+                for _ in range(args.iters):
+                    fwd(ids)
+                fwd_ms = (time.perf_counter() - t0) * 1e3 / args.iters
+                peak = _rss_mib()
+        except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
+            if cuda:
+                torch.cuda.empty_cache()
+            msg = "OOM" if "out of memory" in str(e).lower() else f"err: {str(e)[:40]}"
+            print(f"| {n:>8} | {msg:>11} | {'-':>8} | {'-':>8} |")
+            continue
+        print(f"| {n:>8} | {fwd_ms:>11.1f} | {n / (fwd_ms / 1e3):>8.0f} | {peak:>8.0f} |")
+        del ids
+        if cuda:
+            torch.cuda.empty_cache()
+    return 0
+
+
+def _rss_mib() -> float:
+    try:
+        with open("/proc/self/status") as f:
+            for line in f:
+                if line.startswith("VmHWM:"):
+                    return int(line.split()[1]) / 1024
+    except OSError:
+        pass
+    return 0.0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/requant_q8.py b/scripts/requant_q8.py
new file mode 100644
index 0000000..0a89c11
--- /dev/null
+++ b/scripts/requant_q8.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""Experiment tool: requantize selected weights of an existing GGUF to Q8_0.
+
+Copies every KV field and tensor verbatim, except tensors whose name matches
+--match (default: the MoE expert weights), which are quantized to Q8_0. Used to
+test the hypothesis that ggml's int8 mul_mat_id kernel beats the f16 path on CPU.
+
+  python scripts/requant_q8.py --in f16.gguf --out q8.gguf [--match SUBSTR ...]
+"""
+from __future__ import annotations
+
+import argparse
+
+import numpy as np
+import gguf
+from gguf import GGMLQuantizationType as QT, GGUFValueType as VT
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--in", dest="inp", required=True)
+    ap.add_argument("--out", required=True)
+    ap.add_argument("--match", nargs="*",
+                    default=["ffn_gate_exps.weight", "ffn_up_exps.weight", "ffn_down_exps.weight"])
+    args = ap.parse_args()
+
+    r = gguf.GGUFReader(args.inp)
+    arch = r.fields["general.architecture"].contents()
+    w = gguf.GGUFWriter(args.out, arch)
+
+    # copy metadata (the writer already set general.architecture itself)
+    scalar_add = {
+        VT.UINT8: w.add_uint8, VT.INT8: w.add_int8, VT.UINT16: w.add_uint16,
+        VT.INT16: w.add_int16, VT.UINT32: w.add_uint32, VT.INT32: w.add_int32,
+        VT.FLOAT32: w.add_float32, VT.UINT64: w.add_uint64, VT.INT64: w.add_int64,
+        VT.FLOAT64: w.add_float64, VT.BOOL: w.add_bool, VT.STRING: w.add_string,
+    }
+    for key, field in r.fields.items():
+        if key == "general.architecture":
+            continue
+        val = field.contents()
+        if field.types and field.types[0] == VT.ARRAY:
+            w.add_array(key, val)
+        else:
+            scalar_add[field.types[0]](key, val)
+
+    # copy / quantize tensors
+    n_q = 0
+    for t in r.tensors:
+        if any(m in t.name for m in args.match):
+            x = t.data.astype(np.float32)                      # numpy order [.., ne0]
+            q = gguf.quants.quantize(x, QT.Q8_0)               # uint8, last dim -> bytes/row
+            w.add_tensor(t.name, q, raw_dtype=QT.Q8_0)         # writer derives logical shape
+            n_q += 1
+        else:
+            w.add_tensor(t.name, t.data)                       # verbatim (preserves F16/F32)
+
+    w.write_header_to_file()
+    w.write_kv_data_to_file()
+    w.write_tensors_to_file()
+    w.close()
+    print(f"wrote {args.out}: {len(r.tensors)} tensors ({n_q} quantized to Q8_0), {len(r.fields)} fields")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/backend.cpp b/src/backend.cpp
index eb3f60b..b757b84 100644
--- a/src/backend.cpp
+++ b/src/backend.cpp
@@ -1,5 +1,6 @@
 #include "backend.h"
 
+#include <ggml-backend.h>
 #include <ggml-cpu.h>
 
 #include <algorithm>
@@ -26,8 +27,29 @@ void parse_device(const std::string & req, std::string & name, int & index) {
 
 } // namespace
 
+// Discover dynamically-loadable backends once. For a GGML_BACKEND_DL +
+// GGML_CPU_ALL_VARIANTS build this loads every libggml-cpu-<isa>.so and ggml
+// scores them, so the host's best ISA (e.g. zen4/AVX-512) is selected at run
+// time -- the portable way to ship SIMD without baking -march into one binary
+// (and without Nix's wrapper silently dropping -march=native). No-op / harmless
+// for a statically-linked build, where backends register at static-init.
+static void load_backends_once() {
+    static const bool done = [] { ggml_backend_load_all(); return true; }();
+    (void) done;
+}
+
+// Set threads through the backend registry rather than ggml_backend_cpu_set_n_threads:
+// in a DL build that symbol lives in the variant .so, not in the linked base.
+static void set_cpu_threads(ggml_backend_t be, int n_threads) {
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(be));
+    auto set_fn = (ggml_backend_set_n_threads_t)
+        ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+    if (set_fn) set_fn(be, n_threads);
+}
+
 bool engine_backend::init(const std::string & device_req, int n_threads) {
     release();
+    load_backends_once();
 
     std::string name;
     int         want_idx = 0;
@@ -40,12 +62,16 @@ bool engine_backend::init(const std::string & device_req, int n_threads) {
             return false;
         }
         device = "cpu";
+        if (const char * env = std::getenv("PF_NTHREADS")) {
+            // explicit override (tuning / benchmarking); 0 falls through to auto
+            if (int v = std::atoi(env)) n_threads = v;
+        }
         if (n_threads <= 0) {
             // ggml's default is 4 threads; matmul-heavy work wants the
             // physical cores (SMT siblings only add contention here)
             n_threads = std::max(1u, std::thread::hardware_concurrency() / 2);
         }
-        ggml_backend_cpu_set_n_threads(be, n_threads);
+        set_cpu_threads(be, n_threads);
     } else if (name == "gpu" || name == "cuda" || name == "vulkan") {
         // "gpu" picks the first GPU of whichever backend was compiled in;
         // "cuda"/"vulkan" pin a specific backend when more than one is built.
diff --git a/src/model.cpp b/src/model.cpp
index b31db40..80afe56 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -178,8 +178,18 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector<float> & logits,
     const hparams & h = file.hp;
     const int64_t n_embd = h.n_embd, n_head = h.n_head, n_head_kv = h.n_head_kv, n_rot = h.n_rot;
 
-    // ~45 nodes/layer * 8 layers + inputs/head; generous fixed bound.
-    const size_t  graph_nodes = 1024;
+    // PF_MOE_CHUNK: MoE FFN token-chunk size. The MoE is per-token, so chunking is
+    // exact (no halo) and bounds the mul_mat_id activation scratch (a Vulkan
+    // single-buffer limit). Default = the forward window, so it is inert at the
+    // default window (n <= W) yet keeps a larger window (single-pass long docs)
+    // from OOMing. 0 disables.
+    const int     moe_chunk = std::getenv("PF_MOE_CHUNK") ? std::atoi(std::getenv("PF_MOE_CHUNK")) : 4096;
+
+    // ~45 nodes/layer * 8 layers + inputs/head; generous fixed bound. MoE chunking
+    // multiplies the FFN node count by the number of chunks.
+    size_t        graph_nodes = 1024;
+    if (moe_chunk > 0 && n > moe_chunk)
+        graph_nodes += (size_t) ((n + moe_chunk - 1) / moe_chunk) * h.n_layer * 40;
     ggml_init_params gp = {
         ggml_tensor_overhead() * graph_nodes + ggml_graph_overhead_custom(graph_nodes, false),
         nullptr,
@@ -205,14 +215,36 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector<float> & logits,
         return t;
     };
 
+    // PF_BANDED (experimental): block-local sliding-window attention. The mask is
+    // an O(n*B) per-block band [3B,B,1,nb] instead of the O(n^2) [n,n], and
+    // attention compute drops to O(n*band). Tokens group into blocks of B>=radius;
+    // each query block attends only to blocks {i-1,i,i+1}. Bit-identical to the
+    // full masked attention (see bench/banded_attn_proto.cpp).
+    // Default on once the sequence is long enough to win. The B=256 block padding
+    // makes it a slight loss on short inputs; measured crossover ~2048 tok (CPU
+    // neutral-to-faster, Vulkan ~1.1x rising to ~2.5x at length). PF_BANDED forces
+    // it on (non-zero) or off (0). n here is one window's worth (<= the window).
+    const char * banded_env = std::getenv("PF_BANDED");
+    const bool use_banded = banded_env ? (std::atoi(banded_env) != 0) : (n >= 2048);
+    const int  Bsz   = 256;                          // block size (>= swa_radius 128)
+    const int  nbk   = use_banded ? (int) ((n + Bsz - 1) / Bsz) : 0;
+    const int  n_pad = nbk * Bsz;
+
     // inputs (data written after alloc)
     ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n);
     ggml_tensor * inp_pos    = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n);
-    ggml_tensor * kq_mask    = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
     ggml_tensor * ff         = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_rot / 2);
+    ggml_tensor * kq_mask    = nullptr;   // [n,n]    -- full / flash paths
+    ggml_tensor * band_mask  = nullptr;   // [3B,B,1,nb] -- banded path
+    if (use_banded) {
+        band_mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 3 * Bsz, Bsz, 1, nbk);
+        ggml_set_input(band_mask);
+    } else {
+        kq_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);
+        ggml_set_input(kq_mask);
+    }
     ggml_set_input(inp_tokens);
     ggml_set_input(inp_pos);
-    ggml_set_input(kq_mask);
     ggml_set_input(ff);
 
     ggml_tensor * cur = ggml_get_rows(ctx, tok_embd, inp_tokens);  // [640, n]
@@ -231,6 +263,62 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector<float> & logits,
                              0.0f, attn_factor, h.yarn_beta_fast, h.yarn_beta_slow);
     };
 
+    // Ablation profiling hooks (PF_PROF): skip a block to attribute wall-time.
+    //   noattn  -> skip self-attention; nomoe -> skip the MoE FFN.
+    // The residual still runs on the (cheap rms) input, so the delta vs the full
+    // forward is that block's cost. Build-time only; no effect unset.
+    const char * prof = std::getenv("PF_PROF");
+    const bool prof_noattn = prof && std::strstr(prof, "noattn");
+    const bool prof_nomoe  = prof && std::strstr(prof, "nomoe");
+
+    // Fused flash attention (default) instead of the explicit [n,n] score matrix:
+    // no materialized scores, the backend skips out-of-band KV under the
+    // sliding-window mask, sinks carried via add_sinks, F32 accumulate. Validated
+    // exact (passes the f32 cos>=0.99999 gate) and ~2-2.4x faster on CPU and
+    // Vulkan at length. PF_NOFLASH selects the explicit path (reference / debug).
+    const bool use_flash = !std::getenv("PF_NOFLASH");
+
+    // Banded block-local attention (PF_BANDED): q [d,n_head,n], k/v [d,n_head_kv,n]
+    // post-rope -> [n_head*d, n]. Each query block attends to blocks {i-1,i,i+1}
+    // (3B keys) via a constant-shape per-block band mask; same dot products as the
+    // full path, computed locally. GQA broadcasts over the head dim; sinks added
+    // per block; pad tokens are masked (and trimmed) -- the sink keeps their
+    // softmax finite.
+    auto banded_attn = [&](ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * sinks) {
+        const float scale = 1.0f / std::sqrt((float) n_rot);
+        if (n_pad != n) {
+            q = ggml_pad(ctx, q, 0, 0, n_pad - n, 0);
+            k = ggml_pad(ctx, k, 0, 0, n_pad - n, 0);
+            v = ggml_pad(ctx, v, 0, 0, n_pad - n, 0);
+        }
+        auto to_blocks = [&](ggml_tensor * x, int64_t hh) {              // [d,hh,n_pad]->[d,B,hh,nb]
+            x = ggml_reshape_4d(ctx, x, n_rot, hh, Bsz, nbk);            // [d,hh,B,nb]
+            return ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));    // [d,B,hh,nb]
+        };
+        auto neigh = [&](ggml_tensor * xb, int64_t hh) {                 // [d,B,hh,nb]->[d,3B,hh,nb]
+            ggml_tensor * z = ggml_scale(ctx,
+                ggml_view_4d(ctx, xb, n_rot, Bsz, hh, 1, xb->nb[1], xb->nb[2], xb->nb[3], 0), 0.0f);
+            ggml_tensor * pad = ggml_concat(ctx, ggml_concat(ctx, z, xb, 3), z, 3);          // [d,B,hh,nb+2]
+            ggml_tensor * pr = ggml_view_4d(ctx, pad, n_rot, Bsz, hh, nbk, pad->nb[1], pad->nb[2], pad->nb[3], 0);
+            ggml_tensor * se = ggml_view_4d(ctx, pad, n_rot, Bsz, hh, nbk, pad->nb[1], pad->nb[2], pad->nb[3], 1 * pad->nb[3]);
+            ggml_tensor * nx = ggml_view_4d(ctx, pad, n_rot, Bsz, hh, nbk, pad->nb[1], pad->nb[2], pad->nb[3], 2 * pad->nb[3]);
+            return ggml_cont(ctx, ggml_concat(ctx, ggml_concat(ctx, pr, se, 1), nx, 1));     // [d,3B,hh,nb]
+        };
+        ggml_tensor * qb = to_blocks(q, n_head);                        // [d,B,14,nb]
+        ggml_tensor * kc = neigh(to_blocks(k, n_head_kv), n_head_kv);   // [d,3B,2,nb]
+        ggml_tensor * vc = neigh(to_blocks(v, n_head_kv), n_head_kv);   // [d,3B,2,nb]
+        // flash over the 3-block neighborhoods: no materialized band scores, so
+        // memory is O(n*band). mask is the F16 per-block band; sinks per block.
+        ggml_tensor * m16 = ggml_cast(ctx, band_mask, GGML_TYPE_F16);   // [3B,B,1,nb]
+        ggml_tensor * o = ggml_flash_attn_ext(ctx, qb, kc, vc, m16, scale, 0.0f, 0.0f); // [d,14,B,nb]
+        ggml_flash_attn_ext_set_prec(o, GGML_PREC_F32);
+        ggml_flash_attn_ext_add_sinks(o, sinks);
+        o = ggml_reshape_3d(ctx, o, n_rot, n_head, n_pad);             // [d,14,n_pad]
+        if (n_pad != n)
+            o = ggml_view_3d(ctx, o, n_rot, n_head, n, o->nb[1], o->nb[2], 0);
+        return ggml_cont_2d(ctx, o, n_head * n_rot, n);                // [896, n]
+    };
+
     for (int il = 0; il < h.n_layer; il++) {
         const layer_weights & l = layers[il];
         const std::string L = "l" + std::to_string(il);
@@ -239,7 +327,7 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector<float> & logits,
         cur = tap(rms(cur, l.attn_norm), L + ".attn_norm");
 
         // self-attention
-        {
+        if (!prof_noattn) {
             ggml_tensor * q = ggml_add(ctx, ggml_mul_mat(ctx, l.wq, cur), l.bq);  // [896, n]
             ggml_tensor * k = ggml_add(ctx, ggml_mul_mat(ctx, l.wk, cur), l.bk);  // [128, n]
             ggml_tensor * v = ggml_add(ctx, ggml_mul_mat(ctx, l.wv, cur), l.bv);  // [128, n]
@@ -251,20 +339,33 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector<float> & logits,
             q = tap(rope(q), L + ".q_rope");  // [64, 14, n]
             k = tap(rope(k), L + ".k_rope");  // [64,  2, n]
 
+            const float kq_scale = 1.0f / std::sqrt((float) n_rot);
+            ggml_tensor * attn;                                                  // [896, n]
+            if (use_banded) {
+                attn = banded_attn(q, k, v, l.sinks);
+            } else {
             ggml_tensor * qp = ggml_permute(ctx, q, 0, 2, 1, 3);                 // [64, n, 14]
             ggml_tensor * kp = ggml_permute(ctx, k, 0, 2, 1, 3);                 // [64, n,  2]
-            ggml_tensor * vp = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [n, 64,  2]
-
-            ggml_tensor * kq = ggml_mul_mat(ctx, kp, qp);                        // [n, n, 14] (GQA broadcast)
-            ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
-            kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f / std::sqrt((float) n_rot), 0.0f);
-            ggml_soft_max_add_sinks(kq, l.sinks);
-
-            ggml_tensor * kqv = ggml_mul_mat(ctx, vp, kq);                       // [64, n, 14]
-            kqv = ggml_permute(ctx, kqv, 0, 2, 1, 3);                            // [64, 14, n]
-            kqv = ggml_cont_2d(ctx, kqv, n_head * n_rot, n);                     // [896, n]
+            if (use_flash) {
+                ggml_tensor * vp = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3)); // [64, n, 2]
+                ggml_tensor * m16 = ggml_cast(ctx, kq_mask, GGML_TYPE_F16);
+                ggml_tensor * fa = ggml_flash_attn_ext(ctx, qp, kp, vp, m16, kq_scale, 0.0f, 0.0f);
+                ggml_flash_attn_ext_set_prec(fa, GGML_PREC_F32);
+                ggml_flash_attn_ext_add_sinks(fa, l.sinks);
+                attn = ggml_reshape_2d(ctx, fa, n_head * n_rot, n);             // [896, n]
+            } else {
+                ggml_tensor * vp = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [n, 64, 2]
+                ggml_tensor * kq = ggml_mul_mat(ctx, kp, qp);                        // [n, n, 14] (GQA)
+                ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+                kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
+                ggml_soft_max_add_sinks(kq, l.sinks);
+                ggml_tensor * kqv = ggml_mul_mat(ctx, vp, kq);                       // [64, n, 14]
+                kqv = ggml_permute(ctx, kqv, 0, 2, 1, 3);                            // [64, 14, n]
+                attn = ggml_cont_2d(ctx, kqv, n_head * n_rot, n);                    // [896, n]
+            }
+            }
 
-            cur = ggml_add(ctx, ggml_mul_mat(ctx, l.wo, kqv), l.bo);             // [640, n]
+            cur = ggml_add(ctx, ggml_mul_mat(ctx, l.wo, attn), l.bo);            // [640, n]
             tap(cur, L + ".attn_out");
         }
 
@@ -276,37 +377,45 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector<float> & logits,
 
         // MoE FFN (softmax-after-top-k gating; the HF reference's /top_k and
         // *top_k cancel, so plain softmax weights are the trained semantics)
-        {
+        if (!prof_nomoe) {
             const int64_t n_exp = h.n_expert, n_used = h.n_expert_used;
 
-            ggml_tensor * rl = ggml_add(ctx, ggml_mul_mat(ctx, l.router_w, cur), l.router_b);  // [128, n]
-            tap(rl, L + ".moe_logits");
-
-            ggml_tensor * sel = tap(ggml_argsort_top_k(ctx, rl, (int) n_used), L + ".moe_topk");  // i32 [4, n]
-
-            ggml_tensor * w = ggml_get_rows(ctx, ggml_reshape_3d(ctx, rl, 1, n_exp, n), sel);     // [1, 4, n]
-            w = ggml_soft_max(ctx, ggml_reshape_2d(ctx, w, n_used, n));
-            tap(w, L + ".moe_weights");
-            w = ggml_reshape_3d(ctx, w, 1, n_used, n);
-
-            ggml_tensor * x3   = ggml_reshape_3d(ctx, cur, n_embd, 1, n);
-            ggml_tensor * up   = ggml_mul_mat_id(ctx, l.up_exps, x3, sel);                        // [640, 4, n]
-            up                 = ggml_add_id(ctx, up, l.up_exps_b, sel);
-            ggml_tensor * gate = ggml_mul_mat_id(ctx, l.gate_exps, x3, sel);
-            gate               = ggml_add_id(ctx, gate, l.gate_exps_b, sel);
-
-            ggml_tensor * hms  = ggml_swiglu_oai(ctx, gate, up, 1.702f, 7.0f);                    // [640, 4, n]
-
-            ggml_tensor * out  = ggml_mul_mat_id(ctx, l.down_exps, hms, sel);                     // [640, 4, n]
-            out                = ggml_add_id(ctx, out, l.down_exps_b, sel);
-            out                = ggml_mul(ctx, out, w);
-
-            ggml_tensor * moe = nullptr;
-            for (int64_t e = 0; e < n_used; e++) {
-                ggml_tensor * slice = ggml_view_2d(ctx, out, n_embd, n, out->nb[2], e * out->nb[1]);
-                moe = moe ? ggml_add(ctx, moe, slice) : slice;
+            // one expert FFN over m tokens (x: [n_embd, m]) -> [n_embd, m]
+            auto moe_ffn = [&](ggml_tensor * x, int64_t m, bool do_taps) {
+                ggml_tensor * rl = ggml_add(ctx, ggml_mul_mat(ctx, l.router_w, x), l.router_b);   // [128, m]
+                if (do_taps) tap(rl, L + ".moe_logits");
+                ggml_tensor * sel = ggml_argsort_top_k(ctx, rl, (int) n_used);                    // i32 [4, m]
+                if (do_taps) tap(sel, L + ".moe_topk");
+                ggml_tensor * w = ggml_get_rows(ctx, ggml_reshape_3d(ctx, rl, 1, n_exp, m), sel); // [1, 4, m]
+                w = ggml_soft_max(ctx, ggml_reshape_2d(ctx, w, n_used, m));
+                if (do_taps) tap(w, L + ".moe_weights");
+                w = ggml_reshape_3d(ctx, w, 1, n_used, m);
+                ggml_tensor * x3   = ggml_reshape_3d(ctx, x, n_embd, 1, m);
+                ggml_tensor * up   = ggml_add_id(ctx, ggml_mul_mat_id(ctx, l.up_exps, x3, sel), l.up_exps_b, sel);
+                ggml_tensor * gate = ggml_add_id(ctx, ggml_mul_mat_id(ctx, l.gate_exps, x3, sel), l.gate_exps_b, sel);
+                ggml_tensor * hms  = ggml_swiglu_oai(ctx, gate, up, 1.702f, 7.0f);                // [640, 4, m]
+                ggml_tensor * out  = ggml_add_id(ctx, ggml_mul_mat_id(ctx, l.down_exps, hms, sel), l.down_exps_b, sel);
+                out                = ggml_mul(ctx, out, w);
+                ggml_tensor * moe = nullptr;
+                for (int64_t e = 0; e < n_used; e++) {
+                    ggml_tensor * sl = ggml_view_2d(ctx, out, n_embd, m, out->nb[2], e * out->nb[1]);
+                    moe = moe ? ggml_add(ctx, moe, sl) : sl;
+                }
+                return ggml_cont(ctx, moe);                                                       // [n_embd, m]
+            };
+
+            if (moe_chunk > 0 && n > moe_chunk) {
+                ggml_tensor * acc = nullptr;
+                for (int64_t c = 0; c < n; c += moe_chunk) {
+                    const int64_t m = std::min<int64_t>(moe_chunk, n - c);
+                    ggml_tensor * xs = ggml_cont(ctx, ggml_view_2d(ctx, cur, n_embd, m, cur->nb[1], c * cur->nb[1]));
+                    ggml_tensor * ys = moe_ffn(xs, m, false);
+                    acc = acc ? ggml_concat(ctx, acc, ys, 1) : ys;
+                }
+                cur = tap(acc, L + ".moe_out");
+            } else {
+                cur = tap(moe_ffn(cur, n, taps != nullptr), L + ".moe_out");
             }
-            cur = tap(ggml_cont(ctx, moe), L + ".moe_out");
         }
 
         cur = ggml_add(ctx, cur, resid);
@@ -327,17 +436,35 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector<float> & logits,
         return false;
     }
 
-    // inputs AFTER alloc
+    // inputs AFTER alloc. Guard each set on ->buffer: a PF_PROF ablation can
+    // prune a block, orphaning its inputs (gallocr leaves them unallocated);
+    // unset, every input is live so this is a no-op.
     ggml_backend_tensor_set(inp_tokens, ids, 0, n * sizeof(int32_t));
     {
         std::vector<int32_t> pos(n);
         for (int64_t i = 0; i < n; i++) pos[i] = (int32_t) i;
-        ggml_backend_tensor_set(inp_pos, pos.data(), 0, n * sizeof(int32_t));
-
-        std::vector<float> mask(n * n);
-        fill_swa_mask(mask.data(), n, h.swa_radius);
-        ggml_backend_tensor_set(kq_mask, mask.data(), 0, mask.size() * sizeof(float));
-        ggml_backend_tensor_set(ff, freq_factors.data(), 0, freq_factors.size() * sizeof(float));
+        if (inp_pos->buffer) ggml_backend_tensor_set(inp_pos, pos.data(), 0, n * sizeof(int32_t));
+
+        if (use_banded) {
+            // per-block band mask [3B,B,1,nb]: query (bi*B+j) sees context key
+            // (bi*B-B+p) iff within radius and a real (unpadded) token.
+            const int64_t r = h.swa_radius;
+            std::vector<float> bm((size_t) 3 * Bsz * Bsz * nbk);
+            for (int bi = 0; bi < nbk; bi++)
+                for (int j = 0; j < Bsz; j++)
+                    for (int p = 0; p < 3 * Bsz; p++) {
+                        const int64_t qpos = (int64_t) bi * Bsz + j;
+                        const int64_t kpos = (int64_t) bi * Bsz - Bsz + p;
+                        const bool vis = std::llabs(qpos - kpos) <= r && kpos >= 0 && kpos < n;
+                        bm[((size_t) bi * Bsz + j) * (3 * Bsz) + p] = vis ? 0.0f : -INFINITY;
+                    }
+            if (band_mask->buffer) ggml_backend_tensor_set(band_mask, bm.data(), 0, bm.size() * sizeof(float));
+        } else {
+            std::vector<float> mask((size_t) n * n);
+            fill_swa_mask(mask.data(), n, h.swa_radius);
+            if (kq_mask->buffer) ggml_backend_tensor_set(kq_mask, mask.data(), 0, mask.size() * sizeof(float));
+        }
+        if (ff->buffer)      ggml_backend_tensor_set(ff, freq_factors.data(), 0, freq_factors.size() * sizeof(float));
     }
 
     if (ggml_backend_graph_compute(be.be, gf) != GGML_STATUS_SUCCESS) {
diff --git a/tests/test_graph_blocks.cpp b/tests/test_graph_blocks.cpp
index 90a72a2..f2636ad 100644
--- a/tests/test_graph_blocks.cpp
+++ b/tests/test_graph_blocks.cpp
@@ -4,7 +4,7 @@
 // (interleaved pairing, freq_factors division, attn_factor application).
 #include "model.h"
 
-#include <ggml-cpu.h>
+#include <ggml-backend.h>
 #include <ggml.h>
 
 #include <cmath>
@@ -80,13 +80,24 @@ static void test_swa_mask() {
     }
 }
 
-// tiny single-op CPU eval helper
+// tiny single-op CPU eval helper. Uses the backend API rather than ggml-cpu's
+// ggml_graph_compute_with_ctx so it links in a GGML_BACKEND_DL build (where the
+// CPU compute symbols live in the variant .so, loaded at runtime). Tensors are
+// in a no_alloc=false ctx, so their ->data is CPU-resident and computed in place.
+static ggml_backend_t cpu_backend() {
+    static ggml_backend_t be = [] {
+        ggml_backend_load_all();
+        return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    }();
+    return be;
+}
+
 template <typename BUILD>
 static ggml_tensor * eval(ggml_context * ctx, BUILD build) {
     ggml_tensor * out = build();
     ggml_cgraph * gf = ggml_new_graph(ctx);
     ggml_build_forward_expand(gf, out);
-    ggml_graph_compute_with_ctx(ctx, gf, 2);
+    ggml_backend_graph_compute(cpu_backend(), gf);
     return out;
 }