localai-org · richiejp · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -13,7 +13,11 @@ jobs:
     strategy:
       matrix:
         include:
-          - { name: gcc-release, preset: release, cc: gcc, cxx: g++ }
+          # gcc-release builds the shipped CPU config: every ggml-cpu ISA variant
+          # (GGML_CPU_ALL_VARIANTS), runtime-dispatched -- so released binaries get
+          # AVX-512 without -march=native (which Nix strips and which mis-targets
+          # cross-host builds).
+          - { name: gcc-release, preset: release-portable, cc: gcc, cxx: g++ }
           - { name: clang-debug-san, preset: debug, cc: clang, cxx: clang++ }
     name: ${{ matrix.name }}
     runs-on: ubuntu-latest
@@ -32,13 +36,24 @@ jobs:
         run: cmake --preset ${{ matrix.preset }} -DGGML_NATIVE=OFF
         env: { CC: '${{ matrix.cc }}', CXX: '${{ matrix.cxx }}' }
       - name: build
-        run: cmake --build --preset ${{ matrix.preset }} -j
+        # -j4 (not unbounded -j): release-portable compiles 14 ggml-cpu ISA
+        # variants; uncapped parallelism OOMs the 16 GB runner.
+        run: cmake --build --preset ${{ matrix.preset }} -j4
       - name: test (model-independent)
         run: ctest --preset ${{ matrix.preset }} -LE model
       - name: unicode table regen check
         run: |
           python3 scripts/gen_unicode.py > /tmp/unicode_data.inc
           diff -u src/unicode_data.inc /tmp/unicode_data.inc
+      - name: assert SIMD compiled in (guard the SSE-only trap)
+        if: matrix.name == 'gcc-release'
+        # The AVX-512 ISA variant must actually contain AVX-512 -- catches a
+        # silent regression to a SIMD-less build (e.g. a stripped -march).
+        run: |
+          so=build/release-portable/bin/libggml-cpu-skylakex.so
+          n=$(objdump -d "$so" | grep -c '%zmm')
+          echo "skylakex %zmm instructions: $n"
+          test "$n" -gt 0
 
   # Tier 2 (nightly / dispatch): HF reference fixtures + parity + fuzz smoke.
   # Needs the model checkpoint; cached between runs.
@@ -92,9 +107,9 @@ jobs:
             --model ~/models/privacy-filter-multilingual \
             --outfile ~/ggufs/pf-f32.gguf --outtype f32
       - name: build
-        run: cmake --preset release -DGGML_NATIVE=OFF && cmake --build --preset release -j
+        run: cmake --preset release-portable && cmake --build --preset release-portable -j4
       - name: parity suite
-        run: PF_GGUF_DIR=~/ggufs ctest --preset release -L model
+        run: PF_GGUF_DIR=~/ggufs ctest --preset release-portable -L model
       - name: fuzz smoke (5 min/target)
         run: |
           cmake --preset fuzz && cmake --build --preset fuzz -j --target fuzz_tokenizer fuzz_gguf

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -60,6 +60,17 @@ if (PF_BUILD_TOOLS)
     add_executable(pf-bench bench/pf-bench.cpp)
     target_link_libraries(pf-bench PRIVATE pf)
     target_include_directories(pf-bench PRIVATE src)
+
+    # GFLOP/s by dtype/shape -- diagnostic for the CPU matmul analysis
+    # (docs/cpu-perf.md). Links ggml directly; no pf/model deps.
+    add_executable(pf-gemm-bench bench/gemm_microbench.cpp)
+    target_link_libraries(pf-gemm-bench PRIVATE ggml)
+
+    # Prototype: O(n*band) block-local attention == full masked attention
+    # (bit-identical), with an O(n*B) mask instead of O(n^2). Proof-of-concept
+    # for de-windowing; see docs/cpu-perf.md. Args: [block] [tokens].
+    add_executable(pf-banded-proto bench/banded_attn_proto.cpp)
+    target_link_libraries(pf-banded-proto PRIVATE ggml)
 endif()
 
 if (PF_FUZZ)

diff --git a/CMakePresets.json b/CMakePresets.json
@@ -16,6 +16,18 @@
       "binaryDir": "${sourceDir}/build/release",
       "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
     },
+    {
+      "name": "release-portable",
+      "binaryDir": "${sourceDir}/build/release-portable",
+      "description": "Portable + fast CPU: build every ggml-cpu ISA variant and pick the best at runtime. Avoids -march=native (fragile, and stripped by Nix's NIX_ENFORCE_NO_NATIVE), so binaries run anywhere yet still use AVX-512/VNNI where present.",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "GGML_NATIVE": "OFF",
+        "GGML_BACKEND_DL": "ON",
+        "GGML_CPU_ALL_VARIANTS": "ON",
+        "CMAKE_RUNTIME_OUTPUT_DIRECTORY": "${sourceDir}/build/release-portable/bin"
+      }
+    },
     {
       "name": "profile",
       "binaryDir": "${sourceDir}/build/profile",
@@ -35,11 +47,13 @@
   "buildPresets": [
     { "name": "debug",   "configurePreset": "debug" },
     { "name": "release", "configurePreset": "release" },
+    { "name": "release-portable", "configurePreset": "release-portable" },
     { "name": "profile", "configurePreset": "profile" },
     { "name": "fuzz",    "configurePreset": "fuzz" }
   ],
   "testPresets": [
     { "name": "debug",   "configurePreset": "debug",   "output": { "outputOnFailure": true } },
-    { "name": "release", "configurePreset": "release", "output": { "outputOnFailure": true } }
+    { "name": "release", "configurePreset": "release", "output": { "outputOnFailure": true } },
+    { "name": "release-portable", "configurePreset": "release-portable", "output": { "outputOnFailure": true } }
   ]
 }
diff --git a/README.md b/README.md
@@ -130,18 +130,33 @@ PF_GGUF=model.gguf ./build/fuzz/fuzz_tokenizer corpus_tok/
 ## Bench
 
 ```sh
-build/release/pf-bench model.gguf [cpu|vulkan] [iters]
+cmake --preset release-portable && cmake --build --preset release-portable -j
+build/release-portable/bin/pf-bench model.gguf [cpu|vulkan] [iters] [lengths]
 ```
 
-Ryzen 9 7900 (12 threads) / RTX 5070 Ti, f16 GGUF, forward tok/s by length
-(one untimed warm-up per length; GPU pipelines compile lazily):
+Forward tok/s vs stock HF Transformers (transformers 5.9, eager), Ryzen 9 7900 (12
+threads) + RTX 5070 Ti, f16/fp16, matched token counts
+([scripts/bench_torch.py](scripts/bench_torch.py)):
 
-| tokens | cpu | vulkan |
-|-------:|----:|-------:|
-|    189 | 161 | 51 583 |
-|    756 | 178 | 99 756 |
-|  2 898 | 129 | 45 416 |
-| 11 403 |  68 | 20 085 |
-| 45 234 |  60 | 17 390 |
+GPU — ours (Vulkan) vs HF (CUDA):
 
-Weights stay in one zero-copy buffer: ~2.8 GiB RSS over baseline (f16).
+| tokens |      HF |    ours |    × |
+|-------:|--------:|--------:|-----:|
+|    512 |   5 526 | 100 503 |  18× |
+|  2 048 |  16 427 | 145 481 | 8.9× |
+|  8 192 |  14 154 | 105 034 | 7.4× |
+| 32 768 |     OOM |  83 519 |    — |
+| 131072 |     OOM |  81 105 |    — |
+
+CPU — ours vs HF (fp32):
+
+| tokens |    HF |  ours |    × |
+|-------:|------:|------:|-----:|
+|    512 | 2 171 | 3 564 | 1.6× |
+|  2 048 |   978 | 3 490 | 3.6× |
+|  8 192 |   304 | 2 332 | 7.7× |
+
+Memory is flat ~2.8 GiB VRAM / ~3 GiB RAM to 131k tokens; HF OOMs past ~16k on a 16
+GiB GPU. `release-portable` runtime-dispatches the best ggml-cpu ISA (AVX-512
+without `-march=native`); flash + banded attention default on. See
+[docs/cpu-perf.md](docs/cpu-perf.md).
diff --git a/bench/banded_attn_proto.cpp b/bench/banded_attn_proto.cpp
@@ -0,0 +1,101 @@
+// Prototype: O(n*band) block-local sliding-window attention vs the full O(n^2)
+// masked attention, on random data, to validate correctness before wiring it
+// into the model. Single head for clarity.
+//
+//   Full:   scores[n_kv,n_q] = mul_mat(K,Q); mask |q-k|<=r; softmax; out=V^T@scores
+//   Banded: tokens grouped into blocks of B (>= r). Each query block attends only
+//           to blocks {i-1,i,i+1} (3B keys, since r<=B). A per-block mask
+//           [3B,B,n_blocks] (O(n*B) memory) carries the band + edge validity.
+#include <ggml.h>
+#include <ggml-cpu.h>
+#include <ggml-backend.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <initializer_list>
+
+static ggml_backend_t cpu() {
+    static ggml_backend_t be = [] { ggml_backend_load_all();
+        return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); }();
+    return be;
+}
+static void run(ggml_context * ctx, ggml_tensor * out) {
+    ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, out);
+    ggml_backend_graph_compute(cpu(), gf);
+}
+
+int main(int argc, char ** argv) {
+    const int d = 64;
+    const int r = 128;                       // sliding-window radius
+    const int B = argc > 1 ? std::atoi(argv[1]) : 256;   // block size (>= r)
+    const int n = argc > 2 ? std::atoi(argv[2]) : 4096;  // tokens (multiple of B)
+    const int nb = n / B;
+    const float scale = 1.0f / std::sqrt((float) d);
+
+    ggml_init_params p = { (size_t) 2048 * 1024 * 1024, nullptr, false };
+    ggml_context * ctx = ggml_init(p);
+
+    // shared random q,k,v  [d, n]
+    ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
+    ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
+    ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
+    for (ggml_tensor * t : { Q, K, V })
+        for (int i = 0; i < d * n; i++) ((float *) t->data)[i] = (float) (rand() % 2000) / 1000.0f - 1.0f;
+
+    // ---- full reference ----
+    ggml_tensor * Fmask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n);  // [n_kv, n_q]
+    for (int q = 0; q < n; q++)
+        for (int k = 0; k < n; k++)
+            ((float *) Fmask->data)[(size_t) q * n + k] = (std::abs(q - k) <= r) ? 0.0f : -INFINITY;
+    ggml_tensor * sc = ggml_mul_mat(ctx, K, Q);                          // [n_kv, n_q]
+    sc = ggml_soft_max_ext(ctx, sc, Fmask, scale, 0.0f);
+    ggml_tensor * Vt = ggml_cont(ctx, ggml_transpose(ctx, V));           // [n, d]
+    ggml_tensor * full = ggml_mul_mat(ctx, Vt, sc);                      // [d, n_q]
+    run(ctx, full);
+
+    // ---- banded ----
+    ggml_tensor * qb = ggml_reshape_3d(ctx, Q, d, B, nb);               // [d, B, nb]
+    ggml_tensor * kb = ggml_reshape_3d(ctx, K, d, B, nb);
+    ggml_tensor * vb = ggml_reshape_3d(ctx, V, d, B, nb);
+    // pad a zero block each side along the block axis, then 3 shifted views
+    auto ctx3 = [&](ggml_tensor * x) {
+        ggml_tensor * z = ggml_scale(ctx, ggml_view_3d(ctx, x, d, B, 1, x->nb[1], x->nb[2], 0), 0.0f);
+        ggml_tensor * pad = ggml_concat(ctx, ggml_concat(ctx, z, x, 2), z, 2);  // [d, B, nb+2]
+        ggml_tensor * prev = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 0 * pad->nb[2]);
+        ggml_tensor * self = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 1 * pad->nb[2]);
+        ggml_tensor * next = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 2 * pad->nb[2]);
+        return ggml_cont(ctx, ggml_concat(ctx, ggml_concat(ctx, prev, self, 1), next, 1)); // [d, 3B, nb]
+    };
+    ggml_tensor * kc = ctx3(kb);                                        // [d, 3B, nb]
+    ggml_tensor * vc = ctx3(vb);
+    ggml_tensor * Bmask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3 * B, B, nb);  // [3B, B, nb]
+    for (int bi = 0; bi < nb; bi++)
+        for (int j = 0; j < B; j++)
+            for (int pp = 0; pp < 3 * B; pp++) {
+                const int qpos = bi * B + j;
+                const int kpos = bi * B - B + pp;
+                const bool vis = std::abs(qpos - kpos) <= r && kpos >= 0 && kpos < n;
+                ((float *) Bmask->data)[((size_t) bi * B + j) * (3 * B) + pp] = vis ? 0.0f : -INFINITY;
+            }
+    ggml_tensor * scb = ggml_mul_mat(ctx, kc, qb);                      // [3B, B, nb]
+    scb = ggml_soft_max_ext(ctx, scb, Bmask, scale, 0.0f);
+    ggml_tensor * vct = ggml_cont(ctx, ggml_transpose(ctx, vc));        // [3B, d, nb]
+    ggml_tensor * outb = ggml_mul_mat(ctx, vct, scb);                  // [d, B, nb]
+    outb = ggml_cont_2d(ctx, outb, d, n);                              // [d, n]
+    run(ctx, outb);
+
+    // compare
+    double maxabs = 0, maxrel = 0;
+    for (int i = 0; i < d * n; i++) {
+        const double a = ((float *) full->data)[i], b = ((float *) outb->data)[i];
+        maxabs = std::max(maxabs, std::fabs(a - b));
+        maxrel = std::max(maxrel, std::fabs(a - b) / (std::fabs(a) + 1e-6));
+    }
+    const double full_mask_mib = (double) n * n * 4 / 1048576.0;
+    const double band_mask_mib = (double) 3 * B * B * nb * 4 / 1048576.0;
+    std::printf("n=%d B=%d r=%d | max|d|=%.2e maxrel=%.2e | mask: full %.1f MiB, band %.1f MiB (%.1fx)\n",
+                n, B, r, maxabs, maxrel, full_mask_mib, band_mask_mib, full_mask_mib / band_mask_mib);
+    return 0;
+}
diff --git a/bench/gemm_microbench.cpp b/bench/gemm_microbench.cpp
@@ -0,0 +1,78 @@
+// pf-gemm-bench — isolates ggml's CPU matmul throughput (GFLOP/s) by dtype and
+// shape, to explain where CPU time goes vs a BLAS-backed framework (numpy/torch
+// = MKL/oneDNN). result = mul_mat(a[K,M], b[K,N]) -> [M,N]; FLOPs = 2*M*N*K.
+//
+// Finding it was written to demonstrate: ggml's float matmul is ~40 GFLOP/s flat
+// at every size (no cache-blocked SGEMM -- the kernels are per-row vec_dot, tuned
+// for quantized weights where bandwidth dominates), while its q8_0 path is
+// 5-7x faster. MKL's blocked SGEMM is 12-29x the ggml-f32 rate (run mkl side
+// separately, e.g. torch.matmul). See docs/cpu-perf.md.
+#include <ggml.h>
+#include <ggml-cpu.h>
+#include <ggml-backend.h>
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static double gflops(ggml_backend_t be, ggml_type ta, int M, int K, int N, int iters) {
+    ggml_init_params p = { (size_t) 64 * 1024 * 1024, nullptr, true };
+    ggml_context * ctx = ggml_init(p);
+    ggml_tensor * a = ggml_new_tensor_2d(ctx, ta, K, M);
+    ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, K, N);
+    ggml_tensor * c = ggml_mul_mat(ctx, a, b);
+    ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, c);
+    ggml_gallocr * ga = ggml_gallocr_new(ggml_backend_get_default_buffer_type(be));
+    ggml_gallocr_alloc_graph(ga, gf);
+
+    std::vector<float> rb(K * N);
+    for (auto & x : rb) x = (float) (rand() % 1000) / 1000.0f - 0.5f;  // values irrelevant for timing
+    ggml_backend_tensor_set(b, rb.data(), 0, rb.size() * sizeof(float));
+    std::vector<float> ra(K * M);
+    for (auto & x : ra) x = (float) (rand() % 1000) / 1000.0f - 0.5f;
+    if (ta == GGML_TYPE_F32) {
+        ggml_backend_tensor_set(a, ra.data(), 0, ra.size() * sizeof(float));
+    } else {
+        std::vector<char> buf(ggml_nbytes(a));
+        ggml_quantize_chunk(ta, ra.data(), buf.data(), 0, M, K, nullptr);
+        ggml_backend_tensor_set(a, buf.data(), 0, buf.size());
+    }
+
+    ggml_backend_graph_compute(be, gf);  // warm
+    auto t0 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iters; i++) ggml_backend_graph_compute(be, gf);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    double s = std::chrono::duration<double>(t1 - t0).count() / iters;
+    ggml_gallocr_free(ga);
+    ggml_free(ctx);
+    return 2.0 * M * N * K / s / 1e9;
+}
+
+int main(int argc, char ** argv) {
+    const int nth = argc > 1 ? std::atoi(argv[1]) : 12;
+    ggml_backend_load_all();  // pick the best CPU variant in a GGML_BACKEND_DL build
+    ggml_backend_t be = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    auto set_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
+        ggml_backend_dev_backend_reg(ggml_backend_get_device(be)), "ggml_backend_set_n_threads");
+    if (set_fn) set_fn(be, nth);
+    struct { const char * name; int M, K, N; } shp[] = {
+        { "expert gate_up N=16 ", 1280, 640, 16 },
+        { "expert gate_up N=64 ", 1280, 640, 64 },
+        { "expert gate_up N=512", 1280, 640, 512 },
+        { "expert down    N=16 ", 640, 640, 16 },
+        { "large 4096^2   N=512", 4096, 4096, 512 },
+    };
+    std::printf("ggml CPU mul_mat GFLOP/s, %d threads\n", nth);
+    std::printf("%-22s %10s %10s %10s\n", "shape (M,K,N)", "f32", "f16", "q8_0");
+    for (auto & s : shp) {
+        const int it = (double) s.M * s.N > 1e6 ? 20 : 200;
+        std::printf("%-22s %10.1f %10.1f %10.1f\n", s.name,
+                    gflops(be, GGML_TYPE_F32, s.M, s.K, s.N, it),
+                    gflops(be, GGML_TYPE_F16, s.M, s.K, s.N, it),
+                    gflops(be, GGML_TYPE_Q8_0, s.M, s.K, s.N, it));
+    }
+    ggml_backend_free(be);
+    return 0;
+}