diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4484741..ed0a5fc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,11 @@ jobs: strategy: matrix: include: - - { name: gcc-release, preset: release, cc: gcc, cxx: g++ } + # gcc-release builds the shipped CPU config: every ggml-cpu ISA variant + # (GGML_CPU_ALL_VARIANTS), runtime-dispatched -- so released binaries get + # AVX-512 without -march=native (which Nix strips and which mis-targets + # cross-host builds). + - { name: gcc-release, preset: release-portable, cc: gcc, cxx: g++ } - { name: clang-debug-san, preset: debug, cc: clang, cxx: clang++ } name: ${{ matrix.name }} runs-on: ubuntu-latest @@ -32,13 +36,24 @@ jobs: run: cmake --preset ${{ matrix.preset }} -DGGML_NATIVE=OFF env: { CC: '${{ matrix.cc }}', CXX: '${{ matrix.cxx }}' } - name: build - run: cmake --build --preset ${{ matrix.preset }} -j + # -j4 (not unbounded -j): release-portable compiles 14 ggml-cpu ISA + # variants; uncapped parallelism OOMs the 16 GB runner. + run: cmake --build --preset ${{ matrix.preset }} -j4 - name: test (model-independent) run: ctest --preset ${{ matrix.preset }} -LE model - name: unicode table regen check run: | python3 scripts/gen_unicode.py > /tmp/unicode_data.inc diff -u src/unicode_data.inc /tmp/unicode_data.inc + - name: assert SIMD compiled in (guard the SSE-only trap) + if: matrix.name == 'gcc-release' + # The AVX-512 ISA variant must actually contain AVX-512 -- catches a + # silent regression to a SIMD-less build (e.g. a stripped -march). + run: | + so=build/release-portable/bin/libggml-cpu-skylakex.so + n=$(objdump -d "$so" | grep -c '%zmm') + echo "skylakex %zmm instructions: $n" + test "$n" -gt 0 # Tier 2 (nightly / dispatch): HF reference fixtures + parity + fuzz smoke. # Needs the model checkpoint; cached between runs. @@ -92,9 +107,9 @@ jobs: --model ~/models/privacy-filter-multilingual \ --outfile ~/ggufs/pf-f32.gguf --outtype f32 - name: build - run: cmake --preset release -DGGML_NATIVE=OFF && cmake --build --preset release -j + run: cmake --preset release-portable && cmake --build --preset release-portable -j4 - name: parity suite - run: PF_GGUF_DIR=~/ggufs ctest --preset release -L model + run: PF_GGUF_DIR=~/ggufs ctest --preset release-portable -L model - name: fuzz smoke (5 min/target) run: | cmake --preset fuzz && cmake --build --preset fuzz -j --target fuzz_tokenizer fuzz_gguf diff --git a/CMakeLists.txt b/CMakeLists.txt index 3225c26..64580ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,17 @@ if (PF_BUILD_TOOLS) add_executable(pf-bench bench/pf-bench.cpp) target_link_libraries(pf-bench PRIVATE pf) target_include_directories(pf-bench PRIVATE src) + + # GFLOP/s by dtype/shape -- diagnostic for the CPU matmul analysis + # (docs/cpu-perf.md). Links ggml directly; no pf/model deps. + add_executable(pf-gemm-bench bench/gemm_microbench.cpp) + target_link_libraries(pf-gemm-bench PRIVATE ggml) + + # Prototype: O(n*band) block-local attention == full masked attention + # (bit-identical), with an O(n*B) mask instead of O(n^2). Proof-of-concept + # for de-windowing; see docs/cpu-perf.md. Args: [block] [tokens]. + add_executable(pf-banded-proto bench/banded_attn_proto.cpp) + target_link_libraries(pf-banded-proto PRIVATE ggml) endif() if (PF_FUZZ) diff --git a/CMakePresets.json b/CMakePresets.json index b098df2..32e58e2 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -16,6 +16,18 @@ "binaryDir": "${sourceDir}/build/release", "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, + { + "name": "release-portable", + "binaryDir": "${sourceDir}/build/release-portable", + "description": "Portable + fast CPU: build every ggml-cpu ISA variant and pick the best at runtime. Avoids -march=native (fragile, and stripped by Nix's NIX_ENFORCE_NO_NATIVE), so binaries run anywhere yet still use AVX-512/VNNI where present.", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "GGML_NATIVE": "OFF", + "GGML_BACKEND_DL": "ON", + "GGML_CPU_ALL_VARIANTS": "ON", + "CMAKE_RUNTIME_OUTPUT_DIRECTORY": "${sourceDir}/build/release-portable/bin" + } + }, { "name": "profile", "binaryDir": "${sourceDir}/build/profile", @@ -35,11 +47,13 @@ "buildPresets": [ { "name": "debug", "configurePreset": "debug" }, { "name": "release", "configurePreset": "release" }, + { "name": "release-portable", "configurePreset": "release-portable" }, { "name": "profile", "configurePreset": "profile" }, { "name": "fuzz", "configurePreset": "fuzz" } ], "testPresets": [ { "name": "debug", "configurePreset": "debug", "output": { "outputOnFailure": true } }, - { "name": "release", "configurePreset": "release", "output": { "outputOnFailure": true } } + { "name": "release", "configurePreset": "release", "output": { "outputOnFailure": true } }, + { "name": "release-portable", "configurePreset": "release-portable", "output": { "outputOnFailure": true } } ] } diff --git a/README.md b/README.md index 093a920..f4e49f3 100644 --- a/README.md +++ b/README.md @@ -130,18 +130,33 @@ PF_GGUF=model.gguf ./build/fuzz/fuzz_tokenizer corpus_tok/ ## Bench ```sh -build/release/pf-bench model.gguf [cpu|vulkan] [iters] +cmake --preset release-portable && cmake --build --preset release-portable -j +build/release-portable/bin/pf-bench model.gguf [cpu|vulkan] [iters] [lengths] ``` -Ryzen 9 7900 (12 threads) / RTX 5070 Ti, f16 GGUF, forward tok/s by length -(one untimed warm-up per length; GPU pipelines compile lazily): +Forward tok/s vs stock HF Transformers (transformers 5.9, eager), Ryzen 9 7900 (12 +threads) + RTX 5070 Ti, f16/fp16, matched token counts +([scripts/bench_torch.py](scripts/bench_torch.py)): -| tokens | cpu | vulkan | -|-------:|----:|-------:| -| 189 | 161 | 51 583 | -| 756 | 178 | 99 756 | -| 2 898 | 129 | 45 416 | -| 11 403 | 68 | 20 085 | -| 45 234 | 60 | 17 390 | +GPU — ours (Vulkan) vs HF (CUDA): -Weights stay in one zero-copy buffer: ~2.8 GiB RSS over baseline (f16). +| tokens | HF | ours | × | +|-------:|--------:|--------:|-----:| +| 512 | 5 526 | 100 503 | 18× | +| 2 048 | 16 427 | 145 481 | 8.9× | +| 8 192 | 14 154 | 105 034 | 7.4× | +| 32 768 | OOM | 83 519 | — | +| 131072 | OOM | 81 105 | — | + +CPU — ours vs HF (fp32): + +| tokens | HF | ours | × | +|-------:|------:|------:|-----:| +| 512 | 2 171 | 3 564 | 1.6× | +| 2 048 | 978 | 3 490 | 3.6× | +| 8 192 | 304 | 2 332 | 7.7× | + +Memory is flat ~2.8 GiB VRAM / ~3 GiB RAM to 131k tokens; HF OOMs past ~16k on a 16 +GiB GPU. `release-portable` runtime-dispatches the best ggml-cpu ISA (AVX-512 +without `-march=native`); flash + banded attention default on. See +[docs/cpu-perf.md](docs/cpu-perf.md). diff --git a/bench/banded_attn_proto.cpp b/bench/banded_attn_proto.cpp new file mode 100644 index 0000000..50e9e02 --- /dev/null +++ b/bench/banded_attn_proto.cpp @@ -0,0 +1,101 @@ +// Prototype: O(n*band) block-local sliding-window attention vs the full O(n^2) +// masked attention, on random data, to validate correctness before wiring it +// into the model. Single head for clarity. +// +// Full: scores[n_kv,n_q] = mul_mat(K,Q); mask |q-k|<=r; softmax; out=V^T@scores +// Banded: tokens grouped into blocks of B (>= r). Each query block attends only +// to blocks {i-1,i,i+1} (3B keys, since r<=B). A per-block mask +// [3B,B,n_blocks] (O(n*B) memory) carries the band + edge validity. +#include +#include +#include + +#include +#include +#include +#include + +static ggml_backend_t cpu() { + static ggml_backend_t be = [] { ggml_backend_load_all(); + return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); }(); + return be; +} +static void run(ggml_context * ctx, ggml_tensor * out) { + ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, out); + ggml_backend_graph_compute(cpu(), gf); +} + +int main(int argc, char ** argv) { + const int d = 64; + const int r = 128; // sliding-window radius + const int B = argc > 1 ? std::atoi(argv[1]) : 256; // block size (>= r) + const int n = argc > 2 ? std::atoi(argv[2]) : 4096; // tokens (multiple of B) + const int nb = n / B; + const float scale = 1.0f / std::sqrt((float) d); + + ggml_init_params p = { (size_t) 2048 * 1024 * 1024, nullptr, false }; + ggml_context * ctx = ggml_init(p); + + // shared random q,k,v [d, n] + ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n); + ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n); + ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n); + for (ggml_tensor * t : { Q, K, V }) + for (int i = 0; i < d * n; i++) ((float *) t->data)[i] = (float) (rand() % 2000) / 1000.0f - 1.0f; + + // ---- full reference ---- + ggml_tensor * Fmask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); // [n_kv, n_q] + for (int q = 0; q < n; q++) + for (int k = 0; k < n; k++) + ((float *) Fmask->data)[(size_t) q * n + k] = (std::abs(q - k) <= r) ? 0.0f : -INFINITY; + ggml_tensor * sc = ggml_mul_mat(ctx, K, Q); // [n_kv, n_q] + sc = ggml_soft_max_ext(ctx, sc, Fmask, scale, 0.0f); + ggml_tensor * Vt = ggml_cont(ctx, ggml_transpose(ctx, V)); // [n, d] + ggml_tensor * full = ggml_mul_mat(ctx, Vt, sc); // [d, n_q] + run(ctx, full); + + // ---- banded ---- + ggml_tensor * qb = ggml_reshape_3d(ctx, Q, d, B, nb); // [d, B, nb] + ggml_tensor * kb = ggml_reshape_3d(ctx, K, d, B, nb); + ggml_tensor * vb = ggml_reshape_3d(ctx, V, d, B, nb); + // pad a zero block each side along the block axis, then 3 shifted views + auto ctx3 = [&](ggml_tensor * x) { + ggml_tensor * z = ggml_scale(ctx, ggml_view_3d(ctx, x, d, B, 1, x->nb[1], x->nb[2], 0), 0.0f); + ggml_tensor * pad = ggml_concat(ctx, ggml_concat(ctx, z, x, 2), z, 2); // [d, B, nb+2] + ggml_tensor * prev = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 0 * pad->nb[2]); + ggml_tensor * self = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 1 * pad->nb[2]); + ggml_tensor * next = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 2 * pad->nb[2]); + return ggml_cont(ctx, ggml_concat(ctx, ggml_concat(ctx, prev, self, 1), next, 1)); // [d, 3B, nb] + }; + ggml_tensor * kc = ctx3(kb); // [d, 3B, nb] + ggml_tensor * vc = ctx3(vb); + ggml_tensor * Bmask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3 * B, B, nb); // [3B, B, nb] + for (int bi = 0; bi < nb; bi++) + for (int j = 0; j < B; j++) + for (int pp = 0; pp < 3 * B; pp++) { + const int qpos = bi * B + j; + const int kpos = bi * B - B + pp; + const bool vis = std::abs(qpos - kpos) <= r && kpos >= 0 && kpos < n; + ((float *) Bmask->data)[((size_t) bi * B + j) * (3 * B) + pp] = vis ? 0.0f : -INFINITY; + } + ggml_tensor * scb = ggml_mul_mat(ctx, kc, qb); // [3B, B, nb] + scb = ggml_soft_max_ext(ctx, scb, Bmask, scale, 0.0f); + ggml_tensor * vct = ggml_cont(ctx, ggml_transpose(ctx, vc)); // [3B, d, nb] + ggml_tensor * outb = ggml_mul_mat(ctx, vct, scb); // [d, B, nb] + outb = ggml_cont_2d(ctx, outb, d, n); // [d, n] + run(ctx, outb); + + // compare + double maxabs = 0, maxrel = 0; + for (int i = 0; i < d * n; i++) { + const double a = ((float *) full->data)[i], b = ((float *) outb->data)[i]; + maxabs = std::max(maxabs, std::fabs(a - b)); + maxrel = std::max(maxrel, std::fabs(a - b) / (std::fabs(a) + 1e-6)); + } + const double full_mask_mib = (double) n * n * 4 / 1048576.0; + const double band_mask_mib = (double) 3 * B * B * nb * 4 / 1048576.0; + std::printf("n=%d B=%d r=%d | max|d|=%.2e maxrel=%.2e | mask: full %.1f MiB, band %.1f MiB (%.1fx)\n", + n, B, r, maxabs, maxrel, full_mask_mib, band_mask_mib, full_mask_mib / band_mask_mib); + return 0; +} diff --git a/bench/gemm_microbench.cpp b/bench/gemm_microbench.cpp new file mode 100644 index 0000000..300cbfe --- /dev/null +++ b/bench/gemm_microbench.cpp @@ -0,0 +1,78 @@ +// pf-gemm-bench — isolates ggml's CPU matmul throughput (GFLOP/s) by dtype and +// shape, to explain where CPU time goes vs a BLAS-backed framework (numpy/torch +// = MKL/oneDNN). result = mul_mat(a[K,M], b[K,N]) -> [M,N]; FLOPs = 2*M*N*K. +// +// Finding it was written to demonstrate: ggml's float matmul is ~40 GFLOP/s flat +// at every size (no cache-blocked SGEMM -- the kernels are per-row vec_dot, tuned +// for quantized weights where bandwidth dominates), while its q8_0 path is +// 5-7x faster. MKL's blocked SGEMM is 12-29x the ggml-f32 rate (run mkl side +// separately, e.g. torch.matmul). See docs/cpu-perf.md. +#include +#include +#include + +#include +#include +#include +#include + +static double gflops(ggml_backend_t be, ggml_type ta, int M, int K, int N, int iters) { + ggml_init_params p = { (size_t) 64 * 1024 * 1024, nullptr, true }; + ggml_context * ctx = ggml_init(p); + ggml_tensor * a = ggml_new_tensor_2d(ctx, ta, K, M); + ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, K, N); + ggml_tensor * c = ggml_mul_mat(ctx, a, b); + ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, c); + ggml_gallocr * ga = ggml_gallocr_new(ggml_backend_get_default_buffer_type(be)); + ggml_gallocr_alloc_graph(ga, gf); + + std::vector rb(K * N); + for (auto & x : rb) x = (float) (rand() % 1000) / 1000.0f - 0.5f; // values irrelevant for timing + ggml_backend_tensor_set(b, rb.data(), 0, rb.size() * sizeof(float)); + std::vector ra(K * M); + for (auto & x : ra) x = (float) (rand() % 1000) / 1000.0f - 0.5f; + if (ta == GGML_TYPE_F32) { + ggml_backend_tensor_set(a, ra.data(), 0, ra.size() * sizeof(float)); + } else { + std::vector buf(ggml_nbytes(a)); + ggml_quantize_chunk(ta, ra.data(), buf.data(), 0, M, K, nullptr); + ggml_backend_tensor_set(a, buf.data(), 0, buf.size()); + } + + ggml_backend_graph_compute(be, gf); // warm + auto t0 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iters; i++) ggml_backend_graph_compute(be, gf); + auto t1 = std::chrono::high_resolution_clock::now(); + double s = std::chrono::duration(t1 - t0).count() / iters; + ggml_gallocr_free(ga); + ggml_free(ctx); + return 2.0 * M * N * K / s / 1e9; +} + +int main(int argc, char ** argv) { + const int nth = argc > 1 ? std::atoi(argv[1]) : 12; + ggml_backend_load_all(); // pick the best CPU variant in a GGML_BACKEND_DL build + ggml_backend_t be = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + auto set_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address( + ggml_backend_dev_backend_reg(ggml_backend_get_device(be)), "ggml_backend_set_n_threads"); + if (set_fn) set_fn(be, nth); + struct { const char * name; int M, K, N; } shp[] = { + { "expert gate_up N=16 ", 1280, 640, 16 }, + { "expert gate_up N=64 ", 1280, 640, 64 }, + { "expert gate_up N=512", 1280, 640, 512 }, + { "expert down N=16 ", 640, 640, 16 }, + { "large 4096^2 N=512", 4096, 4096, 512 }, + }; + std::printf("ggml CPU mul_mat GFLOP/s, %d threads\n", nth); + std::printf("%-22s %10s %10s %10s\n", "shape (M,K,N)", "f32", "f16", "q8_0"); + for (auto & s : shp) { + const int it = (double) s.M * s.N > 1e6 ? 20 : 200; + std::printf("%-22s %10.1f %10.1f %10.1f\n", s.name, + gflops(be, GGML_TYPE_F32, s.M, s.K, s.N, it), + gflops(be, GGML_TYPE_F16, s.M, s.K, s.N, it), + gflops(be, GGML_TYPE_Q8_0, s.M, s.K, s.N, it)); + } + ggml_backend_free(be); + return 0; +} diff --git a/bench/pf-bench.cpp b/bench/pf-bench.cpp index 6118f11..c3dfea2 100644 --- a/bench/pf-bench.cpp +++ b/bench/pf-bench.cpp @@ -1,7 +1,11 @@ // pf-bench — tokens/s and per-stage latency at several document lengths. -// pf-bench [device] [iters] +// pf-bench [device] [iters] [lengths] // Synthesizes PII-shaped text, then per length: tokenize / forward (windowed) // / decode timings, plus RSS and cold-start (load -> first entity). +// [lengths] is an optional comma-separated list of EXACT token counts (the +// synthesized text is tokenized then truncated to each count) — use it to match +// scripts/bench_torch.py for an apples-to-apples PyTorch comparison. Omitted, it +// defaults to ~{128,512,2048,8192,32768}-token documents. #include "model.h" #include "ner.h" #include "tokenizer.h" @@ -43,12 +47,29 @@ static std::string make_text(int approx_tokens) { int main(int argc, char ** argv) { if (argc < 2) { - std::fprintf(stderr, "usage: pf-bench [cpu|vulkan] [iters]\n"); + std::fprintf(stderr, "usage: pf-bench [cpu|vulkan] [iters] [len1,len2,...]\n"); return 2; } const char * device = argc > 2 ? argv[2] : "cpu"; const int iters = argc > 3 ? std::atoi(argv[3]) : 3; + // Optional exact token-count list (4th arg): the synthesized text per length + // is truncated to exactly this many tokens, so the lengths match whatever is + // passed to scripts/bench_torch.py --lengths. Empty -> the approximate + // defaults below. + std::vector lengths; + if (argc > 4) { + for (char * s = std::strtok(argv[4], ","); s; s = std::strtok(nullptr, ",")) + if (int v = std::atoi(s)) lengths.push_back(v); + } + const bool exact = !lengths.empty(); + if (!exact) lengths = { 128, 512, 2048, 8192, 32768 }; + + // PF_WINDOW: tokens per forward pass (the pf_set_window knob). Longer inputs + // run as overlapping halo windows; larger W means fewer windows (less halo + // recompute, faster) but a bigger compute buffer (more RAM/VRAM). Default 4096. + const int W = std::getenv("PF_WINDOW") ? std::atoi(std::getenv("PF_WINDOW")) : 4096; + const size_t rss0 = rss_kb("VmRSS:"); const int64_t t_load0 = ggml_time_us(); pf::model m; @@ -72,18 +93,20 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < toks.size(); i++) ids[i] = toks[i].id; std::vector spans; std::string err; - pf::ner::classify_tokens(m, ids.data(), (int) ids.size(), 4096, 0.5f, spans, err); + pf::ner::classify_tokens(m, ids.data(), (int) ids.size(), W, 0.5f, spans, err); } const int64_t t_first = ggml_time_us(); - std::printf("device %s | load %.2fs (+%.0f MiB) | cold start %.2fs | %d iters\n\n", + // weights buffer: device memory (Vulkan VRAM) or the zero-copy CPU wrap. + const double wbuf_mib = m.weights_buf ? ggml_backend_buffer_get_size(m.weights_buf) / 1048576.0 : 0; + std::printf("device %s | load %.2fs (+%.0f MiB) | weights %.0f MiB | window %d | %d iters\n\n", m.be.device.c_str(), (t_load1 - t_load0) / 1e6, - (rss1 - rss0) / 1024.0, (t_first - t_load0) / 1e6, iters); - std::printf("| %8s | %9s | %11s | %9s | %8s |\n", - "tokens", "tok ms", "forward ms", "decode ms", "tok/s"); - std::printf("|---------:|----------:|------------:|----------:|---------:|\n"); + (rss1 - rss0) / 1024.0, wbuf_mib, W, iters); + std::printf("| %8s | %11s | %8s | %9s | %9s |\n", + "tokens", "forward ms", "tok/s", "cmp MiB", "RSS MiB"); + std::printf("|---------:|------------:|--------:|---------:|--------:|\n"); - for (const int target : { 128, 512, 2048, 8192, 32768 }) { + for (const int target : lengths) { const std::string text = make_text(target); int64_t tok_us = 0, fwd_us = 0, dec_us = 0; size_t n_tok = 0; @@ -95,11 +118,13 @@ int main(int argc, char ** argv) { int64_t t1 = ggml_time_us(); std::vector ids(toks.size()); for (size_t i = 0; i < toks.size(); i++) ids[i] = toks[i].id; + // exact mode: truncate to the requested count (make_text overshoots) + if (exact && (int) ids.size() > target) ids.resize(target); n_tok = ids.size(); std::vector emit; std::string err; - if (!pf::ner::emit_logprobs(m, ids.data(), (int) ids.size(), 4096, emit, err)) { + if (!pf::ner::emit_logprobs(m, ids.data(), (int) ids.size(), W, emit, err)) { std::fprintf(stderr, "forward: %s\n", err.c_str()); return 1; } @@ -116,10 +141,14 @@ int main(int argc, char ** argv) { (void) spans; } const double fwd_ms = fwd_us / 1e3 / iters; - std::printf("| %8zu | %9.1f | %11.1f | %9.1f | %8.0f |\n", - n_tok, tok_us / 1e3 / iters, fwd_ms, dec_us / 1e3 / iters, - n_tok / (fwd_ms / 1e3)); + // compute buffer: per-forward activation memory (Vulkan VRAM / CPU RAM), + // sized to one window -> grows with min(n_tok, W). RSS is host resident. + const double cmp_mib = ggml_gallocr_get_buffer_size(m.be.galloc, 0) / 1048576.0; + std::printf("| %8zu | %11.1f | %8.0f | %9.0f | %8.0f |\n", + n_tok, fwd_ms, n_tok / (fwd_ms / 1e3), cmp_mib, rss_kb("VmRSS:") / 1024.0); + (void) tok_us; (void) dec_us; } - std::printf("\npeak RSS %.0f MiB\n", rss_kb("VmHWM:") / 1024.0); + std::printf("\npeak RSS %.0f MiB | weights %.0f MiB\n", + rss_kb("VmHWM:") / 1024.0, wbuf_mib); return 0; } diff --git a/docs/cpu-perf.md b/docs/cpu-perf.md new file mode 100644 index 0000000..a4d3e4d --- /dev/null +++ b/docs/cpu-perf.md @@ -0,0 +1,173 @@ +# CPU performance + +## TL;DR — the build was SSE-only + +The CPU slowness traced to a build trap, not the engine. Under Nix the gcc/clang +wrapper strips `-march=native` (`NIX_ENFORCE_NO_NATIVE`), so a `GGML_NATIVE=ON` +build silently compiles ggml-cpu with **no AVX2/AVX-512/FMA** — and the CI build +(`-DGGML_NATIVE=OFF`) has no SIMD either. Confirmed by disassembly: + +``` +$ objdump -d libggml-cpu.so | grep -c zmm # AVX-512 -> 0 +$ objdump -d libggml-cpu.so | grep -c ymm # AVX2 -> 0 +$ objdump -d libggml-cpu.so | grep -c vfmadd# FMA -> 0 (37k xmm/SSE only) +``` + +With SIMD actually enabled, ggml-f16 on CPU is **~10× faster and beats the +PyTorch/transformers reference** — no quantization needed. The fix is to build the +CPU backend for all ISAs and pick at runtime (`GGML_CPU_ALL_VARIANTS`), which also +sidesteps the Nix `-march=native` stripping. + +| 512 tok, f16 | tok/s | +|---|---:| +| SSE-only (the trap: `GGML_NATIVE=OFF`, or `=ON` under Nix) | 280 | +| AVX-512 (explicit `-mavx512*`, or the zen4 runtime variant) | **~3000** | +| PyTorch CPU (fp32, MKL) | 1935 | + +## The fix: GGML_CPU_ALL_VARIANTS (runtime ISA dispatch) + +`-march=native` is fragile (stripped by Nix; wrong if you build on a different +host than you run on). ggml's portable answer is to compile the CPU backend once +per ISA level and score+load the best at run time: + +```sh +cmake -B build -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ... +``` + +This produces `libggml-cpu-{sse42,haswell,skylakex,icelake,zen4,…}.so`; on this +Ryzen 9 7900 it loads `libggml-cpu-zen4.so` (AVX-512 + VNNI + BF16): + +``` +load_backend: loaded CPU backend from libggml-cpu-zen4.so +``` + +Engine support (`src/backend.cpp`): call `ggml_backend_load_all()` before +`ggml_backend_init_by_type`, and set threads through the registry +(`ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads")`) since the +CPU-specific symbol now lives in the variant `.so`, not in linked base. Both calls +are no-ops for a static build, so one code path serves both. Use the +`release-portable` preset. + +## ggml vs MKL once SIMD is real + +`pf-gemm-bench` (ggml CPU `mul_mat` GFLOP/s) vs `torch.matmul` (MKL), 12 threads: + +| shape (M,K,N) | ggml SSE | ggml AVX-512 | MKL f32 | +|---|---:|---:|---:| +| expert gate_up N=16 (f32) | 39 | 426 | 463 | +| expert gate_up N=512 (f32) | 41 | 528 | 1098 | +| large 4096² N=512 (f32) | 39 | 442 | 1137 | + +ggml's f32 GEMM goes 40 → ~500 GFLOP/s — within ~2× of MKL, and at the actual +model level (lower per-op overhead than HF's Python expert loop) ggml-f16 **wins**: +512 tok f16 3006 vs PyTorch 1935. So there was never a missing blocked SGEMM — it +just wasn't compiled with SIMD. + +## Profile (AVX-512) and the minor Q8 option + +`PF_PROF=noattn|nomoe` ablation (512 tok, AVX-512): **MoE 64%, attention 34%**, +rest <1%. `PF_NTHREADS` sweep: near-linear to 12 physical cores, SMT regresses — +the default is optimal. + +Quantization is now a *minor* lever, not a necessity: `scripts/requant_q8.py` +(Q8_0 experts) adds ~15% over f16+AVX-512 (512 tok: 3006 → 3567) but is a strict +precision drop — on the 3k-token case it falls below the f16 parity gate (cos +0.9972, 1 argmax flip in 3053), so it would need its own tier. Given f16+AVX-512 +already beats the reference, Q8 is optional (e.g. for memory: 1.6 vs 2.8 GiB). + +## Flash attention (both backends) + +With SIMD fixed, attention became the dominant cost at length on *both* backends +(`PF_PROF` ablation — CPU 8192 tok: attention 72%; Vulkan 2k–32k: ~69%), because +the engine built the full `[n,n]` score matrix and masked it to the sliding +window — O(n²) work for an O(n·256) receptive field. + +`ggml_flash_attn_ext` (default; `PF_NOFLASH` selects the explicit path) fuses +QK·softmax·V with no materialized scores, carries the attention sinks +(`ggml_flash_attn_ext_add_sinks`) and the sliding-window mask, and accumulates in +F32. It is numerically exact here — passes the f32 `cos>=0.99999` gate and +window-stitch — and faster where attention dominates: + +| forward tok/s | CPU 2048 | CPU 8192 | Vulkan 8192 | Vulkan 131072 | +|---|---:|---:|---:|---:| +| explicit (`PF_NOFLASH`) | 1881 | 798 | 11845 | 8992 | +| flash (default) | 3319 | 1928 | 26918 | 20631 | +| speedup | 1.8× | 2.4× | 2.3× | 2.3× | + +## Memory and the processing window (W) + +`PF_WINDOW` (the `pf_set_window` knob, default 4096) sets tokens per forward; +longer inputs run as overlapping halo windows. At the default, GGML's footprint +is **flat across document length** — the compute buffer is bounded by the window, +not the input: + +| length | PyTorch VRAM (eager) | GGML Vulkan VRAM (flash, W=4096) | +|---:|---:|---:| +| 4 096 | 5 439 | 2 883 | +| 8 192 | 13 637 | 2 883 | +| 32 768 | OOM | 2 883 | +| 131 072 | OOM | **2 883** | + +PyTorch (single-pass) grows O(n²) and OOMs by ~16k tokens; GGML holds ~2.9 GiB at +131k. So the default W=4096 is a good fit for VRAM-constrained deployments. + +Raising W to cut the halo recompute is tempting but currently a **bad trade**: it +OOMs by W=16384. Flash removed the O(n²) *scores*, but the sliding-window **mask +is still a materialized `[n,n]` tensor** — the last O(n²) term. + +### Banded mask (prototype, `pf-banded-proto`) + +Grouping tokens into blocks of `B ≥ radius` and having each query block attend +only to blocks `{i-1, i, i+1}` makes the mask **O(n·B)** (a `[3B, B, n_blocks]` +band, constant per block) and the attention compute **O(n·band)** — while being +**bit-identical** to full masked attention (same dot products, computed locally): + +``` +$ pf-banded-proto 256 8192 +n=8192 B=256 r=128 | max|d|=0.00e+00 | mask: full 256.0 MiB, band 24.0 MiB (10.7x) +``` + +Mask scaling (B=256): 21× smaller at 16k, 85× at 64k. + +**On by default for sequences >= 2048 tokens** (`src/model.cpp`; `PF_BANDED` +forces it on/off): blocks of B=256, each query block flash-attends to blocks +`{i-1,i,i+1}` with the F16 band mask + sinks; GQA broadcasts over heads; +out-of-range tokens are padded and masked. Parity-exact — passes the f32 +`cos>=0.99999` gate and window-stitch on CPU and Vulkan. Speedups +(flash → banded, default W): + +| tok/s | CPU 8192 | Vulkan 8192 | Vulkan 32768 | +|---|---:|---:|---:| +| flash | 2068 | 42407 | 33893 | +| banded | 2325 | **105058** | **83664** | +| | 1.1× | **2.5×** | **2.5×** | + +Big on Vulkan (the flash kernel computes the full window; banded only the band), +modest on CPU. The measured crossover (banded/flash): 0.9× at 256–512 tok, 1.0× +at 2048, then 1.1× (CPU) / 2.5× (Vulkan) at 4096+. Hence the 2048 default cutoff. + +### Dropping the window (`PF_MOE_CHUNK`) + +With banded attention the only remaining O(n) cap on a large single window was the +MoE expert matmul's activation scratch (`mul_mat_id y_sz > maxStorageBufferRange` +on Vulkan). The MoE is per-token, so `PF_MOE_CHUNK=C` runs it in C-token chunks +(exact, no halo). It defaults to the forward window (4096), so it's inert at the +default window (n <= W) but keeps a *larger* window from OOMing. Banded + chunking +lets a **131072-token document run in one window** instead of windowing at W=4096: + +| 131072 tok, Vulkan | tok/s | compute buffer | +|---|---:|---:| +| banded, windowed W=4096 | 80 897 | 166 MiB | +| banded + chunk, single window | **103 539** | 2 389 MiB | + +~1.28× faster (no halo recompute) for more memory -- the throughput/VRAM tradeoff +the window now exposes, capped only by total VRAM. Passes the f32 parity gate. + +## Reproduce + +```sh +cmake --preset release-portable && cmake --build --preset release-portable -j +build/release-portable/pf-gemm-bench 12 # GFLOP/s by dtype/shape +build/release-portable/pf-bench cpu 5 512 +objdump -d build/release-portable/bin/libggml-cpu-zen4.so | grep -c zmm # > 0 +``` diff --git a/scripts/bench_torch.py b/scripts/bench_torch.py new file mode 100644 index 0000000..41192c4 --- /dev/null +++ b/scripts/bench_torch.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +"""PyTorch reference throughput, comparable to tools/bench (pf-bench). + +Times the HF `openai_privacy_filter` model's forward pass at several document +lengths, one untimed warm-up + N timed iters per length, and prints a markdown +table of forward ms / tok/s -- the same shape pf-bench emits for the ggml +engine, so the two tables line up column-for-column. + + python scripts/bench_torch.py --model --device cpu + python scripts/bench_torch.py --model --device cuda --dtype fp16 + +Only the model forward is timed (the comparable quantity): tokenization and +BIOES decode are excluded on both sides. Inputs are real token ids -- a PII +paragraph tokenized once and tiled/truncated to the exact target length -- so +both engines see identical sequence lengths. Lengths that OOM or error are +reported as such and skipped rather than aborting the run. +""" +from __future__ import annotations + +import argparse +import time + +# A PII-shaped paragraph, mirroring tools/bench/pf-bench.cpp make_text(), so the +# token stream is representative rather than degenerate (repeated single token). +SEED_TEXT = ( + "Case 0: Anna Kowalski reported an issue. Contact at anna.kowalski0@mail.example.com " + "or +48 123 456 789. Ships to 12 Elm Street, Lyon. " + "Refund to IBAN DE89 3704 0044 0532 0130 00.\n\n" +) + +DTYPES = {"fp32": "float32", "fp16": "float16", "bf16": "bfloat16"} + + +def build_ids(tok, n: int): + import torch + + base = tok(SEED_TEXT, add_special_tokens=False)["input_ids"] + reps = (n + len(base) - 1) // len(base) + ids = (base * reps)[:n] + return torch.tensor([ids], dtype=torch.long) + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--model", required=True, help="HF checkpoint dir") + ap.add_argument("--device", default="cpu", help="cpu | cuda | cuda:N") + ap.add_argument("--dtype", default="auto", choices=["auto", *DTYPES], + help="auto: fp32 on cpu, fp16 on cuda") + ap.add_argument("--attn", default="sdpa", choices=["sdpa", "eager"]) + ap.add_argument("--lengths", default="189,756,2898,11403,45234", + help="comma-separated token counts (match pf-bench output)") + ap.add_argument("--iters", type=int, default=3) + ap.add_argument("--threads", type=int, default=0, help="CPU threads (0 = torch default)") + args = ap.parse_args() + + import torch + import transformers + from transformers import AutoModelForTokenClassification, AutoTokenizer + + if args.threads > 0: + torch.set_num_threads(args.threads) + dtype_name = args.dtype + if dtype_name == "auto": + dtype_name = "fp16" if args.device.startswith("cuda") else "fp32" + dtype = getattr(torch, DTYPES[dtype_name]) + dev = torch.device(args.device) + cuda = dev.type == "cuda" + lengths = [int(x) for x in args.lengths.split(",") if x] + + tok = AutoTokenizer.from_pretrained(args.model) + t_load0 = time.perf_counter() + model = AutoModelForTokenClassification.from_pretrained( + args.model, dtype=dtype, attn_implementation=args.attn).eval().to(dev) + if cuda: + torch.cuda.synchronize() + t_load1 = time.perf_counter() + + def fwd(ids): + with torch.inference_mode(): + model(input_ids=ids) + + name = torch.cuda.get_device_name(dev) if cuda else f"cpu x{torch.get_num_threads()}" + print(f"torch {torch.__version__} | tf {transformers.__version__} | {name} | " + f"{dtype_name} | {args.attn} | load {t_load1 - t_load0:.2f}s | {args.iters} iters\n") + print(f"| {'tokens':>8} | {'forward ms':>11} | {'tok/s':>8} | {'peak MiB':>8} |") + print("|---------:|------------:|---------:|---------:|") + + for n in lengths: + ids = build_ids(tok, n).to(dev) + try: + if cuda: + torch.cuda.reset_peak_memory_stats(dev) + torch.cuda.synchronize() + fwd(ids) # warm-up (lazy kernel/autotune, allocator growth) + if cuda: + torch.cuda.synchronize() + ev0, ev1 = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) + ev0.record() + for _ in range(args.iters): + fwd(ids) + ev1.record() + torch.cuda.synchronize() + fwd_ms = ev0.elapsed_time(ev1) / args.iters + peak = torch.cuda.max_memory_allocated(dev) / 1024 / 1024 + else: + t0 = time.perf_counter() + for _ in range(args.iters): + fwd(ids) + fwd_ms = (time.perf_counter() - t0) * 1e3 / args.iters + peak = _rss_mib() + except (torch.cuda.OutOfMemoryError, RuntimeError) as e: + if cuda: + torch.cuda.empty_cache() + msg = "OOM" if "out of memory" in str(e).lower() else f"err: {str(e)[:40]}" + print(f"| {n:>8} | {msg:>11} | {'-':>8} | {'-':>8} |") + continue + print(f"| {n:>8} | {fwd_ms:>11.1f} | {n / (fwd_ms / 1e3):>8.0f} | {peak:>8.0f} |") + del ids + if cuda: + torch.cuda.empty_cache() + return 0 + + +def _rss_mib() -> float: + try: + with open("/proc/self/status") as f: + for line in f: + if line.startswith("VmHWM:"): + return int(line.split()[1]) / 1024 + except OSError: + pass + return 0.0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/requant_q8.py b/scripts/requant_q8.py new file mode 100644 index 0000000..0a89c11 --- /dev/null +++ b/scripts/requant_q8.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""Experiment tool: requantize selected weights of an existing GGUF to Q8_0. + +Copies every KV field and tensor verbatim, except tensors whose name matches +--match (default: the MoE expert weights), which are quantized to Q8_0. Used to +test the hypothesis that ggml's int8 mul_mat_id kernel beats the f16 path on CPU. + + python scripts/requant_q8.py --in f16.gguf --out q8.gguf [--match SUBSTR ...] +""" +from __future__ import annotations + +import argparse + +import numpy as np +import gguf +from gguf import GGMLQuantizationType as QT, GGUFValueType as VT + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--in", dest="inp", required=True) + ap.add_argument("--out", required=True) + ap.add_argument("--match", nargs="*", + default=["ffn_gate_exps.weight", "ffn_up_exps.weight", "ffn_down_exps.weight"]) + args = ap.parse_args() + + r = gguf.GGUFReader(args.inp) + arch = r.fields["general.architecture"].contents() + w = gguf.GGUFWriter(args.out, arch) + + # copy metadata (the writer already set general.architecture itself) + scalar_add = { + VT.UINT8: w.add_uint8, VT.INT8: w.add_int8, VT.UINT16: w.add_uint16, + VT.INT16: w.add_int16, VT.UINT32: w.add_uint32, VT.INT32: w.add_int32, + VT.FLOAT32: w.add_float32, VT.UINT64: w.add_uint64, VT.INT64: w.add_int64, + VT.FLOAT64: w.add_float64, VT.BOOL: w.add_bool, VT.STRING: w.add_string, + } + for key, field in r.fields.items(): + if key == "general.architecture": + continue + val = field.contents() + if field.types and field.types[0] == VT.ARRAY: + w.add_array(key, val) + else: + scalar_add[field.types[0]](key, val) + + # copy / quantize tensors + n_q = 0 + for t in r.tensors: + if any(m in t.name for m in args.match): + x = t.data.astype(np.float32) # numpy order [.., ne0] + q = gguf.quants.quantize(x, QT.Q8_0) # uint8, last dim -> bytes/row + w.add_tensor(t.name, q, raw_dtype=QT.Q8_0) # writer derives logical shape + n_q += 1 + else: + w.add_tensor(t.name, t.data) # verbatim (preserves F16/F32) + + w.write_header_to_file() + w.write_kv_data_to_file() + w.write_tensors_to_file() + w.close() + print(f"wrote {args.out}: {len(r.tensors)} tensors ({n_q} quantized to Q8_0), {len(r.fields)} fields") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/backend.cpp b/src/backend.cpp index eb3f60b..b757b84 100644 --- a/src/backend.cpp +++ b/src/backend.cpp @@ -1,5 +1,6 @@ #include "backend.h" +#include #include #include @@ -26,8 +27,29 @@ void parse_device(const std::string & req, std::string & name, int & index) { } // namespace +// Discover dynamically-loadable backends once. For a GGML_BACKEND_DL + +// GGML_CPU_ALL_VARIANTS build this loads every libggml-cpu-.so and ggml +// scores them, so the host's best ISA (e.g. zen4/AVX-512) is selected at run +// time -- the portable way to ship SIMD without baking -march into one binary +// (and without Nix's wrapper silently dropping -march=native). No-op / harmless +// for a statically-linked build, where backends register at static-init. +static void load_backends_once() { + static const bool done = [] { ggml_backend_load_all(); return true; }(); + (void) done; +} + +// Set threads through the backend registry rather than ggml_backend_cpu_set_n_threads: +// in a DL build that symbol lives in the variant .so, not in the linked base. +static void set_cpu_threads(ggml_backend_t be, int n_threads) { + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(be)); + auto set_fn = (ggml_backend_set_n_threads_t) + ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (set_fn) set_fn(be, n_threads); +} + bool engine_backend::init(const std::string & device_req, int n_threads) { release(); + load_backends_once(); std::string name; int want_idx = 0; @@ -40,12 +62,16 @@ bool engine_backend::init(const std::string & device_req, int n_threads) { return false; } device = "cpu"; + if (const char * env = std::getenv("PF_NTHREADS")) { + // explicit override (tuning / benchmarking); 0 falls through to auto + if (int v = std::atoi(env)) n_threads = v; + } if (n_threads <= 0) { // ggml's default is 4 threads; matmul-heavy work wants the // physical cores (SMT siblings only add contention here) n_threads = std::max(1u, std::thread::hardware_concurrency() / 2); } - ggml_backend_cpu_set_n_threads(be, n_threads); + set_cpu_threads(be, n_threads); } else if (name == "gpu" || name == "cuda" || name == "vulkan") { // "gpu" picks the first GPU of whichever backend was compiled in; // "cuda"/"vulkan" pin a specific backend when more than one is built. diff --git a/src/model.cpp b/src/model.cpp index b31db40..80afe56 100644 --- a/src/model.cpp +++ b/src/model.cpp @@ -178,8 +178,18 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector & logits, const hparams & h = file.hp; const int64_t n_embd = h.n_embd, n_head = h.n_head, n_head_kv = h.n_head_kv, n_rot = h.n_rot; - // ~45 nodes/layer * 8 layers + inputs/head; generous fixed bound. - const size_t graph_nodes = 1024; + // PF_MOE_CHUNK: MoE FFN token-chunk size. The MoE is per-token, so chunking is + // exact (no halo) and bounds the mul_mat_id activation scratch (a Vulkan + // single-buffer limit). Default = the forward window, so it is inert at the + // default window (n <= W) yet keeps a larger window (single-pass long docs) + // from OOMing. 0 disables. + const int moe_chunk = std::getenv("PF_MOE_CHUNK") ? std::atoi(std::getenv("PF_MOE_CHUNK")) : 4096; + + // ~45 nodes/layer * 8 layers + inputs/head; generous fixed bound. MoE chunking + // multiplies the FFN node count by the number of chunks. + size_t graph_nodes = 1024; + if (moe_chunk > 0 && n > moe_chunk) + graph_nodes += (size_t) ((n + moe_chunk - 1) / moe_chunk) * h.n_layer * 40; ggml_init_params gp = { ggml_tensor_overhead() * graph_nodes + ggml_graph_overhead_custom(graph_nodes, false), nullptr, @@ -205,14 +215,36 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector & logits, return t; }; + // PF_BANDED (experimental): block-local sliding-window attention. The mask is + // an O(n*B) per-block band [3B,B,1,nb] instead of the O(n^2) [n,n], and + // attention compute drops to O(n*band). Tokens group into blocks of B>=radius; + // each query block attends only to blocks {i-1,i,i+1}. Bit-identical to the + // full masked attention (see bench/banded_attn_proto.cpp). + // Default on once the sequence is long enough to win. The B=256 block padding + // makes it a slight loss on short inputs; measured crossover ~2048 tok (CPU + // neutral-to-faster, Vulkan ~1.1x rising to ~2.5x at length). PF_BANDED forces + // it on (non-zero) or off (0). n here is one window's worth (<= the window). + const char * banded_env = std::getenv("PF_BANDED"); + const bool use_banded = banded_env ? (std::atoi(banded_env) != 0) : (n >= 2048); + const int Bsz = 256; // block size (>= swa_radius 128) + const int nbk = use_banded ? (int) ((n + Bsz - 1) / Bsz) : 0; + const int n_pad = nbk * Bsz; + // inputs (data written after alloc) ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n); ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n); - ggml_tensor * kq_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); ggml_tensor * ff = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_rot / 2); + ggml_tensor * kq_mask = nullptr; // [n,n] -- full / flash paths + ggml_tensor * band_mask = nullptr; // [3B,B,1,nb] -- banded path + if (use_banded) { + band_mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 3 * Bsz, Bsz, 1, nbk); + ggml_set_input(band_mask); + } else { + kq_mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); + ggml_set_input(kq_mask); + } ggml_set_input(inp_tokens); ggml_set_input(inp_pos); - ggml_set_input(kq_mask); ggml_set_input(ff); ggml_tensor * cur = ggml_get_rows(ctx, tok_embd, inp_tokens); // [640, n] @@ -231,6 +263,62 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector & logits, 0.0f, attn_factor, h.yarn_beta_fast, h.yarn_beta_slow); }; + // Ablation profiling hooks (PF_PROF): skip a block to attribute wall-time. + // noattn -> skip self-attention; nomoe -> skip the MoE FFN. + // The residual still runs on the (cheap rms) input, so the delta vs the full + // forward is that block's cost. Build-time only; no effect unset. + const char * prof = std::getenv("PF_PROF"); + const bool prof_noattn = prof && std::strstr(prof, "noattn"); + const bool prof_nomoe = prof && std::strstr(prof, "nomoe"); + + // Fused flash attention (default) instead of the explicit [n,n] score matrix: + // no materialized scores, the backend skips out-of-band KV under the + // sliding-window mask, sinks carried via add_sinks, F32 accumulate. Validated + // exact (passes the f32 cos>=0.99999 gate) and ~2-2.4x faster on CPU and + // Vulkan at length. PF_NOFLASH selects the explicit path (reference / debug). + const bool use_flash = !std::getenv("PF_NOFLASH"); + + // Banded block-local attention (PF_BANDED): q [d,n_head,n], k/v [d,n_head_kv,n] + // post-rope -> [n_head*d, n]. Each query block attends to blocks {i-1,i,i+1} + // (3B keys) via a constant-shape per-block band mask; same dot products as the + // full path, computed locally. GQA broadcasts over the head dim; sinks added + // per block; pad tokens are masked (and trimmed) -- the sink keeps their + // softmax finite. + auto banded_attn = [&](ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * sinks) { + const float scale = 1.0f / std::sqrt((float) n_rot); + if (n_pad != n) { + q = ggml_pad(ctx, q, 0, 0, n_pad - n, 0); + k = ggml_pad(ctx, k, 0, 0, n_pad - n, 0); + v = ggml_pad(ctx, v, 0, 0, n_pad - n, 0); + } + auto to_blocks = [&](ggml_tensor * x, int64_t hh) { // [d,hh,n_pad]->[d,B,hh,nb] + x = ggml_reshape_4d(ctx, x, n_rot, hh, Bsz, nbk); // [d,hh,B,nb] + return ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [d,B,hh,nb] + }; + auto neigh = [&](ggml_tensor * xb, int64_t hh) { // [d,B,hh,nb]->[d,3B,hh,nb] + ggml_tensor * z = ggml_scale(ctx, + ggml_view_4d(ctx, xb, n_rot, Bsz, hh, 1, xb->nb[1], xb->nb[2], xb->nb[3], 0), 0.0f); + ggml_tensor * pad = ggml_concat(ctx, ggml_concat(ctx, z, xb, 3), z, 3); // [d,B,hh,nb+2] + ggml_tensor * pr = ggml_view_4d(ctx, pad, n_rot, Bsz, hh, nbk, pad->nb[1], pad->nb[2], pad->nb[3], 0); + ggml_tensor * se = ggml_view_4d(ctx, pad, n_rot, Bsz, hh, nbk, pad->nb[1], pad->nb[2], pad->nb[3], 1 * pad->nb[3]); + ggml_tensor * nx = ggml_view_4d(ctx, pad, n_rot, Bsz, hh, nbk, pad->nb[1], pad->nb[2], pad->nb[3], 2 * pad->nb[3]); + return ggml_cont(ctx, ggml_concat(ctx, ggml_concat(ctx, pr, se, 1), nx, 1)); // [d,3B,hh,nb] + }; + ggml_tensor * qb = to_blocks(q, n_head); // [d,B,14,nb] + ggml_tensor * kc = neigh(to_blocks(k, n_head_kv), n_head_kv); // [d,3B,2,nb] + ggml_tensor * vc = neigh(to_blocks(v, n_head_kv), n_head_kv); // [d,3B,2,nb] + // flash over the 3-block neighborhoods: no materialized band scores, so + // memory is O(n*band). mask is the F16 per-block band; sinks per block. + ggml_tensor * m16 = ggml_cast(ctx, band_mask, GGML_TYPE_F16); // [3B,B,1,nb] + ggml_tensor * o = ggml_flash_attn_ext(ctx, qb, kc, vc, m16, scale, 0.0f, 0.0f); // [d,14,B,nb] + ggml_flash_attn_ext_set_prec(o, GGML_PREC_F32); + ggml_flash_attn_ext_add_sinks(o, sinks); + o = ggml_reshape_3d(ctx, o, n_rot, n_head, n_pad); // [d,14,n_pad] + if (n_pad != n) + o = ggml_view_3d(ctx, o, n_rot, n_head, n, o->nb[1], o->nb[2], 0); + return ggml_cont_2d(ctx, o, n_head * n_rot, n); // [896, n] + }; + for (int il = 0; il < h.n_layer; il++) { const layer_weights & l = layers[il]; const std::string L = "l" + std::to_string(il); @@ -239,7 +327,7 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector & logits, cur = tap(rms(cur, l.attn_norm), L + ".attn_norm"); // self-attention - { + if (!prof_noattn) { ggml_tensor * q = ggml_add(ctx, ggml_mul_mat(ctx, l.wq, cur), l.bq); // [896, n] ggml_tensor * k = ggml_add(ctx, ggml_mul_mat(ctx, l.wk, cur), l.bk); // [128, n] ggml_tensor * v = ggml_add(ctx, ggml_mul_mat(ctx, l.wv, cur), l.bv); // [128, n] @@ -251,20 +339,33 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector & logits, q = tap(rope(q), L + ".q_rope"); // [64, 14, n] k = tap(rope(k), L + ".k_rope"); // [64, 2, n] + const float kq_scale = 1.0f / std::sqrt((float) n_rot); + ggml_tensor * attn; // [896, n] + if (use_banded) { + attn = banded_attn(q, k, v, l.sinks); + } else { ggml_tensor * qp = ggml_permute(ctx, q, 0, 2, 1, 3); // [64, n, 14] ggml_tensor * kp = ggml_permute(ctx, k, 0, 2, 1, 3); // [64, n, 2] - ggml_tensor * vp = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [n, 64, 2] - - ggml_tensor * kq = ggml_mul_mat(ctx, kp, qp); // [n, n, 14] (GQA broadcast) - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f / std::sqrt((float) n_rot), 0.0f); - ggml_soft_max_add_sinks(kq, l.sinks); - - ggml_tensor * kqv = ggml_mul_mat(ctx, vp, kq); // [64, n, 14] - kqv = ggml_permute(ctx, kqv, 0, 2, 1, 3); // [64, 14, n] - kqv = ggml_cont_2d(ctx, kqv, n_head * n_rot, n); // [896, n] + if (use_flash) { + ggml_tensor * vp = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3)); // [64, n, 2] + ggml_tensor * m16 = ggml_cast(ctx, kq_mask, GGML_TYPE_F16); + ggml_tensor * fa = ggml_flash_attn_ext(ctx, qp, kp, vp, m16, kq_scale, 0.0f, 0.0f); + ggml_flash_attn_ext_set_prec(fa, GGML_PREC_F32); + ggml_flash_attn_ext_add_sinks(fa, l.sinks); + attn = ggml_reshape_2d(ctx, fa, n_head * n_rot, n); // [896, n] + } else { + ggml_tensor * vp = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [n, 64, 2] + ggml_tensor * kq = ggml_mul_mat(ctx, kp, qp); // [n, n, 14] (GQA) + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f); + ggml_soft_max_add_sinks(kq, l.sinks); + ggml_tensor * kqv = ggml_mul_mat(ctx, vp, kq); // [64, n, 14] + kqv = ggml_permute(ctx, kqv, 0, 2, 1, 3); // [64, 14, n] + attn = ggml_cont_2d(ctx, kqv, n_head * n_rot, n); // [896, n] + } + } - cur = ggml_add(ctx, ggml_mul_mat(ctx, l.wo, kqv), l.bo); // [640, n] + cur = ggml_add(ctx, ggml_mul_mat(ctx, l.wo, attn), l.bo); // [640, n] tap(cur, L + ".attn_out"); } @@ -276,37 +377,45 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector & logits, // MoE FFN (softmax-after-top-k gating; the HF reference's /top_k and // *top_k cancel, so plain softmax weights are the trained semantics) - { + if (!prof_nomoe) { const int64_t n_exp = h.n_expert, n_used = h.n_expert_used; - ggml_tensor * rl = ggml_add(ctx, ggml_mul_mat(ctx, l.router_w, cur), l.router_b); // [128, n] - tap(rl, L + ".moe_logits"); - - ggml_tensor * sel = tap(ggml_argsort_top_k(ctx, rl, (int) n_used), L + ".moe_topk"); // i32 [4, n] - - ggml_tensor * w = ggml_get_rows(ctx, ggml_reshape_3d(ctx, rl, 1, n_exp, n), sel); // [1, 4, n] - w = ggml_soft_max(ctx, ggml_reshape_2d(ctx, w, n_used, n)); - tap(w, L + ".moe_weights"); - w = ggml_reshape_3d(ctx, w, 1, n_used, n); - - ggml_tensor * x3 = ggml_reshape_3d(ctx, cur, n_embd, 1, n); - ggml_tensor * up = ggml_mul_mat_id(ctx, l.up_exps, x3, sel); // [640, 4, n] - up = ggml_add_id(ctx, up, l.up_exps_b, sel); - ggml_tensor * gate = ggml_mul_mat_id(ctx, l.gate_exps, x3, sel); - gate = ggml_add_id(ctx, gate, l.gate_exps_b, sel); - - ggml_tensor * hms = ggml_swiglu_oai(ctx, gate, up, 1.702f, 7.0f); // [640, 4, n] - - ggml_tensor * out = ggml_mul_mat_id(ctx, l.down_exps, hms, sel); // [640, 4, n] - out = ggml_add_id(ctx, out, l.down_exps_b, sel); - out = ggml_mul(ctx, out, w); - - ggml_tensor * moe = nullptr; - for (int64_t e = 0; e < n_used; e++) { - ggml_tensor * slice = ggml_view_2d(ctx, out, n_embd, n, out->nb[2], e * out->nb[1]); - moe = moe ? ggml_add(ctx, moe, slice) : slice; + // one expert FFN over m tokens (x: [n_embd, m]) -> [n_embd, m] + auto moe_ffn = [&](ggml_tensor * x, int64_t m, bool do_taps) { + ggml_tensor * rl = ggml_add(ctx, ggml_mul_mat(ctx, l.router_w, x), l.router_b); // [128, m] + if (do_taps) tap(rl, L + ".moe_logits"); + ggml_tensor * sel = ggml_argsort_top_k(ctx, rl, (int) n_used); // i32 [4, m] + if (do_taps) tap(sel, L + ".moe_topk"); + ggml_tensor * w = ggml_get_rows(ctx, ggml_reshape_3d(ctx, rl, 1, n_exp, m), sel); // [1, 4, m] + w = ggml_soft_max(ctx, ggml_reshape_2d(ctx, w, n_used, m)); + if (do_taps) tap(w, L + ".moe_weights"); + w = ggml_reshape_3d(ctx, w, 1, n_used, m); + ggml_tensor * x3 = ggml_reshape_3d(ctx, x, n_embd, 1, m); + ggml_tensor * up = ggml_add_id(ctx, ggml_mul_mat_id(ctx, l.up_exps, x3, sel), l.up_exps_b, sel); + ggml_tensor * gate = ggml_add_id(ctx, ggml_mul_mat_id(ctx, l.gate_exps, x3, sel), l.gate_exps_b, sel); + ggml_tensor * hms = ggml_swiglu_oai(ctx, gate, up, 1.702f, 7.0f); // [640, 4, m] + ggml_tensor * out = ggml_add_id(ctx, ggml_mul_mat_id(ctx, l.down_exps, hms, sel), l.down_exps_b, sel); + out = ggml_mul(ctx, out, w); + ggml_tensor * moe = nullptr; + for (int64_t e = 0; e < n_used; e++) { + ggml_tensor * sl = ggml_view_2d(ctx, out, n_embd, m, out->nb[2], e * out->nb[1]); + moe = moe ? ggml_add(ctx, moe, sl) : sl; + } + return ggml_cont(ctx, moe); // [n_embd, m] + }; + + if (moe_chunk > 0 && n > moe_chunk) { + ggml_tensor * acc = nullptr; + for (int64_t c = 0; c < n; c += moe_chunk) { + const int64_t m = std::min(moe_chunk, n - c); + ggml_tensor * xs = ggml_cont(ctx, ggml_view_2d(ctx, cur, n_embd, m, cur->nb[1], c * cur->nb[1])); + ggml_tensor * ys = moe_ffn(xs, m, false); + acc = acc ? ggml_concat(ctx, acc, ys, 1) : ys; + } + cur = tap(acc, L + ".moe_out"); + } else { + cur = tap(moe_ffn(cur, n, taps != nullptr), L + ".moe_out"); } - cur = tap(ggml_cont(ctx, moe), L + ".moe_out"); } cur = ggml_add(ctx, cur, resid); @@ -327,17 +436,35 @@ bool model::forward(const int32_t * ids, int64_t n, std::vector & logits, return false; } - // inputs AFTER alloc + // inputs AFTER alloc. Guard each set on ->buffer: a PF_PROF ablation can + // prune a block, orphaning its inputs (gallocr leaves them unallocated); + // unset, every input is live so this is a no-op. ggml_backend_tensor_set(inp_tokens, ids, 0, n * sizeof(int32_t)); { std::vector pos(n); for (int64_t i = 0; i < n; i++) pos[i] = (int32_t) i; - ggml_backend_tensor_set(inp_pos, pos.data(), 0, n * sizeof(int32_t)); - - std::vector mask(n * n); - fill_swa_mask(mask.data(), n, h.swa_radius); - ggml_backend_tensor_set(kq_mask, mask.data(), 0, mask.size() * sizeof(float)); - ggml_backend_tensor_set(ff, freq_factors.data(), 0, freq_factors.size() * sizeof(float)); + if (inp_pos->buffer) ggml_backend_tensor_set(inp_pos, pos.data(), 0, n * sizeof(int32_t)); + + if (use_banded) { + // per-block band mask [3B,B,1,nb]: query (bi*B+j) sees context key + // (bi*B-B+p) iff within radius and a real (unpadded) token. + const int64_t r = h.swa_radius; + std::vector bm((size_t) 3 * Bsz * Bsz * nbk); + for (int bi = 0; bi < nbk; bi++) + for (int j = 0; j < Bsz; j++) + for (int p = 0; p < 3 * Bsz; p++) { + const int64_t qpos = (int64_t) bi * Bsz + j; + const int64_t kpos = (int64_t) bi * Bsz - Bsz + p; + const bool vis = std::llabs(qpos - kpos) <= r && kpos >= 0 && kpos < n; + bm[((size_t) bi * Bsz + j) * (3 * Bsz) + p] = vis ? 0.0f : -INFINITY; + } + if (band_mask->buffer) ggml_backend_tensor_set(band_mask, bm.data(), 0, bm.size() * sizeof(float)); + } else { + std::vector mask((size_t) n * n); + fill_swa_mask(mask.data(), n, h.swa_radius); + if (kq_mask->buffer) ggml_backend_tensor_set(kq_mask, mask.data(), 0, mask.size() * sizeof(float)); + } + if (ff->buffer) ggml_backend_tensor_set(ff, freq_factors.data(), 0, freq_factors.size() * sizeof(float)); } if (ggml_backend_graph_compute(be.be, gf) != GGML_STATUS_SUCCESS) { diff --git a/tests/test_graph_blocks.cpp b/tests/test_graph_blocks.cpp index 90a72a2..f2636ad 100644 --- a/tests/test_graph_blocks.cpp +++ b/tests/test_graph_blocks.cpp @@ -4,7 +4,7 @@ // (interleaved pairing, freq_factors division, attn_factor application). #include "model.h" -#include +#include #include #include @@ -80,13 +80,24 @@ static void test_swa_mask() { } } -// tiny single-op CPU eval helper +// tiny single-op CPU eval helper. Uses the backend API rather than ggml-cpu's +// ggml_graph_compute_with_ctx so it links in a GGML_BACKEND_DL build (where the +// CPU compute symbols live in the variant .so, loaded at runtime). Tensors are +// in a no_alloc=false ctx, so their ->data is CPU-resident and computed in place. +static ggml_backend_t cpu_backend() { + static ggml_backend_t be = [] { + ggml_backend_load_all(); + return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + }(); + return be; +} + template static ggml_tensor * eval(ggml_context * ctx, BUILD build) { ggml_tensor * out = build(); ggml_cgraph * gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, out); - ggml_graph_compute_with_ctx(ctx, gf, 2); + ggml_backend_graph_compute(cpu_backend(), gf); return out; }