Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ jobs:
strategy:
matrix:
include:
- { name: gcc-release, preset: release, cc: gcc, cxx: g++ }
# gcc-release builds the shipped CPU config: every ggml-cpu ISA variant
# (GGML_CPU_ALL_VARIANTS), runtime-dispatched -- so released binaries get
# AVX-512 without -march=native (which Nix strips and which mis-targets
# cross-host builds).
- { name: gcc-release, preset: release-portable, cc: gcc, cxx: g++ }
- { name: clang-debug-san, preset: debug, cc: clang, cxx: clang++ }
name: ${{ matrix.name }}
runs-on: ubuntu-latest
Expand All @@ -32,13 +36,24 @@ jobs:
run: cmake --preset ${{ matrix.preset }} -DGGML_NATIVE=OFF
env: { CC: '${{ matrix.cc }}', CXX: '${{ matrix.cxx }}' }
- name: build
run: cmake --build --preset ${{ matrix.preset }} -j
# -j4 (not unbounded -j): release-portable compiles 14 ggml-cpu ISA
# variants; uncapped parallelism OOMs the 16 GB runner.
run: cmake --build --preset ${{ matrix.preset }} -j4
- name: test (model-independent)
run: ctest --preset ${{ matrix.preset }} -LE model
- name: unicode table regen check
run: |
python3 scripts/gen_unicode.py > /tmp/unicode_data.inc
diff -u src/unicode_data.inc /tmp/unicode_data.inc
- name: assert SIMD compiled in (guard the SSE-only trap)
if: matrix.name == 'gcc-release'
# The AVX-512 ISA variant must actually contain AVX-512 -- catches a
# silent regression to a SIMD-less build (e.g. a stripped -march).
run: |
so=build/release-portable/bin/libggml-cpu-skylakex.so
n=$(objdump -d "$so" | grep -c '%zmm')
echo "skylakex %zmm instructions: $n"
test "$n" -gt 0

# Tier 2 (nightly / dispatch): HF reference fixtures + parity + fuzz smoke.
# Needs the model checkpoint; cached between runs.
Expand Down Expand Up @@ -92,9 +107,9 @@ jobs:
--model ~/models/privacy-filter-multilingual \
--outfile ~/ggufs/pf-f32.gguf --outtype f32
- name: build
run: cmake --preset release -DGGML_NATIVE=OFF && cmake --build --preset release -j
run: cmake --preset release-portable && cmake --build --preset release-portable -j4
- name: parity suite
run: PF_GGUF_DIR=~/ggufs ctest --preset release -L model
run: PF_GGUF_DIR=~/ggufs ctest --preset release-portable -L model
- name: fuzz smoke (5 min/target)
run: |
cmake --preset fuzz && cmake --build --preset fuzz -j --target fuzz_tokenizer fuzz_gguf
Expand Down
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,17 @@ if (PF_BUILD_TOOLS)
add_executable(pf-bench bench/pf-bench.cpp)
target_link_libraries(pf-bench PRIVATE pf)
target_include_directories(pf-bench PRIVATE src)

# GFLOP/s by dtype/shape -- diagnostic for the CPU matmul analysis
# (docs/cpu-perf.md). Links ggml directly; no pf/model deps.
add_executable(pf-gemm-bench bench/gemm_microbench.cpp)
target_link_libraries(pf-gemm-bench PRIVATE ggml)

# Prototype: O(n*band) block-local attention == full masked attention
# (bit-identical), with an O(n*B) mask instead of O(n^2). Proof-of-concept
# for de-windowing; see docs/cpu-perf.md. Args: [block] [tokens].
add_executable(pf-banded-proto bench/banded_attn_proto.cpp)
target_link_libraries(pf-banded-proto PRIVATE ggml)
endif()

if (PF_FUZZ)
Expand Down
16 changes: 15 additions & 1 deletion CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@
"binaryDir": "${sourceDir}/build/release",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
},
{
"name": "release-portable",
"binaryDir": "${sourceDir}/build/release-portable",
"description": "Portable + fast CPU: build every ggml-cpu ISA variant and pick the best at runtime. Avoids -march=native (fragile, and stripped by Nix's NIX_ENFORCE_NO_NATIVE), so binaries run anywhere yet still use AVX-512/VNNI where present.",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"GGML_NATIVE": "OFF",
"GGML_BACKEND_DL": "ON",
"GGML_CPU_ALL_VARIANTS": "ON",
"CMAKE_RUNTIME_OUTPUT_DIRECTORY": "${sourceDir}/build/release-portable/bin"
}
},
{
"name": "profile",
"binaryDir": "${sourceDir}/build/profile",
Expand All @@ -35,11 +47,13 @@
"buildPresets": [
{ "name": "debug", "configurePreset": "debug" },
{ "name": "release", "configurePreset": "release" },
{ "name": "release-portable", "configurePreset": "release-portable" },
{ "name": "profile", "configurePreset": "profile" },
{ "name": "fuzz", "configurePreset": "fuzz" }
],
"testPresets": [
{ "name": "debug", "configurePreset": "debug", "output": { "outputOnFailure": true } },
{ "name": "release", "configurePreset": "release", "output": { "outputOnFailure": true } }
{ "name": "release", "configurePreset": "release", "output": { "outputOnFailure": true } },
{ "name": "release-portable", "configurePreset": "release-portable", "output": { "outputOnFailure": true } }
]
}
37 changes: 26 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,18 +130,33 @@ PF_GGUF=model.gguf ./build/fuzz/fuzz_tokenizer corpus_tok/
## Bench

```sh
build/release/pf-bench model.gguf [cpu|vulkan] [iters]
cmake --preset release-portable && cmake --build --preset release-portable -j
build/release-portable/bin/pf-bench model.gguf [cpu|vulkan] [iters] [lengths]
```

Ryzen 9 7900 (12 threads) / RTX 5070 Ti, f16 GGUF, forward tok/s by length
(one untimed warm-up per length; GPU pipelines compile lazily):
Forward tok/s vs stock HF Transformers (transformers 5.9, eager), Ryzen 9 7900 (12
threads) + RTX 5070 Ti, f16/fp16, matched token counts
([scripts/bench_torch.py](scripts/bench_torch.py)):

| tokens | cpu | vulkan |
|-------:|----:|-------:|
| 189 | 161 | 51 583 |
| 756 | 178 | 99 756 |
| 2 898 | 129 | 45 416 |
| 11 403 | 68 | 20 085 |
| 45 234 | 60 | 17 390 |
GPU — ours (Vulkan) vs HF (CUDA):

Weights stay in one zero-copy buffer: ~2.8 GiB RSS over baseline (f16).
| tokens | HF | ours | × |
|-------:|--------:|--------:|-----:|
| 512 | 5 526 | 100 503 | 18× |
| 2 048 | 16 427 | 145 481 | 8.9× |
| 8 192 | 14 154 | 105 034 | 7.4× |
| 32 768 | OOM | 83 519 | — |
| 131072 | OOM | 81 105 | — |

CPU — ours vs HF (fp32):

| tokens | HF | ours | × |
|-------:|------:|------:|-----:|
| 512 | 2 171 | 3 564 | 1.6× |
| 2 048 | 978 | 3 490 | 3.6× |
| 8 192 | 304 | 2 332 | 7.7× |

Memory is flat ~2.8 GiB VRAM / ~3 GiB RAM to 131k tokens; HF OOMs past ~16k on a 16
GiB GPU. `release-portable` runtime-dispatches the best ggml-cpu ISA (AVX-512
without `-march=native`); flash + banded attention default on. See
[docs/cpu-perf.md](docs/cpu-perf.md).
101 changes: 101 additions & 0 deletions bench/banded_attn_proto.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Prototype: O(n*band) block-local sliding-window attention vs the full O(n^2)
// masked attention, on random data, to validate correctness before wiring it
// into the model. Single head for clarity.
//
// Full: scores[n_kv,n_q] = mul_mat(K,Q); mask |q-k|<=r; softmax; out=V^T@scores
// Banded: tokens grouped into blocks of B (>= r). Each query block attends only
// to blocks {i-1,i,i+1} (3B keys, since r<=B). A per-block mask
// [3B,B,n_blocks] (O(n*B) memory) carries the band + edge validity.
#include <ggml.h>
#include <ggml-cpu.h>
#include <ggml-backend.h>

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <initializer_list>

static ggml_backend_t cpu() {
static ggml_backend_t be = [] { ggml_backend_load_all();
return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); }();
return be;
}
static void run(ggml_context * ctx, ggml_tensor * out) {
ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, out);
ggml_backend_graph_compute(cpu(), gf);
}

int main(int argc, char ** argv) {
const int d = 64;
const int r = 128; // sliding-window radius
const int B = argc > 1 ? std::atoi(argv[1]) : 256; // block size (>= r)
const int n = argc > 2 ? std::atoi(argv[2]) : 4096; // tokens (multiple of B)
const int nb = n / B;
const float scale = 1.0f / std::sqrt((float) d);

ggml_init_params p = { (size_t) 2048 * 1024 * 1024, nullptr, false };
ggml_context * ctx = ggml_init(p);

// shared random q,k,v [d, n]
ggml_tensor * Q = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
ggml_tensor * K = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
ggml_tensor * V = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d, n);
for (ggml_tensor * t : { Q, K, V })
for (int i = 0; i < d * n; i++) ((float *) t->data)[i] = (float) (rand() % 2000) / 1000.0f - 1.0f;

// ---- full reference ----
ggml_tensor * Fmask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n, n); // [n_kv, n_q]
for (int q = 0; q < n; q++)
for (int k = 0; k < n; k++)
((float *) Fmask->data)[(size_t) q * n + k] = (std::abs(q - k) <= r) ? 0.0f : -INFINITY;
ggml_tensor * sc = ggml_mul_mat(ctx, K, Q); // [n_kv, n_q]
sc = ggml_soft_max_ext(ctx, sc, Fmask, scale, 0.0f);
ggml_tensor * Vt = ggml_cont(ctx, ggml_transpose(ctx, V)); // [n, d]
ggml_tensor * full = ggml_mul_mat(ctx, Vt, sc); // [d, n_q]
run(ctx, full);

// ---- banded ----
ggml_tensor * qb = ggml_reshape_3d(ctx, Q, d, B, nb); // [d, B, nb]
ggml_tensor * kb = ggml_reshape_3d(ctx, K, d, B, nb);
ggml_tensor * vb = ggml_reshape_3d(ctx, V, d, B, nb);
// pad a zero block each side along the block axis, then 3 shifted views
auto ctx3 = [&](ggml_tensor * x) {
ggml_tensor * z = ggml_scale(ctx, ggml_view_3d(ctx, x, d, B, 1, x->nb[1], x->nb[2], 0), 0.0f);
ggml_tensor * pad = ggml_concat(ctx, ggml_concat(ctx, z, x, 2), z, 2); // [d, B, nb+2]
ggml_tensor * prev = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 0 * pad->nb[2]);
ggml_tensor * self = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 1 * pad->nb[2]);
ggml_tensor * next = ggml_view_3d(ctx, pad, d, B, nb, pad->nb[1], pad->nb[2], 2 * pad->nb[2]);
return ggml_cont(ctx, ggml_concat(ctx, ggml_concat(ctx, prev, self, 1), next, 1)); // [d, 3B, nb]
};
ggml_tensor * kc = ctx3(kb); // [d, 3B, nb]
ggml_tensor * vc = ctx3(vb);
ggml_tensor * Bmask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3 * B, B, nb); // [3B, B, nb]
for (int bi = 0; bi < nb; bi++)
for (int j = 0; j < B; j++)
for (int pp = 0; pp < 3 * B; pp++) {
const int qpos = bi * B + j;
const int kpos = bi * B - B + pp;
const bool vis = std::abs(qpos - kpos) <= r && kpos >= 0 && kpos < n;
((float *) Bmask->data)[((size_t) bi * B + j) * (3 * B) + pp] = vis ? 0.0f : -INFINITY;
}
ggml_tensor * scb = ggml_mul_mat(ctx, kc, qb); // [3B, B, nb]
scb = ggml_soft_max_ext(ctx, scb, Bmask, scale, 0.0f);
ggml_tensor * vct = ggml_cont(ctx, ggml_transpose(ctx, vc)); // [3B, d, nb]
ggml_tensor * outb = ggml_mul_mat(ctx, vct, scb); // [d, B, nb]
outb = ggml_cont_2d(ctx, outb, d, n); // [d, n]
run(ctx, outb);

// compare
double maxabs = 0, maxrel = 0;
for (int i = 0; i < d * n; i++) {
const double a = ((float *) full->data)[i], b = ((float *) outb->data)[i];
maxabs = std::max(maxabs, std::fabs(a - b));
maxrel = std::max(maxrel, std::fabs(a - b) / (std::fabs(a) + 1e-6));
}
const double full_mask_mib = (double) n * n * 4 / 1048576.0;
const double band_mask_mib = (double) 3 * B * B * nb * 4 / 1048576.0;
std::printf("n=%d B=%d r=%d | max|d|=%.2e maxrel=%.2e | mask: full %.1f MiB, band %.1f MiB (%.1fx)\n",
n, B, r, maxabs, maxrel, full_mask_mib, band_mask_mib, full_mask_mib / band_mask_mib);
return 0;
}
78 changes: 78 additions & 0 deletions bench/gemm_microbench.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// pf-gemm-bench — isolates ggml's CPU matmul throughput (GFLOP/s) by dtype and
// shape, to explain where CPU time goes vs a BLAS-backed framework (numpy/torch
// = MKL/oneDNN). result = mul_mat(a[K,M], b[K,N]) -> [M,N]; FLOPs = 2*M*N*K.
//
// Finding it was written to demonstrate: ggml's float matmul is ~40 GFLOP/s flat
// at every size (no cache-blocked SGEMM -- the kernels are per-row vec_dot, tuned
// for quantized weights where bandwidth dominates), while its q8_0 path is
// 5-7x faster. MKL's blocked SGEMM is 12-29x the ggml-f32 rate (run mkl side
// separately, e.g. torch.matmul). See docs/cpu-perf.md.
#include <ggml.h>
#include <ggml-cpu.h>
#include <ggml-backend.h>

#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <vector>

static double gflops(ggml_backend_t be, ggml_type ta, int M, int K, int N, int iters) {
ggml_init_params p = { (size_t) 64 * 1024 * 1024, nullptr, true };
ggml_context * ctx = ggml_init(p);
ggml_tensor * a = ggml_new_tensor_2d(ctx, ta, K, M);
ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, K, N);
ggml_tensor * c = ggml_mul_mat(ctx, a, b);
ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, c);
ggml_gallocr * ga = ggml_gallocr_new(ggml_backend_get_default_buffer_type(be));
ggml_gallocr_alloc_graph(ga, gf);

std::vector<float> rb(K * N);
for (auto & x : rb) x = (float) (rand() % 1000) / 1000.0f - 0.5f; // values irrelevant for timing
ggml_backend_tensor_set(b, rb.data(), 0, rb.size() * sizeof(float));
std::vector<float> ra(K * M);
for (auto & x : ra) x = (float) (rand() % 1000) / 1000.0f - 0.5f;
if (ta == GGML_TYPE_F32) {
ggml_backend_tensor_set(a, ra.data(), 0, ra.size() * sizeof(float));
} else {
std::vector<char> buf(ggml_nbytes(a));
ggml_quantize_chunk(ta, ra.data(), buf.data(), 0, M, K, nullptr);
ggml_backend_tensor_set(a, buf.data(), 0, buf.size());
}

ggml_backend_graph_compute(be, gf); // warm
auto t0 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iters; i++) ggml_backend_graph_compute(be, gf);
auto t1 = std::chrono::high_resolution_clock::now();
double s = std::chrono::duration<double>(t1 - t0).count() / iters;
ggml_gallocr_free(ga);
ggml_free(ctx);
return 2.0 * M * N * K / s / 1e9;
}

int main(int argc, char ** argv) {
const int nth = argc > 1 ? std::atoi(argv[1]) : 12;
ggml_backend_load_all(); // pick the best CPU variant in a GGML_BACKEND_DL build
ggml_backend_t be = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
auto set_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
ggml_backend_dev_backend_reg(ggml_backend_get_device(be)), "ggml_backend_set_n_threads");
if (set_fn) set_fn(be, nth);
struct { const char * name; int M, K, N; } shp[] = {
{ "expert gate_up N=16 ", 1280, 640, 16 },
{ "expert gate_up N=64 ", 1280, 640, 64 },
{ "expert gate_up N=512", 1280, 640, 512 },
{ "expert down N=16 ", 640, 640, 16 },
{ "large 4096^2 N=512", 4096, 4096, 512 },
};
std::printf("ggml CPU mul_mat GFLOP/s, %d threads\n", nth);
std::printf("%-22s %10s %10s %10s\n", "shape (M,K,N)", "f32", "f16", "q8_0");
for (auto & s : shp) {
const int it = (double) s.M * s.N > 1e6 ? 20 : 200;
std::printf("%-22s %10.1f %10.1f %10.1f\n", s.name,
gflops(be, GGML_TYPE_F32, s.M, s.K, s.N, it),
gflops(be, GGML_TYPE_F16, s.M, s.K, s.N, it),
gflops(be, GGML_TYPE_Q8_0, s.M, s.K, s.N, it));
}
ggml_backend_free(be);
return 0;
}
Loading
Loading