diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..37d5871
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,14 @@
+# Keep the docker build context small: ship source + the vendored ggml submodule,
+# not host build trees, model blobs, or demo media.
+build/
+**/build/
+.git/
+.cache/
+demo/out/
+demo/traces/
+*.gguf
+*.mp4
+*.gif
+*.webm
+.venv/
+__pycache__/
diff --git a/README.md b/README.md
index 85ab558..73fbb8c 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,16 @@ hits the 16 GiB memory wall and OOMs at ~16k.
 
 Full-quality MP4s: [CPU](demo/out/pii_duel_cpu_final.mp4) · [GPU](demo/out/pii_duel_gpu_final.mp4).
 
+**Raspberry Pi 5 — on-device, real time.** The same engine, no GPU: 1,360 tokens
+of mixed PII classified in 3.8 s (360 tok/s) on a Cortex-A76 @ 1.5 GHz with q8
+weights. The right pane is the live NER feed — 107 spans across 22 categories,
+each with its category and byte range (q8 output is span-for-span identical to
+f16 here).
+
+![Raspberry Pi 5 on-device PII scan: 1,360 tokens, 107 PII spans across 22 categories in 3.8 s](demo/out/pii_scan.gif)
+
+Full-quality MP4: [Pi 5 scan](demo/out/pii_scan_final.mp4).
+
 Single forward-pass latency and throughput vs stock HF Transformers (transformers
 5.9, eager), Ryzen 9 7900 (12 threads) + RTX 5070 Ti, f16/fp16, matched token
 counts ([scripts/bench_torch.py](scripts/bench_torch.py)). `tokens` is the input
diff --git a/demo/gen_scan.py b/demo/gen_scan.py
new file mode 100644
index 0000000..aa1440c
--- /dev/null
+++ b/demo/gen_scan.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""Build the single-document NER-scan trace for pii_scan.py from REAL pf-cli output.
+
+Unlike gen_corpus.py (which tiles a paragraph and replicates hard-coded spans),
+this runs the actual engine on demo/scan_doc.txt:
+  * pf-cli --tok-batch  -> exact token count (vocab only)
+  * pf-cli --classify   -> real entity spans (entity_group/start/end/score/text)
+
+so every span and category on screen is exactly what the model emits. Writes
+demo/traces/scan/{content.json,engines.json}.
+
+  python3 gen_scan.py --cli build/release/pf-cli \
+      --model ~/ggufs_perf/pf-q8experts.gguf --ld build/release/ggml/src \
+      --tps 366 --device "Raspberry Pi 5 · CPU · q8 @ 1.5 GHz"
+"""
+import argparse, json, os, struct, subprocess
+from pathlib import Path
+
+HERE = Path(__file__).resolve().parent
+
+
+def run_cli(cli, ld, args, stdin=None):
+    env = dict(os.environ, LD_LIBRARY_PATH=ld)
+    return subprocess.run([cli, *args], input=stdin, capture_output=True, env=env)
+
+
+def token_count(cli, ld, model, doc_bytes):
+    inp, outp = "/tmp/scan_tb_in.bin", "/tmp/scan_tb_out.bin"
+    with open(inp, "wb") as f:
+        f.write(struct.pack("<I", 1) + struct.pack("<I", len(doc_bytes)) + doc_bytes)
+    run_cli(cli, ld, ["--tok-batch", model, inp, outp]).check_returncode()
+    return struct.unpack_from("<I", open(outp, "rb").read(), 0)[0]
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--cli", required=True)
+    ap.add_argument("--model", required=True)
+    ap.add_argument("--ld", default="", help="LD_LIBRARY_PATH for the cli (shared ggml build)")
+    ap.add_argument("--doc", default=str(HERE / "scan_doc.txt"))
+    ap.add_argument("--scene", default=str(HERE / "traces/scan"))
+    ap.add_argument("--threshold", default="0.5")
+    ap.add_argument("--tps", type=float, default=0.0, help="measured engine throughput (tok/s)")
+    ap.add_argument("--label", default="privacy-filter.cpp")
+    ap.add_argument("--device", default="Raspberry Pi 5 · CPU")
+    ap.add_argument("--note", default="")
+    a = ap.parse_args()
+
+    doc_bytes = Path(a.doc).read_bytes()
+    n_tok = token_count(a.cli, a.ld, a.model, doc_bytes)
+    r = run_cli(a.cli, a.ld, ["--classify", a.model, a.threshold, "cpu"], stdin=doc_bytes)
+    ents_raw = json.loads(r.stdout)
+    ents = [{"type": e["entity_group"], "start": e["start"], "end": e["end"],
+             "text": e.get("text", ""), "score": e.get("score", 0.0)} for e in ents_raw]
+
+    d = Path(a.scene); d.mkdir(parents=True, exist_ok=True)
+    content = {"document": doc_bytes.decode("utf-8"), "n_tokens": n_tok,
+               "note": a.note, "entities": ents}
+    json.dump(content, open(d / "content.json", "w"), ensure_ascii=False)
+    json.dump([{"label": a.label, "device": a.device, "tps": a.tps}],
+              open(d / "engines.json", "w"), indent=2, ensure_ascii=False)
+
+    cats = {}
+    for e in ents:
+        cats[e["type"]] = cats.get(e["type"], 0) + 1
+    print(f"{len(doc_bytes):,} chars  {n_tok:,} tokens  {len(ents)} entities  "
+          f"{len(cats)} categories")
+    if a.tps:
+        print(f"engine: {a.tps:g} tok/s  ->  {n_tok / a.tps:.2f}s run")
+    print("categories:", ", ".join(f"{k}:{v}" for k, v in
+                                    sorted(cats.items(), key=lambda x: -x[1])))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demo/make_scan.sh b/demo/make_scan.sh
new file mode 100755
index 0000000..2569cfd
--- /dev/null
+++ b/demo/make_scan.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Render the single-document PII-scan TUI (pii_scan.py) for the "scan" scene,
+# record it with recorder-for-agents, trim the lead-in, append a branding outro.
+# The scene + real spans + real tok/s come from gen_scan.py (pf-cli output).
+#
+#   ./make_scan.sh                 # real time
+#   DILATE=2 ./make_scan.sh        # slowed 2x
+#
+# env: RECORDER, DILATE, WIDTH/HEIGHT/FPS/FONTSIZE, HOLD, CARD, LINK
+set -euo pipefail
+HERE=$(cd "$(dirname "$0")" && pwd)
+RECORDER=${RECORDER:-/home/rich/python/recorder-for-agents}
+SCENE=scan
+OUT=${1:-pii_scan.mp4}
+DILATE=${DILATE:-1}
+W=${WIDTH:-1280}; H=${HEIGHT:-720}; FS=${FONTSIZE:-16}; FPS=${FPS:-30}
+HOLD=${HOLD:-1.4}; CARD=${CARD:-3.5}
+LINK=${LINK:-github.com/richiejp/privacy-filter.cpp}
+SDIR="$HERE/traces/$SCENE"
+
+[ -d "$SDIR" ] || { echo "no scene at $SDIR (run gen_scan.py)"; exit 1; }
+[ -x "$RECORDER/record.sh" ] || { echo "recorder not found at $RECORDER"; exit 1; }
+
+# capture length: start delay + scan (scaled) + hold + settle + card + buffer
+DUR=$(python3 - "$SDIR" "$DILATE" "$HOLD" "$CARD" <<'PY'
+import json, sys, math
+from pathlib import Path
+d = Path(sys.argv[1]); dil, hold, card = map(float, sys.argv[2:5])
+c = json.load(open(d / "content.json")); e = json.load(open(d / "engines.json"))[0]
+proc = c["n_tokens"] / e["tps"]
+print(int(math.ceil(1.0 + proc * dil + hold + 0.7 + card + 1.0)))
+PY
+)
+echo "[make-scan] ${W}x${H}@${FPS} fs=${FS} dilate=${DILATE} duration=${DUR}s -> out/$OUT"
+
+WORK="$HERE" BG="#0d1117" FG="#d7dde5" FONTSIZE="$FS" DURATION="$DUR" \
+  WIDTH="$W" HEIGHT="$H" FPS="$FPS" START_DELAY=1.0 END_HOLD=0.2 \
+  "$RECORDER/record.sh" \
+  "python3 pii_scan.py --scene traces/$SCENE --dilate $DILATE --hold $HOLD --card $CARD --link '$LINK'" \
+  "$OUT"
+
+RAW="$HERE/out/$OUT"; NOEXT="${OUT%.mp4}"
+if [ -f "$RECORDER/examples/duel/trim_lead.sh" ]; then
+  bash "$RECORDER/examples/duel/trim_lead.sh" "$RAW" "$HERE/out/.trim_$SCENE.mp4" \
+    && mv "$HERE/out/.trim_$SCENE.mp4" "$RAW"
+fi
+if [ -f "$RECORDER/examples/duel/outro.sh" ]; then
+  OW="$W" OH="$H" TITLE="privacy-filter.cpp" \
+  LINK1="github.com/richiejp/privacy-filter.cpp" \
+  LINK2="on-device NER · Raspberry Pi 5 · real PII spans" \
+    bash "$RECORDER/examples/duel/outro.sh" "$RAW" "$HERE/out/${NOEXT}_final.mp4"
+  echo "-> $HERE/out/${NOEXT}_final.mp4"
+else
+  echo "-> $RAW"
+fi
diff --git a/demo/out/pii_scan.gif b/demo/out/pii_scan.gif
new file mode 100644
index 0000000..5bd7bf8
Binary files /dev/null and b/demo/out/pii_scan.gif differ
diff --git a/demo/out/pii_scan.mp4 b/demo/out/pii_scan.mp4
new file mode 100644
index 0000000..8bbdb5f
Binary files /dev/null and b/demo/out/pii_scan.mp4 differ
diff --git a/demo/out/pii_scan_final.mp4 b/demo/out/pii_scan_final.mp4
new file mode 100644
index 0000000..869445a
Binary files /dev/null and b/demo/out/pii_scan_final.mp4 differ
diff --git a/demo/pii_scan.py b/demo/pii_scan.py
new file mode 100644
index 0000000..9866f08
--- /dev/null
+++ b/demo/pii_scan.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+"""
+pii_scan.py — a single-document PII scan, rendered with rich and captured by
+recorder-for-agents into an MP4.
+
+The left pane streams one document at the engine's REAL measured throughput
+(tok/s): the view scrolls a page at a time as tokens are processed and each PII
+span snaps to ████ as the frontier passes it. The right pane is the live NER
+feed — every entity the model finds, in document order, with its category and
+byte range. A bottom strip tracks tokens processed, elapsed time and tok/s.
+
+Data (built by gen_scan.py from real pf-cli output):
+  <scene>/content.json -> {"document","n_tokens","note","entities":[{type,start,end,text,score}]}
+  <scene>/engines.json -> [{"label","device","tps"}]
+
+  python3 pii_scan.py --scene traces/scan --dilate 1
+"""
+import argparse, bisect, json, time
+from pathlib import Path
+
+from rich import box
+from rich.console import Group, Console
+from rich.live import Live
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+from rich.align import Align
+
+# ---- palette ---------------------------------------------------------------
+INK   = "#d7dde5"
+DIM   = "#6e7681"
+FAINT = "#3b424c"
+RULE  = "#222b34"
+ACC, ACC_EMPTY = "#3ec8e0", "#1d6c7b"     # teal frontier / bar
+GREEN = "#46c266"
+GOLD  = "#e3b341"
+
+# entity_group -> redaction/label colour, grouped by kind (names, contact,
+# location, financial/id, secrets/misc). Unknown categories fall back to slate.
+CAT = {
+    "FIRSTNAME": "#e0709a", "LASTNAME": "#e0709a", "MIDDLENAME": "#e0709a",
+    "ACCOUNTNAME": "#e0709a", "USERNAME": "#d98ca0", "JOBTITLE": "#d98ca0",
+    "EMAIL": "#7aa2f7", "URL": "#7aa2f7", "IPADDRESS": "#6ab0f3", "MAC": "#6ab0f3",
+    "PHONE": "#e0a458", "PHONENUMBER": "#e0a458",
+    "DATE": "#9d7cd8", "DATEOFBIRTH": "#b48ee8", "AGE": "#b48ee8", "TIME": "#9d7cd8",
+    "STREET": "#98c379", "BUILDINGNUMBER": "#98c379", "CITY": "#8ac06f",
+    "ZIPCODE": "#7fb86a", "STATE": "#a6cc8a", "COUNTY": "#a6cc8a", "SECONDARYADDRESS": "#98c379",
+    "IBAN": "#56b6c2", "AMOUNT": "#56b6c2", "PIN": "#5fc9d6", "SSN": "#4fb0bd",
+    "CVV": "#4fb0bd", "CREDITCARDNUMBER": "#56b6c2", "ACCOUNTNUMBER": "#56b6c2",
+    "PASSWORD": "#e3b341", "VRM": "#e3b341", "HEIGHT": "#cbb05a", "BIC": "#56b6c2",
+}
+DEFAULT_CAT = "#9aa4af"
+
+
+def cat_colour(t):
+    return CAT.get(t, DEFAULT_CAT)
+
+
+# ---- layout helpers (shared shape with pii_duel.py) ------------------------
+def wrap_ranges(s, width):
+    """Greedy word-wrap over the source string -> (start, end) char ranges."""
+    n = len(s); width = max(8, width)
+    out, i = [], 0
+    while i < n:
+        nl = s.find("\n", i, i + width)
+        if nl != -1:
+            out.append((i, nl)); i = nl + 1; continue
+        if i + width >= n:
+            out.append((i, n)); break
+        sp = s.rfind(" ", i, i + width)
+        if sp > i:
+            out.append((i, sp)); i = sp + 1
+        else:
+            out.append((i, i + width)); i += width
+    return out or [(0, 0)]
+
+
+def prep(content, inner_w):
+    doc = content["document"]
+    ents = sorted(content["entities"], key=lambda e: e["start"])
+    ranges = wrap_ranges(doc, inner_w)
+    return {"doc": doc, "ents": ents,
+            "starts": [e["start"] for e in ents],
+            "ranges": ranges, "line_starts": [s for s, _ in ranges],
+            "n_tokens": content["n_tokens"], "inner_w": inner_w}
+
+
+def _emit_norm(t, doc, a, b, playhead, cursor):
+    if a >= b:
+        return
+    if playhead <= a:
+        t.append(doc[a:b], style=FAINT)
+    elif playhead >= b:
+        t.append(doc[a:b], style=INK)
+    else:
+        t.append(doc[a:playhead], style=INK)
+        t.append(doc[playhead], style=cursor)
+        t.append(doc[playhead + 1:b], style=FAINT)
+
+
+def render_page(P, page_lines, playhead):
+    doc, ents, starts, n = P["doc"], P["ents"], P["starts"], len(P["ents"])
+    cursor = f"{ACC} reverse"
+    lines = []
+    for s, e in page_lines:
+        t = Text(no_wrap=True, overflow="crop")
+        k = bisect.bisect_left(starts, s)
+        if k > 0 and ents[k - 1]["end"] > s:
+            k -= 1
+        cur = s
+        while k < n and ents[k]["start"] < e:
+            en = ents[k]; k += 1
+            a, b = max(s, en["start"]), min(e, en["end"])
+            if b <= cur:
+                continue
+            _emit_norm(t, doc, cur, a, playhead, cursor)
+            if playhead >= en["start"]:
+                t.append("█" * (b - a), style=cat_colour(en["type"]))
+            else:
+                t.append(doc[a:b], style=FAINT)
+            cur = b
+        _emit_norm(t, doc, cur, e, playhead, cursor)
+        lines.append(t)
+    return lines
+
+
+def bar(frac, width, fill, empty, label):
+    f = max(0, min(width, round(width * frac)))
+    t = Text()
+    t.append("━" * f, style=fill)
+    t.append("━" * (width - f), style=empty)
+    t.append(f"  {label}", style=fill)
+    return t
+
+
+def doc_pane(P, playhead, height):
+    body_lines = max(4, height - 4)
+    cur_line = max(0, bisect.bisect_right(P["line_starts"], playhead) - 1)
+    page0 = (cur_line // body_lines) * body_lines
+    lines = render_page(P, P["ranges"][page0:page0 + body_lines], playhead)
+    body = Text("\n").join(lines)
+    pad = body_lines - len(lines)
+    if pad > 0:
+        body.append("\n" * pad)
+    page_n = page0 // body_lines + 1
+    pages = (len(P["ranges"]) + body_lines - 1) // body_lines
+    title = Text(" document ", style=f"bold {ACC}")
+    title.append(f" scanning · page {page_n}/{pages} ", style=DIM)
+    return Panel(body, title=title, title_align="left", border_style=ACC_EMPTY,
+                 box=box.ROUNDED, padding=(1, 2), height=height)
+
+
+def findings_pane(P, playhead, height, done):
+    """Right column: the live NER feed (newest at the bottom), tail-limited."""
+    found = bisect.bisect_right(P["starts"], playhead)   # entities discovered so far
+    rows = max(3, height - 4)
+    ents = P["ents"][:found]
+    shown = ents[-rows:]
+    cat_w = 14
+    t = Text()
+    if found > rows:
+        t.append(f"  ⋮ +{found - rows} earlier\n", style=FAINT)
+    for e in shown:
+        col = cat_colour(e["type"])
+        lab = (e["type"][:cat_w]).ljust(cat_w)
+        t.append(lab + " ", style=f"bold {col}")
+        txt = e.get("text", "")
+        avail = max(6, P["find_w"] - cat_w - 1 - 11)
+        if len(txt) > avail:
+            txt = txt[:avail - 1] + "…"
+        t.append(txt.ljust(avail), style=INK)
+        t.append(f" {e['start']:>4}-{e['end']:<4}\n", style=FAINT)
+    pad = rows - len(shown) - (1 if found > rows else 0)
+    if pad > 0:
+        t.append("\n" * pad)
+    title = Text(" PII found ", style=f"bold {GOLD}")
+    title.append(f" {found} spans ", style=DIM)
+    border = GREEN if done else (GOLD if found else FAINT)
+    return Panel(t, title=title, title_align="left", border_style=border,
+                 box=box.ROUNDED, padding=(1, 1), height=height)
+
+
+def status_strip(P, eng, playhead, tokens_done, elapsed_ms, frac, done, cols):
+    found = bisect.bisect_right(P["starts"], playhead)
+    line = Text()
+    line.append(f"{tokens_done:,}", style=INK)
+    line.append(f" / {P['n_tokens']:,} tok", style=DIM)
+    line.append("    ")
+    if done:
+        line.append("✓ ", style=f"bold {GREEN}")
+        line.append(f"{eng['proc_s'] * 1000:,.0f} ms", style=f"bold {INK}")
+    else:
+        line.append("▸ ", style=ACC)
+        line.append(f"{elapsed_ms:,.0f} ms", style=f"bold {INK}")
+    line.append(f"   {eng['tps']:,.0f} tok/s", style=DIM)
+    line.append(f"   ·   {found} entities", style=DIM)
+    blabel = "100%" if done else f"{int(round(frac * 100)):>3d}%"
+    return Group(bar(frac, max(10, cols - 10), ACC, ACC_EMPTY, blabel), line)
+
+
+def header(eng, note, dilate, cols):
+    left = Text()
+    left.append(eng["label"], style=f"bold {ACC}")
+    left.append("   " + eng["device"], style=DIM)
+    right = Text(justify="right")
+    if note:
+        right.append(note, style=INK)
+    if dilate > 1.01:
+        right.append(f"   ·   slowed {dilate:g}×", style=FAINT)
+    g = Table.grid(expand=True)
+    g.add_column(justify="left"); g.add_column(justify="right")
+    g.add_row(left, right)
+    return Group(g, Text("─" * cols, style=RULE))
+
+
+def view(P, eng, note, elapsed, dilate, cols, rows):
+    proc = eng["proc_s"]
+    frac = 0.0 if proc <= 0 else max(0.0, min(1.0, elapsed / proc))
+    done = elapsed >= proc
+    playhead = round(frac * len(P["doc"]))
+    tokens_done = round(frac * P["n_tokens"])
+    body_h = max(10, rows - 5)
+    g = Table.grid(expand=True, padding=(0, 1))
+    g.add_column(ratio=63); g.add_column(ratio=37)
+    g.add_row(doc_pane(P, playhead, body_h), findings_pane(P, playhead, body_h, done))
+    return Group(header(eng, note, dilate, cols), g,
+                 status_strip(P, eng, playhead, tokens_done, elapsed * 1000, frac, done, cols))
+
+
+def end_card(P, eng, note, link):
+    from collections import Counter
+    cats = Counter(e["type"] for e in P["ents"])
+    g = Text()
+    if note:
+        g.append(note.upper() + "\n\n", style=f"bold {DIM}")
+    g.append(f"{P['n_tokens']:,} tokens", style=f"bold {ACC}")
+    g.append(f"  scanned in {eng['proc_s'] * 1000:,.0f} ms", style=f"bold {INK}")
+    g.append(f"   ·   {eng['tps']:,.0f} tok/s\n", style=DIM)
+    g.append(f"{len(P['ents'])} PII spans", style=f"bold {GOLD}")
+    g.append(f"  across {len(cats)} categories\n\n", style=INK)
+    top = cats.most_common(8)
+    for i, (c, k) in enumerate(top):
+        if i and i % 4 == 0:
+            g.append("\n")
+        g.append(f"{c} ", style=f"bold {cat_colour(c)}")
+        g.append(f"{k}   ", style=DIM)
+    g.append(f"\n\n{eng['device']}", style=DIM)
+    if link:
+        g.append(f"\n{link}", style=DIM)
+    return Panel(Align.center(g, vertical="middle"), border_style=ACC_EMPTY,
+                 box=box.ROUNDED, padding=(2, 6))
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    here = Path(__file__).resolve().parent
+    ap.add_argument("--scene", default=str(here / "traces/scan"))
+    ap.add_argument("--link", default="github.com/richiejp/privacy-filter.cpp")
+    ap.add_argument("--fps", type=int, default=30)
+    ap.add_argument("--dilate", type=float, default=1.0, help="time scale (1 = real time)")
+    ap.add_argument("--hold", type=float, default=1.4)
+    ap.add_argument("--card", type=float, default=3.5)
+    a = ap.parse_args()
+
+    scene = Path(a.scene)
+    content = json.load(open(scene / "content.json"))
+    eng = json.load(open(scene / "engines.json"))[0]
+    eng["proc_s"] = content["n_tokens"] / eng["tps"]
+
+    console = Console()
+    cols, rows = console.size
+    doc_w = max(24, int((cols - 4) * 0.63) - 4)
+    P = prep(content, doc_w)
+    P["find_w"] = max(20, int((cols - 4) * 0.37) - 4)
+
+    wall_end = eng["proc_s"] * a.dilate + a.hold
+    dt = 1.0 / a.fps
+    note = content.get("note", "")
+    with Live(console=console, refresh_per_second=a.fps, screen=True) as live:
+        t0 = time.perf_counter()
+        while (w := time.perf_counter() - t0) < wall_end:
+            live.update(view(P, eng, note, w / a.dilate, a.dilate, cols, rows))
+            time.sleep(dt)
+        live.update(view(P, eng, note, eng["proc_s"], a.dilate, cols, rows))
+        time.sleep(0.7)
+        live.update(Panel(Align.center(end_card(P, eng, note, a.link), vertical="middle"),
+                          border_style="black", box=box.SIMPLE, height=rows))
+        time.sleep(a.card)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demo/scan_doc.txt b/demo/scan_doc.txt
new file mode 100644
index 0000000..5e89ea3
--- /dev/null
+++ b/demo/scan_doc.txt
@@ -0,0 +1,25 @@
+Thanks for getting back to me so quickly about the prior-authorization request. I want to make sure everything is in order before the review board meets next week, since the last batch got held up over a missing signature. The patient, John Doe, has been with our practice for years and is in good health, so we expect this one to be routine. If anything looks incomplete on your end, please call our billing office at +1 555-0112 rather than replying here. I have attached the updated history, and the relevant appointment was on 2026-05-12. If you still need the original signed consent form, you can email me directly at jane.roe@northside-clinic.org. Appreciate you helping get claim 4471 across the line before the deadline.
+
+Separately, the replacement shipment for order #A-90817 finally went out this morning. It is headed to Maria Gonzalez at 488 Birchwood Avenue, Portland, OR 97214, and the courier promised delivery by Friday. She had asked us to text updates to 503-555-0148 instead of emailing, because her inbox at maria.gonzalez88@fastmail.com has been bouncing all week. Could you confirm the tracking number once it scans at the depot?
+
+On the finance side, the refund of 1,250.00 was wired back to the customer's account on 2026-04-30. The receiving IBAN is GB29 NWBK 6016 1331 9268 19, held under the name Thomas Reilly, and the reference we used was INV-55831. If the bank flags it, our contact there is Priya Nair, reachable on +44 20 7946 0991 during London hours.
+
+For the new hire paperwork: please set up Daniel Okafor, starting 2026-06-01 as a logistics coordinator. His personal email is d.okafor.work@gmail.com and his mobile is (212) 555-0173. Payroll will need his SSN, 432-19-8765, and his date of birth, 1991-08-23, before the first cycle runs. He is 34 and will report to the Atlanta hub.
+
+While you have the chart open, the follow-up for patient Aisha Rahman, medical record number MRN-88241, is overdue. Her last visit was 2026-03-09 with Dr. Steven Cole, and the lab flagged a result that needs a callback to 617-555-0136. She prefers contact in the evening and her secondary email is aisha.rahman@protonmail.com. Please do not leave details on voicemail per the note on her file.
+
+Travel desk, two bookings to confirm. Robert Klein departs 2026-07-14 on confirmation code QX7T2P, returning the 21st; his passport number is C03X45678 and it expires 2029-02-28. He will stay at the Harbor Court Hotel, 220 West Quay Street, Seattle, WA 98119. Bill the corporate card under cost center CC-4490 and copy his assistant at l.tran@meridian-travel.com on the itinerary.
+
+Ops flagged the failing node again. The crash report came from host 10.42.7.18, hardware serial SN-DG44120, registered to the admin account r.santos@datacore.io. The on-call engineer, Olivia Brooks, can be paged at 408-555-0190. Until the patch lands, route tickets to queue OPS-DELTA and reference incident INC-203394 so the timeline stays linked.
+
+Finally, accounts receivable for the Hendricks matter: invoice 77120 for 8,400.00 is still unpaid, now 45 days past due. The client signatory is Gregory Hale at Hale & Forsythe LLP, 9 Chancery Lane, London EC4A 1DR. He asked that statements go to g.hale@haleforsythe.co.uk and that we stop calling 020 7946 0830 after hours. If we escalate, the file reference on our side is MTR-2026-0188 and the responsible partner is Nadia Costa.
+
+The auto claim is moving as well. Policyholder Sandra Whitfield, policy number POL-3392184, reported the collision on 2026-02-17 involving the vehicle with plate 8XQJ492. The adjuster, Marcus Bell, left a note that the garage estimate came to 4,975.50 and that the rental authorization expires 2026-03-15. Send the settlement letter to 14 Maplewood Crescent, Leeds LS6 2BX, and copy s.whitfield@ymail.com so she has a record.
+
+Admissions also needs a hand. The transcript request for student Kevin Park, student ID STU-771204, has been sitting since 2026-01-22. He graduated from Westbrook University with a recorded GPA of 3.82 and now lists his address as 305 College Drive, Austin, TX 78705. His contact number changed to 737-555-0164 last term, and the registrar should email the verification to kevin.park@westbrook.edu before releasing anything.
+
+On the property file, the closing for 62 Lantern Hill Road, Asheville, NC 28801 is scheduled for 2026-08-05 at a sale price of 415,000.00. The buyers, Henry and Claire Donovan, are financing through Summit Mutual, loan number LN-66120. Their agent, Rosa Méndez, can be reached at 828-555-0107, and the title company wants wiring instructions confirmed in writing first.
+
+Billing churn for the quarter: the subscription on account 5500-2241 renews 2026-09-01 for 199.00, charged to a Mastercard ending 8893. The owner, Felix Otero, switched the receipt email to felix.otero@hey.com and disputed the previous charge under reference CB-40217. If the retry fails twice, downgrade the plan rather than cancelling outright.
+
+For the docket, hearing in case 24-CV-10882 is set for 2026-10-19 before Judge Eleanor Voss. The plaintiff, represented by counsel Aaron Feld, is seeking disclosure of records held by the respondent, Brightline Logistics. File the motion through the clerk and serve notice to a.feld@feldlaw.com no later than ten days prior, or we forfeit the slot.
diff --git a/demo/traces/scan/content.json b/demo/traces/scan/content.json
new file mode 100644
index 0000000..118afa2
--- /dev/null
+++ b/demo/traces/scan/content.json
@@ -0,0 +1 @@
+{"document": "Thanks for getting back to me so quickly about the prior-authorization request. I want to make sure everything is in order before the review board meets next week, since the last batch got held up over a missing signature. The patient, John Doe, has been with our practice for years and is in good health, so we expect this one to be routine. If anything looks incomplete on your end, please call our billing office at +1 555-0112 rather than replying here. I have attached the updated history, and the relevant appointment was on 2026-05-12. If you still need the original signed consent form, you can email me directly at jane.roe@northside-clinic.org. Appreciate you helping get claim 4471 across the line before the deadline.\n\nSeparately, the replacement shipment for order #A-90817 finally went out this morning. It is headed to Maria Gonzalez at 488 Birchwood Avenue, Portland, OR 97214, and the courier promised delivery by Friday. She had asked us to text updates to 503-555-0148 instead of emailing, because her inbox at maria.gonzalez88@fastmail.com has been bouncing all week. Could you confirm the tracking number once it scans at the depot?\n\nOn the finance side, the refund of 1,250.00 was wired back to the customer's account on 2026-04-30. The receiving IBAN is GB29 NWBK 6016 1331 9268 19, held under the name Thomas Reilly, and the reference we used was INV-55831. If the bank flags it, our contact there is Priya Nair, reachable on +44 20 7946 0991 during London hours.\n\nFor the new hire paperwork: please set up Daniel Okafor, starting 2026-06-01 as a logistics coordinator. His personal email is d.okafor.work@gmail.com and his mobile is (212) 555-0173. Payroll will need his SSN, 432-19-8765, and his date of birth, 1991-08-23, before the first cycle runs. He is 34 and will report to the Atlanta hub.\n\nWhile you have the chart open, the follow-up for patient Aisha Rahman, medical record number MRN-88241, is overdue. Her last visit was 2026-03-09 with Dr. Steven Cole, and the lab flagged a result that needs a callback to 617-555-0136. She prefers contact in the evening and her secondary email is aisha.rahman@protonmail.com. Please do not leave details on voicemail per the note on her file.\n\nTravel desk, two bookings to confirm. Robert Klein departs 2026-07-14 on confirmation code QX7T2P, returning the 21st; his passport number is C03X45678 and it expires 2029-02-28. He will stay at the Harbor Court Hotel, 220 West Quay Street, Seattle, WA 98119. Bill the corporate card under cost center CC-4490 and copy his assistant at l.tran@meridian-travel.com on the itinerary.\n\nOps flagged the failing node again. The crash report came from host 10.42.7.18, hardware serial SN-DG44120, registered to the admin account r.santos@datacore.io. The on-call engineer, Olivia Brooks, can be paged at 408-555-0190. Until the patch lands, route tickets to queue OPS-DELTA and reference incident INC-203394 so the timeline stays linked.\n\nFinally, accounts receivable for the Hendricks matter: invoice 77120 for 8,400.00 is still unpaid, now 45 days past due. The client signatory is Gregory Hale at Hale & Forsythe LLP, 9 Chancery Lane, London EC4A 1DR. He asked that statements go to g.hale@haleforsythe.co.uk and that we stop calling 020 7946 0830 after hours. If we escalate, the file reference on our side is MTR-2026-0188 and the responsible partner is Nadia Costa.\n\nThe auto claim is moving as well. Policyholder Sandra Whitfield, policy number POL-3392184, reported the collision on 2026-02-17 involving the vehicle with plate 8XQJ492. The adjuster, Marcus Bell, left a note that the garage estimate came to 4,975.50 and that the rental authorization expires 2026-03-15. Send the settlement letter to 14 Maplewood Crescent, Leeds LS6 2BX, and copy s.whitfield@ymail.com so she has a record.\n\nAdmissions also needs a hand. The transcript request for student Kevin Park, student ID STU-771204, has been sitting since 2026-01-22. He graduated from Westbrook University with a recorded GPA of 3.82 and now lists his address as 305 College Drive, Austin, TX 78705. His contact number changed to 737-555-0164 last term, and the registrar should email the verification to kevin.park@westbrook.edu before releasing anything.\n\nOn the property file, the closing for 62 Lantern Hill Road, Asheville, NC 28801 is scheduled for 2026-08-05 at a sale price of 415,000.00. The buyers, Henry and Claire Donovan, are financing through Summit Mutual, loan number LN-66120. Their agent, Rosa Méndez, can be reached at 828-555-0107, and the title company wants wiring instructions confirmed in writing first.\n\nBilling churn for the quarter: the subscription on account 5500-2241 renews 2026-09-01 for 199.00, charged to a Mastercard ending 8893. The owner, Felix Otero, switched the receipt email to felix.otero@hey.com and disputed the previous charge under reference CB-40217. If the retry fails twice, downgrade the plan rather than cancelling outright.\n\nFor the docket, hearing in case 24-CV-10882 is set for 2026-10-19 before Judge Eleanor Voss. The plaintiff, represented by counsel Aaron Feld, is seeking disclosure of records held by the respondent, Brightline Logistics. File the motion through the clerk and serve notice to a.feld@feldlaw.com no later than ten days prior, or we forfeit the slot.\n", "n_tokens": 1360, "note": "on-device PII scan · real spans", "entities": [{"type": "FIRSTNAME", "start": 236, "end": 240, "text": "John", "score": 0.9859}, {"type": "LASTNAME", "start": 241, "end": 244, "text": "Doe", "score": 0.9756}, {"type": "PHONE", "start": 419, "end": 430, "text": "+1 555-0112", "score": 0.9999}, {"type": "DATE", "start": 531, "end": 541, "text": "2026-05-12", "score": 0.7356}, {"type": "EMAIL", "start": 624, "end": 642, "text": "jane.roe@northside", "score": 0.8905}, {"type": "PIN", "start": 688, "end": 692, "text": "4471", "score": 0.945}, {"type": "AMOUNT", "start": 781, "end": 786, "text": "90817", "score": 0.7762}, {"type": "FIRSTNAME", "start": 834, "end": 839, "text": "Maria", "score": 0.9995}, {"type": "LASTNAME", "start": 840, "end": 848, "text": "Gonzalez", "score": 0.99}, {"type": "BUILDINGNUMBER", "start": 852, "end": 855, "text": "488", "score": 0.9982}, {"type": "STREET", "start": 856, "end": 872, "text": "Birchwood Avenue", "score": 1.0}, {"type": "CITY", "start": 874, "end": 882, "text": "Portland", "score": 0.9807}, {"type": "ZIPCODE", "start": 887, "end": 892, "text": "97214", "score": 0.9899}, {"type": "PHONE", "start": 975, "end": 987, "text": "503-555-0148", "score": 0.9307}, {"type": "EMAIL", "start": 1030, "end": 1059, "text": "maria.gonzalez88@fastmail.com", "score": 0.9959}, {"type": "AMOUNT", "start": 1190, "end": 1198, "text": "1,250.00", "score": 0.7332}, {"type": "DATE", "start": 1243, "end": 1253, "text": "2026-04-30", "score": 0.9176}, {"type": "IBAN", "start": 1277, "end": 1301, "text": "GB29 NWBK 6016 1331 9268", "score": 0.8934}, {"type": "FIRSTNAME", "start": 1326, "end": 1332, "text": "Thomas", "score": 0.9861}, {"type": "LASTNAME", "start": 1333, "end": 1339, "text": "Reilly", "score": 0.9642}, {"type": "AMOUNT", "start": 1375, "end": 1380, "text": "55831", "score": 0.5124}, {"type": "FIRSTNAME", "start": 1425, "end": 1430, "text": "Priya", "score": 0.9875}, {"type": "LASTNAME", "start": 1431, "end": 1435, "text": "Nair", "score": 0.9605}, {"type": "PHONE", "start": 1450, "end": 1466, "text": "+44 20 7946 0991", "score": 0.9997}, {"type": "FIRSTNAME", "start": 1531, "end": 1537, "text": "Daniel", "score": 0.9504}, {"type": "LASTNAME", "start": 1538, "end": 1544, "text": "Okafor", "score": 0.9876}, {"type": "DATE", "start": 1555, "end": 1565, "text": "2026-06-01", "score": 0.9835}, {"type": "EMAIL", "start": 1616, "end": 1639, "text": "d.okafor.work@gmail.com", "score": 0.9995}, {"type": "PHONE", "start": 1658, "end": 1672, "text": "(212) 555-0173", "score": 0.9351}, {"type": "SSN", "start": 1701, "end": 1712, "text": "432-19-8765", "score": 0.9993}, {"type": "DATEOFBIRTH", "start": 1737, "end": 1747, "text": "1991-08-23", "score": 0.8961}, {"type": "AGE", "start": 1784, "end": 1786, "text": "34", "score": 0.8913}, {"type": "FIRSTNAME", "start": 1881, "end": 1886, "text": "Aisha", "score": 0.9981}, {"type": "LASTNAME", "start": 1887, "end": 1893, "text": "Rahman", "score": 0.9909}, {"type": "ACCOUNTNAME", "start": 1917, "end": 1926, "text": "MRN-88241", "score": 0.6954}, {"type": "DATE", "start": 1959, "end": 1969, "text": "2026-03-09", "score": 0.9863}, {"type": "FIRSTNAME", "start": 1979, "end": 1985, "text": "Steven", "score": 0.8845}, {"type": "LASTNAME", "start": 1986, "end": 1990, "text": "Cole", "score": 0.9792}, {"type": "PHONE", "start": 2046, "end": 2058, "text": "617-555-0136", "score": 0.9964}, {"type": "EMAIL", "start": 2122, "end": 2149, "text": "aisha.rahman@protonmail.com", "score": 0.9985}, {"type": "FIRSTNAME", "start": 2257, "end": 2263, "text": "Robert", "score": 0.6067}, {"type": "DATE", "start": 2278, "end": 2288, "text": "2026-07-14", "score": 0.9341}, {"type": "PASSWORD", "start": 2310, "end": 2316, "text": "QX7T2P", "score": 0.846}, {"type": "VRM", "start": 2361, "end": 2370, "text": "C03X45678", "score": 0.5681}, {"type": "DATE", "start": 2386, "end": 2396, "text": "2029-02-28", "score": 0.8575}, {"type": "BUILDINGNUMBER", "start": 2438, "end": 2441, "text": "220", "score": 0.9903}, {"type": "STREET", "start": 2442, "end": 2458, "text": "West Quay Street", "score": 0.9987}, {"type": "CITY", "start": 2460, "end": 2467, "text": "Seattle", "score": 0.9819}, {"type": "STATE", "start": 2469, "end": 2471, "text": "WA", "score": 0.5203}, {"type": "ZIPCODE", "start": 2472, "end": 2477, "text": "98119", "score": 0.9986}, {"type": "EMAIL", "start": 2555, "end": 2581, "text": "l.tran@meridian-travel.com", "score": 0.9598}, {"type": "IPADDRESS", "start": 2669, "end": 2679, "text": "10.42.7.18", "score": 0.9941}, {"type": "USERNAME", "start": 2741, "end": 2761, "text": "r.santos@datacore.io", "score": 0.7571}, {"type": "FIRSTNAME", "start": 2785, "end": 2791, "text": "Olivia", "score": 0.9991}, {"type": "LASTNAME", "start": 2792, "end": 2798, "text": "Brooks", "score": 0.7488}, {"type": "PHONE", "start": 2816, "end": 2828, "text": "408-555-0190", "score": 0.9994}, {"type": "AMOUNT", "start": 3014, "end": 3019, "text": "77120", "score": 0.7878}, {"type": "AMOUNT", "start": 3024, "end": 3032, "text": "8,400.00", "score": 0.8164}, {"type": "FIRSTNAME", "start": 3096, "end": 3103, "text": "Gregory", "score": 0.9939}, {"type": "LASTNAME", "start": 3104, "end": 3108, "text": "Hale", "score": 0.9873}, {"type": "BUILDINGNUMBER", "start": 3133, "end": 3134, "text": "9", "score": 0.9141}, {"type": "STREET", "start": 3135, "end": 3148, "text": "Chancery Lane", "score": 0.99}, {"type": "STATE", "start": 3150, "end": 3156, "text": "London", "score": 0.5794}, {"type": "EMAIL", "start": 3198, "end": 3223, "text": "g.hale@haleforsythe.co.uk", "score": 0.9578}, {"type": "PHONE", "start": 3249, "end": 3262, "text": "020 7946 0830", "score": 0.9996}, {"type": "FIRSTNAME", "start": 3371, "end": 3376, "text": "Nadia", "score": 0.9993}, {"type": "LASTNAME", "start": 3377, "end": 3382, "text": "Costa", "score": 0.9959}, {"type": "FIRSTNAME", "start": 3432, "end": 3438, "text": "Sandra", "score": 0.9703}, {"type": "LASTNAME", "start": 3439, "end": 3448, "text": "Whitfield", "score": 0.8962}, {"type": "DATE", "start": 3503, "end": 3513, "text": "2026-02-17", "score": 0.7866}, {"type": "FIRSTNAME", "start": 3570, "end": 3576, "text": "Marcus", "score": 0.9965}, {"type": "LASTNAME", "start": 3577, "end": 3581, "text": "Bell", "score": 0.9603}, {"type": "AMOUNT", "start": 3628, "end": 3636, "text": "4,975.50", "score": 0.9163}, {"type": "DATE", "start": 3679, "end": 3689, "text": "2026-03-15", "score": 0.5489}, {"type": "BUILDINGNUMBER", "start": 3721, "end": 3723, "text": "14", "score": 0.9561}, {"type": "STREET", "start": 3724, "end": 3742, "text": "Maplewood Crescent", "score": 0.9979}, {"type": "EMAIL", "start": 3768, "end": 3789, "text": "s.whitfield@ymail.com", "score": 0.9064}, {"type": "FIRSTNAME", "start": 3877, "end": 3882, "text": "Kevin", "score": 0.934}, {"type": "LASTNAME", "start": 3883, "end": 3887, "text": "Park", "score": 0.6699}, {"type": "DATE", "start": 3935, "end": 3945, "text": "2026-01-22", "score": 0.9796}, {"type": "HEIGHT", "start": 4009, "end": 4013, "text": "3.82", "score": 0.8439}, {"type": "BUILDINGNUMBER", "start": 4043, "end": 4046, "text": "305", "score": 0.9999}, {"type": "STREET", "start": 4047, "end": 4060, "text": "College Drive", "score": 0.9743}, {"type": "CITY", "start": 4062, "end": 4068, "text": "Austin", "score": 0.9752}, {"type": "ZIPCODE", "start": 4073, "end": 4078, "text": "78705", "score": 0.9757}, {"type": "PHONE", "start": 4110, "end": 4122, "text": "737-555-0164", "score": 0.756}, {"type": "EMAIL", "start": 4185, "end": 4209, "text": "kevin.park@westbrook.edu", "score": 0.825}, {"type": "BUILDINGNUMBER", "start": 4276, "end": 4278, "text": "62", "score": 0.9962}, {"type": "STREET", "start": 4279, "end": 4296, "text": "Lantern Hill Road", "score": 0.8804}, {"type": "CITY", "start": 4298, "end": 4307, "text": "Asheville", "score": 0.8121}, {"type": "ZIPCODE", "start": 4312, "end": 4317, "text": "28801", "score": 0.9987}, {"type": "DATE", "start": 4335, "end": 4345, "text": "2026-08-05", "score": 0.9904}, {"type": "AMOUNT", "start": 4365, "end": 4375, "text": "415,000.00", "score": 0.5984}, {"type": "FIRSTNAME", "start": 4389, "end": 4394, "text": "Henry", "score": 0.9869}, {"type": "FIRSTNAME", "start": 4399, "end": 4405, "text": "Claire", "score": 0.9668}, {"type": "LASTNAME", "start": 4406, "end": 4413, "text": "Donovan", "score": 0.9812}, {"type": "PHONE", "start": 4519, "end": 4531, "text": "828-555-0107", "score": 0.9969}, {"type": "DATE", "start": 4686, "end": 4696, "text": "2026-09-01", "score": 0.8857}, {"type": "FIRSTNAME", "start": 4757, "end": 4762, "text": "Felix", "score": 0.9951}, {"type": "LASTNAME", "start": 4763, "end": 4768, "text": "Otero", "score": 0.984}, {"type": "EMAIL", "start": 4800, "end": 4819, "text": "felix.otero@hey.com", "score": 0.9833}, {"type": "DATE", "start": 5013, "end": 5023, "text": "2026-10-19", "score": 0.9709}, {"type": "FIRSTNAME", "start": 5037, "end": 5044, "text": "Eleanor", "score": 0.9274}, {"type": "LASTNAME", "start": 5045, "end": 5049, "text": "Voss", "score": 0.9932}, {"type": "FIRSTNAME", "start": 5089, "end": 5094, "text": "Aaron", "score": 0.9891}, {"type": "LASTNAME", "start": 5095, "end": 5099, "text": "Feld", "score": 0.9957}, {"type": "EMAIL", "start": 5234, "end": 5252, "text": "a.feld@feldlaw.com", "score": 0.872}]}
\ No newline at end of file
diff --git a/demo/traces/scan/engines.json b/demo/traces/scan/engines.json
new file mode 100644
index 0000000..dfac2fe
--- /dev/null
+++ b/demo/traces/scan/engines.json
@@ -0,0 +1,7 @@
+[
+  {
+    "label": "privacy-filter.cpp",
+    "device": "Raspberry Pi 5 · CPU · q8 @ 1.5 GHz",
+    "tps": 360.0
+  }
+]
\ No newline at end of file
diff --git a/docker/Dockerfile.arm64 b/docker/Dockerfile.arm64
new file mode 100644
index 0000000..4b777e5
--- /dev/null
+++ b/docker/Dockerfile.arm64
@@ -0,0 +1,36 @@
+# Cross-build privacy-filter.cpp for arm64 (aarch64) via docker buildx + qemu.
+#
+#   docker buildx build --platform linux/arm64 -f docker/Dockerfile.arm64 \
+#       --target export --output type=local,dest=build/arm64 .
+#
+# The build runs under qemu user emulation (slow but reproducible). The base is
+# ubuntu:24.04 so the toolchain (gcc 13.3, glibc 2.39) and resulting ABI match a
+# Raspberry Pi 5 running Ubuntu 24.04 -- the binaries drop straight onto the Pi.
+#
+# By default it targets the Pi 5's Cortex-A76 (armv8.2-a + dotprod + fp16), which
+# is what ggml's quantized kernels exploit. Override --build-arg GGML_CPU_ARM_ARCH
+# for a different chip, or set it to a baseline like "armv8-a" for a portable
+# (slower) binary.
+ARG UBUNTU=24.04
+
+FROM ubuntu:${UBUNTU} AS build
+ARG GGML_CPU_ARM_ARCH=armv8.2-a+dotprod+fp16
+ARG JOBS=4
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential cmake git ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /src
+COPY . .
+RUN cmake -S . -B build/arm64 \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH} \
+        -DBUILD_SHARED_LIBS=OFF \
+        -DPF_BUILD_TESTS=OFF \
+    && cmake --build build/arm64 -j${JOBS} --target pf-cli pf-bench
+
+# Thin export stage: `--output type=local` against this target writes just the
+# two binaries to the host (no image layers, no apt cruft).
+FROM scratch AS export
+COPY --from=build /src/build/arm64/pf-cli  /
+COPY --from=build /src/build/arm64/pf-bench /
diff --git a/model-cards/privacy-filter-multilingual.md b/model-cards/privacy-filter-multilingual.md
index 610d51f..56ba3c3 100644
--- a/model-cards/privacy-filter-multilingual.md
+++ b/model-cards/privacy-filter-multilingual.md
@@ -35,7 +35,7 @@ language:
   - zh
 ---
 
-# privacy-filter-multilingual — GGUF (F16)
+# privacy-filter-multilingual — GGUF (F16 + Q8_0)
 
 GGUF conversion of [`OpenMed/privacy-filter-multilingual`](https://huggingface.co/OpenMed/privacy-filter-multilingual),
 a multilingual PII **token-classification** model (a fine-tune of
@@ -109,11 +109,32 @@ upstream llama.cpp. It runs on:
 
 | File | Precision | Size | Notes |
 |---|---|---|---|
-| `privacy-filter-multilingual-f16.gguf` | F16 | ~2.7 GB | 217 `classifier.output_labels`; `pooling_type = TOKEN_CLS`. Validated artifact. |
-
-F16 is the validated, shipped precision. Quantized variants are deferred until they can be
-evaluated with a **task metric (span-F1 per language) + KL-vs-F16** — perplexity is meaningless
-for a classifier, so a naively-quantized GGUF is not published here yet.
+| `privacy-filter-multilingual-f16.gguf` | F16 | ~2.7 GB | Reference artifact. 217 `classifier.output_labels`; `pooling_type = TOKEN_CLS`. |
+| `privacy-filter-multilingual-q8.gguf` | Q8_0 (experts) | ~1.6 GB | MoE expert weights → Q8_0, the rest F16. For RAM-constrained / edge use. |
+
+`sha256 (q8): 968135172ba8202374b4c3bd7d353e100c8fc574035da793fa4d13ca441319b7`
+
+**Q8_0 quantization — and why it isn't free.** `q8` stores the bulk of the weights (the MoE
+expert matrices) as 8-bit integers instead of 16-bit floats — via
+[`scripts/requant_q8.py`](https://github.com/localai-org/privacy-filter.cpp/blob/master/scripts/requant_q8.py),
+with attention, embeddings and the classifier head left at F16. That roughly halves the download
+(≈2.7 GB → ≈1.6 GB) and is usually a bit faster on CPU.
+
+The catch: **reducing precision throws information away, and it is almost never a free lunch.**
+Our checks didn't find a regression — on a mixed-PII document (1,360 tokens) q8 matched f16 on
+**100%** of token labels and produced identical spans, with an average prediction shift
+(KL divergence) of just 6.9e-5. But "we didn't find a difference" is not the same as "there is
+none." Those numbers come from a single English document, and a tiny *average* shift can still
+hide a flip on the one input that matters to you — a rare name, an unusual phone or ID format, or
+a language we never tested. **Accuracy benchmarks and divergence metrics routinely look
+reassuring right up until the case that bites.** For PII detection a single missed span is a
+leak, so:
+
+- **Prefer F16** if you can afford the ~2.7 GB — it is the reference these numbers are measured
+  against, and what we trust by default.
+- **Use Q8_0** when memory or speed forces it (e.g. a 4 GB Raspberry Pi 5), treat it as a
+  deliberate tradeoff, and **validate it on your own data** first. A full span-F1-per-language
+  sweep across the 16 languages is the bar we'd want before calling q8 a true drop-in.
 
 ## Architecture & conversion
 
diff --git a/model-cards/privacy-filter.md b/model-cards/privacy-filter.md
index 44e5066..1187805 100644
--- a/model-cards/privacy-filter.md
+++ b/model-cards/privacy-filter.md
@@ -17,7 +17,7 @@ tags:
   - openai-privacy-filter
 ---
 
-# privacy-filter — GGUF (F16)
+# privacy-filter — GGUF (F16 + Q8_0)
 
 GGUF conversion of [`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter),
 OpenAI's bidirectional PII **token-classification** model. It labels every token with a BIOES
@@ -74,13 +74,26 @@ upstream llama.cpp. It runs on:
 
 | File | Precision | Size | Notes |
 |---|---|---|---|
-| `privacy-filter-f16.gguf` | F16 | 2.82 GB | 156 tensors; 33 `classifier.output_labels`; `pooling_type = TOKEN_CLS`. |
-
-`sha256: eb71312b6b9370d0fe582e576b840567bb06603c4de241c6d899205d1b04dc81`
-
-F16 is the validated, shipped precision. Quantized variants are deferred until they can be
-evaluated with a **task metric (span-F1) + KL-vs-F16** — perplexity is meaningless for a
-classifier, so a naively-quantized GGUF is not published here yet.
+| `privacy-filter-f16.gguf` | F16 | 2.82 GB | Reference artifact. 156 tensors; 33 `classifier.output_labels`; `pooling_type = TOKEN_CLS`. |
+| `privacy-filter-q8.gguf` | Q8_0 (experts) | ~1.6 GB | MoE expert weights → Q8_0, the rest F16. For RAM-constrained / edge use. |
+
+`sha256 (f16): eb71312b6b9370d0fe582e576b840567bb06603c4de241c6d899205d1b04dc81`
+`sha256 (q8):  80efc1803eda7c095a79741d2008c07e2e0a57b01bac8825fbeb448fd097998c`
+
+**Q8_0 quantization — and why it isn't free.** `q8` stores the bulk of the weights (the MoE
+expert matrices) as 8-bit integers instead of 16-bit floats — via
+[`scripts/requant_q8.py`](https://github.com/localai-org/privacy-filter.cpp/blob/master/scripts/requant_q8.py),
+with attention, embeddings and the classifier head left at F16. That roughly halves the download
+(2.82 GB → ≈1.6 GB) and is usually a bit faster on CPU.
+
+The catch: **reducing precision throws information away, and it is almost never a free lunch.**
+On a mixed-PII document (1,360 tokens) q8 matched f16 on **99.7%** of token labels (average
+prediction shift, KL divergence, of 1.1e-3) — close, but note it did **not** match on all of
+them; a few tokens flipped. That is the point in miniature: a reassuring average still hides the
+specific cases that change, and **accuracy benchmarks tend to look fine until the one that
+bites.** For PII detection a missed span is a leak, so **prefer F16 when you can afford it** (it
+is the reference precision) and treat **Q8_0 as a deliberate size/speed tradeoff** for
+constrained hardware — ideally re-checked on your own data.
 
 ## Architecture & conversion
 
diff --git a/scripts/bench_torch.py b/scripts/bench_torch.py
index 41192c4..76818a6 100644
--- a/scripts/bench_torch.py
+++ b/scripts/bench_torch.py
@@ -50,8 +50,13 @@ def main() -> int:
     ap.add_argument("--attn", default="sdpa", choices=["sdpa", "eager"])
     ap.add_argument("--lengths", default="189,756,2898,11403,45234",
                     help="comma-separated token counts (match pf-bench output)")
-    ap.add_argument("--iters", type=int, default=3)
+    ap.add_argument("--iters", type=int, default=10)
+    ap.add_argument("--warmup", type=int, default=5,
+                    help="untimed warm-up iters (small/fast lengths need several)")
     ap.add_argument("--threads", type=int, default=0, help="CPU threads (0 = torch default)")
+    ap.add_argument("--compile", action="store_true", help="wrap the model in torch.compile")
+    ap.add_argument("--compile-mode", default="default",
+                    choices=["default", "reduce-overhead", "max-autotune"])
     args = ap.parse_args()
 
     import torch
@@ -72,6 +77,8 @@ def main() -> int:
     t_load0 = time.perf_counter()
     model = AutoModelForTokenClassification.from_pretrained(
         args.model, dtype=dtype, attn_implementation=args.attn).eval().to(dev)
+    if args.compile:
+        model = torch.compile(model, mode=args.compile_mode)
     if cuda:
         torch.cuda.synchronize()
     t_load1 = time.perf_counter()
@@ -81,8 +88,9 @@ def fwd(ids):
             model(input_ids=ids)
 
     name = torch.cuda.get_device_name(dev) if cuda else f"cpu x{torch.get_num_threads()}"
+    comp = f"compile:{args.compile_mode}" if args.compile else "eager-run"
     print(f"torch {torch.__version__} | tf {transformers.__version__} | {name} | "
-          f"{dtype_name} | {args.attn} | load {t_load1 - t_load0:.2f}s | {args.iters} iters\n")
+          f"{dtype_name} | {args.attn} | {comp} | load {t_load1 - t_load0:.2f}s | {args.iters} iters\n")
     print(f"| {'tokens':>8} | {'forward ms':>11} | {'tok/s':>8} | {'peak MiB':>8} |")
     print("|---------:|------------:|---------:|---------:|")
 
@@ -92,7 +100,8 @@ def fwd(ids):
             if cuda:
                 torch.cuda.reset_peak_memory_stats(dev)
                 torch.cuda.synchronize()
-            fwd(ids)  # warm-up (lazy kernel/autotune, allocator growth)
+            for _ in range(args.warmup):    # warm-up: compile, cuBLAS algo pick,
+                fwd(ids)                    # workspace alloc, allocator growth
             if cuda:
                 torch.cuda.synchronize()
                 ev0, ev1 = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
diff --git a/scripts/publish_hf.py b/scripts/publish_hf.py
index 910ca53..be778af 100755
--- a/scripts/publish_hf.py
+++ b/scripts/publish_hf.py
@@ -20,10 +20,16 @@
         --gguf ~/models/privacy-filter/privacy-filter-f16.gguf          # dry-run
 
     python scripts/publish_hf.py --model privacy-filter-multilingual \\
-        --gguf .../privacy-filter-multilingual-f16.gguf --upload         # push
+        --gguf .../privacy-filter-multilingual-f16.gguf --upload         # push f16
+
+    python scripts/publish_hf.py --model privacy-filter-multilingual --quant q8 \\
+        --gguf .../privacy-filter-multilingual-q8.gguf --upload          # push q8
 
     python scripts/publish_hf.py --model privacy-filter-multilingual \\
         --card-only --upload                  # sync just the card (README.md)
+
+The q8 GGUF (experts-only Q8_0, ~1.6 GB) is produced by scripts/requant_q8.py
+from the f16 and lands alongside it in the same repo; the card lists both.
 """
 from __future__ import annotations
 
@@ -37,16 +43,19 @@
 
 HF_ORG = "LocalAI-io"
 
-# key -> (HF repo id, published GGUF filename, model-card filename under model-cards/)
-MODELS: dict[str, tuple[str, str, str]] = {
+# key -> (HF repo id, {quant: published GGUF filename}, model-card filename).
+# f16 and q8 live side by side in the same repo (the card lists both).
+MODELS: dict[str, tuple[str, dict[str, str], str]] = {
     "privacy-filter-multilingual": (
         f"{HF_ORG}/privacy-filter-multilingual-GGUF",
-        "privacy-filter-multilingual-f16.gguf",
+        {"f16": "privacy-filter-multilingual-f16.gguf",
+         "q8":  "privacy-filter-multilingual-q8.gguf"},
         "privacy-filter-multilingual.md",
     ),
     "privacy-filter": (
         f"{HF_ORG}/privacy-filter-GGUF",
-        "privacy-filter-f16.gguf",
+        {"f16": "privacy-filter-f16.gguf",
+         "q8":  "privacy-filter-q8.gguf"},
         "privacy-filter.md",
     ),
 }
@@ -67,7 +76,9 @@ def main() -> int:
     ap.add_argument("--model", required=True, choices=sorted(MODELS),
                     help="which model to publish")
     ap.add_argument("--gguf", type=Path, default=None,
-                    help="path to the converted <name>-f16.gguf (omit with --card-only)")
+                    help="path to the converted <name>-<quant>.gguf (omit with --card-only)")
+    ap.add_argument("--quant", default="f16", choices=["f16", "q8"],
+                    help="which precision this --gguf is, picks the published filename (default f16)")
     ap.add_argument("--card-only", action="store_true",
                     help="sync only the model card (README.md); leave the published GGUF untouched")
     ap.add_argument("--repo", default=None, help="override the target HF repo id")
@@ -75,7 +86,8 @@ def main() -> int:
                     help="actually push (default: dry-run, contacts nothing)")
     args = ap.parse_args()
 
-    repo, gguf_name, card_name = MODELS[args.model]
+    repo, gguf_names, card_name = MODELS[args.model]
+    gguf_name = gguf_names[args.quant]
     repo = args.repo or repo
     card = CARDS_DIR / card_name
 
@@ -94,7 +106,7 @@ def main() -> int:
         print("gguf:    (card-only — published GGUF left untouched)")
     else:
         size = args.gguf.stat().st_size
-        print(f"gguf:    {args.gguf}  ({size / 1e9:.2f} GB)  uploaded as {gguf_name}")
+        print(f"gguf:    {args.gguf}  ({size / 1e9:.2f} GB, {args.quant})  uploaded as {gguf_name}")
         print(f"sha256:  {sha256(args.gguf)}   <- pin this in the LocalAI gallery entry")
 
     if not args.upload:
@@ -116,7 +128,7 @@ def main() -> int:
         api.upload_file(
             path_or_fileobj=str(args.gguf), path_in_repo=gguf_name,
             repo_id=repo, repo_type="model",
-            commit_message=f"gguf: {gguf_name} (f16)",
+            commit_message=f"gguf: {gguf_name} ({args.quant})",
         )
     print(f"done -> https://huggingface.co/{repo}")
     return 0
diff --git a/src/backend.cpp b/src/backend.cpp
index b757b84..b713dc2 100644
--- a/src/backend.cpp
+++ b/src/backend.cpp
@@ -7,6 +7,7 @@
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 #include <thread>
 
 namespace pf {
@@ -18,6 +19,20 @@ std::string lower(std::string s) {
     return s;
 }
 
+// Default CPU threads = physical cores. SMT siblings only add contention for
+// matmul-heavy work, so on x86 (logical == 2x physical) we halve -- but ONLY
+// when SMT is actually on. ARM / Apple silicon have no SMT, where a blanket /2
+// silently throws away half the cores.
+unsigned default_cpu_threads() {
+    unsigned logical = std::max(1u, std::thread::hardware_concurrency());
+    std::ifstream smt("/sys/devices/system/cpu/smt/active");
+    int on = 0;
+    if (smt >> on && on == 1) {
+        return std::max(1u, logical / 2);
+    }
+    return logical;
+}
+
 // "cuda:1" -> ("cuda", 1); "vulkan" -> ("vulkan", 0); "" -> ("", 0).
 void parse_device(const std::string & req, std::string & name, int & index) {
     const size_t colon = req.find(':');
@@ -67,9 +82,7 @@ bool engine_backend::init(const std::string & device_req, int n_threads) {
             if (int v = std::atoi(env)) n_threads = v;
         }
         if (n_threads <= 0) {
-            // ggml's default is 4 threads; matmul-heavy work wants the
-            // physical cores (SMT siblings only add contention here)
-            n_threads = std::max(1u, std::thread::hardware_concurrency() / 2);
+            n_threads = (int) default_cpu_threads();
         }
         set_cpu_threads(be, n_threads);
     } else if (name == "gpu" || name == "cuda" || name == "vulkan") {