diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9db7094..4484741 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -54,6 +54,9 @@ jobs:
         id: cache
         uses: actions/cache@v4
         with:
+          # HF safetensors download + reference fixtures only. The GGUF is
+          # (re)converted from these on every run, so it always reflects the
+          # current scripts/convert.py rather than a stale cached artifact.
           path: |
             ~/models/privacy-filter-multilingual
             tests/fixtures/hf
@@ -63,24 +66,38 @@ jobs:
           python3 -m venv .venv
           .venv/bin/pip install -q torch --index-url https://download.pytorch.org/whl/cpu
           .venv/bin/pip install -q -r scripts/requirements.txt
-      - name: fetch model + convert + dump fixtures
+      - name: fetch model + dump fixtures
         if: steps.cache.outputs.cache-hit != 'true'
         run: |
           .venv/bin/pip install -q "huggingface_hub[cli]"
           .venv/bin/hf download OpenMed/privacy-filter-multilingual \
             --local-dir ~/models/privacy-filter-multilingual
-          # GGUF conversion lives in the llama.cpp fork (same files serve both
-          # engines); the cache is seeded with pf-rope2-f16.gguf + pf-f32.gguf
-          # once, manually. Without them the model-label tests skip (exit 77).
           .venv/bin/python scripts/hf_dump.py \
             --model ~/models/privacy-filter-multilingual --out tests/fixtures/hf
+      - name: convert HF -> GGUF
+        run: |
+          # Conversion is part of the tested path: the parity suite below gates
+          # these freshly converted GGUFs against the HF reference fixtures, so a
+          # scripts/convert.py regression fails CI. ~/ggufs is deliberately
+          # outside the cached paths so every run reconverts with the current
+          # script. (The f16 is the shipped artifact, published at
+          # huggingface.co/LocalAI-io; the f32 adds the tight exact-rotation
+          # parity gate, cos >= 0.99999, that isolates conversion errors from
+          # f16 rounding.)
+          mkdir -p ~/ggufs
+          .venv/bin/python scripts/convert.py \
+            --model ~/models/privacy-filter-multilingual \
+            --outfile ~/ggufs/pf-rope2-f16.gguf --outtype f16
+          .venv/bin/python scripts/convert.py \
+            --model ~/models/privacy-filter-multilingual \
+            --outfile ~/ggufs/pf-f32.gguf --outtype f32
       - name: build
         run: cmake --preset release -DGGML_NATIVE=OFF && cmake --build --preset release -j
       - name: parity suite
-        run: PF_GGUF_DIR=~/models/privacy-filter-multilingual ctest --preset release -L model
+        run: PF_GGUF_DIR=~/ggufs ctest --preset release -L model
       - name: fuzz smoke (5 min/target)
         run: |
           cmake --preset fuzz && cmake --build --preset fuzz -j --target fuzz_tokenizer fuzz_gguf
-          PF_GGUF=~/models/privacy-filter-multilingual/pf-rope2-f16.gguf \
+          PF_GGUF=~/ggufs/pf-rope2-f16.gguf \
             ./build/fuzz/fuzz_tokenizer -max_total_time=300 -max_len=4096
           ./build/fuzz/fuzz_gguf -max_total_time=300 -max_len=8192
diff --git a/README.md b/README.md
index 857ec5b..093a920 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,12 @@ PII/NER entity spans with exact UTF-8 byte offsets. Stock upstream ggml — no
 patches; the model's YaRN `truncate=false` frequencies are computed at load
 time and fed to `ggml_rope_ext` as `freq_factors`.
 
-Uses the same GGUF files as the llama.cpp-based path (arch
-`openai-privacy-filter`, converted by the llama.cpp-fork converter).
+Pre-converted GGUFs (arch `openai-privacy-filter`):
+[`LocalAI-io/privacy-filter-multilingual-GGUF`](https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF)
+and [`LocalAI-io/privacy-filter-GGUF`](https://huggingface.co/LocalAI-io/privacy-filter-GGUF).
+Convert your own from a HF checkpoint with
+[`scripts/convert.py`](scripts/convert.py) — self-contained, no llama.cpp
+dependency (see [Convert](#convert)).
 
 ## Build
 
@@ -34,6 +38,23 @@ echo "Contact John Doe at jdoe@example.com" | \
   build/release/pf-cli --classify model.gguf 0.5       # [cpu|cuda|vulkan]
 ```
 
+## Convert
+
+Pre-converted GGUFs are linked above. To convert an `OpenAIPrivacyFilter` HF
+checkpoint yourself:
+
+```sh
+pip install -r scripts/requirements.txt   # torch + safetensors + gguf
+python scripts/convert.py --model <hf-model-dir> --outfile model-f16.gguf
+python scripts/convert.py --model <hf-model-dir> --outfile model-f32.gguf --outtype f32
+```
+
+[`scripts/convert.py`](scripts/convert.py) reads `config.json` +
+`model.safetensors` + `tokenizer.json` and emits the GGUF directly — it does
+**not** depend on llama.cpp or its converter. The nightly CI converts the model
+this way and gates the result against the HF reference logits, so the converter
+stays in parity (`.github/workflows/ci.yml`).
+
 ## C API
 
 Flat C API in [`include/pf.h`](include/pf.h): an opaque `pf_ctx` handle and
@@ -80,9 +101,11 @@ pf_free(ctx);
 
 ```sh
 ctest --preset debug -LE model            # fast suite, sanitizers, no assets
-# reference fixtures (one-time, pinned env: scripts/requirements.txt):
+# reference fixtures + GGUF (one-time, pinned env: scripts/requirements.txt):
 python scripts/hf_dump.py --model <hf-model-dir> --out tests/fixtures/hf
-PF_GGUF_DIR=<dir-with-ggufs> ctest --preset release          # full parity
+python scripts/convert.py --model <hf-model-dir> --outfile ggufs/pf-rope2-f16.gguf
+python scripts/convert.py --model <hf-model-dir> --outfile ggufs/pf-f32.gguf --outtype f32
+PF_GGUF_DIR=ggufs ctest --preset release                     # full parity (f16 + tight f32)
 PF_DEVICE=vulkan PF_GGUF_DIR=... ctest --preset release -L model   # on GPU
 ```
 
diff --git a/fuzz/fuzz_tokenizer.cpp b/fuzz/fuzz_tokenizer.cpp
index 52819e6..1d70920 100644
--- a/fuzz/fuzz_tokenizer.cpp
+++ b/fuzz/fuzz_tokenizer.cpp
@@ -4,7 +4,9 @@
 //   - encode: valid ids, start < end, non-decreasing starts, every byte
 //     covered (offsets are widened to UTF-8 char boundaries, so tokens may
 //     overlap on a multibyte char but never leave gaps)
-// With PF_GGUF set the full encode path runs; otherwise pretokenize only.
+// With PF_GGUF set to a loadable GGUF the full encode path runs; unset runs
+// pretokenize-only. PF_GGUF set but missing is a hard error (exit 1) — setting
+// it requests full-encode fuzzing, so the file has to be there.
 #include "tokenizer.h"
 
 #include <cstdio>
@@ -19,6 +21,17 @@ extern "C" int LLVMFuzzerInitialize(int *, char ***) {
         std::fprintf(stderr, "fuzz_tokenizer: PF_GGUF unset, pretokenize-only mode\n");
         return 0;
     }
+    // PF_GGUF was set, so full-encode fuzzing was requested: the GGUF is a hard
+    // requirement. Exit cleanly (exit 1, not abort -> no core dump) when it's
+    // missing, so CI fails loudly instead of silently fuzzing pretokenize-only.
+    // CI generates it with scripts/convert.py; a missing file means misconfig.
+    // A file that exists but won't load is a real bug, so that path aborts below.
+    if (FILE * f = std::fopen(gguf, "rb")) {
+        std::fclose(f);
+    } else {
+        std::fprintf(stderr, "fuzz_tokenizer: PF_GGUF set but missing: %s\n", gguf);
+        std::exit(1);
+    }
     ggml_context * gctx = nullptr;
     gguf_init_params params = { /*no_alloc =*/ true, &gctx };
     gguf_context * g = gguf_init_from_file(gguf, params);
diff --git a/model-cards/privacy-filter-multilingual.md b/model-cards/privacy-filter-multilingual.md
new file mode 100644
index 0000000..610d51f
--- /dev/null
+++ b/model-cards/privacy-filter-multilingual.md
@@ -0,0 +1,161 @@
+---
+license: apache-2.0
+base_model: OpenMed/privacy-filter-multilingual
+base_model_relation: quantized
+pipeline_tag: token-classification
+library_name: gguf
+tags:
+  - gguf
+  - privacy-filter.cpp
+  - llama-cpp
+  - localai
+  - token-classification
+  - pii
+  - ner
+  - privacy
+  - redaction
+  - multilingual
+  - openai-privacy-filter
+language:
+  - ar
+  - bn
+  - de
+  - en
+  - es
+  - fr
+  - hi
+  - it
+  - ja
+  - ko
+  - nl
+  - pt
+  - te
+  - tr
+  - vi
+  - zh
+---
+
+# privacy-filter-multilingual — GGUF (F16)
+
+GGUF conversion of [`OpenMed/privacy-filter-multilingual`](https://huggingface.co/OpenMed/privacy-filter-multilingual),
+a multilingual PII **token-classification** model (a fine-tune of
+[`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter)). It labels every
+token with a BIOES tag over **54 PII categories (217 classes)** across **16 languages**, so
+it can be served locally with **no Python** as the encoder/NER tier of a PII redactor.
+
+For the full model description, label space, evaluation, limitations, and citations, see the
+**[source model card](https://huggingface.co/OpenMed/privacy-filter-multilingual)** — this
+card only covers the GGUF packaging and how to run it.
+
+## Runtimes
+
+This GGUF uses a **custom architecture, `openai-privacy-filter`**, that is not (yet) part of
+upstream llama.cpp. It runs on:
+
+1. **[privacy-filter.cpp](https://github.com/localai-org/privacy-filter.cpp)** *(recommended)* —
+   a small standalone GGML engine for exactly this model family, on **stock upstream ggml with
+   no patches** (CPU / CUDA / Vulkan). This is the reference runtime and what the parity numbers
+   below are measured against.
+
+   ```sh
+   # build (see the repo README for CUDA/Vulkan)
+   cmake --preset release && cmake --build --preset release -j
+   # run
+   echo "Contact John Doe at jdoe@example.com" | \
+     build/release/pf-cli --classify privacy-filter-multilingual-f16.gguf 0.5
+   ```
+
+   It exposes a flat C API (`pf_load` / `pf_classify` → entity spans with UTF-8 byte offsets;
+   `pf_tokenize` / `pf_logits`) shaped for FFI — see the repo README.
+
+2. **[LocalAI](https://github.com/mudler/LocalAI)** — install from the model gallery; LocalAI
+   serves it behind the gRPC `TokenClassify` RPC and runs the constrained BIOES Viterbi decode,
+   returning entity spans. LocalAI drives it through the **`privacy-filter` backend** (which
+   wraps privacy-filter.cpp); older builds used a llama.cpp-patched path. The model is **not** a
+   chat/completion model — it is a PII detector that other models opt into.
+
+   ```bash
+   local-ai models install privacy-filter-multilingual
+   ```
+
+   The gallery entry carries the detection policy in a `pii_detection:` block (default: mask
+   everything detected; block credentials / financial-secrets / crypto). Other models opt in by
+   listing it under `pii.detectors`:
+
+   ```yaml
+   # any chat or cloud-proxy model — opt in and reference the detector(s)
+   name: my-assistant
+   pii:
+     enabled: true
+     detectors:
+       - privacy-filter-multilingual
+   ```
+
+3. **llama.cpp — only with a patch.** Stock `llama.cpp`, `llama-cpp-python`, Ollama, and
+   LM Studio will **fail to load** this file (`unknown model architecture:
+   'openai-privacy-filter'`). The arch can be added with carry-patches (TOKEN_CLS pooling, the
+   architecture + HF→GGUF converter, the bidirectional banded-attention graph, and an all-SWA
+   no-cache mask fix; TOKEN_CLS pooling tracks the still-open
+   [PR #19725](https://github.com/ggml-org/llama.cpp/pull/19725)). Until that support lands
+   upstream, the patched path is carried by LocalAI; `privacy-filter.cpp` above is the
+   patch-free alternative.
+
+> **Pooling note (llama.cpp path only):** the model must be loaded with **TOKEN_CLS pooling**
+> (the GGUF's default). If you drive `llama-embedding` directly for testing, do **not** pass
+> `--pooling none` — that overrides the default and yields raw hidden states instead of label
+> logits. privacy-filter.cpp handles this automatically.
+
+## Files
+
+| File | Precision | Size | Notes |
+|---|---|---|---|
+| `privacy-filter-multilingual-f16.gguf` | F16 | ~2.7 GB | 217 `classifier.output_labels`; `pooling_type = TOKEN_CLS`. Validated artifact. |
+
+F16 is the validated, shipped precision. Quantized variants are deferred until they can be
+evaluated with a **task metric (span-F1 per language) + KL-vs-F16** — perplexity is meaningless
+for a classifier, so a naively-quantized GGUF is not published here yet.
+
+## Architecture & conversion
+
+gpt-oss-style sparse **MoE** (8 layers, `d_model=640`, 128 experts, top-4 routing, ~50M active
+per token), **bidirectional banded attention** (symmetric sliding window 128, attention sinks
+retained), **interleaved (GPT-J) RoPE** with YaRN (θ=150000, factor 32), o200k (`o200k_base`)
+tokenizer, and a 217-way token-classification head (`score` → `cls.output`).
+
+The conversion reproduces the HF reference **exactly at F16**: token-for-token argmax match on
+the parity prompt set, **full-logit cosine = 1.0**, every layer's residual-stream cosine = 1.0
+(relerr ≈ 2e-4, i.e. F16 rounding). The two load-bearing conversion choices — the expert
+`gate_up` `chunk(2)` split and the `n_swa = 2·sliding_window` window mapping — are both
+confirmed by that parity. privacy-filter.cpp re-derives the YaRN `truncate=false` frequencies at
+load time (fed to `ggml_rope_ext` as `freq_factors`) so the same GGUF is interchangeable across
+runtimes.
+
+This GGUF was produced by [`scripts/convert.py`](https://github.com/localai-org/privacy-filter.cpp/blob/master/scripts/convert.py)
+— a self-contained HF→GGUF converter (no llama.cpp dependency). Nightly CI re-runs it and gates
+the output against the HF reference logits, so the published artifact stays in parity.
+
+## Label space
+
+`O` plus `B-`/`I-`/`E-`/`S-` for each of 54 categories (1 + 54×4 = 217), spanning identity,
+contact, address, dates/time, government IDs, financial, crypto, vehicle, digital, and auth
+entities. The ordered `id2label` table is embedded in the GGUF (`classifier.output_labels`).
+See the [source card](https://huggingface.co/OpenMed/privacy-filter-multilingual#label-space-54-categories)
+for the full list.
+
+## Limitations & intended use
+
+Identical to the [source model](https://huggingface.co/OpenMed/privacy-filter-multilingual#limitations--intended-use):
+multilingual but uneven (strongest on de/es/fr/it/hi/te/en; weaker on CJK), trained on
+synthetic AI4Privacy data, **not** a substitute for legal/compliance review, and **not** a
+clinical PHI model. Use it as one tier behind deterministic regex pre-filters and human review.
+
+## License
+
+**Apache-2.0**, inherited from `openai/privacy-filter` and `OpenMed/privacy-filter-multilingual`.
+
+## Credits & citation
+
+Conversion and runtime support by the **LocalAI** project (`privacy-filter.cpp`). The model
+itself is by **OpenMed**, fine-tuned from **OpenAI**'s `privacy-filter`, on **AI4Privacy**
+datasets — please cite all of them (BibTeX in the
+[source card](https://huggingface.co/OpenMed/privacy-filter-multilingual#citation)).
diff --git a/model-cards/privacy-filter.md b/model-cards/privacy-filter.md
new file mode 100644
index 0000000..44e5066
--- /dev/null
+++ b/model-cards/privacy-filter.md
@@ -0,0 +1,117 @@
+---
+license: apache-2.0
+base_model: openai/privacy-filter
+base_model_relation: quantized
+pipeline_tag: token-classification
+library_name: gguf
+tags:
+  - gguf
+  - privacy-filter.cpp
+  - llama-cpp
+  - localai
+  - token-classification
+  - pii
+  - ner
+  - privacy
+  - redaction
+  - openai-privacy-filter
+---
+
+# privacy-filter — GGUF (F16)
+
+GGUF conversion of [`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter),
+OpenAI's bidirectional PII **token-classification** model. It labels every token with a BIOES
+tag over **8 PII categories (33 classes)** in a single forward pass, then decodes coherent
+spans with a constrained Viterbi procedure — so it can be served locally with **no Python** as
+the encoder/NER tier of a PII redactor.
+
+For the full model description, training, evaluation, operating points, limitations, and
+citations, see the **[source model card](https://huggingface.co/openai/privacy-filter)** — this
+card only covers the GGUF packaging and how to run it.
+
+> For broader language coverage (54 categories across 16 languages), see the multilingual
+> fine-tune [`privacy-filter-multilingual` GGUF](https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF).
+
+## Runtimes
+
+This GGUF uses a **custom architecture, `openai-privacy-filter`**, that is not (yet) part of
+upstream llama.cpp. It runs on:
+
+1. **[privacy-filter.cpp](https://github.com/localai-org/privacy-filter.cpp)** *(recommended)* —
+   a small standalone GGML engine for exactly this model family, on **stock upstream ggml with
+   no patches** (CPU / CUDA / Vulkan). This is the reference runtime.
+
+   ```sh
+   # build (see the repo README for CUDA/Vulkan)
+   cmake --preset release && cmake --build --preset release -j
+   # run
+   echo "My name is Alice Smith" | \
+     build/release/pf-cli --classify privacy-filter-f16.gguf 0.5
+   ```
+
+   It exposes a flat C API (`pf_load` / `pf_classify` → entity spans with UTF-8 byte offsets;
+   `pf_tokenize` / `pf_logits`) shaped for FFI — see the repo README.
+
+2. **[LocalAI](https://github.com/mudler/LocalAI)** — install from the model gallery; LocalAI
+   serves it behind the gRPC `TokenClassify` RPC and runs the constrained BIOES Viterbi decode,
+   returning entity spans. LocalAI drives it through the **`privacy-filter` backend** (which
+   wraps privacy-filter.cpp). The model is **not** a chat/completion model — it is a PII
+   detector that other models opt into via a `pii.detectors` list.
+
+3. **llama.cpp — only with a patch.** Stock `llama.cpp`, `llama-cpp-python`, Ollama, and
+   LM Studio will **fail to load** this file (`unknown model architecture:
+   'openai-privacy-filter'`). The arch can be added with carry-patches (TOKEN_CLS pooling, the
+   architecture + HF→GGUF converter, the bidirectional banded-attention graph, and an all-SWA
+   no-cache mask fix; TOKEN_CLS pooling tracks the still-open
+   [PR #19725](https://github.com/ggml-org/llama.cpp/pull/19725)). Until that support lands
+   upstream, `privacy-filter.cpp` above is the patch-free alternative.
+
+> **Pooling note (llama.cpp path only):** the model must be loaded with **TOKEN_CLS pooling**
+> (the GGUF's default). If you drive `llama-embedding` directly for testing, do **not** pass
+> `--pooling none`. privacy-filter.cpp handles this automatically.
+
+## Files
+
+| File | Precision | Size | Notes |
+|---|---|---|---|
+| `privacy-filter-f16.gguf` | F16 | 2.82 GB | 156 tensors; 33 `classifier.output_labels`; `pooling_type = TOKEN_CLS`. |
+
+`sha256: eb71312b6b9370d0fe582e576b840567bb06603c4de241c6d899205d1b04dc81`
+
+F16 is the validated, shipped precision. Quantized variants are deferred until they can be
+evaluated with a **task metric (span-F1) + KL-vs-F16** — perplexity is meaningless for a
+classifier, so a naively-quantized GGUF is not published here yet.
+
+## Architecture & conversion
+
+gpt-oss-style sparse **MoE** (8 layers, `d_model=640`, 128 experts, top-4 routing; ~1.5B total /
+~50M active per token), **bidirectional banded attention** (symmetric sliding window, attention
+sinks retained), **interleaved (GPT-J) RoPE** with YaRN (θ=150000, factor 32), o200k
+(`o200k_base`) tokenizer, and a 33-way token-classification head (`score` → `cls.output`).
+privacy-filter.cpp re-derives the YaRN `truncate=false` frequencies at load time (fed to
+`ggml_rope_ext` as `freq_factors`) so the GGUF is interchangeable across runtimes.
+
+## Label space
+
+`O` plus `B-`/`I-`/`E-`/`S-` for each of 8 categories (1 + 8×4 = 33):
+`account_number`, `private_address`, `private_date`, `private_email`, `private_person`,
+`private_phone`, `private_url`, `secret`. The ordered `id2label` table is embedded in the GGUF
+(`classifier.output_labels`).
+
+## Limitations & intended use
+
+Identical to the [source model](https://huggingface.co/openai/privacy-filter): trained for
+high-throughput data sanitization, **not** a substitute for legal/compliance review, and **not**
+a clinical PHI model. Use it as one tier behind deterministic regex pre-filters and human
+review. For multilingual text, prefer the
+[multilingual fine-tune](https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF).
+
+## License
+
+**Apache-2.0**, inherited from `openai/privacy-filter`.
+
+## Credits & citation
+
+Model by **OpenAI** (`openai/privacy-filter`). GGUF conversion and runtime support
+(`privacy-filter.cpp`) by the **LocalAI** project. Please cite OpenAI per the
+[source card](https://huggingface.co/openai/privacy-filter).
diff --git a/scripts/convert.py b/scripts/convert.py
new file mode 100644
index 0000000..5b239cb
--- /dev/null
+++ b/scripts/convert.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""Convert an OpenAIPrivacyFilter HF checkpoint to the privacy-filter.cpp GGUF.
+
+Self-contained: reads ``config.json`` + ``model.safetensors`` (single or
+sharded) + ``tokenizer.json`` straight from the HF model dir and emits the
+``openai-privacy-filter`` GGUF that this repo's loader (src/gguf_loader.cpp,
+src/model.cpp) expects. It does NOT depend on llama.cpp or its convert script —
+the architecture is small and fully specified by the loader, so the whole
+mapping lives here.
+
+Models already converted with this script are published at:
+  - https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF
+  - https://huggingface.co/LocalAI-io/privacy-filter-GGUF
+
+Usage:
+  python scripts/convert.py --model ~/models/privacy-filter-multilingual \\
+      --outfile pf-rope2-f16.gguf                # f16 (default), the shipped precision
+  python scripts/convert.py --model <dir> --outtype f32 --outfile pf-f32.gguf
+
+The architecture (gpt-oss MoE body re-purposed as a bidirectional token
+classifier): 8 layers, 14/2 heads, head_dim 64, d_model 640, 128 experts top-4,
+o200k vocab, attention sinks, interleaved RoPE + YaRN, a TOKEN_CLS score head.
+The two load-bearing tensor transforms (verified by the parity test):
+  - experts.gate_up_proj is packed as CONCATENATED halves (gate = first
+    intermediate_size output columns, up = the rest), transposed to
+    [E, 2*inter, in] then split — NOT gpt-oss's interleaved even/odd;
+  - experts.down_proj is transposed (-1,-2) like the gpt-oss dense path.
+The expert matrices are square (inter == d_model == 640), so a wrong transpose
+is silently shape-valid but numerically wrong — parity is the guard.
+RoPE: the loader recomputes the YaRN freq factors from the rope KVs at load
+time (rope.scaling.yarn_truncate=false), so no per-dim rope_freqs are baked.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+ARCH = "openai-privacy-filter"
+
+# Token types in tokenizer.ggml.token_type (gguf.TokenType), mirrored here so we
+# don't depend on the enum's import path: NORMAL vocab, CONTROL specials, UNUSED
+# placeholders for the reserved gap ids between the specials and vocab_size.
+TT_NORMAL, TT_CONTROL, TT_UNUSED = 1, 3, 5
+
+# GGUF tensor names that carry the weight matrices quantized to f16 in an f16
+# build; everything else (norms, biases, attn sinks, the router weight) stays
+# f32. Matches the published reference file's per-tensor precision exactly.
+F16_SUFFIXES = (
+    "attn_q.weight", "attn_k.weight", "attn_v.weight", "attn_output.weight",
+    "ffn_gate_exps.weight", "ffn_up_exps.weight", "ffn_down_exps.weight",
+)
+F16_GLOBALS = ("token_embd.weight", "cls.output.weight")
+
+
+def gpt2_byte_encoder() -> dict[int, str]:
+    # GPT-2/o200k byte<->unicode: printable bytes map to themselves, the rest to
+    # 0x100+n in order. tokenizer.json vocab keys are already in this encoding,
+    # so we only need it to byte-encode any added-token content we synthesize.
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(0xA1, 0xAD)) + list(range(0xAE, 0x100))
+    cs = bs[:]
+    n = 0
+    for b in range(256):
+        if b not in bs:
+            bs.append(b)
+            cs.append(0x100 + n)
+            n += 1
+    return {b: chr(c) for b, c in zip(bs, cs)}
+
+
+def load_tokenizer(model_dir: Path):
+    """Extract (tokens, token_types, merges, eos, pad) from tokenizer.json.
+
+    Mirrors llama.cpp's gpt2 vocab handling: base BPE vocab as NORMAL, special
+    added tokens as CONTROL, and reserved gap ids up to vocab_size filled with
+    ``[PAD{i}]`` placeholders typed UNUSED.
+    """
+    tj = json.loads((model_dir / "tokenizer.json").read_text())
+    cfg = json.loads((model_dir / "config.json").read_text())
+    vocab_size = cfg["vocab_size"]
+
+    enc = gpt2_byte_encoder()
+    byte_encode = lambda s: "".join(enc[b] for b in s.encode("utf-8"))
+
+    tokens: list[str | None] = [None] * vocab_size
+    types = [TT_UNUSED] * vocab_size
+    for tok, tid in tj["model"]["vocab"].items():
+        tokens[tid] = tok
+        types[tid] = TT_NORMAL
+    for at in tj.get("added_tokens", []):
+        tid = at["id"]
+        if 0 <= tid < vocab_size:
+            tokens[tid] = byte_encode(at["content"])
+            types[tid] = TT_CONTROL if at.get("special") else TT_NORMAL
+    for i in range(vocab_size):
+        if tokens[i] is None:
+            tokens[i] = f"[PAD{i}]"  # reserved gap id; never matches real text
+
+    merges_raw = tj["model"]["merges"]
+    merges = [m if isinstance(m, str) else f"{m[0]} {m[1]}" for m in merges_raw]
+
+    eos = cfg.get("eos_token_id")
+    pad = cfg.get("pad_token_id", eos)
+    return tokens, types, merges, eos, pad
+
+
+def load_state_dict(model_dir: Path):
+    """Yield (name, torch.Tensor) for every weight, from single or sharded
+    safetensors, one tensor resident at a time."""
+    from safetensors import safe_open
+
+    index = model_dir / "model.safetensors.index.json"
+    if index.is_file():
+        weight_map = json.loads(index.read_text())["weight_map"]
+        shard_of = {name: model_dir / shard for name, shard in weight_map.items()}
+    else:
+        single = model_dir / "model.safetensors"
+        with safe_open(single, framework="pt") as f:
+            shard_of = {name: single for name in f.keys()}
+
+    handles: dict[Path, object] = {}
+    for name, path in shard_of.items():
+        if path not in handles:
+            handles[path] = safe_open(path, framework="pt")
+        yield name, handles[path].get_tensor(name)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--model", required=True, type=Path, help="HF model directory")
+    ap.add_argument("--outfile", required=True, type=Path, help="output .gguf path")
+    ap.add_argument("--outtype", choices=("f16", "f32"), default="f16")
+    ap.add_argument("--name", default=None, help="general.name (default: model dir name)")
+    args = ap.parse_args()
+
+    import torch
+    import gguf
+
+    cfg = json.loads((args.model / "config.json").read_text())
+    n_layer = cfg["num_hidden_layers"]
+    inter = cfg["intermediate_size"]
+    rope = cfg.get("rope_parameters") or cfg.get("rope_scaling") or {}
+    id2label = {int(k): v for k, v in cfg["id2label"].items()}
+    labels = [id2label[i] for i in range(len(id2label))]
+
+    writer = gguf.GGUFWriter(str(args.outfile), ARCH)  # writes general.architecture
+
+    # --- metadata -----------------------------------------------------------
+    writer.add_string("general.type", "model")
+    writer.add_string("general.name", args.name or args.model.resolve().name)
+    writer.add_uint32("general.file_type", 1 if args.outtype == "f16" else 0)
+    writer.add_uint32("general.quantization_version", 2)
+
+    writer.add_uint32(f"{ARCH}.block_count", n_layer)
+    writer.add_uint32(f"{ARCH}.context_length", cfg["max_position_embeddings"])
+    writer.add_uint32(f"{ARCH}.embedding_length", cfg["hidden_size"])
+    writer.add_uint32(f"{ARCH}.feed_forward_length", inter)
+    writer.add_uint32(f"{ARCH}.expert_feed_forward_length", inter)
+    writer.add_uint32(f"{ARCH}.attention.head_count", cfg["num_attention_heads"])
+    writer.add_uint32(f"{ARCH}.attention.head_count_kv", cfg["num_key_value_heads"])
+    writer.add_uint32(f"{ARCH}.attention.key_length", cfg["head_dim"])
+    writer.add_uint32(f"{ARCH}.attention.value_length", cfg["head_dim"])
+    writer.add_float32(f"{ARCH}.attention.layer_norm_rms_epsilon", cfg["rms_norm_eps"])
+    writer.add_uint32(f"{ARCH}.attention.sliding_window", cfg["sliding_window"])
+    writer.add_uint32(f"{ARCH}.expert_count", cfg["num_local_experts"])
+    writer.add_uint32(f"{ARCH}.expert_used_count", cfg["num_experts_per_tok"])
+
+    writer.add_float32(f"{ARCH}.rope.freq_base", rope["rope_theta"])
+    writer.add_string(f"{ARCH}.rope.scaling.type", "yarn")
+    writer.add_float32(f"{ARCH}.rope.scaling.factor", rope["factor"])
+    writer.add_uint32(f"{ARCH}.rope.scaling.original_context_length",
+                      rope["original_max_position_embeddings"])
+    writer.add_float32(f"{ARCH}.rope.scaling.yarn_beta_fast", rope["beta_fast"])
+    writer.add_float32(f"{ARCH}.rope.scaling.yarn_beta_slow", rope["beta_slow"])
+    writer.add_bool(f"{ARCH}.rope.scaling.yarn_truncate", bool(rope.get("truncate", False)))
+
+    writer.add_uint32(f"{ARCH}.pooling_type", 5)  # TOKEN_CLS
+    writer.add_uint32(f"{ARCH}.embedding_length_out", len(labels))
+    writer.add_array(f"{ARCH}.classifier.output_labels", labels)
+
+    # --- tokenizer ----------------------------------------------------------
+    tokens, types, merges, eos, pad = load_tokenizer(args.model)
+    writer.add_string("tokenizer.ggml.model", "gpt2")
+    writer.add_string("tokenizer.ggml.pre", "gpt-4o")
+    writer.add_array("tokenizer.ggml.tokens", tokens)
+    writer.add_array("tokenizer.ggml.token_type", types)
+    writer.add_array("tokenizer.ggml.merges", merges)
+    if eos is not None:
+        writer.add_uint32("tokenizer.ggml.eos_token_id", eos)
+    if pad is not None:
+        writer.add_uint32("tokenizer.ggml.padding_token_id", pad)
+
+    # --- tensors ------------------------------------------------------------
+    def emit(name: str, t: "torch.Tensor"):
+        is_f16 = args.outtype == "f16" and (name in F16_GLOBALS or name.endswith(F16_SUFFIXES))
+        t = t.to(torch.float16 if is_f16 else torch.float32).contiguous()
+        writer.add_tensor(name, t.numpy())
+
+    n_emitted = 0
+    for name, t in load_state_dict(args.model):
+        if name == "model.embed_tokens.weight":
+            emit("token_embd.weight", t)
+        elif name == "model.norm.weight":
+            emit("output_norm.weight", t)
+        elif name == "score.weight":
+            emit("cls.output.weight", t)
+        elif name == "score.bias":
+            emit("cls.output.bias", t)
+        elif name.startswith("model.layers."):
+            bid = int(name.split(".")[2])
+            sub = name.split(".", 3)[3]
+            p = f"blk.{bid}."
+            simple = {
+                "input_layernorm.weight": "attn_norm.weight",
+                "self_attn.q_proj.weight": "attn_q.weight",
+                "self_attn.q_proj.bias": "attn_q.bias",
+                "self_attn.k_proj.weight": "attn_k.weight",
+                "self_attn.k_proj.bias": "attn_k.bias",
+                "self_attn.v_proj.weight": "attn_v.weight",
+                "self_attn.v_proj.bias": "attn_v.bias",
+                "self_attn.o_proj.weight": "attn_output.weight",
+                "self_attn.o_proj.bias": "attn_output.bias",
+                "self_attn.sinks": "attn_sinks.weight",
+                "post_attention_layernorm.weight": "post_attention_norm.weight",
+                "mlp.router.weight": "ffn_gate_inp.weight",
+                "mlp.router.bias": "ffn_gate_inp.bias",
+                "mlp.experts.down_proj_bias": "ffn_down_exps.bias",
+            }
+            if sub in simple:
+                emit(p + simple[sub], t)
+            elif sub == "mlp.experts.down_proj":
+                emit(p + "ffn_down_exps.weight", t.transpose(-1, -2))
+            elif sub == "mlp.experts.gate_up_proj":
+                w = t.transpose(-1, -2)              # [E, 2*inter, in]
+                emit(p + "ffn_gate_exps.weight", w[:, :inter, :])
+                emit(p + "ffn_up_exps.weight",   w[:, inter:, :])
+                n_emitted += 1                       # two tensors from one source
+            elif sub == "mlp.experts.gate_up_proj_bias":
+                emit(p + "ffn_gate_exps.bias", t[..., :inter])
+                emit(p + "ffn_up_exps.bias",   t[..., inter:])
+                n_emitted += 1
+            else:
+                sys.exit(f"unmapped tensor: {name}")
+        else:
+            sys.exit(f"unmapped tensor: {name}")
+        n_emitted += 1
+
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file(progress=True)
+    writer.close()
+    print(f"\nwrote {args.outfile}  ({n_emitted} tensors, {args.outtype}, {len(labels)} labels)")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/publish_hf.py b/scripts/publish_hf.py
new file mode 100755
index 0000000..910ca53
--- /dev/null
+++ b/scripts/publish_hf.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Publish privacy-filter.cpp GGUF models + model cards to the HuggingFace Hub.
+
+The GGUFs are produced by ``scripts/convert.py`` (self-contained — reads
+config.json + model.safetensors + tokenizer.json, no llama.cpp dependency):
+
+    python scripts/convert.py --model <hf-model-dir> --outfile <name>-f16.gguf
+
+This script uploads a converted GGUF plus its **version-controlled** model card
+(``model-cards/<key>.md`` -> the repo's ``README.md``) to the matching HF repo,
+so the published card never drifts from the one in this repo.
+
+**DRY-RUN BY DEFAULT** — without ``--upload`` it prints what it *would* push
+(repo, files, size, sha256) and never contacts HuggingFace. Pass ``--upload``
+to perform the real push. The sha256 it prints is what the LocalAI gallery
+entry should pin.
+
+Usage:
+    python scripts/publish_hf.py --model privacy-filter \\
+        --gguf ~/models/privacy-filter/privacy-filter-f16.gguf          # dry-run
+
+    python scripts/publish_hf.py --model privacy-filter-multilingual \\
+        --gguf .../privacy-filter-multilingual-f16.gguf --upload         # push
+
+    python scripts/publish_hf.py --model privacy-filter-multilingual \\
+        --card-only --upload                  # sync just the card (README.md)
+"""
+from __future__ import annotations
+
+import argparse
+import hashlib
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+CARDS_DIR = REPO_ROOT / "model-cards"
+
+HF_ORG = "LocalAI-io"
+
+# key -> (HF repo id, published GGUF filename, model-card filename under model-cards/)
+MODELS: dict[str, tuple[str, str, str]] = {
+    "privacy-filter-multilingual": (
+        f"{HF_ORG}/privacy-filter-multilingual-GGUF",
+        "privacy-filter-multilingual-f16.gguf",
+        "privacy-filter-multilingual.md",
+    ),
+    "privacy-filter": (
+        f"{HF_ORG}/privacy-filter-GGUF",
+        "privacy-filter-f16.gguf",
+        "privacy-filter.md",
+    ),
+}
+
+
+def sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    ap.add_argument("--model", required=True, choices=sorted(MODELS),
+                    help="which model to publish")
+    ap.add_argument("--gguf", type=Path, default=None,
+                    help="path to the converted <name>-f16.gguf (omit with --card-only)")
+    ap.add_argument("--card-only", action="store_true",
+                    help="sync only the model card (README.md); leave the published GGUF untouched")
+    ap.add_argument("--repo", default=None, help="override the target HF repo id")
+    ap.add_argument("--upload", action="store_true",
+                    help="actually push (default: dry-run, contacts nothing)")
+    args = ap.parse_args()
+
+    repo, gguf_name, card_name = MODELS[args.model]
+    repo = args.repo or repo
+    card = CARDS_DIR / card_name
+
+    if not card.is_file():
+        sys.exit(f"model card not found: {card}")
+    if not args.card_only:
+        if args.gguf is None:
+            sys.exit("--gguf is required unless --card-only is given")
+        if not args.gguf.is_file():
+            sys.exit(f"GGUF not found: {args.gguf}")
+
+    print(f"model:   {args.model}")
+    print(f"repo:    https://huggingface.co/{repo}")
+    print(f"card:    {card.relative_to(REPO_ROOT)} -> README.md")
+    if args.card_only:
+        print("gguf:    (card-only — published GGUF left untouched)")
+    else:
+        size = args.gguf.stat().st_size
+        print(f"gguf:    {args.gguf}  ({size / 1e9:.2f} GB)  uploaded as {gguf_name}")
+        print(f"sha256:  {sha256(args.gguf)}   <- pin this in the LocalAI gallery entry")
+
+    if not args.upload:
+        print("\n[dry-run] nothing uploaded. Re-run with --upload to push.")
+        return 0
+
+    from huggingface_hub import HfApi
+
+    api = HfApi()
+    api.create_repo(repo, repo_type="model", exist_ok=True)
+    print("\nuploading README.md ...")
+    api.upload_file(
+        path_or_fileobj=str(card), path_in_repo="README.md",
+        repo_id=repo, repo_type="model",
+        commit_message=f"card: sync from privacy-filter.cpp ({card_name})",
+    )
+    if not args.card_only:
+        print(f"uploading {gguf_name} ...")
+        api.upload_file(
+            path_or_fileobj=str(args.gguf), path_in_repo=gguf_name,
+            repo_id=repo, repo_type="model",
+            commit_message=f"gguf: {gguf_name} (f16)",
+        )
+    print(f"done -> https://huggingface.co/{repo}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index 40c26a7..34c44d4 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1,8 +1,11 @@
-# Reference environment for fixture generation (hf_dump.py) and tokenizer
-# differential tests (hf_tok_diff.py). The openai_privacy_filter model type
-# needs transformers >= 5.6.
+# Reference environment for fixture generation (hf_dump.py), tokenizer
+# differential tests (hf_tok_diff.py), and HF->GGUF conversion (convert.py).
+# The openai_privacy_filter model type needs transformers >= 5.6.
 transformers==5.9.0
 tokenizers==0.22.2
 torch>=2.11
 numpy
 safetensors
+# convert.py reads safetensors/config/tokenizer and writes GGUF; it needs only
+# torch + safetensors + gguf (no transformers).
+gguf>=0.10
diff --git a/tests/test_parity.cpp b/tests/test_parity.cpp
index 53b6fcc..e18d511 100644
--- a/tests/test_parity.cpp
+++ b/tests/test_parity.cpp
@@ -137,36 +137,29 @@ int main() {
         std::fprintf(stderr, "PF_GGUF_DIR or PF_FIXTURES not set, skipping\n");
         return 77;
     }
-    {
-        std::vector<char> probe;
-        if (!read_file(std::string(fixtures) + "/short-en/tokens.i32", probe)) {
-            std::fprintf(stderr, "fixtures not generated (scripts/hf_dump.py), skipping\n");
-            return 77;
-        }
-    }
-
-    // The f16 GGUF is the primary gate. PF_GGUF_DIR may be set (the CI cache
-    // restores the directory) while the GGUF itself is absent — conversion
-    // lives in the llama.cpp fork and the cache is seeded separately (see
-    // .github/workflows/ci.yml). Skip cleanly rather than fail in that case.
+    // The skip above is the only legitimate one: model testing wasn't requested
+    // (the fast tier runs -LE model; this is the local "full suite, no assets"
+    // path). Once PF_GGUF_DIR/PF_FIXTURES are set the operator IS asking for
+    // parity, so every asset is a hard requirement -- a missing one fails loudly
+    // rather than silently skipping the gate. CI regenerates them all on every
+    // run (scripts/hf_dump.py + scripts/convert.py), so a missing file is a real
+    // error, not a reason to skip.
+    auto require_asset = [](const std::string & path, const char * what) {
+        if (FILE * f = std::fopen(path.c_str(), "rb")) { std::fclose(f); return; }
+        failures++;
+        std::fprintf(stderr, "FAIL: required %s missing: %s\n", what, path.c_str());
+    };
     const char * f16_name = std::getenv("PF_GGUF_NAME");
     const std::string f16 = std::string(gguf_dir) + "/" + (f16_name ? f16_name : "pf-rope2-f16.gguf");
-    if (FILE * f = std::fopen(f16.c_str(), "rb")) {
-        std::fclose(f);
-    } else {
-        std::fprintf(stderr, "%s absent, skipping\n", f16.c_str());
-        return 77;
-    }
+    const std::string f32 = std::string(gguf_dir) + "/pf-f32.gguf";
+    require_asset(std::string(fixtures) + "/short-en/tokens.i32", "fixtures (scripts/hf_dump.py)");
+    require_asset(f16, "f16 GGUF (scripts/convert.py)");
+    require_asset(f32, "f32 GGUF (scripts/convert.py --outtype f32)");
+    if (failures) return 1;
 
     // f32 GGUF: tight per-row gates vs the exact-rotation reference
-    const std::string f32 = std::string(gguf_dir) + "/pf-f32.gguf";
-    if (FILE * f = std::fopen(f32.c_str(), "rb")) {
-        std::fclose(f);
-        if (is_gpu()) run_model(f32, fixtures, "logits.f32", 0.998, 0.15);
-        else          run_model(f32, fixtures, "logits.f32", 0.99999, 1e-2);
-    } else {
-        std::fprintf(stderr, "note: %s absent, skipping f32 gates\n", f32.c_str());
-    }
+    if (is_gpu()) run_model(f32, fixtures, "logits.f32", 0.998, 0.15);
+    else          run_model(f32, fixtures, "logits.f32", 0.99999, 1e-2);
 
     // f16 GGUF: production-file gate vs the stock reference
     run_model(f16, fixtures, "logits_stock.f32", is_gpu() ? 0.998 : 0.999, is_gpu() ? 0.15 : 5e-2);
diff --git a/tests/test_window_stitch.cpp b/tests/test_window_stitch.cpp
index bcc28ff..fcea3ba 100644
--- a/tests/test_window_stitch.cpp
+++ b/tests/test_window_stitch.cpp
@@ -44,23 +44,25 @@ int main() {
         std::fprintf(stderr, "PF_GGUF_DIR or PF_FIXTURES not set, skipping\n");
         return 77;
     }
+    // The skip above is the only legitimate one (model testing not requested).
+    // Once PF_GGUF_DIR/PF_FIXTURES are set, the GGUF and fixture are hard
+    // requirements -- CI regenerates both every run (scripts/convert.py +
+    // scripts/hf_dump.py), so a missing asset fails loudly rather than silently
+    // skipping the gate.
     std::vector<int32_t> ids;
     if (!read_i32(std::string(fixtures) + "/long-3k/tokens.i32", ids)) {
-        std::fprintf(stderr, "long-3k fixture missing, skipping\n");
-        return 77;
+        std::fprintf(stderr, "FAIL: required long-3k fixture missing (scripts/hf_dump.py)\n");
+        return 1;
     }
     const int n = (int) ids.size();
 
-    // PF_GGUF_DIR may be set (the CI cache restores the directory) while the
-    // GGUF itself is absent — the cache is seeded separately. Skip cleanly
-    // rather than fail when the file is missing (see .github/workflows/ci.yml).
     const char * f16_name = std::getenv("PF_GGUF_NAME");
     const std::string f16 = std::string(gguf_dir) + "/" + (f16_name ? f16_name : "pf-rope2-f16.gguf");
     if (FILE * f = std::fopen(f16.c_str(), "rb")) {
         std::fclose(f);
     } else {
-        std::fprintf(stderr, "%s absent, skipping\n", f16.c_str());
-        return 77;
+        std::fprintf(stderr, "FAIL: required f16 GGUF missing (scripts/convert.py): %s\n", f16.c_str());
+        return 1;
     }
     pf::model m;
     if (!m.load(f16, std::getenv("PF_DEVICE") ? std::getenv("PF_DEVICE") : "cpu", 0)) {