diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9db7094..4484741 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,6 +54,9 @@ jobs: id: cache uses: actions/cache@v4 with: + # HF safetensors download + reference fixtures only. The GGUF is + # (re)converted from these on every run, so it always reflects the + # current scripts/convert.py rather than a stale cached artifact. path: | ~/models/privacy-filter-multilingual tests/fixtures/hf @@ -63,24 +66,38 @@ jobs: python3 -m venv .venv .venv/bin/pip install -q torch --index-url https://download.pytorch.org/whl/cpu .venv/bin/pip install -q -r scripts/requirements.txt - - name: fetch model + convert + dump fixtures + - name: fetch model + dump fixtures if: steps.cache.outputs.cache-hit != 'true' run: | .venv/bin/pip install -q "huggingface_hub[cli]" .venv/bin/hf download OpenMed/privacy-filter-multilingual \ --local-dir ~/models/privacy-filter-multilingual - # GGUF conversion lives in the llama.cpp fork (same files serve both - # engines); the cache is seeded with pf-rope2-f16.gguf + pf-f32.gguf - # once, manually. Without them the model-label tests skip (exit 77). .venv/bin/python scripts/hf_dump.py \ --model ~/models/privacy-filter-multilingual --out tests/fixtures/hf + - name: convert HF -> GGUF + run: | + # Conversion is part of the tested path: the parity suite below gates + # these freshly converted GGUFs against the HF reference fixtures, so a + # scripts/convert.py regression fails CI. ~/ggufs is deliberately + # outside the cached paths so every run reconverts with the current + # script. (The f16 is the shipped artifact, published at + # huggingface.co/LocalAI-io; the f32 adds the tight exact-rotation + # parity gate, cos >= 0.99999, that isolates conversion errors from + # f16 rounding.) + mkdir -p ~/ggufs + .venv/bin/python scripts/convert.py \ + --model ~/models/privacy-filter-multilingual \ + --outfile ~/ggufs/pf-rope2-f16.gguf --outtype f16 + .venv/bin/python scripts/convert.py \ + --model ~/models/privacy-filter-multilingual \ + --outfile ~/ggufs/pf-f32.gguf --outtype f32 - name: build run: cmake --preset release -DGGML_NATIVE=OFF && cmake --build --preset release -j - name: parity suite - run: PF_GGUF_DIR=~/models/privacy-filter-multilingual ctest --preset release -L model + run: PF_GGUF_DIR=~/ggufs ctest --preset release -L model - name: fuzz smoke (5 min/target) run: | cmake --preset fuzz && cmake --build --preset fuzz -j --target fuzz_tokenizer fuzz_gguf - PF_GGUF=~/models/privacy-filter-multilingual/pf-rope2-f16.gguf \ + PF_GGUF=~/ggufs/pf-rope2-f16.gguf \ ./build/fuzz/fuzz_tokenizer -max_total_time=300 -max_len=4096 ./build/fuzz/fuzz_gguf -max_total_time=300 -max_len=8192 diff --git a/README.md b/README.md index 857ec5b..093a920 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,12 @@ PII/NER entity spans with exact UTF-8 byte offsets. Stock upstream ggml — no patches; the model's YaRN `truncate=false` frequencies are computed at load time and fed to `ggml_rope_ext` as `freq_factors`. -Uses the same GGUF files as the llama.cpp-based path (arch -`openai-privacy-filter`, converted by the llama.cpp-fork converter). +Pre-converted GGUFs (arch `openai-privacy-filter`): +[`LocalAI-io/privacy-filter-multilingual-GGUF`](https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF) +and [`LocalAI-io/privacy-filter-GGUF`](https://huggingface.co/LocalAI-io/privacy-filter-GGUF). +Convert your own from a HF checkpoint with +[`scripts/convert.py`](scripts/convert.py) — self-contained, no llama.cpp +dependency (see [Convert](#convert)). ## Build @@ -34,6 +38,23 @@ echo "Contact John Doe at jdoe@example.com" | \ build/release/pf-cli --classify model.gguf 0.5 # [cpu|cuda|vulkan] ``` +## Convert + +Pre-converted GGUFs are linked above. To convert an `OpenAIPrivacyFilter` HF +checkpoint yourself: + +```sh +pip install -r scripts/requirements.txt # torch + safetensors + gguf +python scripts/convert.py --model --outfile model-f16.gguf +python scripts/convert.py --model --outfile model-f32.gguf --outtype f32 +``` + +[`scripts/convert.py`](scripts/convert.py) reads `config.json` + +`model.safetensors` + `tokenizer.json` and emits the GGUF directly — it does +**not** depend on llama.cpp or its converter. The nightly CI converts the model +this way and gates the result against the HF reference logits, so the converter +stays in parity (`.github/workflows/ci.yml`). + ## C API Flat C API in [`include/pf.h`](include/pf.h): an opaque `pf_ctx` handle and @@ -80,9 +101,11 @@ pf_free(ctx); ```sh ctest --preset debug -LE model # fast suite, sanitizers, no assets -# reference fixtures (one-time, pinned env: scripts/requirements.txt): +# reference fixtures + GGUF (one-time, pinned env: scripts/requirements.txt): python scripts/hf_dump.py --model --out tests/fixtures/hf -PF_GGUF_DIR= ctest --preset release # full parity +python scripts/convert.py --model --outfile ggufs/pf-rope2-f16.gguf +python scripts/convert.py --model --outfile ggufs/pf-f32.gguf --outtype f32 +PF_GGUF_DIR=ggufs ctest --preset release # full parity (f16 + tight f32) PF_DEVICE=vulkan PF_GGUF_DIR=... ctest --preset release -L model # on GPU ``` diff --git a/fuzz/fuzz_tokenizer.cpp b/fuzz/fuzz_tokenizer.cpp index 52819e6..1d70920 100644 --- a/fuzz/fuzz_tokenizer.cpp +++ b/fuzz/fuzz_tokenizer.cpp @@ -4,7 +4,9 @@ // - encode: valid ids, start < end, non-decreasing starts, every byte // covered (offsets are widened to UTF-8 char boundaries, so tokens may // overlap on a multibyte char but never leave gaps) -// With PF_GGUF set the full encode path runs; otherwise pretokenize only. +// With PF_GGUF set to a loadable GGUF the full encode path runs; unset runs +// pretokenize-only. PF_GGUF set but missing is a hard error (exit 1) — setting +// it requests full-encode fuzzing, so the file has to be there. #include "tokenizer.h" #include @@ -19,6 +21,17 @@ extern "C" int LLVMFuzzerInitialize(int *, char ***) { std::fprintf(stderr, "fuzz_tokenizer: PF_GGUF unset, pretokenize-only mode\n"); return 0; } + // PF_GGUF was set, so full-encode fuzzing was requested: the GGUF is a hard + // requirement. Exit cleanly (exit 1, not abort -> no core dump) when it's + // missing, so CI fails loudly instead of silently fuzzing pretokenize-only. + // CI generates it with scripts/convert.py; a missing file means misconfig. + // A file that exists but won't load is a real bug, so that path aborts below. + if (FILE * f = std::fopen(gguf, "rb")) { + std::fclose(f); + } else { + std::fprintf(stderr, "fuzz_tokenizer: PF_GGUF set but missing: %s\n", gguf); + std::exit(1); + } ggml_context * gctx = nullptr; gguf_init_params params = { /*no_alloc =*/ true, &gctx }; gguf_context * g = gguf_init_from_file(gguf, params); diff --git a/model-cards/privacy-filter-multilingual.md b/model-cards/privacy-filter-multilingual.md new file mode 100644 index 0000000..610d51f --- /dev/null +++ b/model-cards/privacy-filter-multilingual.md @@ -0,0 +1,161 @@ +--- +license: apache-2.0 +base_model: OpenMed/privacy-filter-multilingual +base_model_relation: quantized +pipeline_tag: token-classification +library_name: gguf +tags: + - gguf + - privacy-filter.cpp + - llama-cpp + - localai + - token-classification + - pii + - ner + - privacy + - redaction + - multilingual + - openai-privacy-filter +language: + - ar + - bn + - de + - en + - es + - fr + - hi + - it + - ja + - ko + - nl + - pt + - te + - tr + - vi + - zh +--- + +# privacy-filter-multilingual — GGUF (F16) + +GGUF conversion of [`OpenMed/privacy-filter-multilingual`](https://huggingface.co/OpenMed/privacy-filter-multilingual), +a multilingual PII **token-classification** model (a fine-tune of +[`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter)). It labels every +token with a BIOES tag over **54 PII categories (217 classes)** across **16 languages**, so +it can be served locally with **no Python** as the encoder/NER tier of a PII redactor. + +For the full model description, label space, evaluation, limitations, and citations, see the +**[source model card](https://huggingface.co/OpenMed/privacy-filter-multilingual)** — this +card only covers the GGUF packaging and how to run it. + +## Runtimes + +This GGUF uses a **custom architecture, `openai-privacy-filter`**, that is not (yet) part of +upstream llama.cpp. It runs on: + +1. **[privacy-filter.cpp](https://github.com/localai-org/privacy-filter.cpp)** *(recommended)* — + a small standalone GGML engine for exactly this model family, on **stock upstream ggml with + no patches** (CPU / CUDA / Vulkan). This is the reference runtime and what the parity numbers + below are measured against. + + ```sh + # build (see the repo README for CUDA/Vulkan) + cmake --preset release && cmake --build --preset release -j + # run + echo "Contact John Doe at jdoe@example.com" | \ + build/release/pf-cli --classify privacy-filter-multilingual-f16.gguf 0.5 + ``` + + It exposes a flat C API (`pf_load` / `pf_classify` → entity spans with UTF-8 byte offsets; + `pf_tokenize` / `pf_logits`) shaped for FFI — see the repo README. + +2. **[LocalAI](https://github.com/mudler/LocalAI)** — install from the model gallery; LocalAI + serves it behind the gRPC `TokenClassify` RPC and runs the constrained BIOES Viterbi decode, + returning entity spans. LocalAI drives it through the **`privacy-filter` backend** (which + wraps privacy-filter.cpp); older builds used a llama.cpp-patched path. The model is **not** a + chat/completion model — it is a PII detector that other models opt into. + + ```bash + local-ai models install privacy-filter-multilingual + ``` + + The gallery entry carries the detection policy in a `pii_detection:` block (default: mask + everything detected; block credentials / financial-secrets / crypto). Other models opt in by + listing it under `pii.detectors`: + + ```yaml + # any chat or cloud-proxy model — opt in and reference the detector(s) + name: my-assistant + pii: + enabled: true + detectors: + - privacy-filter-multilingual + ``` + +3. **llama.cpp — only with a patch.** Stock `llama.cpp`, `llama-cpp-python`, Ollama, and + LM Studio will **fail to load** this file (`unknown model architecture: + 'openai-privacy-filter'`). The arch can be added with carry-patches (TOKEN_CLS pooling, the + architecture + HF→GGUF converter, the bidirectional banded-attention graph, and an all-SWA + no-cache mask fix; TOKEN_CLS pooling tracks the still-open + [PR #19725](https://github.com/ggml-org/llama.cpp/pull/19725)). Until that support lands + upstream, the patched path is carried by LocalAI; `privacy-filter.cpp` above is the + patch-free alternative. + +> **Pooling note (llama.cpp path only):** the model must be loaded with **TOKEN_CLS pooling** +> (the GGUF's default). If you drive `llama-embedding` directly for testing, do **not** pass +> `--pooling none` — that overrides the default and yields raw hidden states instead of label +> logits. privacy-filter.cpp handles this automatically. + +## Files + +| File | Precision | Size | Notes | +|---|---|---|---| +| `privacy-filter-multilingual-f16.gguf` | F16 | ~2.7 GB | 217 `classifier.output_labels`; `pooling_type = TOKEN_CLS`. Validated artifact. | + +F16 is the validated, shipped precision. Quantized variants are deferred until they can be +evaluated with a **task metric (span-F1 per language) + KL-vs-F16** — perplexity is meaningless +for a classifier, so a naively-quantized GGUF is not published here yet. + +## Architecture & conversion + +gpt-oss-style sparse **MoE** (8 layers, `d_model=640`, 128 experts, top-4 routing, ~50M active +per token), **bidirectional banded attention** (symmetric sliding window 128, attention sinks +retained), **interleaved (GPT-J) RoPE** with YaRN (θ=150000, factor 32), o200k (`o200k_base`) +tokenizer, and a 217-way token-classification head (`score` → `cls.output`). + +The conversion reproduces the HF reference **exactly at F16**: token-for-token argmax match on +the parity prompt set, **full-logit cosine = 1.0**, every layer's residual-stream cosine = 1.0 +(relerr ≈ 2e-4, i.e. F16 rounding). The two load-bearing conversion choices — the expert +`gate_up` `chunk(2)` split and the `n_swa = 2·sliding_window` window mapping — are both +confirmed by that parity. privacy-filter.cpp re-derives the YaRN `truncate=false` frequencies at +load time (fed to `ggml_rope_ext` as `freq_factors`) so the same GGUF is interchangeable across +runtimes. + +This GGUF was produced by [`scripts/convert.py`](https://github.com/localai-org/privacy-filter.cpp/blob/master/scripts/convert.py) +— a self-contained HF→GGUF converter (no llama.cpp dependency). Nightly CI re-runs it and gates +the output against the HF reference logits, so the published artifact stays in parity. + +## Label space + +`O` plus `B-`/`I-`/`E-`/`S-` for each of 54 categories (1 + 54×4 = 217), spanning identity, +contact, address, dates/time, government IDs, financial, crypto, vehicle, digital, and auth +entities. The ordered `id2label` table is embedded in the GGUF (`classifier.output_labels`). +See the [source card](https://huggingface.co/OpenMed/privacy-filter-multilingual#label-space-54-categories) +for the full list. + +## Limitations & intended use + +Identical to the [source model](https://huggingface.co/OpenMed/privacy-filter-multilingual#limitations--intended-use): +multilingual but uneven (strongest on de/es/fr/it/hi/te/en; weaker on CJK), trained on +synthetic AI4Privacy data, **not** a substitute for legal/compliance review, and **not** a +clinical PHI model. Use it as one tier behind deterministic regex pre-filters and human review. + +## License + +**Apache-2.0**, inherited from `openai/privacy-filter` and `OpenMed/privacy-filter-multilingual`. + +## Credits & citation + +Conversion and runtime support by the **LocalAI** project (`privacy-filter.cpp`). The model +itself is by **OpenMed**, fine-tuned from **OpenAI**'s `privacy-filter`, on **AI4Privacy** +datasets — please cite all of them (BibTeX in the +[source card](https://huggingface.co/OpenMed/privacy-filter-multilingual#citation)). diff --git a/model-cards/privacy-filter.md b/model-cards/privacy-filter.md new file mode 100644 index 0000000..44e5066 --- /dev/null +++ b/model-cards/privacy-filter.md @@ -0,0 +1,117 @@ +--- +license: apache-2.0 +base_model: openai/privacy-filter +base_model_relation: quantized +pipeline_tag: token-classification +library_name: gguf +tags: + - gguf + - privacy-filter.cpp + - llama-cpp + - localai + - token-classification + - pii + - ner + - privacy + - redaction + - openai-privacy-filter +--- + +# privacy-filter — GGUF (F16) + +GGUF conversion of [`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter), +OpenAI's bidirectional PII **token-classification** model. It labels every token with a BIOES +tag over **8 PII categories (33 classes)** in a single forward pass, then decodes coherent +spans with a constrained Viterbi procedure — so it can be served locally with **no Python** as +the encoder/NER tier of a PII redactor. + +For the full model description, training, evaluation, operating points, limitations, and +citations, see the **[source model card](https://huggingface.co/openai/privacy-filter)** — this +card only covers the GGUF packaging and how to run it. + +> For broader language coverage (54 categories across 16 languages), see the multilingual +> fine-tune [`privacy-filter-multilingual` GGUF](https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF). + +## Runtimes + +This GGUF uses a **custom architecture, `openai-privacy-filter`**, that is not (yet) part of +upstream llama.cpp. It runs on: + +1. **[privacy-filter.cpp](https://github.com/localai-org/privacy-filter.cpp)** *(recommended)* — + a small standalone GGML engine for exactly this model family, on **stock upstream ggml with + no patches** (CPU / CUDA / Vulkan). This is the reference runtime. + + ```sh + # build (see the repo README for CUDA/Vulkan) + cmake --preset release && cmake --build --preset release -j + # run + echo "My name is Alice Smith" | \ + build/release/pf-cli --classify privacy-filter-f16.gguf 0.5 + ``` + + It exposes a flat C API (`pf_load` / `pf_classify` → entity spans with UTF-8 byte offsets; + `pf_tokenize` / `pf_logits`) shaped for FFI — see the repo README. + +2. **[LocalAI](https://github.com/mudler/LocalAI)** — install from the model gallery; LocalAI + serves it behind the gRPC `TokenClassify` RPC and runs the constrained BIOES Viterbi decode, + returning entity spans. LocalAI drives it through the **`privacy-filter` backend** (which + wraps privacy-filter.cpp). The model is **not** a chat/completion model — it is a PII + detector that other models opt into via a `pii.detectors` list. + +3. **llama.cpp — only with a patch.** Stock `llama.cpp`, `llama-cpp-python`, Ollama, and + LM Studio will **fail to load** this file (`unknown model architecture: + 'openai-privacy-filter'`). The arch can be added with carry-patches (TOKEN_CLS pooling, the + architecture + HF→GGUF converter, the bidirectional banded-attention graph, and an all-SWA + no-cache mask fix; TOKEN_CLS pooling tracks the still-open + [PR #19725](https://github.com/ggml-org/llama.cpp/pull/19725)). Until that support lands + upstream, `privacy-filter.cpp` above is the patch-free alternative. + +> **Pooling note (llama.cpp path only):** the model must be loaded with **TOKEN_CLS pooling** +> (the GGUF's default). If you drive `llama-embedding` directly for testing, do **not** pass +> `--pooling none`. privacy-filter.cpp handles this automatically. + +## Files + +| File | Precision | Size | Notes | +|---|---|---|---| +| `privacy-filter-f16.gguf` | F16 | 2.82 GB | 156 tensors; 33 `classifier.output_labels`; `pooling_type = TOKEN_CLS`. | + +`sha256: eb71312b6b9370d0fe582e576b840567bb06603c4de241c6d899205d1b04dc81` + +F16 is the validated, shipped precision. Quantized variants are deferred until they can be +evaluated with a **task metric (span-F1) + KL-vs-F16** — perplexity is meaningless for a +classifier, so a naively-quantized GGUF is not published here yet. + +## Architecture & conversion + +gpt-oss-style sparse **MoE** (8 layers, `d_model=640`, 128 experts, top-4 routing; ~1.5B total / +~50M active per token), **bidirectional banded attention** (symmetric sliding window, attention +sinks retained), **interleaved (GPT-J) RoPE** with YaRN (θ=150000, factor 32), o200k +(`o200k_base`) tokenizer, and a 33-way token-classification head (`score` → `cls.output`). +privacy-filter.cpp re-derives the YaRN `truncate=false` frequencies at load time (fed to +`ggml_rope_ext` as `freq_factors`) so the GGUF is interchangeable across runtimes. + +## Label space + +`O` plus `B-`/`I-`/`E-`/`S-` for each of 8 categories (1 + 8×4 = 33): +`account_number`, `private_address`, `private_date`, `private_email`, `private_person`, +`private_phone`, `private_url`, `secret`. The ordered `id2label` table is embedded in the GGUF +(`classifier.output_labels`). + +## Limitations & intended use + +Identical to the [source model](https://huggingface.co/openai/privacy-filter): trained for +high-throughput data sanitization, **not** a substitute for legal/compliance review, and **not** +a clinical PHI model. Use it as one tier behind deterministic regex pre-filters and human +review. For multilingual text, prefer the +[multilingual fine-tune](https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF). + +## License + +**Apache-2.0**, inherited from `openai/privacy-filter`. + +## Credits & citation + +Model by **OpenAI** (`openai/privacy-filter`). GGUF conversion and runtime support +(`privacy-filter.cpp`) by the **LocalAI** project. Please cite OpenAI per the +[source card](https://huggingface.co/openai/privacy-filter). diff --git a/scripts/convert.py b/scripts/convert.py new file mode 100644 index 0000000..5b239cb --- /dev/null +++ b/scripts/convert.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +"""Convert an OpenAIPrivacyFilter HF checkpoint to the privacy-filter.cpp GGUF. + +Self-contained: reads ``config.json`` + ``model.safetensors`` (single or +sharded) + ``tokenizer.json`` straight from the HF model dir and emits the +``openai-privacy-filter`` GGUF that this repo's loader (src/gguf_loader.cpp, +src/model.cpp) expects. It does NOT depend on llama.cpp or its convert script — +the architecture is small and fully specified by the loader, so the whole +mapping lives here. + +Models already converted with this script are published at: + - https://huggingface.co/LocalAI-io/privacy-filter-multilingual-GGUF + - https://huggingface.co/LocalAI-io/privacy-filter-GGUF + +Usage: + python scripts/convert.py --model ~/models/privacy-filter-multilingual \\ + --outfile pf-rope2-f16.gguf # f16 (default), the shipped precision + python scripts/convert.py --model --outtype f32 --outfile pf-f32.gguf + +The architecture (gpt-oss MoE body re-purposed as a bidirectional token +classifier): 8 layers, 14/2 heads, head_dim 64, d_model 640, 128 experts top-4, +o200k vocab, attention sinks, interleaved RoPE + YaRN, a TOKEN_CLS score head. +The two load-bearing tensor transforms (verified by the parity test): + - experts.gate_up_proj is packed as CONCATENATED halves (gate = first + intermediate_size output columns, up = the rest), transposed to + [E, 2*inter, in] then split — NOT gpt-oss's interleaved even/odd; + - experts.down_proj is transposed (-1,-2) like the gpt-oss dense path. +The expert matrices are square (inter == d_model == 640), so a wrong transpose +is silently shape-valid but numerically wrong — parity is the guard. +RoPE: the loader recomputes the YaRN freq factors from the rope KVs at load +time (rope.scaling.yarn_truncate=false), so no per-dim rope_freqs are baked. +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +ARCH = "openai-privacy-filter" + +# Token types in tokenizer.ggml.token_type (gguf.TokenType), mirrored here so we +# don't depend on the enum's import path: NORMAL vocab, CONTROL specials, UNUSED +# placeholders for the reserved gap ids between the specials and vocab_size. +TT_NORMAL, TT_CONTROL, TT_UNUSED = 1, 3, 5 + +# GGUF tensor names that carry the weight matrices quantized to f16 in an f16 +# build; everything else (norms, biases, attn sinks, the router weight) stays +# f32. Matches the published reference file's per-tensor precision exactly. +F16_SUFFIXES = ( + "attn_q.weight", "attn_k.weight", "attn_v.weight", "attn_output.weight", + "ffn_gate_exps.weight", "ffn_up_exps.weight", "ffn_down_exps.weight", +) +F16_GLOBALS = ("token_embd.weight", "cls.output.weight") + + +def gpt2_byte_encoder() -> dict[int, str]: + # GPT-2/o200k byte<->unicode: printable bytes map to themselves, the rest to + # 0x100+n in order. tokenizer.json vocab keys are already in this encoding, + # so we only need it to byte-encode any added-token content we synthesize. + bs = list(range(ord("!"), ord("~") + 1)) + list(range(0xA1, 0xAD)) + list(range(0xAE, 0x100)) + cs = bs[:] + n = 0 + for b in range(256): + if b not in bs: + bs.append(b) + cs.append(0x100 + n) + n += 1 + return {b: chr(c) for b, c in zip(bs, cs)} + + +def load_tokenizer(model_dir: Path): + """Extract (tokens, token_types, merges, eos, pad) from tokenizer.json. + + Mirrors llama.cpp's gpt2 vocab handling: base BPE vocab as NORMAL, special + added tokens as CONTROL, and reserved gap ids up to vocab_size filled with + ``[PAD{i}]`` placeholders typed UNUSED. + """ + tj = json.loads((model_dir / "tokenizer.json").read_text()) + cfg = json.loads((model_dir / "config.json").read_text()) + vocab_size = cfg["vocab_size"] + + enc = gpt2_byte_encoder() + byte_encode = lambda s: "".join(enc[b] for b in s.encode("utf-8")) + + tokens: list[str | None] = [None] * vocab_size + types = [TT_UNUSED] * vocab_size + for tok, tid in tj["model"]["vocab"].items(): + tokens[tid] = tok + types[tid] = TT_NORMAL + for at in tj.get("added_tokens", []): + tid = at["id"] + if 0 <= tid < vocab_size: + tokens[tid] = byte_encode(at["content"]) + types[tid] = TT_CONTROL if at.get("special") else TT_NORMAL + for i in range(vocab_size): + if tokens[i] is None: + tokens[i] = f"[PAD{i}]" # reserved gap id; never matches real text + + merges_raw = tj["model"]["merges"] + merges = [m if isinstance(m, str) else f"{m[0]} {m[1]}" for m in merges_raw] + + eos = cfg.get("eos_token_id") + pad = cfg.get("pad_token_id", eos) + return tokens, types, merges, eos, pad + + +def load_state_dict(model_dir: Path): + """Yield (name, torch.Tensor) for every weight, from single or sharded + safetensors, one tensor resident at a time.""" + from safetensors import safe_open + + index = model_dir / "model.safetensors.index.json" + if index.is_file(): + weight_map = json.loads(index.read_text())["weight_map"] + shard_of = {name: model_dir / shard for name, shard in weight_map.items()} + else: + single = model_dir / "model.safetensors" + with safe_open(single, framework="pt") as f: + shard_of = {name: single for name in f.keys()} + + handles: dict[Path, object] = {} + for name, path in shard_of.items(): + if path not in handles: + handles[path] = safe_open(path, framework="pt") + yield name, handles[path].get_tensor(name) + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--model", required=True, type=Path, help="HF model directory") + ap.add_argument("--outfile", required=True, type=Path, help="output .gguf path") + ap.add_argument("--outtype", choices=("f16", "f32"), default="f16") + ap.add_argument("--name", default=None, help="general.name (default: model dir name)") + args = ap.parse_args() + + import torch + import gguf + + cfg = json.loads((args.model / "config.json").read_text()) + n_layer = cfg["num_hidden_layers"] + inter = cfg["intermediate_size"] + rope = cfg.get("rope_parameters") or cfg.get("rope_scaling") or {} + id2label = {int(k): v for k, v in cfg["id2label"].items()} + labels = [id2label[i] for i in range(len(id2label))] + + writer = gguf.GGUFWriter(str(args.outfile), ARCH) # writes general.architecture + + # --- metadata ----------------------------------------------------------- + writer.add_string("general.type", "model") + writer.add_string("general.name", args.name or args.model.resolve().name) + writer.add_uint32("general.file_type", 1 if args.outtype == "f16" else 0) + writer.add_uint32("general.quantization_version", 2) + + writer.add_uint32(f"{ARCH}.block_count", n_layer) + writer.add_uint32(f"{ARCH}.context_length", cfg["max_position_embeddings"]) + writer.add_uint32(f"{ARCH}.embedding_length", cfg["hidden_size"]) + writer.add_uint32(f"{ARCH}.feed_forward_length", inter) + writer.add_uint32(f"{ARCH}.expert_feed_forward_length", inter) + writer.add_uint32(f"{ARCH}.attention.head_count", cfg["num_attention_heads"]) + writer.add_uint32(f"{ARCH}.attention.head_count_kv", cfg["num_key_value_heads"]) + writer.add_uint32(f"{ARCH}.attention.key_length", cfg["head_dim"]) + writer.add_uint32(f"{ARCH}.attention.value_length", cfg["head_dim"]) + writer.add_float32(f"{ARCH}.attention.layer_norm_rms_epsilon", cfg["rms_norm_eps"]) + writer.add_uint32(f"{ARCH}.attention.sliding_window", cfg["sliding_window"]) + writer.add_uint32(f"{ARCH}.expert_count", cfg["num_local_experts"]) + writer.add_uint32(f"{ARCH}.expert_used_count", cfg["num_experts_per_tok"]) + + writer.add_float32(f"{ARCH}.rope.freq_base", rope["rope_theta"]) + writer.add_string(f"{ARCH}.rope.scaling.type", "yarn") + writer.add_float32(f"{ARCH}.rope.scaling.factor", rope["factor"]) + writer.add_uint32(f"{ARCH}.rope.scaling.original_context_length", + rope["original_max_position_embeddings"]) + writer.add_float32(f"{ARCH}.rope.scaling.yarn_beta_fast", rope["beta_fast"]) + writer.add_float32(f"{ARCH}.rope.scaling.yarn_beta_slow", rope["beta_slow"]) + writer.add_bool(f"{ARCH}.rope.scaling.yarn_truncate", bool(rope.get("truncate", False))) + + writer.add_uint32(f"{ARCH}.pooling_type", 5) # TOKEN_CLS + writer.add_uint32(f"{ARCH}.embedding_length_out", len(labels)) + writer.add_array(f"{ARCH}.classifier.output_labels", labels) + + # --- tokenizer ---------------------------------------------------------- + tokens, types, merges, eos, pad = load_tokenizer(args.model) + writer.add_string("tokenizer.ggml.model", "gpt2") + writer.add_string("tokenizer.ggml.pre", "gpt-4o") + writer.add_array("tokenizer.ggml.tokens", tokens) + writer.add_array("tokenizer.ggml.token_type", types) + writer.add_array("tokenizer.ggml.merges", merges) + if eos is not None: + writer.add_uint32("tokenizer.ggml.eos_token_id", eos) + if pad is not None: + writer.add_uint32("tokenizer.ggml.padding_token_id", pad) + + # --- tensors ------------------------------------------------------------ + def emit(name: str, t: "torch.Tensor"): + is_f16 = args.outtype == "f16" and (name in F16_GLOBALS or name.endswith(F16_SUFFIXES)) + t = t.to(torch.float16 if is_f16 else torch.float32).contiguous() + writer.add_tensor(name, t.numpy()) + + n_emitted = 0 + for name, t in load_state_dict(args.model): + if name == "model.embed_tokens.weight": + emit("token_embd.weight", t) + elif name == "model.norm.weight": + emit("output_norm.weight", t) + elif name == "score.weight": + emit("cls.output.weight", t) + elif name == "score.bias": + emit("cls.output.bias", t) + elif name.startswith("model.layers."): + bid = int(name.split(".")[2]) + sub = name.split(".", 3)[3] + p = f"blk.{bid}." + simple = { + "input_layernorm.weight": "attn_norm.weight", + "self_attn.q_proj.weight": "attn_q.weight", + "self_attn.q_proj.bias": "attn_q.bias", + "self_attn.k_proj.weight": "attn_k.weight", + "self_attn.k_proj.bias": "attn_k.bias", + "self_attn.v_proj.weight": "attn_v.weight", + "self_attn.v_proj.bias": "attn_v.bias", + "self_attn.o_proj.weight": "attn_output.weight", + "self_attn.o_proj.bias": "attn_output.bias", + "self_attn.sinks": "attn_sinks.weight", + "post_attention_layernorm.weight": "post_attention_norm.weight", + "mlp.router.weight": "ffn_gate_inp.weight", + "mlp.router.bias": "ffn_gate_inp.bias", + "mlp.experts.down_proj_bias": "ffn_down_exps.bias", + } + if sub in simple: + emit(p + simple[sub], t) + elif sub == "mlp.experts.down_proj": + emit(p + "ffn_down_exps.weight", t.transpose(-1, -2)) + elif sub == "mlp.experts.gate_up_proj": + w = t.transpose(-1, -2) # [E, 2*inter, in] + emit(p + "ffn_gate_exps.weight", w[:, :inter, :]) + emit(p + "ffn_up_exps.weight", w[:, inter:, :]) + n_emitted += 1 # two tensors from one source + elif sub == "mlp.experts.gate_up_proj_bias": + emit(p + "ffn_gate_exps.bias", t[..., :inter]) + emit(p + "ffn_up_exps.bias", t[..., inter:]) + n_emitted += 1 + else: + sys.exit(f"unmapped tensor: {name}") + else: + sys.exit(f"unmapped tensor: {name}") + n_emitted += 1 + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file(progress=True) + writer.close() + print(f"\nwrote {args.outfile} ({n_emitted} tensors, {args.outtype}, {len(labels)} labels)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/publish_hf.py b/scripts/publish_hf.py new file mode 100755 index 0000000..910ca53 --- /dev/null +++ b/scripts/publish_hf.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Publish privacy-filter.cpp GGUF models + model cards to the HuggingFace Hub. + +The GGUFs are produced by ``scripts/convert.py`` (self-contained — reads +config.json + model.safetensors + tokenizer.json, no llama.cpp dependency): + + python scripts/convert.py --model --outfile -f16.gguf + +This script uploads a converted GGUF plus its **version-controlled** model card +(``model-cards/.md`` -> the repo's ``README.md``) to the matching HF repo, +so the published card never drifts from the one in this repo. + +**DRY-RUN BY DEFAULT** — without ``--upload`` it prints what it *would* push +(repo, files, size, sha256) and never contacts HuggingFace. Pass ``--upload`` +to perform the real push. The sha256 it prints is what the LocalAI gallery +entry should pin. + +Usage: + python scripts/publish_hf.py --model privacy-filter \\ + --gguf ~/models/privacy-filter/privacy-filter-f16.gguf # dry-run + + python scripts/publish_hf.py --model privacy-filter-multilingual \\ + --gguf .../privacy-filter-multilingual-f16.gguf --upload # push + + python scripts/publish_hf.py --model privacy-filter-multilingual \\ + --card-only --upload # sync just the card (README.md) +""" +from __future__ import annotations + +import argparse +import hashlib +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +CARDS_DIR = REPO_ROOT / "model-cards" + +HF_ORG = "LocalAI-io" + +# key -> (HF repo id, published GGUF filename, model-card filename under model-cards/) +MODELS: dict[str, tuple[str, str, str]] = { + "privacy-filter-multilingual": ( + f"{HF_ORG}/privacy-filter-multilingual-GGUF", + "privacy-filter-multilingual-f16.gguf", + "privacy-filter-multilingual.md", + ), + "privacy-filter": ( + f"{HF_ORG}/privacy-filter-GGUF", + "privacy-filter-f16.gguf", + "privacy-filter.md", + ), +} + + +def sha256(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + h.update(chunk) + return h.hexdigest() + + +def main() -> int: + ap = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + ap.add_argument("--model", required=True, choices=sorted(MODELS), + help="which model to publish") + ap.add_argument("--gguf", type=Path, default=None, + help="path to the converted -f16.gguf (omit with --card-only)") + ap.add_argument("--card-only", action="store_true", + help="sync only the model card (README.md); leave the published GGUF untouched") + ap.add_argument("--repo", default=None, help="override the target HF repo id") + ap.add_argument("--upload", action="store_true", + help="actually push (default: dry-run, contacts nothing)") + args = ap.parse_args() + + repo, gguf_name, card_name = MODELS[args.model] + repo = args.repo or repo + card = CARDS_DIR / card_name + + if not card.is_file(): + sys.exit(f"model card not found: {card}") + if not args.card_only: + if args.gguf is None: + sys.exit("--gguf is required unless --card-only is given") + if not args.gguf.is_file(): + sys.exit(f"GGUF not found: {args.gguf}") + + print(f"model: {args.model}") + print(f"repo: https://huggingface.co/{repo}") + print(f"card: {card.relative_to(REPO_ROOT)} -> README.md") + if args.card_only: + print("gguf: (card-only — published GGUF left untouched)") + else: + size = args.gguf.stat().st_size + print(f"gguf: {args.gguf} ({size / 1e9:.2f} GB) uploaded as {gguf_name}") + print(f"sha256: {sha256(args.gguf)} <- pin this in the LocalAI gallery entry") + + if not args.upload: + print("\n[dry-run] nothing uploaded. Re-run with --upload to push.") + return 0 + + from huggingface_hub import HfApi + + api = HfApi() + api.create_repo(repo, repo_type="model", exist_ok=True) + print("\nuploading README.md ...") + api.upload_file( + path_or_fileobj=str(card), path_in_repo="README.md", + repo_id=repo, repo_type="model", + commit_message=f"card: sync from privacy-filter.cpp ({card_name})", + ) + if not args.card_only: + print(f"uploading {gguf_name} ...") + api.upload_file( + path_or_fileobj=str(args.gguf), path_in_repo=gguf_name, + repo_id=repo, repo_type="model", + commit_message=f"gguf: {gguf_name} (f16)", + ) + print(f"done -> https://huggingface.co/{repo}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 40c26a7..34c44d4 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,8 +1,11 @@ -# Reference environment for fixture generation (hf_dump.py) and tokenizer -# differential tests (hf_tok_diff.py). The openai_privacy_filter model type -# needs transformers >= 5.6. +# Reference environment for fixture generation (hf_dump.py), tokenizer +# differential tests (hf_tok_diff.py), and HF->GGUF conversion (convert.py). +# The openai_privacy_filter model type needs transformers >= 5.6. transformers==5.9.0 tokenizers==0.22.2 torch>=2.11 numpy safetensors +# convert.py reads safetensors/config/tokenizer and writes GGUF; it needs only +# torch + safetensors + gguf (no transformers). +gguf>=0.10 diff --git a/tests/test_parity.cpp b/tests/test_parity.cpp index 53b6fcc..e18d511 100644 --- a/tests/test_parity.cpp +++ b/tests/test_parity.cpp @@ -137,36 +137,29 @@ int main() { std::fprintf(stderr, "PF_GGUF_DIR or PF_FIXTURES not set, skipping\n"); return 77; } - { - std::vector probe; - if (!read_file(std::string(fixtures) + "/short-en/tokens.i32", probe)) { - std::fprintf(stderr, "fixtures not generated (scripts/hf_dump.py), skipping\n"); - return 77; - } - } - - // The f16 GGUF is the primary gate. PF_GGUF_DIR may be set (the CI cache - // restores the directory) while the GGUF itself is absent — conversion - // lives in the llama.cpp fork and the cache is seeded separately (see - // .github/workflows/ci.yml). Skip cleanly rather than fail in that case. + // The skip above is the only legitimate one: model testing wasn't requested + // (the fast tier runs -LE model; this is the local "full suite, no assets" + // path). Once PF_GGUF_DIR/PF_FIXTURES are set the operator IS asking for + // parity, so every asset is a hard requirement -- a missing one fails loudly + // rather than silently skipping the gate. CI regenerates them all on every + // run (scripts/hf_dump.py + scripts/convert.py), so a missing file is a real + // error, not a reason to skip. + auto require_asset = [](const std::string & path, const char * what) { + if (FILE * f = std::fopen(path.c_str(), "rb")) { std::fclose(f); return; } + failures++; + std::fprintf(stderr, "FAIL: required %s missing: %s\n", what, path.c_str()); + }; const char * f16_name = std::getenv("PF_GGUF_NAME"); const std::string f16 = std::string(gguf_dir) + "/" + (f16_name ? f16_name : "pf-rope2-f16.gguf"); - if (FILE * f = std::fopen(f16.c_str(), "rb")) { - std::fclose(f); - } else { - std::fprintf(stderr, "%s absent, skipping\n", f16.c_str()); - return 77; - } + const std::string f32 = std::string(gguf_dir) + "/pf-f32.gguf"; + require_asset(std::string(fixtures) + "/short-en/tokens.i32", "fixtures (scripts/hf_dump.py)"); + require_asset(f16, "f16 GGUF (scripts/convert.py)"); + require_asset(f32, "f32 GGUF (scripts/convert.py --outtype f32)"); + if (failures) return 1; // f32 GGUF: tight per-row gates vs the exact-rotation reference - const std::string f32 = std::string(gguf_dir) + "/pf-f32.gguf"; - if (FILE * f = std::fopen(f32.c_str(), "rb")) { - std::fclose(f); - if (is_gpu()) run_model(f32, fixtures, "logits.f32", 0.998, 0.15); - else run_model(f32, fixtures, "logits.f32", 0.99999, 1e-2); - } else { - std::fprintf(stderr, "note: %s absent, skipping f32 gates\n", f32.c_str()); - } + if (is_gpu()) run_model(f32, fixtures, "logits.f32", 0.998, 0.15); + else run_model(f32, fixtures, "logits.f32", 0.99999, 1e-2); // f16 GGUF: production-file gate vs the stock reference run_model(f16, fixtures, "logits_stock.f32", is_gpu() ? 0.998 : 0.999, is_gpu() ? 0.15 : 5e-2); diff --git a/tests/test_window_stitch.cpp b/tests/test_window_stitch.cpp index bcc28ff..fcea3ba 100644 --- a/tests/test_window_stitch.cpp +++ b/tests/test_window_stitch.cpp @@ -44,23 +44,25 @@ int main() { std::fprintf(stderr, "PF_GGUF_DIR or PF_FIXTURES not set, skipping\n"); return 77; } + // The skip above is the only legitimate one (model testing not requested). + // Once PF_GGUF_DIR/PF_FIXTURES are set, the GGUF and fixture are hard + // requirements -- CI regenerates both every run (scripts/convert.py + + // scripts/hf_dump.py), so a missing asset fails loudly rather than silently + // skipping the gate. std::vector ids; if (!read_i32(std::string(fixtures) + "/long-3k/tokens.i32", ids)) { - std::fprintf(stderr, "long-3k fixture missing, skipping\n"); - return 77; + std::fprintf(stderr, "FAIL: required long-3k fixture missing (scripts/hf_dump.py)\n"); + return 1; } const int n = (int) ids.size(); - // PF_GGUF_DIR may be set (the CI cache restores the directory) while the - // GGUF itself is absent — the cache is seeded separately. Skip cleanly - // rather than fail when the file is missing (see .github/workflows/ci.yml). const char * f16_name = std::getenv("PF_GGUF_NAME"); const std::string f16 = std::string(gguf_dir) + "/" + (f16_name ? f16_name : "pf-rope2-f16.gguf"); if (FILE * f = std::fopen(f16.c_str(), "rb")) { std::fclose(f); } else { - std::fprintf(stderr, "%s absent, skipping\n", f16.c_str()); - return 77; + std::fprintf(stderr, "FAIL: required f16 GGUF missing (scripts/convert.py): %s\n", f16.c_str()); + return 1; } pf::model m; if (!m.load(f16, std::getenv("PF_DEVICE") ? std::getenv("PF_DEVICE") : "cpu", 0)) {