From fe2aa12a3d02bf765a00ff95bc2f3fb1ea3ca4ed Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 21:31:54 -0500 Subject: [PATCH 1/4] feat: rename on-disk magics TV* -> OV* (ordvec format) with full back-compat Files written by the crate now use the ordvec magics OVR1/OVRQ/OVBM/OVSB (extensions .ovr/.ovrq/.ovbm/.ovsb), replacing the turbovec-era TV* magics. The read contract is unchanged: all loaders (rank_io.rs) AND the C ABI accept BOTH the current OV* and the legacy TV* magics, so every file the crate (or turbovec) ever wrote still loads. Only the write path changed. - src/rank_io.rs: OV* magic constants (written) + TV* retained read-only for back-compat; writers emit OV*; loaders + probe_index_metadata accept both. - ordvec-ffi: the C ABI sniff-magic dispatch accepts both OV* and TV*; probe path was already format-agnostic (uses IndexKind). - tests/persistence_compat.rs: forward fixtures now pin OV*; added tests proving legacy TV* files still load for all four index types. - Parity sweep (docs / extensions only, no logic): ordvec-manifest (+ python bindings), ordvec-python docstrings + tests, ordvec-go test, C header, fuzz targets, docs/*, README format line, SECURITY/THREAT_MODEL, CONTRIBUTING stable-surface statement (the read contract is never broken), .gitignore. Gate: fmt + clippy -D warnings (core/ffi/manifest) + full test suites (core exp+default, manifest, ffi) + ordvec-python check + rustdoc -D warnings. Signed-off-by: Nelson Spence --- .gitignore | 5 + CONTRIBUTING.md | 12 +- README.md | 3 +- SECURITY.md | 5 +- THREAT_MODEL.md | 6 +- docs/INDEX_PROVENANCE.md | 2 +- docs/PERSISTED_FORMAT.md | 34 ++++-- docs/c-api.md | 6 +- docs/compatibility-policy.md | 12 +- fuzz/fuzz_targets/load_bitmap.rs | 5 +- fuzz/fuzz_targets/load_rank.rs | 5 +- fuzz/fuzz_targets/load_rankquant.rs | 5 +- fuzz/fuzz_targets/load_sign_bitmap.rs | 7 +- fuzz/fuzz_targets/roundtrip_rankquant.rs | 2 +- fuzz/fuzz_targets/scratch.rs | 5 +- ordvec-ffi/include/ordvec.h | 8 +- ordvec-ffi/src/lib.rs | 36 +++--- ordvec-ffi/tests/c_link_smoke.rs | 4 +- ordvec-go/ordvec_test.go | 43 ++++++-- ordvec-manifest-python/README.md | 2 +- .../tests/test_manifest_bindings.py | 8 +- ordvec-manifest/README.md | 14 +-- ordvec-manifest/tests/manifest.rs | 32 +++--- ordvec-python/src/lib.rs | 28 +++-- ordvec-python/tests/test_bitmap.py | 6 +- ordvec-python/tests/test_rank.py | 4 +- ordvec-python/tests/test_rank_quant.py | 4 +- ordvec-python/tests/test_redteam_fuzz.py | 70 ++++++------ ordvec-python/tests/test_sign_bitmap.py | 6 +- src/bitmap.rs | 11 +- src/quant.rs | 23 ++-- src/rank.rs | 11 +- src/rank_io.rs | 53 +++++---- src/sign_bitmap.rs | 13 ++- tests/persistence_compat.rs | 103 +++++++++++++++++- 35 files changed, 391 insertions(+), 202 deletions(-) diff --git a/.gitignore b/.gitignore index d342254b..de829925 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,11 @@ *.swp # Local index/serialisation artifacts produced by tests/benches. +# Current `.ov*` magics plus the legacy `.tv*` ones (still loadable, files persist). +*.ovr +*.ovrq +*.ovbm +*.ovsb *.tvr *.tvrq *.tvbm diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a22929f5..329eb170 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,10 +19,14 @@ Contributions to the code, the docs, and the paper are all welcome. caveat. The Lean bitmap theorem proves a constant-weight overlap admission model under explicit assumptions; it is not a blanket retrieval guarantee. - **MSRV is Rust 1.89.** Don't use newer standard-library or language APIs. -- **Stable surface.** The persistence file magics (`.tvr` / `.tvrq` / - `.tvbm` / `.tvsb`) and the public method names - (`new` / `add` / `search` / `search_asymmetric*` / `top_m_candidates*` / - `write` / `load`) are stable — please don't rename them. +- **Stable surface.** The on-disk formats remain loadable forever: writers emit + the current `.ov*` magics (`.ovr` / `.ovrq` / `.ovbm` / `.ovsb`, renamed from + the turbovec-era `.tv*`), and the loaders accept **both** the current `.ov*` + and the legacy `.tv*` magics — so every file the crate has ever written still + loads. Only the write path changed; the read contract is never broken. The + public method names (`new` / `add` / `search` / `search_asymmetric*` / + `top_m_candidates*` / `write` / `load`) are likewise stable — please don't + rename them. - **Tests are required for new functionality.** As major new functionality is added, tests covering it MUST be added to the automated test suite (`cargo test`, plus `pytest` for the Python bindings). Changes that add diff --git a/README.md b/README.md index f6d77306..c2712f16 100644 --- a/README.md +++ b/README.md @@ -329,7 +329,8 @@ clean-checkout kernel sanity check. ## Security: index-file trust -The on-disk formats (`.tvr` / `.tvrq` / `.tvbm` / `.tvsb`) carry **no built-in +The on-disk formats (`.ovr` / `.ovrq` / `.ovbm` / `.ovsb`; legacy `.tvr` / +`.tvrq` / `.tvbm` / `.tvsb` files still load) carry **no built-in checksum, MAC, or signature — by design.** The loaders validate *structure* (magic, version, bounds, exact-length payload) but not *origin*: a structurally valid file can still be untrusted. If an index file crosses a diff --git a/SECURITY.md b/SECURITY.md index c2cba27b..166b0a36 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -15,8 +15,9 @@ Use GitHub's private vulnerability reporting: We aim to acknowledge reports within a few business days. -`ordvec` parses serialized index files (`.tvr` / `.tvrq` / `.tvbm` / -`.tvsb`); the loaders are fuzzed (`cargo +nightly fuzz`), so +`ordvec` parses serialized index files (`.ovr` / `.ovrq` / `.ovbm` / +`.ovsb`; the loaders also accept the legacy `.tvr` / `.tvrq` / `.tvbm` / +`.tvsb` magics); the loaders are fuzzed (`cargo +nightly fuzz`), so parsing-robustness reports against the deserialization paths are especially welcome. Reports are also welcome against the `unsafe` SIMD kernels (shape / bounds invariants), the Python FFI contract (buffer handling, GIL discipline), diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md index aa8c0867..3ca834d5 100644 --- a/THREAT_MODEL.md +++ b/THREAT_MODEL.md @@ -66,7 +66,7 @@ absence of a second maintainer is itself a tracked supply-chain residual | Layer | Components | Trust boundary | |---|---|---| -| **Deserialization** | `rank_io.rs` — `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` loaders | Untrusted filesystem / network byte stream | +| **Deserialization** | `rank_io.rs` — `.ovr` / `.ovrq` / `.ovbm` / `.ovsb` loaders (also accept the legacy `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` magics) | Untrusted filesystem / network byte stream | | **Manifest verification** | `ordvec-manifest` — JSON sidecar verifier | Manifest + index + optional row-map files before load | | **Compute kernels** | `fastscan.rs`, `quant_kernels.rs`, `bitmap.rs`, `sign_bitmap.rs` | Trust established after format validation | | **Index API** | `rank.rs`, `quant.rs`, `bitmap.rs`, `sign_bitmap.rs` | Caller-controlled query embeddings | @@ -221,8 +221,8 @@ those kernels, and layering ASAN onto the existing SDE leg remains a follow-up. ### 4.1 C ABI defenses (code-verified) -`ordvec-ffi` exposes only loaded `.tvrq` `RankQuant` and `.tvbm` `Bitmap` -indexes through one opaque handle. The ABI checks raw pointer nullness and +`ordvec-ffi` exposes only loaded `.ovrq` `RankQuant` and `.ovbm` `Bitmap` +indexes (legacy `.tvrq` / `.tvbm` files also load) through one opaque handle. The ABI checks raw pointer nullness and caller-supplied lengths before use, requires exact v1 `struct_size` values for input structs, rejects unknown flags and nonzero reserved input fields, validates query dimension and finiteness before entering core search, diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index d83b6b42..6f1b98ea 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -1,6 +1,6 @@ # Index file provenance -`ordvec` persists indexes as `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` files and +`ordvec` persists indexes as `.ovr` / `.ovrq` / `.ovbm` / `.ovsb` files and reloads them through `Rank::load`, `RankQuant::load`, `Bitmap::load`, and `SignBitmap::load`. This note states exactly **what the loaders guarantee and what they do not**, so you can decide whether an index file needs out-of-band diff --git a/docs/PERSISTED_FORMAT.md b/docs/PERSISTED_FORMAT.md index 4c3da8c2..1d53282c 100644 --- a/docs/PERSISTED_FORMAT.md +++ b/docs/PERSISTED_FORMAT.md @@ -1,8 +1,8 @@ # Persisted Index Format This document is the compatibility contract for ordvec persisted index files. -It covers the primitive index artifacts only: `.tvr`, `.tvrq`, `.tvbm`, and -`.tvsb`. It does not define a database, transaction log, replication protocol, +It covers the primitive index artifacts only: `.ovr`, `.ovrq`, `.ovbm`, and +`.ovsb`. It does not define a database, transaction log, replication protocol, provenance system, checksum manifest, signature, or trust policy. All integer fields are little-endian. Each format has one fixed header followed @@ -58,7 +58,7 @@ Example external segment entry: ```json { - "path": "segments/shard-0007/index.tvrq", + "path": "segments/shard-0007/index.ovrq", "sha256": "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", "metadata": { "kind": "RankQuant", @@ -92,13 +92,16 @@ persisted row. ## Format Layouts -### Rank (`.tvr`, magic `TVR1`) +### Rank (`.ovr`, magic `OVR1`) + +Current writers emit magic `OVR1`. Loaders also accept the legacy magic `TVR1` +(written by versions before the format rename). Header: | Offset | Bytes | Field | | ---: | ---: | --- | -| 0 | 4 | magic `TVR1` | +| 0 | 4 | magic `OVR1` (or legacy `TVR1`) | | 4 | 1 | format version `1` | | 5 | 4 | `dim` as `u32` little-endian | | 9 | 4 | `n_vectors` as `u32` little-endian | @@ -112,13 +115,16 @@ Probe metadata: - `params = Rank` - `bytes_per_vec = dim * 2` -### RankQuant (`.tvrq`, magic `TVRQ`) +### RankQuant (`.ovrq`, magic `OVRQ`) + +Current writers emit magic `OVRQ`. Loaders also accept the legacy magic `TVRQ` +(written by versions before the format rename). Header: | Offset | Bytes | Field | | ---: | ---: | --- | -| 0 | 4 | magic `TVRQ` | +| 0 | 4 | magic `OVRQ` (or legacy `TVRQ`) | | 4 | 1 | format version `1` | | 5 | 1 | `bits` as `u8`, one of `1`, `2`, or `4` | | 6 | 4 | `dim` as `u32` little-endian | @@ -139,13 +145,16 @@ Probe metadata: - `params = RankQuant { bits }` - `bytes_per_vec = dim * bits / 8` -### Bitmap (`.tvbm`, magic `TVBM`) +### Bitmap (`.ovbm`, magic `OVBM`) + +Current writers emit magic `OVBM`. Loaders also accept the legacy magic `TVBM` +(written by versions before the format rename). Header: | Offset | Bytes | Field | | ---: | ---: | --- | -| 0 | 4 | magic `TVBM` | +| 0 | 4 | magic `OVBM` (or legacy `TVBM`) | | 4 | 1 | format version `1` | | 5 | 4 | `dim` as `u32` little-endian | | 9 | 4 | `n_top` as `u32` little-endian | @@ -161,13 +170,16 @@ Probe metadata: - `params = Bitmap { n_top }` - `bytes_per_vec = dim / 8` -### SignBitmap (`.tvsb`, magic `TVSB`) +### SignBitmap (`.ovsb`, magic `OVSB`) + +Current writers emit magic `OVSB`. Loaders also accept the legacy magic `TVSB` +(written by versions before the format rename). Header: | Offset | Bytes | Field | | ---: | ---: | --- | -| 0 | 4 | magic `TVSB` | +| 0 | 4 | magic `OVSB` (or legacy `TVSB`) | | 4 | 1 | format version `1` | | 5 | 4 | `dim` as `u32` little-endian | | 9 | 4 | `n_vectors` as `u32` little-endian | diff --git a/docs/c-api.md b/docs/c-api.md index a2f3fd79..d936ae3b 100644 --- a/docs/c-api.md +++ b/docs/c-api.md @@ -1,7 +1,7 @@ # C API -`ordvec-ffi` exposes a small ABI v1 for loading persisted `.tvrq` -`RankQuant` and `.tvbm` `Bitmap` indexes and running synchronous single-query +`ordvec-ffi` exposes a small ABI v1 for loading persisted `.ovrq` +`RankQuant` and `.ovbm` `Bitmap` indexes and running synchronous single-query searches. The public header is [`../ordvec-ffi/include/ordvec.h`](../ordvec-ffi/include/ordvec.h). ## Build and Link @@ -33,7 +33,7 @@ When linking dynamically, make sure your platform's loader can find int main(void) { ordvec_index_t *index = NULL; - ordvec_status_t st = ordvec_index_load("index.tvrq", 0, &index); + ordvec_status_t st = ordvec_index_load("index.ovrq", 0, &index); if (st != ORDVEC_STATUS_OK) { fprintf(stderr, "load failed: %s\n", ordvec_last_error()); return 1; diff --git a/docs/compatibility-policy.md b/docs/compatibility-policy.md index a515cce4..d96bfd65 100644 --- a/docs/compatibility-policy.md +++ b/docs/compatibility-policy.md @@ -121,10 +121,14 @@ with documented migration steps. The primitive index formats are the files written and loaded by the core index types: -- `.tvr` / `TVR1` for `Rank`; -- `.tvrq` / `TVRQ` for `RankQuant`; -- `.tvbm` / `TVBM` for `Bitmap`; -- `.tvsb` / `TVSB` for `SignBitmap`. +- `.ovr` / `OVR1` for `Rank`; +- `.ovrq` / `OVRQ` for `RankQuant`; +- `.ovbm` / `OVBM` for `Bitmap`; +- `.ovsb` / `OVSB` for `SignBitmap`. + +Legacy files using the old turbovec-era magics (`TVR1`, `TVRQ`, `TVBM`, `TVSB` +and extensions `.tvr`, `.tvrq`, `.tvbm`, `.tvsb`) are still accepted by current +loaders. Writers no longer emit those magics. Patch releases should keep valid files from the same minor series loadable. Loader hardening may reject malformed files, forged sizes, trailing bytes, bad diff --git a/fuzz/fuzz_targets/load_bitmap.rs b/fuzz/fuzz_targets/load_bitmap.rs index 7727788e..985aa8de 100644 --- a/fuzz/fuzz_targets/load_bitmap.rs +++ b/fuzz/fuzz_targets/load_bitmap.rs @@ -1,5 +1,6 @@ -//! libFuzzer target for the `.tvbm` / `TVBM` loader, driven through the -//! public `ordvec::Bitmap::load` entry point. +//! libFuzzer target for the `.ovbm` / `OVBM` loader (which also accepts the +//! legacy `.tvbm` / `TVBM` magic), driven through the public +//! `ordvec::Bitmap::load` entry point. //! //! The low-level `rank_io::load_bitmap` parser is crate-internal //! (`pub(crate)`), so the fuzzer exercises it through `Bitmap::load` — which diff --git a/fuzz/fuzz_targets/load_rank.rs b/fuzz/fuzz_targets/load_rank.rs index 1a0dee76..62488b7f 100644 --- a/fuzz/fuzz_targets/load_rank.rs +++ b/fuzz/fuzz_targets/load_rank.rs @@ -1,5 +1,6 @@ -//! libFuzzer target for the `.tvr` / `TVR1` loader, driven through the -//! public `ordvec::Rank::load` entry point. +//! libFuzzer target for the `.ovr` / `OVR1` loader (which also accepts the +//! legacy `.tvr` / `TVR1` magic), driven through the public `ordvec::Rank::load` +//! entry point. //! //! The low-level `rank_io::load_rank` parser is crate-internal (`pub(crate)`), //! so the fuzzer exercises it through `Rank::load` — which runs that exact diff --git a/fuzz/fuzz_targets/load_rankquant.rs b/fuzz/fuzz_targets/load_rankquant.rs index 1dc8a41c..95b329bd 100644 --- a/fuzz/fuzz_targets/load_rankquant.rs +++ b/fuzz/fuzz_targets/load_rankquant.rs @@ -1,5 +1,6 @@ -//! libFuzzer target for the `.tvrq` / `TVRQ` loader, driven through the -//! public `ordvec::RankQuant::load` entry point. +//! libFuzzer target for the `.ovrq` / `OVRQ` loader (which also accepts the +//! legacy `.tvrq` / `TVRQ` magic), driven through the public +//! `ordvec::RankQuant::load` entry point. //! //! The low-level `rank_io::load_rankquant` parser is crate-internal //! (`pub(crate)`), so the fuzzer exercises it through `RankQuant::load` — diff --git a/fuzz/fuzz_targets/load_sign_bitmap.rs b/fuzz/fuzz_targets/load_sign_bitmap.rs index 083c2dc3..061f9869 100644 --- a/fuzz/fuzz_targets/load_sign_bitmap.rs +++ b/fuzz/fuzz_targets/load_sign_bitmap.rs @@ -1,5 +1,6 @@ -//! libFuzzer target for the `.tvsb` / `TVSB` loader, driven through the -//! public `ordvec::SignBitmap::load` entry point. +//! libFuzzer target for the `.ovsb` / `OVSB` loader (which also accepts the +//! legacy `.tvsb` / `TVSB` magic), driven through the public +//! `ordvec::SignBitmap::load` entry point. //! //! The low-level `rank_io::load_sign_bitmap` parser is crate-internal //! (`pub(crate)`), so the fuzzer exercises it through `SignBitmap::load` — @@ -13,7 +14,7 @@ //! Contract: on arbitrary bytes the loader must return `Ok(..)` or //! `Err(..)` — never panic, abort, or read out of bounds. libFuzzer //! treats any panic/abort as a crash, so simply letting the result drop -//! is the assertion. The `.tvsb` dim validation path differs from the +//! is the assertion. The `.ovsb` dim validation path differs from the //! other three (`MAX_SIGN_BITMAP_DIM`, multiple-of-64), so it gets its //! own target rather than riding on `load_bitmap`. diff --git a/fuzz/fuzz_targets/roundtrip_rankquant.rs b/fuzz/fuzz_targets/roundtrip_rankquant.rs index 04814b7e..d9d64f5e 100644 --- a/fuzz/fuzz_targets/roundtrip_rankquant.rs +++ b/fuzz/fuzz_targets/roundtrip_rankquant.rs @@ -46,7 +46,7 @@ fuzz_target!(|data: &[u8]| { Ok(d) => d, Err(_) => return, }; - let path = dir.path().join("roundtrip.tvrq"); + let path = dir.path().join("roundtrip.ovrq"); idx.write(&path).expect("write of a validly-built index must succeed"); let reloaded = RankQuant::load(&path).expect("write output must reload (round-trip)"); assert_eq!(reloaded.dim(), idx.dim()); diff --git a/fuzz/fuzz_targets/scratch.rs b/fuzz/fuzz_targets/scratch.rs index 634c8368..24053b4b 100644 --- a/fuzz/fuzz_targets/scratch.rs +++ b/fuzz/fuzz_targets/scratch.rs @@ -1,5 +1,6 @@ -//! Shared per-worker scratch temp file for the `.tvr` / `.tvrq` / `.tvbm` / -//! `.tvsb` loader fuzz targets. +//! Shared per-worker scratch temp file for the `.ovr` / `.ovrq` / `.ovbm` / +//! `.ovsb` loader fuzz targets (the loaders also accept the legacy `.tv*` +//! magics). //! //! # Why this exists (issue #6) //! diff --git a/ordvec-ffi/include/ordvec.h b/ordvec-ffi/include/ordvec.h index 6907655b..68a0ac51 100644 --- a/ordvec-ffi/include/ordvec.h +++ b/ordvec-ffi/include/ordvec.h @@ -180,7 +180,8 @@ void ordvec_search_params_init(ordvec_search_params_t *params); void ordvec_search_stats_init(ordvec_search_stats_t *stats); /** - * Load a `.tvrq` RankQuant or `.tvbm` Bitmap index. + * Load a `.ovrq` RankQuant or `.ovbm` Bitmap index (legacy `.tvrq` / `.tvbm` + * files are also accepted). * * # Safety * @@ -190,8 +191,9 @@ void ordvec_search_stats_init(ordvec_search_stats_t *stats); ordvec_status_t ordvec_index_load(const char *path, uint64_t flags, ordvec_index_t **out); /** - * Probe on-disk metadata for a `.tvrq` RankQuant or `.tvbm` Bitmap index - * without loading payload rows into an index handle. + * Probe on-disk metadata for a `.ovrq` RankQuant or `.ovbm` Bitmap index + * (legacy `.tvrq` / `.tvbm` also accepted) without loading payload rows into an + * index handle. * * This validates the fixed header, declared dimensions, payload byte count, * and exact file length. Full row-invariant validation remains the job of diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index 773e871e..8a385808 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -370,7 +370,7 @@ fn info_for_metadata(meta: &IndexMetadata) -> Result ORDVEC_INDEX_KIND_BITMAP, IndexKind::Rank | IndexKind::SignBitmap => return Err(FfiError::new( ORDVEC_STATUS_UNSUPPORTED_FORMAT, - "ABI v1 supports metadata probes only for TVRQ RankQuant and TVBM Bitmap indexes", + "ABI v1 supports metadata probes only for RankQuant and Bitmap indexes", )), }; info.format_version = u32::from(meta.format_version); @@ -671,7 +671,8 @@ pub unsafe extern "C" fn ordvec_search_stats_init(stats: *mut ordvec_search_stat } #[no_mangle] -/// Load a `.tvrq` RankQuant or `.tvbm` Bitmap index. +/// Load a `.ovrq` RankQuant or `.ovbm` Bitmap index (legacy `.tvrq` / `.tvbm` +/// files are also accepted). /// /// # Safety /// @@ -719,17 +720,19 @@ pub unsafe extern "C" fn ordvec_index_load( .map_err(|err| io_to_ffi(err, "stat index"))? .len(); + // Accept both the current `OV*` magics and the legacy turbovec-era + // `TV*` magics (back-compat) — mirrors the loaders in `rank_io.rs`. let index = match &magic { - b"TVRQ" => LoadedIndex::RankQuant( - RankQuant::load(path).map_err(|err| io_to_ffi(err, "load TVRQ index"))?, + b"OVRQ" | b"TVRQ" => LoadedIndex::RankQuant( + RankQuant::load(path).map_err(|err| io_to_ffi(err, "load RankQuant index"))?, ), - b"TVBM" => LoadedIndex::Bitmap( - Bitmap::load(path).map_err(|err| io_to_ffi(err, "load TVBM index"))?, + b"OVBM" | b"TVBM" => LoadedIndex::Bitmap( + Bitmap::load(path).map_err(|err| io_to_ffi(err, "load Bitmap index"))?, ), - b"TVR1" | b"TVSB" => { + b"OVR1" | b"OVSB" | b"TVR1" | b"TVSB" => { return Err(FfiError::new( ORDVEC_STATUS_UNSUPPORTED_FORMAT, - "ABI v1 supports only TVRQ RankQuant and TVBM Bitmap indexes", + "ABI v1 supports only RankQuant and Bitmap indexes", )) } _ => { @@ -753,8 +756,9 @@ pub unsafe extern "C" fn ordvec_index_load( } #[no_mangle] -/// Probe on-disk metadata for a `.tvrq` RankQuant or `.tvbm` Bitmap index -/// without loading payload rows into an index handle. +/// Probe on-disk metadata for a `.ovrq` RankQuant or `.ovbm` Bitmap index +/// (legacy `.tv*` also accepted) without loading payload rows into an index +/// handle. /// /// This validates the fixed header, declared dimensions, payload byte count, /// and exact file length. Full row-invariant validation remains the job of @@ -992,7 +996,7 @@ mod tests { } fn make_rankquant_fixture() -> std::path::PathBuf { - let path = temp_path("rankquant", "tvrq"); + let path = temp_path("rankquant", "ovrq"); let mut index = RankQuant::new(16, 2); let doc: Vec = (0..16).map(|x| x as f32).collect(); let mut corpus = Vec::new(); @@ -1005,7 +1009,7 @@ mod tests { } fn make_bitmap_fixture() -> std::path::PathBuf { - let path = temp_path("bitmap", "tvbm"); + let path = temp_path("bitmap", "ovbm"); let mut index = Bitmap::new(64, 4); let mut doc = vec![0.0f32; 64]; for (j, value) in doc.iter_mut().take(4).enumerate() { @@ -1454,20 +1458,20 @@ mod tests { #[test] fn load_maps_unsupported_and_corrupt_formats() { - let rank_path = temp_path("rank", "tvr"); + let rank_path = temp_path("rank", "ovr"); let mut rank = Rank::new(16); rank.add(&[0.0f32; 16]); rank.write(&rank_path).unwrap(); - let sign_path = temp_path("sign", "tvsb"); + let sign_path = temp_path("sign", "ovsb"); let mut sign = SignBitmap::new(64); sign.add(&[0.0f32; 64]); sign.write(&sign_path).unwrap(); - let corrupt_path = temp_path("corrupt", "tvrq"); + let corrupt_path = temp_path("corrupt", "ovrq"); std::fs::File::create(&corrupt_path) .unwrap() - .write_all(b"TVRQ\x01") + .write_all(b"OVRQ\x01") .unwrap(); unsafe { diff --git a/ordvec-ffi/tests/c_link_smoke.rs b/ordvec-ffi/tests/c_link_smoke.rs index 8e932241..408ca98b 100644 --- a/ordvec-ffi/tests/c_link_smoke.rs +++ b/ordvec-ffi/tests/c_link_smoke.rs @@ -25,7 +25,7 @@ fn write_file(path: &Path, body: &[u8]) { fn write_rankquant_fixture(path: &Path) { let mut bytes = Vec::new(); - bytes.extend_from_slice(b"TVRQ"); + bytes.extend_from_slice(b"OVRQ"); bytes.push(1); bytes.push(2); bytes.extend_from_slice(&16u32.to_le_bytes()); @@ -98,7 +98,7 @@ fn c_program_links_and_runs_against_static_library() { lib.display() ); - let fixture = temp_path("linked_fixture", "tvrq"); + let fixture = temp_path("linked_fixture", "ovrq"); write_rankquant_fixture(&fixture); let src = temp_path("linked_smoke", "c"); diff --git a/ordvec-go/ordvec_test.go b/ordvec-go/ordvec_test.go index c775b065..e18e8d26 100644 --- a/ordvec-go/ordvec_test.go +++ b/ordvec-go/ordvec_test.go @@ -13,11 +13,14 @@ import ( "testing" ) -func writeRankQuantFixture(t *testing.T) string { +// writeRankQuantFixtureMagic builds a RankQuant fixture with the given 4-byte +// magic and file extension. The loader accepts both the current "OVRQ" magic and +// the legacy "TVRQ" magic, so this is parameterised to exercise both. +func writeRankQuantFixtureMagic(t *testing.T, magic, ext string) string { t.Helper() - path := filepath.Join(t.TempDir(), "fixture.tvrq") + path := filepath.Join(t.TempDir(), "fixture."+ext) var b []byte - b = append(b, []byte("TVRQ")...) + b = append(b, []byte(magic)...) b = append(b, 1) // version b = append(b, 2) // bits b = binary.LittleEndian.AppendUint32(b, 16) @@ -32,11 +35,18 @@ func writeRankQuantFixture(t *testing.T) string { return path } +// writeRankQuantFixture builds a RankQuant fixture in the current on-disk format +// ("OVRQ" magic, ".ovrq" extension). +func writeRankQuantFixture(t *testing.T) string { + t.Helper() + return writeRankQuantFixtureMagic(t, "OVRQ", "ovrq") +} + func writeBitmapFixture(t *testing.T) string { t.Helper() - path := filepath.Join(t.TempDir(), "fixture.tvbm") + path := filepath.Join(t.TempDir(), "fixture.ovbm") var b []byte - b = append(b, []byte("TVBM")...) + b = append(b, []byte("OVBM")...) b = append(b, 1) // version b = binary.LittleEndian.AppendUint32(b, 64) b = binary.LittleEndian.AppendUint32(b, 4) @@ -121,6 +131,25 @@ func TestLoadInfoSearchRankQuant(t *testing.T) { } } +// TestLoadsLegacyTVMagic confirms the C ABI still loads files written with the +// pre-rename "TVRQ" magic (legacy turbovec-era on-disk format). New files are +// written with "OVRQ"; the loader accepts both, so old indexes never break. +func TestLoadsLegacyTVMagic(t *testing.T) { + idx, err := Load(writeRankQuantFixtureMagic(t, "TVRQ", "tvrq")) + if err != nil { + t.Fatal(err) + } + defer idx.Close() + + info, err := idx.Info() + if err != nil { + t.Fatal(err) + } + if info.Kind != KindRankQuant || info.Dim != 16 || info.BitWidth != 2 || info.VectorCount != 4 { + t.Fatalf("unexpected info from legacy TVRQ fixture: %+v", info) + } +} + func TestProbeRankQuantInfo(t *testing.T) { path := writeRankQuantFixture(t) @@ -253,14 +282,14 @@ func TestTypedStatusErrors(t *testing.T) { t.Fatalf("unexpected status: %v", statusErr.Status) } - _, err = Load(filepath.Join(t.TempDir(), "missing.tvrq")) + _, err = Load(filepath.Join(t.TempDir(), "missing.ovrq")) if !errors.As(err, &statusErr) || statusErr.Status != StatusIO { t.Fatalf("missing file should be IO status, got %T %[1]v", err) } } func TestLoadRejectsNullBytePath(t *testing.T) { - _, err := Load("bad\x00path.tvrq") + _, err := Load("bad\x00path.ovrq") if err == nil || !strings.Contains(err.Error(), "null byte") { t.Fatalf("Load should reject null byte paths, got %v", err) } diff --git a/ordvec-manifest-python/README.md b/ordvec-manifest-python/README.md index 6ca81b02..9fa3b11c 100644 --- a/ordvec-manifest-python/README.md +++ b/ordvec-manifest-python/README.md @@ -24,7 +24,7 @@ Create manifests with caller-owned sidecars by passing dictionaries with ```python manifest = ordvec_manifest.create_manifest( - "index.tvrq", + "index.ovrq", "index.manifest.json", "bge-small-en-v1.5", row_id_is_identity=True, diff --git a/ordvec-manifest-python/tests/test_manifest_bindings.py b/ordvec-manifest-python/tests/test_manifest_bindings.py index 0bea7e99..320262da 100644 --- a/ordvec-manifest-python/tests/test_manifest_bindings.py +++ b/ordvec-manifest-python/tests/test_manifest_bindings.py @@ -12,7 +12,7 @@ def write_rankquant_index(path: Path, *, dim: int = 16, rows: int = 2, bits: int = 2): bytes_per_vec = dim * bits // 8 path.write_bytes( - b"TVRQ" + b"OVRQ" + bytes([1, bits]) + dim.to_bytes(4, "little") + rows.to_bytes(4, "little") @@ -21,7 +21,7 @@ def write_rankquant_index(path: Path, *, dim: int = 16, rows: int = 2, bits: int def write_unloadable_manifest(tmp_path): - artifact = tmp_path / "index.tvrq" + artifact = tmp_path / "index.ovrq" artifact.write_bytes(b"not an ordvec index") digest = hashlib.sha256(artifact.read_bytes()).hexdigest() manifest = { @@ -92,7 +92,7 @@ def test_verify_for_load_preserves_manifest_io_errors(tmp_path): def test_create_manifest_requires_explicit_row_identity(tmp_path): - index = tmp_path / "index.tvrq" + index = tmp_path / "index.ovrq" index.write_bytes(b"not an ordvec index") with pytest.raises(ValueError, match="row_map or row_id_is_identity"): @@ -100,7 +100,7 @@ def test_create_manifest_requires_explicit_row_identity(tmp_path): def test_create_manifest_accepts_auxiliary_artifacts(tmp_path): - index = tmp_path / "index.tvrq" + index = tmp_path / "index.ovrq" ids = tmp_path / "ids.bin" optional = tmp_path / "optional.json" manifest_path = tmp_path / "manifest.json" diff --git a/ordvec-manifest/README.md b/ordvec-manifest/README.md index 58ddc416..2b64a2cf 100644 --- a/ordvec-manifest/README.md +++ b/ordvec-manifest/README.md @@ -18,7 +18,7 @@ library default feature set is empty and does not depend on `clap`. ```sh ordvec-manifest create \ - --index path/to/index.tvrq \ + --index path/to/index.ovrq \ --row-id-is-identity \ --aux app.ids=path/to/ids.bin \ --embedding-model bge-small-en-v1.5 \ @@ -163,9 +163,9 @@ paths, declared digest/length, and observed digest/length: "checked_at": "2026-06-03T17:20:00Z", "manifest_id": "urn:uuid:11111111-1111-4111-8111-111111111111", "artifact": { - "manifest_path": "index.tvrq", - "observed_path": "index.tvrq", - "canonical_path": "/srv/index/index.tvrq", + "manifest_path": "index.ovrq", + "observed_path": "index.ovrq", + "canonical_path": "/srv/index/index.ovrq", "sha256": "1111111111111111111111111111111111111111111111111111111111111111", "size_bytes": 4096, "metadata": null @@ -222,9 +222,9 @@ read and absent when the file is missing: "checked_at": "2026-06-03T17:21:00Z", "manifest_id": "urn:uuid:11111111-1111-4111-8111-111111111111", "artifact": { - "manifest_path": "index.tvrq", - "observed_path": "index.tvrq", - "canonical_path": "/srv/index/index.tvrq", + "manifest_path": "index.ovrq", + "observed_path": "index.ovrq", + "canonical_path": "/srv/index/index.ovrq", "sha256": "1111111111111111111111111111111111111111111111111111111111111111", "size_bytes": 4096, "metadata": null diff --git a/ordvec-manifest/tests/manifest.rs b/ordvec-manifest/tests/manifest.rs index a555f977..dab4dbe1 100644 --- a/ordvec-manifest/tests/manifest.rs +++ b/ordvec-manifest/tests/manifest.rs @@ -19,7 +19,7 @@ use std::path::{Path, PathBuf}; use std::process::Command; fn write_index(dir: &Path) -> PathBuf { - let path = dir.join("index.tvrq"); + let path = dir.join("index.ovrq"); let mut index = RankQuant::new(16, 2); let docs: Vec = (0..32).map(|i| i as f32 - 12.0).collect(); index.add(&docs); @@ -28,7 +28,7 @@ fn write_index(dir: &Path) -> PathBuf { } fn write_rankquant_index(dir: &Path, rows: usize) -> PathBuf { - let path = dir.join("index.tvrq"); + let path = dir.join("index.ovrq"); let mut index = RankQuant::new(16, 2); let docs: Vec = (0..16 * rows).map(|i| i as f32 - 12.0).collect(); index.add(&docs); @@ -47,7 +47,7 @@ enum FixtureKind { fn write_index_kind(dir: &Path, kind: FixtureKind) -> PathBuf { match kind { FixtureKind::Rank => { - let path = dir.join("index.tvr"); + let path = dir.join("index.ovr"); let mut index = Rank::new(8); index.add(&[ 1.0, 3.0, 2.0, 4.0, 8.0, 7.0, 6.0, 5.0, 8.0, 6.0, 7.0, 5.0, 1.0, 2.0, 3.0, 4.0, @@ -57,7 +57,7 @@ fn write_index_kind(dir: &Path, kind: FixtureKind) -> PathBuf { } FixtureKind::RankQuant => write_index(dir), FixtureKind::Bitmap => { - let path = dir.join("index.tvbm"); + let path = dir.join("index.ovbm"); let mut index = Bitmap::new(64, 16); let docs: Vec = (0..128).map(|i| ((i * 17) % 31) as f32).collect(); index.add(&docs); @@ -65,7 +65,7 @@ fn write_index_kind(dir: &Path, kind: FixtureKind) -> PathBuf { path } FixtureKind::SignBitmap => { - let path = dir.join("index.tvsb"); + let path = dir.join("index.ovsb"); let mut index = SignBitmap::new(64); let docs: Vec = (0usize..128) .map(|i| if i.is_multiple_of(3) { 1.0 } else { -1.0 }) @@ -1939,7 +1939,7 @@ fn path_policy_rejects_escapes_and_absolute_paths_by_default() { ) .unwrap(); - manifest.artifact.path = "../index.tvrq".to_string(); + manifest.artifact.path = "../index.ovrq".to_string(); let report = verify_manifest_with_base(manifest.clone(), &base, VerifyOptions::default()); assert!(report .errors @@ -1986,7 +1986,7 @@ fn symlink_escape_reports_observed_canonical_path() { fs::create_dir(&base).unwrap(); fs::create_dir(&outside).unwrap(); let index = write_index(&outside); - symlink(&index, base.join("link.tvrq")).unwrap(); + symlink(&index, base.join("link.ovrq")).unwrap(); let manifest_path = base.join("manifest.json"); let mut manifest = create_manifest_for_index_with_options( &index, @@ -1999,7 +1999,7 @@ fn symlink_escape_reports_observed_canonical_path() { }, ) .unwrap(); - manifest.artifact.path = "link.tvrq".to_string(); + manifest.artifact.path = "link.ovrq".to_string(); let report = verify_manifest_with_base(manifest.clone(), &base, VerifyOptions::default()); assert!(report @@ -2086,7 +2086,7 @@ fn verify_for_load_uses_explicit_index_override() { &manifest_path, ) .unwrap(); - manifest.artifact.path = "missing.tvrq".to_string(); + manifest.artifact.path = "missing.ovrq".to_string(); fs::write( &manifest_path, serde_json::to_string_pretty(&manifest).unwrap(), @@ -2096,7 +2096,7 @@ fn verify_for_load_uses_explicit_index_override() { let plan = verify_for_load( &manifest_path, VerifyOptions { - index_override: Some(PathBuf::from("index.tvrq")), + index_override: Some(PathBuf::from("index.ovrq")), ..VerifyOptions::default() }, ) @@ -2108,7 +2108,7 @@ fn verify_for_load_uses_explicit_index_override() { ); assert_eq!( plan.report().artifact.observed_path.as_deref(), - Some("index.tvrq") + Some("index.ovrq") ); } @@ -2215,7 +2215,7 @@ fn verify_for_load_fails_closed_with_report_for_default_path_policy() { }, ) .unwrap(); - manifest.artifact.path = "../index.tvrq".to_string(); + manifest.artifact.path = "../index.ovrq".to_string(); fs::write( &manifest_path, serde_json::to_string_pretty(&manifest).unwrap(), @@ -2766,7 +2766,7 @@ fn attestation_shape_requires_matching_subject_sha256() { manifest.attestations.push(json!({ "predicateType": "https://slsa.dev/provenance/v1", "predicate": {"builder": {"id": "builder"}}, - "subject": [{"name": "index.tvrq", "digest": {"sha256": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}}] + "subject": [{"name": "index.ovrq", "digest": {"sha256": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}}] })); let report = verify_manifest_with_base(manifest.clone(), temp.path(), VerifyOptions::default()); @@ -3044,7 +3044,7 @@ fn verify_index_manifest_uses_explicit_index_override() { &manifest_path, ) .unwrap(); - manifest.artifact.path = "missing.tvrq".to_string(); + manifest.artifact.path = "missing.ovrq".to_string(); fs::write( &manifest_path, serde_json::to_string_pretty(&manifest).unwrap(), @@ -3052,7 +3052,7 @@ fn verify_index_manifest_uses_explicit_index_override() { .unwrap(); let report = verify_index_manifest( - PathBuf::from("index.tvrq"), + PathBuf::from("index.ovrq"), &manifest_path, VerifyOptions::default(), ) @@ -3704,7 +3704,7 @@ fn sqlite_cache_key_is_scoped_to_manifest_location() { ) .unwrap(); - let index_b = case_b.join("index.tvrq"); + let index_b = case_b.join("index.ovrq"); let manifest_b = case_b.join("manifest.json"); fs::copy(&index_a, &index_b).unwrap(); fs::copy(&manifest_a, &manifest_b).unwrap(); diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index 6e75661d..72825fa1 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -512,7 +512,7 @@ impl Rank { Ok((scores, indices)) } - /// Serialise the rank index to a `.tvr` file. + /// Serialise the rank index to a `.ovr` file. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -522,7 +522,8 @@ impl Rank { .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string())) } - /// Load a `Rank` index from a `.tvr` file previously written by [`Rank::write`]. + /// Load a `Rank` index from a `.ovr` file previously written by [`Rank::write`] + /// (legacy `.tvr` files are also accepted). /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -695,7 +696,7 @@ impl RankQuant { Ok((scores, indices)) } - /// Serialise the quantised index to a `.tvrq` file. + /// Serialise the quantised index to a `.ovrq` file. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -705,7 +706,8 @@ impl RankQuant { .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string())) } - /// Load a `RankQuant` index from a `.tvrq` file written by [`RankQuant::write`]. + /// Load a `RankQuant` index from a `.ovrq` file written by [`RankQuant::write`] + /// (legacy `.tvrq` files are also accepted). /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1154,7 +1156,7 @@ impl Bitmap { self.inner.is_empty() } - /// Serialise the bitmap index to a `.tvbm` file. + /// Serialise the bitmap index to a `.ovbm` file. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1164,7 +1166,8 @@ impl Bitmap { .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string())) } - /// Load a `Bitmap` index from a `.tvbm` file written by [`Bitmap::write`]. + /// Load a `Bitmap` index from a `.ovbm` file written by [`Bitmap::write`] + /// (legacy `.tvbm` files are also accepted). /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1398,8 +1401,8 @@ impl SignBitmap { self.inner.is_empty() } - /// Persist the sign-bitmap payload to a `.tvsb` file. Format: 13-byte header - /// (`TVSB` magic + version + dim + n_vectors) + LE u64 bitmaps. + /// Persist the sign-bitmap payload to a `.ovsb` file. Format: 13-byte header + /// (`OVSB` magic + version + dim + n_vectors) + LE u64 bitmaps. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1409,9 +1412,10 @@ impl SignBitmap { .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string())) } - /// Load a `SignBitmap` from a `.tvsb` file previously written by - /// [`SignBitmap::write`]. Raises `IOError` if the file is missing, malformed, - /// or its payload length disagrees with the header-declared shape. + /// Load a `SignBitmap` from a `.ovsb` file previously written by + /// [`SignBitmap::write`] (legacy `.tvsb` files are also accepted). Raises + /// `IOError` if the file is missing, malformed, or its payload length + /// disagrees with the header-declared shape. /// /// `path` is forwarded to the filesystem unmodified — no `..` / traversal /// sanitisation — so treat it as trusted input (see the module docstring). @@ -1683,7 +1687,7 @@ fn search_asymmetric_byte_lut<'py>( /// /// This rank-transforms and buckets the raw `corpus`/`queries` matrices on the /// fly, so it supports non-byte-aligned widths such as `bits=3` without changing -/// `RankQuant` storage or `.tvrq` persistence. Returns `(scores, indices)` with +/// `RankQuant` storage or `.ovrq` persistence. Returns `(scores, indices)` with /// the same shape contract as `RankQuant.search`. #[pyfunction] fn rankquant_eval_search<'py>( diff --git a/ordvec-python/tests/test_bitmap.py b/ordvec-python/tests/test_bitmap.py index f632b7fb..e0be7808 100644 --- a/ordvec-python/tests/test_bitmap.py +++ b/ordvec-python/tests/test_bitmap.py @@ -139,7 +139,7 @@ def test_save_load_roundtrip(tmp_path): idx = Bitmap(dim=128, n_top=32) idx.add(vectors) - path = str(tmp_path / "idx.tvbm") + path = str(tmp_path / "idx.ovbm") idx.write(path) loaded = Bitmap.load(path) @@ -157,7 +157,7 @@ def test_save_load_roundtrip(tmp_path): def test_load_rejects_nonexistent_file(): with pytest.raises(IOError): - Bitmap.load("/nonexistent/path/does-not-exist.tvbm") + Bitmap.load("/nonexistent/path/does-not-exist.ovbm") def test_invalid_n_top_rejected(): @@ -236,7 +236,7 @@ def test_add_float64_is_coerced(): def test_dim_above_u16_max_rejected(): # dim = 65536 is a multiple of 64 but exceeds u16::MAX; the binding must # reject it with a clean ValueError (mirrors the core Bitmap::new guard and - # the .tvbm loader cap) rather than defer to a Rust panic on add/search. + # the .ovbm loader cap) rather than defer to a Rust panic on add/search. with pytest.raises(ValueError, match="u16 rank invariant"): Bitmap(dim=65_536, n_top=256) diff --git a/ordvec-python/tests/test_rank.py b/ordvec-python/tests/test_rank.py index 0be221e3..5531a529 100644 --- a/ordvec-python/tests/test_rank.py +++ b/ordvec-python/tests/test_rank.py @@ -117,7 +117,7 @@ def test_save_load_roundtrip(tmp_path): idx = Rank(dim=128) idx.add(vectors) - path = str(tmp_path / "idx.tvr") + path = str(tmp_path / "idx.ovr") idx.write(path) loaded = Rank.load(path) @@ -133,7 +133,7 @@ def test_save_load_roundtrip(tmp_path): def test_load_rejects_nonexistent_file(): with pytest.raises(IOError): - Rank.load("/nonexistent/path/does-not-exist.tvr") + Rank.load("/nonexistent/path/does-not-exist.ovr") def test_empty_index_search_does_not_panic(): diff --git a/ordvec-python/tests/test_rank_quant.py b/ordvec-python/tests/test_rank_quant.py index 79ef6764..cdd0c5fd 100644 --- a/ordvec-python/tests/test_rank_quant.py +++ b/ordvec-python/tests/test_rank_quant.py @@ -224,7 +224,7 @@ def test_save_load_roundtrip(tmp_path, bits): idx = RankQuant(dim=128, bits=bits) idx.add(vectors) - path = str(tmp_path / f"idx_b{bits}.tvrq") + path = str(tmp_path / f"idx_b{bits}.ovrq") idx.write(path) loaded = RankQuant.load(path) @@ -242,7 +242,7 @@ def test_save_load_roundtrip(tmp_path, bits): def test_load_rejects_nonexistent_file(): with pytest.raises(IOError): - RankQuant.load("/nonexistent/path/does-not-exist.tvrq") + RankQuant.load("/nonexistent/path/does-not-exist.ovrq") @pytest.mark.parametrize("bits", [1, 2, 4]) diff --git a/ordvec-python/tests/test_redteam_fuzz.py b/ordvec-python/tests/test_redteam_fuzz.py index 21fba014..0d2b168c 100644 --- a/ordvec-python/tests/test_redteam_fuzz.py +++ b/ordvec-python/tests/test_redteam_fuzz.py @@ -867,8 +867,8 @@ def _write_real_rank(path: str) -> bytes: def test_rank_load_header_only_truncated_io_error(tmp_path): - data = _write_real_rank(str(tmp_path / "real.tvr")) - p = str(tmp_path / "trunc.tvr") + data = _write_real_rank(str(tmp_path / "real.ovr")) + p = str(tmp_path / "trunc.ovr") with open(p, "wb") as f: f.write(data[:13]) # header, zero payload with pytest.raises(IOError): @@ -876,8 +876,8 @@ def test_rank_load_header_only_truncated_io_error(tmp_path): def test_rank_load_mid_payload_truncated_io_error(tmp_path): - data = _write_real_rank(str(tmp_path / "real.tvr")) - p = str(tmp_path / "half.tvr") + data = _write_real_rank(str(tmp_path / "real.ovr")) + p = str(tmp_path / "half.ovr") with open(p, "wb") as f: f.write(data[: len(data) // 2]) with pytest.raises(IOError): @@ -887,8 +887,8 @@ def test_rank_load_mid_payload_truncated_io_error(tmp_path): def test_rank_load_trailing_bytes_io_error(tmp_path): # A structurally-valid file with extra trailing bytes is rejected (v1 has no # footer) — guards against record-smuggling past a smaller declared payload. - data = _write_real_rank(str(tmp_path / "real.tvr")) - p = str(tmp_path / "ext.tvr") + data = _write_real_rank(str(tmp_path / "real.ovr")) + p = str(tmp_path / "ext.ovr") with open(p, "wb") as f: f.write(data + b"\x00" * 64) with pytest.raises(IOError): @@ -899,9 +899,9 @@ def test_rank_load_forged_huge_n_vectors_io_error_no_oom(tmp_path): # Forge n_vectors (bytes 9..13) to ~268M into a tiny file. The DoS-alloc # hypothesis: a naive loader allocates n_vectors*dim*2 up front. The loader # must reject (MAX_VECTORS / payload-mismatch) BEFORE allocating. - data = bytearray(_write_real_rank(str(tmp_path / "real.tvr"))) + data = bytearray(_write_real_rank(str(tmp_path / "real.ovr"))) data[9:13] = struct.pack(") -> std::io::Result<()> { crate::rank_io::write_bitmap(path, self.dim, self.n_top, self.n_vectors, &self.bitmaps) } - /// Load from a `.tvbm` file produced by [`Self::write`]. + /// Load from a `.ovbm` file produced by [`Self::write`]. + /// + /// Legacy `.tvbm` files (magic `TVBM`) written by older versions of this + /// crate are also accepted; newly written files use the `OVBM` magic. /// /// Returns `io::Error::InvalidData` on any constructor-invariant /// violation (`load_bitmap` already validates dim/n_top/n_vectors; @@ -535,14 +538,14 @@ impl Bitmap { let expected = n_vectors.checked_mul(qpv).ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVBM n_vectors * dim/64 overflows usize", + "OVBM n_vectors * dim/64 overflows usize", ) })?; if bitmaps.len() != expected { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!( - "TVBM payload length {} does not match expected {expected} u64 lanes", + "OVBM payload length {} does not match expected {expected} u64 lanes", bitmaps.len(), ), )); diff --git a/src/quant.rs b/src/quant.rs index 1d022aea..ee9f0dbd 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -861,10 +861,10 @@ impl RankQuant { last } - /// Persist to a `.tvrq` file. Format: 14-byte header + packed bytes. + /// Persist to a `.ovrq` file. Format: 14-byte header + packed bytes. /// /// # `b=8` - /// The `.tvrq` on-disk format and its loader currently support only + /// The `.ovrq` on-disk format and its loader currently support only /// `bits ∈ {1, 2, 4}`. `b=8` is an in-memory evidence/refinement surface /// in this phase; persisting it is a follow-up. To avoid writing a file /// that [`Self::load`] would then reject (a silent broken round-trip), @@ -874,7 +874,7 @@ impl RankQuant { if self.bits == 8 { return Err(std::io::Error::new( std::io::ErrorKind::Unsupported, - "RankQuant b=8 persistence is not supported yet (the .tvrq loader \ + "RankQuant b=8 persistence is not supported yet (the .ovrq loader \ accepts bits ∈ {1, 2, 4}); b=8 is an in-memory evidence surface \ in this phase", )); @@ -882,7 +882,10 @@ impl RankQuant { crate::rank_io::write_rankquant(path, self.bits, self.dim, self.n_vectors, &self.packed) } - /// Load from a `.tvrq` file produced by [`Self::write`]. + /// Load from a `.ovrq` file produced by [`Self::write`]. + /// + /// Legacy `.tvrq` files (magic `TVRQ`) written by older versions of this + /// crate are also accepted; newly written files use the `OVRQ` magic. /// /// Re-runs the same constructor invariants `RankQuant::new` /// enforces (`bits ∈ {1, 2, 4}`, `dim % (1 << bits) == 0`, @@ -897,7 +900,7 @@ impl RankQuant { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!( - "TVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ + "OVRQ dim {dim} is not a multiple of 2^bits = {n_buckets}; \ constant-composition invariant violated" ), )); @@ -906,7 +909,7 @@ impl RankQuant { if dim % codes_per_byte != 0 { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, - format!("TVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}",), + format!("OVRQ dim {dim} is not a multiple of codes_per_byte = {codes_per_byte}",), )); } // `checked_mul` (not `saturating`): on a 32-bit target the byte count @@ -917,7 +920,7 @@ impl RankQuant { let nv_dim = n_vectors.checked_mul(dim).ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVRQ n_vectors * dim overflows usize", + "OVRQ n_vectors * dim overflows usize", ) })?; let expected_bytes = nv_dim @@ -926,14 +929,14 @@ impl RankQuant { .ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVRQ (n_vectors * dim) * bits overflows usize", + "OVRQ (n_vectors * dim) * bits overflows usize", ) })?; if packed.len() != expected_bytes { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!( - "TVRQ payload length {} does not match expected {expected_bytes}", + "OVRQ payload length {} does not match expected {expected_bytes}", packed.len(), ), )); @@ -1453,7 +1456,7 @@ fn validate_finite(values: &[f32], name: &'static str) -> Result<(), OrdvecError /// Standalone symmetric RankQuant-style eval search for arbitrary bit widths. /// -/// This does **not** use [`RankQuant`] storage and does not change the `.tvrq` +/// This does **not** use [`RankQuant`] storage and does not change the `.ovrq` /// packing contract. It rank-transforms `corpus` and `queries`, buckets each /// rank into `1 << bits` equal-width bins, mean-centres bucket ids, normalises /// by the **empirical** norm for that `(dim, bits)` (the exact L2 norm of the diff --git a/src/rank.rs b/src/rank.rs index c74bba8f..10cd1e2b 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -540,12 +540,15 @@ impl Rank { last } - /// Persist to a `.tvr` file. Format: 13-byte header + u16 ranks LE. + /// Persist to a `.ovr` file. Format: 13-byte header + u16 ranks LE. pub fn write(&self, path: impl AsRef) -> std::io::Result<()> { crate::rank_io::write_rank(path, self.dim, self.n_vectors, &self.ranks) } - /// Load from a `.tvr` file produced by [`Self::write`]. + /// Load from a `.ovr` file produced by [`Self::write`]. + /// + /// Legacy `.tvr` files (magic `TVR1`) written by older versions of this + /// crate are also accepted; newly written files use the `OVR1` magic. /// /// Returns `io::Error` (kind `InvalidData`) on any structural /// inconsistency between the header and the payload (`load_rank` @@ -560,13 +563,13 @@ impl Rank { let expected = n_vectors.checked_mul(dim).ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVR1 n_vectors * dim overflows usize", + "OVR1 n_vectors * dim overflows usize", ) })?; if ranks.len() != expected { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVR1 payload length does not match dim * n_vectors", + "OVR1 payload length does not match dim * n_vectors", )); } Ok(Self { diff --git a/src/rank_io.rs b/src/rank_io.rs index 8e3be947..3a1e3418 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -1,10 +1,13 @@ //! Read/write ordinal/sign index files. //! -//! Four formats live here, each self-describing via a 4-byte magic: -//! * `.tvr` — [`Rank`](crate::Rank) — magic `TVR1` -//! * `.tvrq` — [`RankQuant`](crate::RankQuant) — magic `TVRQ` -//! * `.tvbm` — [`Bitmap`](crate::Bitmap) — magic `TVBM` -//! * `.tvsb` — [`SignBitmap`](crate::SignBitmap) — magic `TVSB` +//! Four formats live here, each self-describing via a 4-byte magic. Files +//! written by this crate use the **`.ov*` / `OV*`** magics (the ordvec format); +//! the legacy turbovec-era **`.tv*` / `TV*`** magics are still accepted on load +//! for backward compatibility, but are never written: +//! * `.ovr` (legacy `.tvr`) — [`Rank`](crate::Rank) — magic `OVR1` (also reads `TVR1`) +//! * `.ovrq` (legacy `.tvrq`) — [`RankQuant`](crate::RankQuant) — magic `OVRQ` (also reads `TVRQ`) +//! * `.ovbm` (legacy `.tvbm`) — [`Bitmap`](crate::Bitmap) — magic `OVBM` (also reads `TVBM`) +//! * `.ovsb` (legacy `.tvsb`) — [`SignBitmap`](crate::SignBitmap) — magic `OVSB` (also reads `TVSB`) //! //! All formats are little-endian. Headers are small fixed-size structs //! followed by a single contiguous payload (the rank / packed / bitmap @@ -59,6 +62,14 @@ use std::fs::File; use std::io::{self, BufReader, BufWriter, Read, Seek, Write}; use std::path::Path; +// Current ordvec magics — written by this crate going forward. +const OVR_MAGIC: &[u8; 4] = b"OVR1"; +const OVRQ_MAGIC: &[u8; 4] = b"OVRQ"; +const OVBM_MAGIC: &[u8; 4] = b"OVBM"; +const OVSB_MAGIC: &[u8; 4] = b"OVSB"; +// Legacy turbovec-era magics — still accepted on load for backward +// compatibility, never written. Files produced before the ordvec rebrand carry +// these; loaders accept either the `OV*` or the matching `TV*` magic. const TVR_MAGIC: &[u8; 4] = b"TVR1"; const TVRQ_MAGIC: &[u8; 4] = b"TVRQ"; const TVBM_MAGIC: &[u8; 4] = b"TVBM"; @@ -345,10 +356,10 @@ pub fn probe_index_metadata(path: impl AsRef) -> io::Result let mut f = BufReader::new(file); let magic = read_magic(&mut f, "ordvec index")?; match &magic { - TVR_MAGIC => probe_rank_metadata(&mut f, file_size_bytes), - TVRQ_MAGIC => probe_rankquant_metadata(&mut f, file_size_bytes), - TVBM_MAGIC => probe_bitmap_metadata(&mut f, file_size_bytes), - TVSB_MAGIC => probe_sign_bitmap_metadata(&mut f, file_size_bytes), + OVR_MAGIC | TVR_MAGIC => probe_rank_metadata(&mut f, file_size_bytes), + OVRQ_MAGIC | TVRQ_MAGIC => probe_rankquant_metadata(&mut f, file_size_bytes), + OVBM_MAGIC | TVBM_MAGIC => probe_bitmap_metadata(&mut f, file_size_bytes), + OVSB_MAGIC | TVSB_MAGIC => probe_sign_bitmap_metadata(&mut f, file_size_bytes), _ => Err(invalid("unknown ordvec index magic")), } } @@ -494,7 +505,7 @@ pub(crate) fn write_rank( check_payload_bytes(payload_bytes)?; assert_eq!(ranks.len(), payload_bytes / 2); let mut f = BufWriter::new(File::create(path)?); - f.write_all(TVR_MAGIC)?; + f.write_all(OVR_MAGIC)?; f.write_all(&[VERSION])?; f.write_all(&(dim as u32).to_le_bytes())?; f.write_all(&(n_vectors as u32).to_le_bytes())?; @@ -515,8 +526,8 @@ pub(crate) fn load_rank(path: impl AsRef) -> io::Result<(usize, usize, Vec let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "TVR1")?; - if &magic != TVR_MAGIC { - return Err(invalid("not a TVR1 file: wrong magic")); + if &magic != OVR_MAGIC && &magic != TVR_MAGIC { + return Err(invalid("not an OVR1/TVR1 (Rank) file: wrong magic")); } read_version(&mut f, "TVR1")?; let dim = read_u32_le(&mut f, "TVR1", "dim")? as usize; @@ -587,7 +598,7 @@ pub(crate) fn write_rankquant( check_payload_bytes(payload_bytes)?; assert_eq!(packed.len(), payload_bytes); let mut f = BufWriter::new(File::create(path)?); - f.write_all(TVRQ_MAGIC)?; + f.write_all(OVRQ_MAGIC)?; f.write_all(&[VERSION])?; f.write_all(&[bits])?; f.write_all(&(dim as u32).to_le_bytes())?; @@ -607,8 +618,8 @@ pub(crate) fn load_rankquant(path: impl AsRef) -> io::Result<(u8, usize, u let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "TVRQ")?; - if &magic != TVRQ_MAGIC { - return Err(invalid("not a TVRQ file: wrong magic")); + if &magic != OVRQ_MAGIC && &magic != TVRQ_MAGIC { + return Err(invalid("not an OVRQ/TVRQ (RankQuant) file: wrong magic")); } read_version(&mut f, "TVRQ")?; let bits = read_u8_field(&mut f, "TVRQ", "bits")?; @@ -697,7 +708,7 @@ pub(crate) fn write_bitmap( check_payload_bytes(payload_bytes)?; assert_eq!(bitmaps.len(), payload_bytes / 8); let mut f = BufWriter::new(File::create(path)?); - f.write_all(TVBM_MAGIC)?; + f.write_all(OVBM_MAGIC)?; f.write_all(&[VERSION])?; f.write_all(&(dim as u32).to_le_bytes())?; f.write_all(&(n_top as u32).to_le_bytes())?; @@ -719,8 +730,8 @@ pub(crate) fn load_bitmap(path: impl AsRef) -> io::Result<(usize, usize, u let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "TVBM")?; - if &magic != TVBM_MAGIC { - return Err(invalid("not a TVBM file: wrong magic")); + if &magic != OVBM_MAGIC && &magic != TVBM_MAGIC { + return Err(invalid("not an OVBM/TVBM (Bitmap) file: wrong magic")); } read_version(&mut f, "TVBM")?; let dim = read_u32_le(&mut f, "TVBM", "dim")? as usize; @@ -788,7 +799,7 @@ pub(crate) fn write_sign_bitmap( check_payload_bytes(payload_bytes)?; assert_eq!(bitmaps.len(), payload_bytes / 8); let mut f = BufWriter::new(File::create(path)?); - f.write_all(TVSB_MAGIC)?; + f.write_all(OVSB_MAGIC)?; f.write_all(&[VERSION])?; f.write_all(&(dim as u32).to_le_bytes())?; f.write_all(&(n_vectors as u32).to_le_bytes())?; @@ -824,8 +835,8 @@ pub(crate) fn load_sign_bitmap(path: impl AsRef) -> io::Result<(usize, usi let file_len = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "TVSB")?; - if &magic != TVSB_MAGIC { - return Err(invalid("not a TVSB file: wrong magic")); + if &magic != OVSB_MAGIC && &magic != TVSB_MAGIC { + return Err(invalid("not an OVSB/TVSB (SignBitmap) file: wrong magic")); } read_version(&mut f, "TVSB")?; let dim = read_u32_le(&mut f, "TVSB", "dim")? as usize; diff --git a/src/sign_bitmap.rs b/src/sign_bitmap.rs index 4f1ce09e..e27d4a36 100644 --- a/src/sign_bitmap.rs +++ b/src/sign_bitmap.rs @@ -134,7 +134,7 @@ impl SignBitmap { /// can be persisted via [`Self::write`] and reloaded via /// [`Self::load`] — without it, `new` could produce indices the /// loader refuses to round-trip (the issue Codex caught after the - /// first `.tvsb` revision used [`crate::rank_io::MAX_DIM`]'s + /// first `.ovsb` revision used [`crate::rank_io::MAX_DIM`]'s /// rank-storage `u16::MAX` cap, which doesn't apply to sign /// bitmaps). pub fn new(dim: usize) -> Self { @@ -454,12 +454,15 @@ impl SignBitmap { last } - /// Persist to a `.tvsb` file. Format: 13-byte header + LE u64 bitmaps. + /// Persist to a `.ovsb` file. Format: 13-byte header + LE u64 bitmaps. pub fn write(&self, path: impl AsRef) -> std::io::Result<()> { crate::rank_io::write_sign_bitmap(path, self.dim, self.n_vectors, &self.bitmaps) } - /// Load from a `.tvsb` file produced by [`Self::write`]. + /// Load from a `.ovsb` file produced by [`Self::write`]. + /// + /// Legacy `.tvsb` files (magic `TVSB`) written by older versions of this + /// crate are also accepted; newly written files use the `OVSB` magic. /// /// Returns `io::Error::InvalidData` on any constructor-invariant /// violation. `load_sign_bitmap` already validates dim and n_vectors; @@ -474,14 +477,14 @@ impl SignBitmap { let expected = n_vectors.checked_mul(qpv).ok_or_else(|| { std::io::Error::new( std::io::ErrorKind::InvalidData, - "TVSB n_vectors * dim/64 overflows usize", + "OVSB n_vectors * dim/64 overflows usize", ) })?; if bitmaps.len() != expected { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!( - "TVSB payload length {} does not match expected {expected} u64 lanes", + "OVSB payload length {} does not match expected {expected} u64 lanes", bitmaps.len(), ), )); diff --git a/tests/persistence_compat.rs b/tests/persistence_compat.rs index 40684e33..dee669dd 100644 --- a/tests/persistence_compat.rs +++ b/tests/persistence_compat.rs @@ -100,7 +100,7 @@ fn assert_rejects_version_and_trailing_bytes( #[test] fn rank_v1_fixture_bytes_are_stable() { let expected = [ - b'T', b'V', b'R', b'1', 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, + b'O', b'V', b'R', b'1', 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, ]; let path = tmp("rank"); @@ -129,7 +129,7 @@ fn rank_v1_fixture_bytes_are_stable() { #[test] fn rankquant_v1_fixture_bytes_are_stable() { let expected = [ - b'T', b'V', b'R', b'Q', 1, 2, 8, 0, 0, 0, 1, 0, 0, 0, 0x05, 0xaf, + b'O', b'V', b'R', b'Q', 1, 2, 8, 0, 0, 0, 1, 0, 0, 0, 0x05, 0xaf, ]; let path = tmp("rankquant"); @@ -159,7 +159,7 @@ fn rankquant_v1_fixture_bytes_are_stable() { #[test] fn bitmap_v1_fixture_bytes_are_stable() { let expected = [ - b'T', b'V', b'B', b'M', 1, 64, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc0, + b'O', b'V', b'B', b'M', 1, 64, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc0, ]; let path = tmp("bitmap"); @@ -190,7 +190,7 @@ fn bitmap_v1_fixture_bytes_are_stable() { #[test] fn sign_bitmap_v1_fixture_bytes_are_stable() { let expected = [ - b'T', b'V', b'S', b'B', 1, 64, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0x80, + b'O', b'V', b'S', b'B', 1, 64, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0x80, ]; let path = tmp("sign_bitmap"); @@ -220,3 +220,98 @@ fn sign_bitmap_v1_fixture_bytes_are_stable() { SignBitmap::load(path) }); } + +// Back-compat tests: files carrying the legacy TV* magic still load correctly. +// The fixture body (everything after the 4-byte magic) is identical; only the +// magic prefix differs. These prove the "accept both OV* and TV* on load, +// never write TV*" contract is upheld at the public index-type level. + +#[test] +fn rank_v1_legacy_tv_magic_still_loads() { + // Fixture body from `rank_v1_fixture_bytes_are_stable`, magic swapped TV*. + let ov_fixture: &[u8] = &[ + b'O', b'V', b'R', b'1', 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, + ]; + let mut legacy = vec![b'T', b'V', b'R', b'1']; + legacy.extend_from_slice(&ov_fixture[4..]); + + let path = tmp("rank_legacy_tv"); + write_bytes(&path, &legacy); + let loaded = Rank::load(&path).unwrap(); + assert_eq!(loaded.dim(), 4); + assert_eq!(loaded.len(), 1); + + // Also confirm the OV* fixture loads to the same shape (round-trip parity). + let ov_path = tmp("rank_ov"); + write_bytes(&ov_path, ov_fixture); + let ov_loaded = Rank::load(&ov_path).unwrap(); + assert_eq!(loaded.dim(), ov_loaded.dim()); + assert_eq!(loaded.len(), ov_loaded.len()); +} + +#[test] +fn rankquant_v1_legacy_tv_magic_still_loads() { + let ov_fixture: &[u8] = &[ + b'O', b'V', b'R', b'Q', 1, 2, 8, 0, 0, 0, 1, 0, 0, 0, 0x05, 0xaf, + ]; + let mut legacy = vec![b'T', b'V', b'R', b'Q']; + legacy.extend_from_slice(&ov_fixture[4..]); + + let path = tmp("rankquant_legacy_tv"); + write_bytes(&path, &legacy); + let loaded = RankQuant::load(&path).unwrap(); + assert_eq!(loaded.dim(), 8); + assert_eq!(loaded.len(), 1); + assert_eq!(loaded.bits(), 2); + + let ov_path = tmp("rankquant_ov"); + write_bytes(&ov_path, ov_fixture); + let ov_loaded = RankQuant::load(&ov_path).unwrap(); + assert_eq!(loaded.dim(), ov_loaded.dim()); + assert_eq!(loaded.len(), ov_loaded.len()); + assert_eq!(loaded.bits(), ov_loaded.bits()); +} + +#[test] +fn bitmap_v1_legacy_tv_magic_still_loads() { + let ov_fixture: &[u8] = &[ + b'O', b'V', b'B', b'M', 1, 64, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xc0, + ]; + let mut legacy = vec![b'T', b'V', b'B', b'M']; + legacy.extend_from_slice(&ov_fixture[4..]); + + let path = tmp("bitmap_legacy_tv"); + write_bytes(&path, &legacy); + let loaded = Bitmap::load(&path).unwrap(); + assert_eq!(loaded.dim(), 64); + assert_eq!(loaded.len(), 1); + assert_eq!(loaded.n_top(), 2); + + let ov_path = tmp("bitmap_ov"); + write_bytes(&ov_path, ov_fixture); + let ov_loaded = Bitmap::load(&ov_path).unwrap(); + assert_eq!(loaded.dim(), ov_loaded.dim()); + assert_eq!(loaded.len(), ov_loaded.len()); + assert_eq!(loaded.n_top(), ov_loaded.n_top()); +} + +#[test] +fn sign_bitmap_v1_legacy_tv_magic_still_loads() { + let ov_fixture: &[u8] = &[ + b'O', b'V', b'S', b'B', 1, 64, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0x80, + ]; + let mut legacy = vec![b'T', b'V', b'S', b'B']; + legacy.extend_from_slice(&ov_fixture[4..]); + + let path = tmp("sign_bitmap_legacy_tv"); + write_bytes(&path, &legacy); + let loaded = SignBitmap::load(&path).unwrap(); + assert_eq!(loaded.dim(), 64); + assert_eq!(loaded.len(), 1); + + let ov_path = tmp("sign_bitmap_ov"); + write_bytes(&ov_path, ov_fixture); + let ov_loaded = SignBitmap::load(&ov_path).unwrap(); + assert_eq!(loaded.dim(), ov_loaded.dim()); + assert_eq!(loaded.len(), ov_loaded.len()); +} From ccf963d5d1840bfbfad1355b4e87e5dacd86922d Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 21:36:28 -0500 Subject: [PATCH 2/4] style: rustfmt ordvec-ffi (match layout after error-string shortening) The shorter UNSUPPORTED_FORMAT message in info_for_metadata let rustfmt collapse `info.kind = match {...}` onto one line. Formatting only, no logic change. Signed-off-by: Nelson Spence --- ordvec-ffi/src/lib.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index 8a385808..f4bf4639 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -364,15 +364,16 @@ fn info_for_handle(handle: &IndexHandle) -> ordvec_index_info_t { fn info_for_metadata(meta: &IndexMetadata) -> Result { let mut info = default_info(); - info.kind = - match meta.kind { - IndexKind::RankQuant => ORDVEC_INDEX_KIND_RANK_QUANT, - IndexKind::Bitmap => ORDVEC_INDEX_KIND_BITMAP, - IndexKind::Rank | IndexKind::SignBitmap => return Err(FfiError::new( + info.kind = match meta.kind { + IndexKind::RankQuant => ORDVEC_INDEX_KIND_RANK_QUANT, + IndexKind::Bitmap => ORDVEC_INDEX_KIND_BITMAP, + IndexKind::Rank | IndexKind::SignBitmap => { + return Err(FfiError::new( ORDVEC_STATUS_UNSUPPORTED_FORMAT, "ABI v1 supports metadata probes only for RankQuant and Bitmap indexes", - )), - }; + )) + } + }; info.format_version = u32::from(meta.format_version); info.dim = meta.dim as u64; info.vector_count = meta.vector_count as u64; From c4b26f247c75f76bf02dc67142944264d26fd44c Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Sun, 14 Jun 2026 21:53:00 -0500 Subject: [PATCH 3/4] feat: promote RankQuantFastscan to public + add .ovfs persistence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Un-hides RankQuantFastscan (it was `#[doc(hidden)]` on both the re-export and the struct) and gives it persistence, building on the OV* format convention. - Un-hide: remove `#[doc(hidden)]` from the crate-root re-export and the struct; reword its docs as a stable-but-specialized b=2 minimum-latency scan path (not the headline retrieval surface). Add a discoverable crate-root mention. - Persistence (net-new): `RankQuantFastscan::write`/`load` via a new `.ovfs` format (magic `OVFS`; 13-byte header + the opaque block-32 packed payload). rank_io gains `write_fastscan`/`load_fastscan`/`fastscan_payload_bytes` (pub(crate)); the loader validates magic, `dim % 4 == 0`, n_vectors, exact payload length (no trailing bytes), with the MAX_PAYLOAD pre-File::create guard. OVFS is new in the ordvec format — no legacy TV* counterpart. - Tests: write/load round-trip scans byte-identically; empty-index round-trip; OVFS-magic-on-write; rejects wrong magic / trailing bytes / dim%4!=0. - Fuzz: new `load_fastscan` libFuzzer target (parity with the other 4 loaders). Kernels were already AVX-512/AVX2/scalar-optimized; this PR is promotion + persistence only, no kernel changes. Gate: fmt + clippy -D warnings + rustdoc -D warnings + full test suites + `cargo +nightly fuzz build load_fastscan`. Signed-off-by: Nelson Spence --- fuzz/Cargo.toml | 7 ++ fuzz/fuzz_targets/load_fastscan.rs | 27 +++++++ src/fastscan.rs | 42 ++++++++-- src/lib.rs | 17 +++-- src/rank_io.rs | 82 +++++++++++++++++++- tests/index/fastscan.rs | 118 +++++++++++++++++++++++++++++ 6 files changed, 279 insertions(+), 14 deletions(-) create mode 100644 fuzz/fuzz_targets/load_fastscan.rs diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 116cfc9e..ae82c790 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -49,6 +49,13 @@ test = false doc = false bench = false +[[bin]] +name = "load_fastscan" +path = "fuzz_targets/load_fastscan.rs" +test = false +doc = false +bench = false + # Hot-path targets (beyond the loaders): the ingest + asymmetric-search compute # path, and the write -> load round-trip the loader targets never reach. [[bin]] diff --git a/fuzz/fuzz_targets/load_fastscan.rs b/fuzz/fuzz_targets/load_fastscan.rs new file mode 100644 index 00000000..85c6cbeb --- /dev/null +++ b/fuzz/fuzz_targets/load_fastscan.rs @@ -0,0 +1,27 @@ +//! libFuzzer target for the `.ovfs` / `OVFS` loader (the FastScan b=2 +//! persistence format — new in the ordvec format, no legacy `TV*` magic), +//! driven through the public `ordvec::RankQuantFastscan::load` entry point. +//! +//! The low-level `rank_io::load_fastscan` parser is crate-internal +//! (`pub(crate)`), so the fuzzer exercises it through `RankQuantFastscan::load` +//! — which runs that exact loader (the full public load path). `load` takes a +//! `&Path` and the only public load entry points are path-based (issue #6), so +//! a shared process-local scratch file (see [`scratch`]) feeds the loader the +//! fuzz bytes without per-iteration `mkstemp`/`unlink` churn. +//! +//! Contract: on arbitrary bytes the loader must return `Ok(..)` or `Err(..)` — +//! never panic, abort, or read out of bounds. libFuzzer treats any panic/abort +//! as a crash, so simply letting the result drop is the assertion. + +#![no_main] + +use libfuzzer_sys::fuzz_target; + +mod scratch; + +fuzz_target!(|data: &[u8]| { + scratch::with_scratch_file(data, |path| { + // The only thing under test: arbitrary bytes -> Ok | Err, no panic. + let _ = ordvec::RankQuantFastscan::load(path); + }); +}); diff --git a/src/fastscan.rs b/src/fastscan.rs index 5149bd5b..dd01b956 100644 --- a/src/fastscan.rs +++ b/src/fastscan.rs @@ -511,21 +511,22 @@ pub(crate) fn search_asymmetric_fastscan_b2( /// cleanly with incremental extend (tail padding within blocks /// would interleave with new docs). Subsequent `add()` calls panic; /// construct a new index for incremental scenarios. -/// - **No `swap_remove`, `write`, `load`** — the block-32 layout -/// makes byte-exact updates non-trivial. v2 follow-up. +/// - **No `swap_remove`** — the block-32 layout makes byte-exact in-place +/// updates non-trivial (a v2 follow-up). Persistence *is* supported: +/// [`write`](Self::write) / [`load`](Self::load) round-trip via the +/// `.ovfs` format. /// /// # Concurrency /// /// `search` takes `&self`; safe to call from multiple threads /// concurrently. /// -/// # Visibility +/// # Positioning /// -/// This type is re-exported `#[doc(hidden)]`: it is an optional scan -/// path, not part of the headline API. Prefer -/// [`RankQuant`](crate::RankQuant) unless you have -/// measured FastScan to win on your workload. -#[doc(hidden)] +/// A stable, documented public type, but a **specialized** one: it is the +/// minimum-latency b=2 scan path, not the headline retrieval API. Prefer +/// [`RankQuant`](crate::RankQuant) / [`Bitmap`](crate::Bitmap) / the two-stage +/// flow unless you have measured FastScan to win on your workload. pub struct RankQuantFastscan { dim: usize, n_vectors: usize, @@ -640,4 +641,29 @@ impl RankQuantFastscan { pub fn byte_size(&self) -> usize { self.packed_fs.len() } + + /// Persist this index to a `.ovfs` file (magic `OVFS`). + /// + /// The on-disk form is a 13-byte header (`OVFS` magic, version, `dim`, + /// `n_vectors`) followed by the opaque block-32 packed FastScan payload. + /// This is a new ordvec format with no turbovec-era counterpart. Round-trip + /// is a type-level guarantee: [`Self::load`] reconstructs the same + /// `(dim, n_vectors)` and packed buffer this writes. + pub fn write(&self, path: impl AsRef) -> std::io::Result<()> { + crate::rank_io::write_fastscan(path, self.dim, self.n_vectors, &self.packed_fs) + } + + /// Load a `.ovfs` FastScan index previously written by [`Self::write`]. + /// + /// The loader validates the header and that the payload length is exactly + /// the block-32 size implied by `(dim, n_vectors)` (`dim % 4 == 0`, no + /// trailing bytes), so the returned index is consistent by construction. + pub fn load(path: impl AsRef) -> std::io::Result { + let (dim, n_vectors, packed_fs) = crate::rank_io::load_fastscan(path)?; + Ok(Self { + dim, + n_vectors, + packed_fs, + }) + } } diff --git a/src/lib.rs b/src/lib.rs index 4b90c3cb..400ff02c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,12 @@ //! coordinate, set when the coordinate is positive) for sign-cosine //! candidate generation. //! +//! For b=2 specifically, [`RankQuantFastscan`] is a specialized companion to +//! [`RankQuant`] — a block-32 FastScan kernel (nibble LUT; AVX-512 → AVX2 → +//! scalar dispatch) for absolute-minimum stage-1 scan latency, trading 2× the +//! b=2 storage and 8-bit LUT scoring noise. Reach for it only when scan latency +//! is the binding constraint. +//! //! These four families are the retrieval surface. The `experimental` //! `MultiBucketBitmap` indexed contingency / projection API is a niche //! research/analysis substrate for the bilinear bucket-overlap decomposition — @@ -162,11 +168,12 @@ pub use const_weight_bitmap::{ choose, top_group_overlap_vector, BitmapNull, ConstantWeightBitmap, PackedConstantWeightBitmap, }; -// `RankQuantFastscan` is an optional FastScan b=2 scan path. It is -// re-exported `#[doc(hidden)]` at the crate root — reachable as -// `ordvec::RankQuantFastscan` for callers who opt in, but not -// advertised alongside the headline index types above. -#[doc(hidden)] +// `RankQuantFastscan` is a specialized b=2 FastScan scan path (block-32 nibble +// LUT, AVX-512 → AVX2 → scalar dispatch) for absolute-minimum stage-1 scan +// latency, at the cost of 2× the `RankQuant` b=2 storage and 8-bit LUT scoring +// noise. It is a stable, documented public type, but a *specialized* one — the +// headline retrieval surface is still `RankQuant` / `Bitmap` / two-stage; reach +// for FastScan only when scan latency at b=2 is the binding constraint. pub use fastscan::RankQuantFastscan; /// Whether the AVX-512 VPOPCNTDQ bitmap/sign scan kernels are active on this diff --git a/src/rank_io.rs b/src/rank_io.rs index 3a1e3418..8862dc6a 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -1,6 +1,6 @@ //! Read/write ordinal/sign index files. //! -//! Four formats live here, each self-describing via a 4-byte magic. Files +//! Five formats live here, each self-describing via a 4-byte magic. Files //! written by this crate use the **`.ov*` / `OV*`** magics (the ordvec format); //! the legacy turbovec-era **`.tv*` / `TV*`** magics are still accepted on load //! for backward compatibility, but are never written: @@ -8,6 +8,8 @@ //! * `.ovrq` (legacy `.tvrq`) — [`RankQuant`](crate::RankQuant) — magic `OVRQ` (also reads `TVRQ`) //! * `.ovbm` (legacy `.tvbm`) — [`Bitmap`](crate::Bitmap) — magic `OVBM` (also reads `TVBM`) //! * `.ovsb` (legacy `.tvsb`) — [`SignBitmap`](crate::SignBitmap) — magic `OVSB` (also reads `TVSB`) +//! * `.ovfs` — [`RankQuantFastscan`](crate::RankQuantFastscan) — magic `OVFS` +//! (new in the ordvec format; no legacy counterpart) //! //! All formats are little-endian. Headers are small fixed-size structs //! followed by a single contiguous payload (the rank / packed / bitmap @@ -67,6 +69,9 @@ const OVR_MAGIC: &[u8; 4] = b"OVR1"; const OVRQ_MAGIC: &[u8; 4] = b"OVRQ"; const OVBM_MAGIC: &[u8; 4] = b"OVBM"; const OVSB_MAGIC: &[u8; 4] = b"OVSB"; +// FastScan b=2 block-32 layout (`RankQuantFastscan`). New in the ordvec format — +// there is no turbovec-era counterpart, so it has no legacy magic. +const OVFS_MAGIC: &[u8; 4] = b"OVFS"; // Legacy turbovec-era magics — still accepted on load for backward // compatibility, never written. Files produced before the ordvec rebrand carry // these; loaders accept either the `OV*` or the matching `TV*` magic. @@ -858,6 +863,81 @@ pub(crate) fn load_sign_bitmap(path: impl AsRef) -> io::Result<(usize, usi Ok((dim, n_vectors, bitmaps)) } +// ------------------------------------------------------------------- +// RankQuantFastscan: b=2 block-32 FastScan layout. +// Header: magic(4) | version(1) | dim(u32 LE) | n_vectors(u32 LE) = 13 B +// Payload: n_blocks * (dim/2) * 32 bytes, n_blocks = ceil(n_vectors / 32). +// New ordvec format (no legacy TV* counterpart). +// ------------------------------------------------------------------- + +fn fastscan_payload_bytes(dim: usize, vector_count: usize) -> io::Result { + // FastScan b=2 packs 32 docs per block; each block holds `pairs * 32` bytes + // (`pairs = dim / 2`). `dim % 4 == 0` is enforced by the loader / constructor + // before this is called, so `dim / 2` is exact. An empty corpus has zero + // blocks and zero payload. + let n_blocks = vector_count.div_ceil(32); + let pairs = dim / 2; + n_blocks + .checked_mul(pairs) + .and_then(|x| x.checked_mul(32)) + .ok_or_else(|| invalid("OVFS payload size overflows usize")) +} + +pub(crate) fn write_fastscan( + path: impl AsRef, + dim: usize, + n_vectors: usize, + packed_fs: &[u8], +) -> io::Result<()> { + // Enforce the loaders' MAX_PAYLOAD cap *before* File::create so a rejected + // oversized write never truncates an existing file. Defense-in-depth; the + // round-trip guarantee is type-level (see module docs). Mirrors load_fastscan. + let payload_bytes = fastscan_payload_bytes(dim, n_vectors)?; + check_payload_bytes(payload_bytes)?; + assert_eq!(packed_fs.len(), payload_bytes); + let mut f = BufWriter::new(File::create(path)?); + f.write_all(OVFS_MAGIC)?; + f.write_all(&[VERSION])?; + f.write_all(&(dim as u32).to_le_bytes())?; + f.write_all(&(n_vectors as u32).to_le_bytes())?; + f.write_all(packed_fs)?; + f.flush()?; + Ok(()) +} + +pub(crate) fn load_fastscan(path: impl AsRef) -> io::Result<(usize, usize, Vec)> { + let file = File::open(path)?; + let file_len = file.metadata()?.len(); + let mut f = BufReader::new(file); + let magic = read_magic(&mut f, "OVFS")?; + // OVFS is new in the ordvec format: there is no legacy TV* fastscan magic. + if &magic != OVFS_MAGIC { + return Err(invalid("not an OVFS (RankQuantFastscan) file: wrong magic")); + } + read_version(&mut f, "OVFS")?; + let dim = read_u32_le(&mut f, "OVFS", "dim")? as usize; + check_dim(dim)?; + // FastScan b=2 requires `dim % 4 == 0` (mirrors `RankQuantFastscan::new` / + // `RankQuant::new(dim, 2)`: constant composition, exact analytical norm). + // `dim % 4 == 0` subsumes the pair-encoding's `dim % 2 == 0`. + if !dim.is_multiple_of(4) { + return Err(invalid(format!( + "OVFS dim {dim} is not a multiple of 4 (b=2 constant composition)" + ))); + } + let n_vectors = read_u32_le(&mut f, "OVFS", "n_vectors")? as usize; + check_n_vectors(n_vectors)?; + let payload_bytes = fastscan_payload_bytes(dim, n_vectors)?; + check_payload_bytes(payload_bytes)?; + check_payload_matches_file(&mut f, "OVFS", file_len, payload_bytes)?; + // The packed FastScan payload is opaque pre-encoded nibbles in the block-32 + // transpose: any byte value is valid, so there is no per-row invariant to + // check beyond the exact payload length validated above. + let mut packed_fs = try_alloc_zeroed(payload_bytes)?; + f.read_exact(&mut packed_fs)?; + Ok((dim, n_vectors, packed_fs)) +} + #[cfg(test)] mod tests { use super::{ diff --git a/tests/index/fastscan.rs b/tests/index/fastscan.rs index c8ed4736..f30cd6df 100644 --- a/tests/index/fastscan.rs +++ b/tests/index/fastscan.rs @@ -262,3 +262,121 @@ fn fastscan_new_rejects_dim_above_u16_max() { // by the u16 bound — not deferred to a panic on the first add(). let _ = RankQuantFastscan::new(65_536); } + +// --------------------------------------------------------------------- +// Persistence: `.ovfs` (magic `OVFS`) write/load round-trip + validation. +// --------------------------------------------------------------------- + +fn fs_tmp(name: &str) -> std::path::PathBuf { + std::env::temp_dir().join(format!( + "ordvec_fastscan_{}_{}.ovfs", + name, + std::process::id() + )) +} + +#[test] +fn fastscan_write_load_roundtrip_searches_identically() { + const FD: usize = 128; + const FN: usize = 200; + let mut rng = ChaCha8Rng::seed_from_u64(909090); + let docs: Vec = (0..FN * FD).map(|_| rng.random_range(-1.0..1.0)).collect(); + let queries: Vec = (0..4 * FD).map(|_| rng.random_range(-1.0..1.0)).collect(); + + let mut idx = RankQuantFastscan::new(FD); + idx.add(&docs); + let before = idx.search(&queries, 10); + + let path = fs_tmp("roundtrip"); + idx.write(&path).unwrap(); + let loaded = RankQuantFastscan::load(&path).unwrap(); + std::fs::remove_file(&path).ok(); + + // Reloaded index reports the same shape and scans byte-identically: the + // packed buffer is the same, so scores/indices match exactly (no recompute). + assert_eq!(loaded.dim(), FD); + assert_eq!(loaded.len(), FN); + assert_eq!(loaded.byte_size(), idx.byte_size()); + let after = loaded.search(&queries, 10); + assert_eq!(after.indices, before.indices, "reloaded indices must match"); + assert_eq!(after.scores, before.scores, "reloaded scores must match"); +} + +#[test] +fn fastscan_empty_index_roundtrips() { + let idx = RankQuantFastscan::new(64); // never add()-ed → 0 vectors, empty payload + let path = fs_tmp("empty"); + idx.write(&path).unwrap(); + let bytes = std::fs::read(&path).unwrap(); + let loaded = RankQuantFastscan::load(&path).unwrap(); + std::fs::remove_file(&path).ok(); + assert_eq!(bytes.len(), 13, "empty .ovfs is header-only (no payload)"); + assert_eq!(&bytes[0..4], b"OVFS", "magic is OVFS"); + assert_eq!(loaded.dim(), 64); + assert_eq!(loaded.len(), 0); + assert!(loaded.is_empty()); +} + +#[test] +fn fastscan_written_file_starts_with_ovfs_magic() { + let mut idx = RankQuantFastscan::new(64); + idx.add(&vec![0.5f32; 64 * 40]); + let path = fs_tmp("magic"); + idx.write(&path).unwrap(); + let bytes = std::fs::read(&path).unwrap(); + std::fs::remove_file(&path).ok(); + assert_eq!(&bytes[0..4], b"OVFS"); +} + +#[test] +fn fastscan_load_rejects_wrong_magic() { + let mut idx = RankQuantFastscan::new(64); + idx.add(&vec![0.25f32; 64 * 40]); + let path = fs_tmp("badmagic"); + idx.write(&path).unwrap(); + let mut bytes = std::fs::read(&path).unwrap(); + bytes[0..4].copy_from_slice(b"OVRQ"); // a different (valid) ordvec magic + std::fs::write(&path, &bytes).unwrap(); + let err = match RankQuantFastscan::load(&path) { + Ok(_) => panic!("expected load error, got Ok"), + Err(e) => e, + }; + std::fs::remove_file(&path).ok(); + assert!(err.to_string().contains("OVFS"), "got: {err}"); +} + +#[test] +fn fastscan_load_rejects_trailing_bytes() { + let mut idx = RankQuantFastscan::new(64); + idx.add(&vec![-0.3f32; 64 * 40]); + let path = fs_tmp("trailing"); + idx.write(&path).unwrap(); + let mut bytes = std::fs::read(&path).unwrap(); + bytes.push(0xAB); // one trailing byte past the declared payload + std::fs::write(&path, &bytes).unwrap(); + let err = match RankQuantFastscan::load(&path) { + Ok(_) => panic!("expected load error, got Ok"), + Err(e) => e, + }; + std::fs::remove_file(&path).ok(); + // A structurally-valid file with trailing bytes is rejected. + assert!(!err.to_string().is_empty()); +} + +#[test] +fn fastscan_load_rejects_dim_not_multiple_of_4() { + // Forge a header with dim = 66 (even but % 4 == 2) and zero payload. + let path = fs_tmp("baddim"); + let mut bytes = Vec::new(); + bytes.extend_from_slice(b"OVFS"); + bytes.push(1); // version + bytes.extend_from_slice(&66u32.to_le_bytes()); // dim = 66 + bytes.extend_from_slice(&0u32.to_le_bytes()); // n_vectors = 0 + std::fs::write(&path, &bytes).unwrap(); + let err = match RankQuantFastscan::load(&path) { + Ok(_) => panic!("expected load error, got Ok"), + Err(e) => e, + }; + std::fs::remove_file(&path).ok(); + assert!(err.to_string().contains("multiple of 4"), "got: {err}"); +} From 1bbdb8f617581e8af483773104c5b3dfb5badf33 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Mon, 15 Jun 2026 11:05:32 -0500 Subject: [PATCH 4/4] fix(rank_io): fail-loud OVFS write + regen ffi header (gemini/qodo) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `write_fastscan` backs the now-public `RankQuantFastscan` persistence API but could (a) silently truncate `dim`/`n_vectors` through the `as u32` casts, and (b) panic via `assert_eq!(packed_fs.len(), payload_bytes)` from a `Result`-returning fn (flagged by gemini + qodo). Validate `dim` (range + multiple-of-4 for FastScan b=2), `n_vectors`, the payload size, and the packed buffer length BEFORE `File::create`, returning a clean `io::Error` instead of panicking or truncating — and a rejected write never creates/truncates a file. Add a regression test: valid round-trip + Err-not-panic on bad dim / length. Regenerate `ordvec-ffi/include/ordvec.h` with cbindgen 0.29.3 (the CI version); the committed header had drifted from the loader doc-comment wording. Signed-off-by: Nelson Spence --- ordvec-ffi/include/ordvec.h | 4 +-- src/rank_io.rs | 51 ++++++++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/ordvec-ffi/include/ordvec.h b/ordvec-ffi/include/ordvec.h index 68a0ac51..493567fe 100644 --- a/ordvec-ffi/include/ordvec.h +++ b/ordvec-ffi/include/ordvec.h @@ -192,8 +192,8 @@ ordvec_status_t ordvec_index_load(const char *path, uint64_t flags, ordvec_index /** * Probe on-disk metadata for a `.ovrq` RankQuant or `.ovbm` Bitmap index - * (legacy `.tvrq` / `.tvbm` also accepted) without loading payload rows into an - * index handle. + * (legacy `.tv*` also accepted) without loading payload rows into an index + * handle. * * This validates the fixed header, declared dimensions, payload byte count, * and exact file length. Full row-invariant validation remains the job of diff --git a/src/rank_io.rs b/src/rank_io.rs index 8862dc6a..50570557 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -889,12 +889,26 @@ pub(crate) fn write_fastscan( n_vectors: usize, packed_fs: &[u8], ) -> io::Result<()> { - // Enforce the loaders' MAX_PAYLOAD cap *before* File::create so a rejected - // oversized write never truncates an existing file. Defense-in-depth; the - // round-trip guarantee is type-level (see module docs). Mirrors load_fastscan. + // Validate every header parameter *before* File::create, so a now-public + // persistence API never (a) silently truncates `dim`/`n_vectors` through the + // `as u32` casts below, (b) writes a corrupt/oversized file (the loaders' + // MAX_PAYLOAD cap; a rejected write never truncates an existing file), or + // (c) panics from a `Result`-returning fn. Mirrors load_fastscan's contract. + check_dim(dim)?; + if !dim.is_multiple_of(4) { + return Err(invalid(format!( + "OVFS dim {dim} is not a multiple of 4 (FastScan b=2 constant composition)" + ))); + } + check_n_vectors(n_vectors)?; let payload_bytes = fastscan_payload_bytes(dim, n_vectors)?; check_payload_bytes(payload_bytes)?; - assert_eq!(packed_fs.len(), payload_bytes); + if packed_fs.len() != payload_bytes { + return Err(invalid(format!( + "OVFS packed buffer is {} bytes but dim={dim}/n_vectors={n_vectors} implies {payload_bytes}", + packed_fs.len() + ))); + } let mut f = BufWriter::new(File::create(path)?); f.write_all(OVFS_MAGIC)?; f.write_all(&[VERSION])?; @@ -1557,4 +1571,33 @@ mod tests { let _ = std::fs::remove_file(p); } } + + // OVFS (FastScan) write path: valid round-trip, and fail-loud (io::Error, not + // a panic) on invalid `dim`/`n_vectors`/payload — the now-public persistence + // API must never abort the caller or silently truncate the header. + #[test] + fn write_fastscan_validates_and_never_panics() { + use super::{load_fastscan, write_fastscan}; + // dim=8 (multiple of 4), 4 vectors -> ceil(4/32)*(8/2)*32 = 128-byte payload. + let (dim, n) = (8usize, 4usize); + let payload = vec![0u8; 128]; + let p = temp_index_path("ovfs_ok"); + write_fastscan(&p, dim, n, &payload).unwrap(); + let (ld, ln, lbytes) = load_fastscan(&p).unwrap(); + assert_eq!((ld, ln), (dim, n)); + assert_eq!(lbytes, payload, "OVFS round-trip altered the payload"); + let _ = std::fs::remove_file(&p); + + // dim not a multiple of 4 -> rejected before File::create (no panic, no file). + let p2 = temp_index_path("ovfs_baddim"); + let e = write_fastscan(&p2, 6, n, &payload).unwrap_err(); + assert_eq!(e.kind(), std::io::ErrorKind::InvalidData); + assert!(!p2.exists(), "rejected write must not create a file"); + + // packed buffer inconsistent with dim/n_vectors -> rejected, not panic. + let p3 = temp_index_path("ovfs_badlen"); + let e = write_fastscan(&p3, dim, n, &payload[..100]).unwrap_err(); + assert_eq!(e.kind(), std::io::ErrorKind::InvalidData); + assert!(!p3.exists(), "rejected write must not create a file"); + } }