From c4600ceca40e0836639b37d294b355c7b09b9971 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 17:48:22 -0500 Subject: [PATCH 1/7] address release audit findings --- CHANGELOG.md | 13 +++++ Cargo.toml | 6 +-- README.md | 10 ++-- SECURITY.md | 12 ++--- THREAT_MODEL.md | 18 ++++--- docs/INDEX_PROVENANCE.md | 9 ++-- docs/RANK_MODES.md | 2 +- .../ordinal-routing-research/README.md | 2 +- .../crt_seam_oracle_results.md | 2 +- ordvec-ffi/include/ordvec.h | 4 ++ .../python/ordvec_manifest/__init__.py | 2 + ordvec-manifest-python/src/lib.rs | 35 +++++++++++-- .../tests/test_manifest_bindings.py | 50 +++++++++++++++++++ ordvec-python/README.md | 2 +- ordvec-python/pyproject.toml | 2 +- ordvec-python/src/lib.rs | 11 ++-- src/quant.rs | 6 +-- src/rank.rs | 2 +- tests/release_publish_invariants.py | 2 +- 19 files changed, 143 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d4e2ef9..079098c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `crates-io` / `pypi` GitHub Environment gates: `Fieldnote-Echo` and `toadkicker` are listed as required reviewers, self-review is blocked, and a 30-minute wait timer applies before registry publish jobs can proceed. +- Exposed the calibration-profile byte limit through the `ordvec-manifest` + Python bindings, including the default constant, `default_resource_limits()`, + and verifier/create keyword arguments. +- Aligned `.ovfs` / `OVFS` security and provenance docs with the now-public + `RankQuantFastscan` persistence loader and fuzz target. +- Updated formalization links and release invariants after the companion + `ordvec-formalization` repository moved under `Project-Navi`. + +### Fixed + +- Hid the `SubsetScratch::capacities_for_test` helper behind the non-default + `test-utils` feature and cleaned stale release-doc comments around FastScan + and b=8 bucket rustdoc. ## 0.5.0 - 2026-06-19 diff --git a/Cargo.toml b/Cargo.toml index 21b46252..5cb07d96 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,9 +58,9 @@ exclude = [ "tests/release_signed_release_invariants.sh", ] -# docs.rs build configuration: build with default features only, so the -# experimental MultiBucketBitmap scaffold stays off the published docs. -# (The `#[doc(hidden)]` FastScan path is hidden by its attribute either way.) +# docs.rs build configuration: build with default features only. Stable default +# APIs, including `RankQuantFastscan`, are documented; the experimental +# MultiBucketBitmap scaffold stays off the published docs. [package.metadata.docs.rs] all-features = false diff --git a/README.md b/README.md index 30574995..c945b7e1 100644 --- a/README.md +++ b/README.md @@ -170,7 +170,7 @@ are machine-checked in Lean 4, both `sorry`-free on Lean's standard axiom base signal model makes an overlap-count threshold Bayes-optimal among deterministic admission rules, and the uniform constant-weight bitmap null assigns that same threshold event exactly the hypergeometric upper tail — in - [`ordvec-formalization`](https://github.com/Fieldnote-Echo/ordvec-formalization) + [`ordvec-formalization`](https://github.com/Project-Navi/ordvec-formalization) (theorem `exists_uniformBitmapOverlapTail_finiteBayesRisk_le_and_hypergeomTail`). This is an *in-model* result. It proves the rule shape and the idealized finite @@ -310,10 +310,10 @@ candidate slices passed to `Search` until the call returns. [`docs/compatibility-policy.md`](docs/compatibility-policy.md) defines the stable, experimental, repo-local sidecar, persisted-format, examples/docs, MSRV, and release-note review surfaces. -- **Formal proof spine:** [`ordvec-formalization`](https://github.com/Fieldnote-Echo/ordvec-formalization), - including its [`proof-spine`](https://github.com/Fieldnote-Echo/ordvec-formalization/blob/main/docs/proof-spine.md), - [`theorem-map`](https://github.com/Fieldnote-Echo/ordvec-formalization/blob/main/docs/theorem-map.md), - and [`reviewer brief`](https://github.com/Fieldnote-Echo/ordvec-formalization/blob/main/docs/reviewer-brief.md). +- **Formal proof spine:** [`ordvec-formalization`](https://github.com/Project-Navi/ordvec-formalization), + including its [`proof-spine`](https://github.com/Project-Navi/ordvec-formalization/blob/main/docs/proof-spine.md), + [`theorem-map`](https://github.com/Project-Navi/ordvec-formalization/blob/main/docs/theorem-map.md), + and [`reviewer brief`](https://github.com/Project-Navi/ordvec-formalization/blob/main/docs/reviewer-brief.md). - **API docs:** , - **Paper (OrdVec / RankQuant):** _link TBD — see [Research collaboration](#research-collaboration)._ diff --git a/SECURITY.md b/SECURITY.md index 07a86144..c6bcd383 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -16,12 +16,12 @@ Use GitHub's private vulnerability reporting: We aim to acknowledge reports within a few business days. `ordvec` parses serialized index files (`.ovr` / `.ovrq` / `.ovbm` / -`.ovsb`; the loaders also accept the legacy `.tvr` / `.tvrq` / `.tvbm` / -`.tvsb` magics); the loaders are fuzzed (`cargo +nightly fuzz`), so -parsing-robustness reports against the deserialization paths are especially -welcome. Reports are also welcome against the `unsafe` SIMD kernels (shape / -bounds invariants), the Python FFI contract (buffer handling, GIL discipline), -and the release pipeline. +`.ovsb` / `.ovfs`; `.ovfs` uses `OVFS` FastScan magic, and the other loaders +also accept the legacy `.tvr` / `.tvrq` / `.tvbm` / `.tvsb` magics); the loaders +are fuzzed (`cargo +nightly fuzz`), so parsing-robustness reports against the +deserialization paths are especially welcome. Reports are also welcome against +the `unsafe` SIMD kernels (shape / bounds invariants), the Python FFI contract +(buffer handling, GIL discipline), and the release pipeline. ## Threat model diff --git a/THREAT_MODEL.md b/THREAT_MODEL.md index e35b80f8..9fa2344b 100644 --- a/THREAT_MODEL.md +++ b/THREAT_MODEL.md @@ -49,14 +49,16 @@ See also: [`SECURITY.md`](SECURITY.md) (reporting), [`RELEASING.md`](RELEASING.m ## Maintenance budget -`ordvec` is maintained by a single primary contributor. Mitigations are -prioritized when they are (1) low-maintenance once merged, (2) enforceable by -tests or CI, (3) local to the library boundary, and (4) unlikely to add -operational burden downstream. Heavyweight controls (mandatory index signing, -long-running fuzz farms, service-level admission control) are documented as -**deployment guidance** until there is maintainer capacity to own them. The -absence of a second maintainer is itself a tracked supply-chain residual -(see THREAT-SUPPLY-001). +`ordvec` has one project lead plus an additional maintainer / release +approver. Mitigations are prioritized when they are (1) low-maintenance once +merged, (2) enforceable by tests or CI, (3) local to the library boundary, and +(4) unlikely to add operational burden downstream. Heavyweight controls +(mandatory index signing, long-running fuzz farms, service-level admission +control) are documented as **deployment guidance** unless the project has +maintainer capacity to own them. Release publication requires a non-triggering +approver through protected GitHub Environments; the residual release +supply-chain risk is approver account compromise / collusion, not a +single-owner project structure (see THREAT-SUPPLY-001). --- diff --git a/docs/INDEX_PROVENANCE.md b/docs/INDEX_PROVENANCE.md index 808f2815..03fa45ec 100644 --- a/docs/INDEX_PROVENANCE.md +++ b/docs/INDEX_PROVENANCE.md @@ -25,10 +25,13 @@ files without panicking, aborting, or silently accepting garbage: - an exact file-length match (trailing bytes or short files are rejected); - per-row **structural** invariants: `Rank` rows must be a true permutation of `[0, dim)`, `RankQuant` rows must satisfy constant composition, `Bitmap` rows - must have exactly `n_top` bits set. + must have exactly `n_top` bits set, and direct `RankQuantFastscan` `.ovfs` + rows must use valid FastScan nibbles, satisfy b=2 constant composition, and + have zero block-tail padding. -A file that survives all of this is **structurally well-formed**. The four -loaders are exercised by `cargo fuzz` (the `load_*` targets). +A file that survives all of this is **structurally well-formed**. The five +loaders are exercised by `cargo fuzz` (the `load_*` targets, including +`load_fastscan` for `.ovfs`). ## What the loaders do NOT validate diff --git a/docs/RANK_MODES.md b/docs/RANK_MODES.md index 01b54091..8409bd1e 100644 --- a/docs/RANK_MODES.md +++ b/docs/RANK_MODES.md @@ -130,7 +130,7 @@ unknown embedding distribution. **Checked finite model: symmetry, quotient sufficiency, threshold, calibration.** The proof chain now has a larger machine-checked middle than the implementation docs used to claim. In -[`ordvec-formalization`](https://github.com/Fieldnote-Echo/ordvec-formalization), +[`ordvec-formalization`](https://github.com/Project-Navi/ordvec-formalization), Lean proves that literal bitmap overlap is the canonical invariant under query-preserving coordinate relabelings; finite quotient sufficiency reduces the admission decision to ordered overlap diff --git a/experiments/ordinal-routing-research/README.md b/experiments/ordinal-routing-research/README.md index 0b5305f1..85ff4987 100644 --- a/experiments/ordinal-routing-research/README.md +++ b/experiments/ordinal-routing-research/README.md @@ -34,7 +34,7 @@ tiered below by **what survived scrutiny**. Read the tiers, not every doc. |-----|-------| | [density_collapse_results.md](density_collapse_results.md) | **Mechanism.** RankQuant b=2 density collapse = Hamming-near codes the scorer can't separate. Among those lookalikes, true neighbours have lower intra-code Kendall-tau (gap ≈ 0.04, CI > 0). Real but small. | | [tau_rerank_bakeoff_results.md](tau_rerank_bakeoff_results.md) | **The verdict.** Does that tau signal beat b=4? NO — b=4 wins even at the tau ceiling; tau scores below b=2's own ordering. Signal is real-but-inert; just use b=4. Closes the line: research, not a feature. | -| [crt_seam_oracle_results.md](crt_seam_oracle_results.md) | CRT vernier seam theorem — exhaustive finite proof: lcm spacing, one coincidence/period, capped density `∏min(2t+1,m_i)/m_i`. Lean 4 formalization lives in the companion repo: [ordvec-formalization#17](https://github.com/Fieldnote-Echo/ordvec-formalization/pull/17) (open PR, `sorry`-free). | +| [crt_seam_oracle_results.md](crt_seam_oracle_results.md) | CRT vernier seam theorem — exhaustive finite proof: lcm spacing, one coincidence/period, capped density `∏min(2t+1,m_i)/m_i`. Lean 4 formalization lives in the companion repo: [ordvec-formalization#17](https://github.com/Project-Navi/ordvec-formalization/pull/17) (open PR, `sorry`-free). | | [shard_recall_results.md](shard_recall_results.md) | Controlled ablation (post RNG-desync fix): random phase offsets add nothing vs aligned grids across R random directions. | | [oblivious_directions_results.md](oblivious_directions_results.md) | **The directions arc (round 2).** Data-oblivious low-discrepancy directions (golden-angle / Sobol / Kronecker) do NOT beat iid-random for training-free routing — across 5 encoders (nomic, bge-m3, bge-large, snowflake-arctic-v2, harrier-oss) at real intrinsic dim 18–24. CLASS-DEAD, pre-registered, replicated (the one mid-ladder flicker failed to replicate). Centering removes the cone but fails at b=4 (penalty grows with capacity). One robust positive: data-aligned (PCA) directions lead at higher ID — the lever is data-alignment, which training-free forbids. Also **resolves the twonn_id PARTIAL**: real-corpus ID measured at ~18–24 across 5 encoders, and ID is a **corpus** property (repo≈13 vs fiqa≈24, same encoder), not an encoder constant. Probes: `uniformity_lemma.rs`, `overlap_decomp.rs`, `centering_recall.rs`, `subspace_directions.rs`, `partition_balance.rs`, `fib_*.rs`. | | [length_mixture_lake_results.md](length_mixture_lake_results.md) | **Path B — chunk-length-mixture lake (closes the synthetic-lake arc).** Same fiqa docs embedded at 4 chunk lengths {128,256,512,1100} unioned into a 230k-doc lake; b=4 raw R@10 vs FP32 cosine is **immune** (+0.002, CR@100=1.0). Bonus measurement of the "chunk length is a third geometry axis" claim: real but **small and co-axial** — R̄ spreads only 0.705→0.723 over an 8.6× length range, cone axes ≥0.986 aligned (not the distinct geometries the mixture framing imagined). With Phase B (multi-domain) this leaves every synthetic lake pathology — multi-cone, hub, multi-length — benign for "spend the bits, b=4." Probe: `make_length_lake.py` + `centering_recall.rs`. | diff --git a/experiments/ordinal-routing-research/crt_seam_oracle_results.md b/experiments/ordinal-routing-research/crt_seam_oracle_results.md index b6293dbd..eebd239a 100644 --- a/experiments/ordinal-routing-research/crt_seam_oracle_results.md +++ b/experiments/ordinal-routing-research/crt_seam_oracle_results.md @@ -1,7 +1,7 @@ # CRT seam oracle — corrected vernier theorem (exhaustive finite proof) > Lean 4 formalization of this theorem lives in the companion repo: -> [ordvec-formalization#17](https://github.com/Fieldnote-Echo/ordvec-formalization/pull/17) +> [ordvec-formalization#17](https://github.com/Project-Navi/ordvec-formalization/pull/17) > (open PR, `sorry`-free). `examples/crt_seam_oracle.rs` enumerates the full ring Z/M to verify the diff --git a/ordvec-ffi/include/ordvec.h b/ordvec-ffi/include/ordvec.h index 493567fe..53c5e816 100644 --- a/ordvec-ffi/include/ordvec.h +++ b/ordvec-ffi/include/ordvec.h @@ -234,6 +234,10 @@ void ordvec_index_free(ordvec_index_t *index); * and may be unsorted or duplicated. Duplicate candidates are scored as * separate entries and can produce duplicate hits; callers that need unique * output rows must deduplicate before calling. + * Full search is represented by `candidate_count == 0 && candidate_rows == NULL`. + * ABI v1 treats `candidate_count == 0 && candidate_rows != NULL` as + * `ORDVEC_STATUS_BAD_ARGUMENT`; callers should short-circuit explicit empty + * survivor sets before crossing the ABI. * * # Safety * diff --git a/ordvec-manifest-python/python/ordvec_manifest/__init__.py b/ordvec-manifest-python/python/ordvec_manifest/__init__.py index 6730089f..20e77dd9 100644 --- a/ordvec-manifest-python/python/ordvec_manifest/__init__.py +++ b/ordvec-manifest-python/python/ordvec_manifest/__init__.py @@ -11,6 +11,7 @@ DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES, DEFAULT_MAX_AUXILIARY_ARTIFACTS, DEFAULT_MAX_CACHED_REPORT_BYTES, + DEFAULT_MAX_CALIBRATION_PROFILE_BYTES, DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES, DEFAULT_MAX_MANIFEST_BYTES, DEFAULT_MAX_REPORT_ISSUES, @@ -37,6 +38,7 @@ "DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES", "DEFAULT_MAX_AUXILIARY_ARTIFACTS", "DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES", + "DEFAULT_MAX_CALIBRATION_PROFILE_BYTES", "DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES", "DEFAULT_MAX_REPORT_ISSUES", "DEFAULT_MAX_CACHED_REPORT_BYTES", diff --git a/ordvec-manifest-python/src/lib.rs b/ordvec-manifest-python/src/lib.rs index 2b573992..0f3860cb 100644 --- a/ordvec-manifest-python/src/lib.rs +++ b/ordvec-manifest-python/src/lib.rs @@ -6,10 +6,11 @@ use ordvec_manifest_core::{ CreateRowIdentity, ManifestError, ResourceLimits, VerifiedLoadPlanError, VerifyOptions, CALIBRATION_SCHEMA_VERSION, DEFAULT_MAX_AUXILIARY_ARTIFACTS, DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES, DEFAULT_MAX_CACHED_REPORT_BYTES, - DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES, DEFAULT_MAX_MANIFEST_BYTES, - DEFAULT_MAX_REPORT_ISSUES, DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES, - DEFAULT_MAX_ROW_IDENTITY_ROWS, DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES, - ENCODER_DISTORTION_SCHEMA_VERSION, SCHEMA_VERSION, + DEFAULT_MAX_CALIBRATION_PROFILE_BYTES, DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES, + DEFAULT_MAX_MANIFEST_BYTES, DEFAULT_MAX_REPORT_ISSUES, + DEFAULT_MAX_ROW_IDENTITY_JSONL_LINE_BYTES, DEFAULT_MAX_ROW_IDENTITY_ROWS, + DEFAULT_MAX_ROW_IDENTITY_TRACKED_DB_ID_BYTES, ENCODER_DISTORTION_SCHEMA_VERSION, + SCHEMA_VERSION, }; use pyo3::exceptions::PyKeyError; use pyo3::prelude::*; @@ -91,6 +92,7 @@ fn resource_limits( max_row_map_tracked_id_bytes: Option, max_auxiliary_artifacts: Option, max_auxiliary_artifact_bytes: Option, + max_calibration_profile_bytes: Option, max_encoder_distortion_profile_bytes: Option, max_report_issues: Option, max_cached_report_bytes: Option, @@ -114,6 +116,9 @@ fn resource_limits( if let Some(value) = max_auxiliary_artifact_bytes { limits.max_auxiliary_artifact_bytes = value; } + if let Some(value) = max_calibration_profile_bytes { + limits.max_calibration_profile_bytes = value; + } if let Some(value) = max_encoder_distortion_profile_bytes { limits.max_encoder_distortion_profile_bytes = value; } @@ -134,6 +139,7 @@ struct PythonResourceLimits { max_row_map_tracked_id_bytes: usize, max_auxiliary_artifacts: usize, max_auxiliary_artifact_bytes: u64, + max_calibration_profile_bytes: u64, max_encoder_distortion_profile_bytes: u64, max_report_issues: usize, max_cached_report_bytes: u64, @@ -148,6 +154,7 @@ impl From for PythonResourceLimits { max_row_map_tracked_id_bytes: limits.max_row_identity_tracked_db_id_bytes, max_auxiliary_artifacts: limits.max_auxiliary_artifacts, max_auxiliary_artifact_bytes: limits.max_auxiliary_artifact_bytes, + max_calibration_profile_bytes: limits.max_calibration_profile_bytes, max_encoder_distortion_profile_bytes: limits.max_encoder_distortion_profile_bytes, max_report_issues: limits.max_report_issues, max_cached_report_bytes: limits.max_cached_report_bytes, @@ -167,6 +174,7 @@ fn verify_options( max_row_map_tracked_id_bytes: Option, max_auxiliary_artifacts: Option, max_auxiliary_artifact_bytes: Option, + max_calibration_profile_bytes: Option, max_encoder_distortion_profile_bytes: Option, max_report_issues: Option, max_cached_report_bytes: Option, @@ -183,6 +191,7 @@ fn verify_options( max_row_map_tracked_id_bytes, max_auxiliary_artifacts, max_auxiliary_artifact_bytes, + max_calibration_profile_bytes, max_encoder_distortion_profile_bytes, max_report_issues, max_cached_report_bytes, @@ -200,6 +209,7 @@ fn create_options( max_row_map_tracked_id_bytes: Option, max_auxiliary_artifacts: Option, max_auxiliary_artifact_bytes: Option, + max_calibration_profile_bytes: Option, max_encoder_distortion_profile_bytes: Option, max_report_issues: Option, max_cached_report_bytes: Option, @@ -215,6 +225,7 @@ fn create_options( max_row_map_tracked_id_bytes, max_auxiliary_artifacts, max_auxiliary_artifact_bytes, + max_calibration_profile_bytes, max_encoder_distortion_profile_bytes, max_report_issues, max_cached_report_bytes, @@ -273,6 +284,7 @@ fn sha256_file(py: Python<'_>, path: PathBuf) -> PyResult> { max_row_map_tracked_id_bytes = None, max_auxiliary_artifacts = None, max_auxiliary_artifact_bytes = None, + max_calibration_profile_bytes = None, max_encoder_distortion_profile_bytes = None, max_report_issues = None, max_cached_report_bytes = None @@ -287,6 +299,7 @@ fn inspect_manifest( max_row_map_tracked_id_bytes: Option, max_auxiliary_artifacts: Option, max_auxiliary_artifact_bytes: Option, + max_calibration_profile_bytes: Option, max_encoder_distortion_profile_bytes: Option, max_report_issues: Option, max_cached_report_bytes: Option, @@ -302,6 +315,7 @@ fn inspect_manifest( max_row_map_tracked_id_bytes, max_auxiliary_artifacts, max_auxiliary_artifact_bytes, + max_calibration_profile_bytes, max_encoder_distortion_profile_bytes, max_report_issues, max_cached_report_bytes, @@ -326,6 +340,7 @@ fn inspect_manifest( max_row_map_tracked_id_bytes = None, max_auxiliary_artifacts = None, max_auxiliary_artifact_bytes = None, + max_calibration_profile_bytes = None, max_encoder_distortion_profile_bytes = None, max_report_issues = None, max_cached_report_bytes = None @@ -344,6 +359,7 @@ fn verify_manifest( max_row_map_tracked_id_bytes: Option, max_auxiliary_artifacts: Option, max_auxiliary_artifact_bytes: Option, + max_calibration_profile_bytes: Option, max_encoder_distortion_profile_bytes: Option, max_report_issues: Option, max_cached_report_bytes: Option, @@ -359,6 +375,7 @@ fn verify_manifest( max_row_map_tracked_id_bytes, max_auxiliary_artifacts, max_auxiliary_artifact_bytes, + max_calibration_profile_bytes, max_encoder_distortion_profile_bytes, max_report_issues, max_cached_report_bytes, @@ -416,6 +433,7 @@ struct PythonVerifiedAuxiliaryArtifactPlan { max_row_map_tracked_id_bytes = None, max_auxiliary_artifacts = None, max_auxiliary_artifact_bytes = None, + max_calibration_profile_bytes = None, max_encoder_distortion_profile_bytes = None, max_report_issues = None, max_cached_report_bytes = None @@ -434,6 +452,7 @@ fn verify_for_load( max_row_map_tracked_id_bytes: Option, max_auxiliary_artifacts: Option, max_auxiliary_artifact_bytes: Option, + max_calibration_profile_bytes: Option, max_encoder_distortion_profile_bytes: Option, max_report_issues: Option, max_cached_report_bytes: Option, @@ -449,6 +468,7 @@ fn verify_for_load( max_row_map_tracked_id_bytes, max_auxiliary_artifacts, max_auxiliary_artifact_bytes, + max_calibration_profile_bytes, max_encoder_distortion_profile_bytes, max_report_issues, max_cached_report_bytes, @@ -503,6 +523,7 @@ fn verify_for_load( max_row_map_tracked_id_bytes = None, max_auxiliary_artifacts = None, max_auxiliary_artifact_bytes = None, + max_calibration_profile_bytes = None, max_encoder_distortion_profile_bytes = None, max_report_issues = None, max_cached_report_bytes = None @@ -524,6 +545,7 @@ fn create_manifest( max_row_map_tracked_id_bytes: Option, max_auxiliary_artifacts: Option, max_auxiliary_artifact_bytes: Option, + max_calibration_profile_bytes: Option, max_encoder_distortion_profile_bytes: Option, max_report_issues: Option, max_cached_report_bytes: Option, @@ -552,6 +574,7 @@ fn create_manifest( max_row_map_tracked_id_bytes, max_auxiliary_artifacts, max_auxiliary_artifact_bytes, + max_calibration_profile_bytes, max_encoder_distortion_profile_bytes, max_report_issues, max_cached_report_bytes, @@ -602,6 +625,10 @@ fn _ordvec_manifest(m: &Bound<'_, PyModule>) -> PyResult<()> { "DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES", DEFAULT_MAX_AUXILIARY_ARTIFACT_BYTES, )?; + m.add( + "DEFAULT_MAX_CALIBRATION_PROFILE_BYTES", + DEFAULT_MAX_CALIBRATION_PROFILE_BYTES, + )?; m.add( "DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES", DEFAULT_MAX_ENCODER_DISTORTION_PROFILE_BYTES, diff --git a/ordvec-manifest-python/tests/test_manifest_bindings.py b/ordvec-manifest-python/tests/test_manifest_bindings.py index 320262da..0ec0ce78 100644 --- a/ordvec-manifest-python/tests/test_manifest_bindings.py +++ b/ordvec-manifest-python/tests/test_manifest_bindings.py @@ -59,6 +59,10 @@ def test_hash_and_limits(tmp_path): } limits = ordvec_manifest.default_resource_limits() assert limits["max_manifest_bytes"] == ordvec_manifest.DEFAULT_MAX_MANIFEST_BYTES + assert ( + limits["max_calibration_profile_bytes"] + == ordvec_manifest.DEFAULT_MAX_CALIBRATION_PROFILE_BYTES + ) assert "max_row_map_line_bytes" in limits assert "max_row_identity_jsonl_line_bytes" not in limits @@ -67,6 +71,52 @@ def test_hash_and_limits(tmp_path): assert report["ok"] is False +def test_verify_manifest_enforces_calibration_profile_limit(tmp_path): + index = tmp_path / "index.ovrq" + manifest_path = tmp_path / "manifest.json" + profile_path = tmp_path / "calibration.f64" + write_rankquant_index(index) + profile_path.write_bytes(b"\0" * (16 * 8)) + profile_hash = ordvec_manifest.sha256_file(profile_path) + + manifest = ordvec_manifest.create_manifest( + index, + manifest_path, + "model", + row_id_is_identity=True, + ) + manifest["calibration"] = { + "schema_version": ordvec_manifest.CALIBRATION_SCHEMA_VERSION, + "profile_id": "urn:uuid:7c66ad6e-bdde-49a8-b420-f1136d04f5bd", + "calibrated_for": {"model": "model", "dim": 16}, + "ordinalization": {"kind": "top_k", "dim": 16, "k": 4}, + "profile": { + "path": profile_path.name, + "sha256": profile_hash["sha256"], + "file_size_bytes": profile_hash["size_bytes"], + "dim": 16, + "sample_count": 16, + "parameterization": "marginal_topk_frequency", + "format": "raw_f64_le", + "shape": [16], + }, + "null_model": { + "kind": "weighted_marginal_profile", + "parameterization": "marginal_topk_frequency", + }, + } + manifest_path.write_text(json.dumps(manifest), encoding="utf-8") + + report = ordvec_manifest.verify_manifest( + manifest_path, + max_calibration_profile_bytes=16, + ) + + assert "calibration_profile_too_large" in { + error["code"] for error in report["errors"] + } + + def test_inspect_and_verify_return_dicts(tmp_path): _, manifest_path = write_unloadable_manifest(tmp_path) diff --git a/ordvec-python/README.md b/ordvec-python/README.md index 90b80b9a..7f58d75a 100644 --- a/ordvec-python/README.md +++ b/ordvec-python/README.md @@ -47,7 +47,7 @@ assuming `k` rows back. ## Theory and calibration `Bitmap` exposes the constant-weight top-bucket overlap statistic formalized in -[`ordvec-formalization`](https://github.com/Fieldnote-Echo/ordvec-formalization). +[`ordvec-formalization`](https://github.com/Project-Navi/ordvec-formalization). In that finite Lean model, literal bitmap overlap is the query-preserving quotient statistic, an overlap threshold is Bayes-optimal under explicit monotone-overlap assumptions, and the idealized uniform constant-weight null diff --git a/ordvec-python/pyproject.toml b/ordvec-python/pyproject.toml index 91d42e0d..d627aa12 100644 --- a/ordvec-python/pyproject.toml +++ b/ordvec-python/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ Homepage = "https://github.com/Project-Navi/ordvec" Repository = "https://github.com/Project-Navi/ordvec" Issues = "https://github.com/Project-Navi/ordvec/issues" -Formalization = "https://github.com/Fieldnote-Echo/ordvec-formalization" +Formalization = "https://github.com/Project-Navi/ordvec-formalization" [tool.maturin] manifest-path = "Cargo.toml" diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index d910f0c1..77f31778 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -117,9 +117,9 @@ fn check_bits_124(bits: u8) -> PyResult<()> { Ok(()) } -/// Reject a `bits` value the bucket primitives can't represent: `rank_to_bucket` -/// / `bucket_centre` cap at 7 so `1 << bits` fits the result and never overflows -/// the shift. Mirrors the core asserts as a typed `ValueError`. +/// Reject a `bits` value outside the Python constructor surface. The Rust core +/// has b=8 evidence/refinement helpers, but Python `RankQuant` exposes only the +/// byte-aligned persisted widths through this constructor path. fn check_bits_max7(bits: u8) -> PyResult<()> { if bits > 7 { return Err(pyo3::exceptions::PyValueError::new_err("bits must be <= 7")); @@ -748,11 +748,6 @@ impl RankQuant { /// hits; callers that require unique row IDs should deduplicate before /// calling. /// - /// ``candidates`` may be unsorted and may contain duplicate global doc IDs. - /// Each candidate entry is scored independently, so duplicate IDs may - /// produce duplicate returned global IDs. Deduplicate the array before - /// calling this method when unique hits are required. - /// /// If the shortlist came from [`Bitmap`], this is the exact RankQuant /// rerank stage over that survivor set; it does not itself apply or /// calibrate a bitmap overlap threshold. diff --git a/src/quant.rs b/src/quant.rs index a06a07f0..831f54bc 100644 --- a/src/quant.rs +++ b/src/quant.rs @@ -80,9 +80,9 @@ impl SubsetScratch { } /// Test-only capacity probe (scratch reuse / allocation-free assertions). - /// `#[doc(hidden)]` rather than `#[cfg(test)]` so the integration tests in - /// `tests/` (which compile the crate without `cfg(test)`) can reach it; it - /// is hidden from the public docs surface. + /// Gated behind `test-utils` so integration tests can opt in without + /// exposing this helper on the default public API surface. + #[cfg(feature = "test-utils")] #[doc(hidden)] pub fn capacities_for_test(&self) -> (usize, usize, usize, usize) { ( diff --git a/src/rank.rs b/src/rank.rs index 94dbe400..5c511edf 100644 --- a/src/rank.rs +++ b/src/rank.rs @@ -118,7 +118,7 @@ pub fn rank_to_bucket(rank: u16, d: usize, bits: u8) -> u8 { /// Bucket every entry of a full rank vector. /// /// # Panics -/// Panics if `bits > 7` (validated up front, so this holds for empty input +/// Panics if `bits > 8` (validated up front, so this holds for empty input /// too), or — via [`rank_to_bucket`] — if any entry is `>= ranks.len()`. A /// valid rank vector is a permutation of `[0, ranks.len())`, so well-formed /// input never trips the per-entry guard. diff --git a/tests/release_publish_invariants.py b/tests/release_publish_invariants.py index bed96696..022d6ea4 100644 --- a/tests/release_publish_invariants.py +++ b/tests/release_publish_invariants.py @@ -631,7 +631,7 @@ def check_python_package_metadata() -> None: "Homepage": "https://github.com/Project-Navi/ordvec", "Repository": "https://github.com/Project-Navi/ordvec", "Issues": "https://github.com/Project-Navi/ordvec/issues", - "Formalization": "https://github.com/Fieldnote-Echo/ordvec-formalization", + "Formalization": "https://github.com/Project-Navi/ordvec-formalization", }.items(): if urls.get(key) != expected: fail(f"ordvec-python/pyproject.toml: project.urls.{key} must be {expected!r}") From e23149068c79bb12fc2fafa5f72cafe079112dc1 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 18:10:55 -0500 Subject: [PATCH 2/7] Fix FFI header drift --- ordvec-ffi/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index fbb9fa70..399f164a 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -894,6 +894,10 @@ pub unsafe extern "C" fn ordvec_index_free(index: *mut ordvec_index_t) { /// and may be unsorted or duplicated. Duplicate candidates are scored as /// separate entries and can produce duplicate hits; callers that need unique /// output rows must deduplicate before calling. +/// Full search is represented by `candidate_count == 0 && candidate_rows == NULL`. +/// ABI v1 treats `candidate_count == 0 && candidate_rows != NULL` as +/// `ORDVEC_STATUS_BAD_ARGUMENT`; callers should short-circuit explicit empty +/// survivor sets before crossing the ABI. /// /// # Safety /// From 90163b30a95975d54a9425dfe57967582ea4d716 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 18:33:34 -0500 Subject: [PATCH 3/7] Address audit format registry and Python GIL safety --- CHANGELOG.md | 8 ++ README.md | 6 +- docs/bindings-safety.md | 14 ++- ordvec-ffi/src/lib.rs | 55 ++++++--- ordvec-manifest/src/lib.rs | 73 ++++++++++- ordvec-python/README.md | 5 +- ordvec-python/src/lib.rs | 153 ++++++++++++++--------- src/format.rs | 244 +++++++++++++++++++++++++++++++++++++ src/lib.rs | 4 + src/rank_io.rs | 52 +++----- 10 files changed, 492 insertions(+), 122 deletions(-) create mode 100644 src/format.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 079098c3..7160aa25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Security +- Hardened the Python binding's GIL-released search, candidate, scoring, and + `add` paths: NumPy inputs are now copied into Rust-owned buffers before + `py.detach`, so safe Python code cannot race a detached Rust read by mutating + the same array from another thread. - Updated release governance to document and audit the two-approver `crates-io` / `pypi` GitHub Environment gates: `Fieldnote-Echo` and `toadkicker` are listed as required reviewers, self-review is blocked, and a @@ -23,6 +27,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Added a persisted-format registry that drives probe, manifest-coverage, and + C-ABI load decisions from one table; `.ovfs` now remains explicitly + known-but-not-probeable/not-manifest-covered, and the C ABI reports it as an + unsupported format rather than a corrupt index. - Hid the `SubsetScratch::capacities_for_test` helper behind the non-default `test-utils` feature and cleaned stale release-doc comments around FastScan and b=8 bucket rustdoc. diff --git a/README.md b/README.md index c945b7e1..11d8eba7 100644 --- a/README.md +++ b/README.md @@ -276,9 +276,9 @@ The runtime dependency floor is `numpy>=2.2`. The consolidated cross-language ownership and lifetime contract is in [`docs/bindings-safety.md`](docs/bindings-safety.md). -Python search, candidate-generation, and scoring methods release the GIL and -read NumPy inputs in place. Callers must not mutate query, corpus, candidate, -or scoring input arrays passed to those methods until the call returns. +Python search, candidate-generation, scoring, and `add` methods release the GIL +after copying NumPy inputs into Rust-owned buffers, so ordinary Python in-place +array mutation in another thread cannot race the detached Rust scan. The C ABI allows concurrent search and info calls on one loaded handle. `ordvec_index_free` must not race with any other call on the same handle. diff --git a/docs/bindings-safety.md b/docs/bindings-safety.md index b0d400ee..f7444a53 100644 --- a/docs/bindings-safety.md +++ b/docs/bindings-safety.md @@ -13,8 +13,8 @@ still own scheduling, path trust, input mutability, and deployment provenance. Mutation methods such as `add` require exclusive access. - Python search, candidate-generation, scoring, and `add` methods release the GIL while Rust performs the heavy work. PyO3 still enforces object borrow - rules, but caller-owned NumPy arrays are read in place while the GIL is - released. + rules, and the binding copies NumPy input arrays into Rust-owned buffers + before releasing the GIL. - The C ABI permits concurrent `ordvec_index_search`, `ordvec_index_probe`, and `ordvec_index_info` calls on one loaded handle. `ordvec_index_free` must not race with any other call on that handle. @@ -23,11 +23,13 @@ still own scheduling, path trust, input mutability, and deployment provenance. ## Borrowed Inputs -Caller-provided buffers are borrowed for the duration of the call and are not -retained after the function returns. +Caller-provided Rust slices, C buffers, and Go slices are borrowed for the +duration of the call and are not retained after the function returns. Python +NumPy inputs that cross a GIL-released call are copied before the GIL is +released. -- Do not mutate Rust slices, NumPy arrays, C buffers, or Go slices while a call - that received them is in progress. +- Do not mutate Rust slices, C buffers, or Go slices while a call that received + them is in progress. - Query, corpus, candidate, output, hit, and stats buffers remain caller-owned unless a specific API says otherwise. - Candidate lists are entry lists, not sets. Duplicate candidate IDs are scored diff --git a/ordvec-ffi/src/lib.rs b/ordvec-ffi/src/lib.rs index 399f164a..a85a376b 100644 --- a/ordvec-ffi/src/lib.rs +++ b/ordvec-ffi/src/lib.rs @@ -10,7 +10,10 @@ use std::path::Path; use std::ptr; use std::time::Instant; -use ordvec::{probe_index_metadata, Bitmap, IndexKind, IndexMetadata, IndexParams, RankQuant}; +use ordvec::{ + probe_index_metadata, Bitmap, FfiLoadSupport, IndexKind, IndexMetadata, IndexParams, + PersistedFormat, RankQuant, +}; pub type ordvec_status_t = u32; pub type ordvec_index_kind_t = u32; @@ -733,25 +736,29 @@ pub unsafe extern "C" fn ordvec_index_load( .map_err(|err| io_to_ffi(err, "stat index"))? .len(); - // Accept both the current `OV*` magics and the legacy turbovec-era - // `TV*` magics (back-compat) — mirrors the loaders in `rank_io.rs`. - let index = match &magic { - b"OVRQ" | b"TVRQ" => LoadedIndex::RankQuant( - RankQuant::load(path).map_err(|err| io_to_ffi(err, "load RankQuant index"))?, - ), - b"OVBM" | b"TVBM" => LoadedIndex::Bitmap( - Bitmap::load(path).map_err(|err| io_to_ffi(err, "load Bitmap index"))?, - ), - b"OVR1" | b"OVSB" | b"TVR1" | b"TVSB" => { - return Err(FfiError::new( - ORDVEC_STATUS_UNSUPPORTED_FORMAT, - "ABI v1 supports only RankQuant and Bitmap indexes", - )) + let spec = ordvec::format::lookup_magic(&magic).ok_or_else(|| { + FfiError::new( + ORDVEC_STATUS_CORRUPT_INDEX, + "unrecognized ordvec index magic", + ) + })?; + let index = match spec.ffi_load { + FfiLoadSupport::Supported => match spec.format { + PersistedFormat::RankQuant => LoadedIndex::RankQuant( + RankQuant::load(path).map_err(|err| io_to_ffi(err, "load RankQuant index"))?, + ), + PersistedFormat::Bitmap => LoadedIndex::Bitmap( + Bitmap::load(path).map_err(|err| io_to_ffi(err, "load Bitmap index"))?, + ), + _ => unreachable!("only RankQuant and Bitmap are FFI-loadable in ABI v1"), + }, + FfiLoadSupport::Unsupported { reason } => { + return Err(FfiError::new(ORDVEC_STATUS_UNSUPPORTED_FORMAT, reason)) } _ => { return Err(FfiError::new( - ORDVEC_STATUS_CORRUPT_INDEX, - "unrecognized ordvec index magic", + ORDVEC_STATUS_UNSUPPORTED_FORMAT, + "ABI v1 does not support this persisted index format", )) } }; @@ -1504,6 +1511,17 @@ mod tests { sign.add(&[0.0f32; 64]); sign.write(&sign_path).unwrap(); + let fastscan_path = temp_path("fastscan", "ovfs"); + let mut fastscan = Vec::new(); + fastscan.extend_from_slice(b"OVFS"); + fastscan.push(1); + fastscan.extend_from_slice(&8u32.to_le_bytes()); + fastscan.extend_from_slice(&0u32.to_le_bytes()); + std::fs::File::create(&fastscan_path) + .unwrap() + .write_all(&fastscan) + .unwrap(); + let corrupt_path = temp_path("corrupt", "ovrq"); std::fs::File::create(&corrupt_path) .unwrap() @@ -1511,7 +1529,7 @@ mod tests { .unwrap(); unsafe { - for path in [&rank_path, &sign_path] { + for path in [&rank_path, &sign_path, &fastscan_path] { let cpath = CString::new(path.to_str().unwrap()).unwrap(); let mut out = ptr::null_mut(); assert_eq!( @@ -1530,6 +1548,7 @@ mod tests { } std::fs::remove_file(rank_path).ok(); std::fs::remove_file(sign_path).ok(); + std::fs::remove_file(fastscan_path).ok(); std::fs::remove_file(corrupt_path).ok(); } } diff --git a/ordvec-manifest/src/lib.rs b/ordvec-manifest/src/lib.rs index bee68321..be25a5f1 100644 --- a/ordvec-manifest/src/lib.rs +++ b/ordvec-manifest/src/lib.rs @@ -16,8 +16,8 @@ use chrono::{DateTime, SecondsFormat, Utc}; use ordvec::{ - probe_index_metadata, IndexKind as CoreIndexKind, IndexMetadata as CoreIndexMetadata, - IndexParams as CoreIndexParams, + probe_index_metadata, FormatSpec, IndexKind as CoreIndexKind, + IndexMetadata as CoreIndexMetadata, IndexParams as CoreIndexParams, ManifestCoverage, FORMATS, }; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; @@ -2746,6 +2746,7 @@ pub enum ManifestIndexKind { impl ManifestIndexKind { fn try_from_core(kind: CoreIndexKind) -> Result { + require_manifest_coverage(kind)?; match kind { CoreIndexKind::Rank => Ok(Self::Rank), CoreIndexKind::RankQuant => Ok(Self::RankQuant), @@ -2782,6 +2783,11 @@ impl ManifestIndexParams { enum UnsupportedCoreMetadata { Kind(CoreIndexKind), Params(CoreIndexParams), + RegistryMissing(CoreIndexKind), + ManifestNotCovered { + kind: CoreIndexKind, + reason: &'static str, + }, } impl UnsupportedCoreMetadata { @@ -2789,6 +2795,8 @@ impl UnsupportedCoreMetadata { match self { Self::Kind(_) => "artifact_kind_unsupported", Self::Params(_) => "artifact_params_unsupported", + Self::RegistryMissing(_) => "artifact_format_registry_missing", + Self::ManifestNotCovered { .. } => "artifact_manifest_coverage_unsupported", } } @@ -2800,10 +2808,38 @@ impl UnsupportedCoreMetadata { Self::Params(params) => format!( "artifact metadata params {params:?} are not supported by ordvec-manifest v1" ), + Self::RegistryMissing(kind) => format!( + "artifact metadata kind {kind:?} has no ordvec persisted-format registry entry" + ), + Self::ManifestNotCovered { kind, reason } => format!( + "artifact metadata kind {kind:?} is not covered by ordvec-manifest v1: {reason}" + ), } } } +fn format_spec_for_kind( + kind: CoreIndexKind, +) -> Result<&'static FormatSpec, UnsupportedCoreMetadata> { + FORMATS + .iter() + .find(|spec| spec.kind == kind) + .ok_or(UnsupportedCoreMetadata::RegistryMissing(kind)) +} + +fn require_manifest_coverage(kind: CoreIndexKind) -> Result<(), UnsupportedCoreMetadata> { + match format_spec_for_kind(kind)?.manifest { + ManifestCoverage::Covered => Ok(()), + ManifestCoverage::NotCovered { reason, .. } => { + Err(UnsupportedCoreMetadata::ManifestNotCovered { kind, reason }) + } + _ => Err(UnsupportedCoreMetadata::ManifestNotCovered { + kind, + reason: "unsupported manifest coverage stance in ordvec persisted-format registry", + }), + } +} + /// Verified paths and metadata for a caller-managed load. /// /// A `VerifiedLoadPlan` means the manifest, primary artifact, row-identity @@ -4019,5 +4055,38 @@ fn hex_digest_eq(a: &str, b: &str) -> bool { a == b } +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn manifest_kind_conversion_uses_format_registry_coverage() { + for spec in FORMATS { + match spec.manifest { + ManifestCoverage::Covered => { + ManifestIndexKind::try_from_core(spec.kind) + .expect("manifest-covered registry rows must map to manifest kinds"); + } + ManifestCoverage::NotCovered { + tracking_issue, + reason, + } => { + let err = ManifestIndexKind::try_from_core(spec.kind).unwrap_err(); + assert_eq!(err.code(), "artifact_manifest_coverage_unsupported"); + assert!(err.message().contains(reason)); + assert!(tracking_issue > 0); + } + _ => panic!("unsupported manifest coverage stance in registry test"), + } + } + assert!(matches!( + ManifestIndexKind::try_from_core(CoreIndexKind::RankQuantFastscan) + .unwrap_err() + .code(), + "artifact_manifest_coverage_unsupported" + )); + } +} + #[cfg(feature = "sqlite")] pub mod sqlite; diff --git a/ordvec-python/README.md b/ordvec-python/README.md index 7f58d75a..6a6aa48c 100644 --- a/ordvec-python/README.md +++ b/ordvec-python/README.md @@ -69,8 +69,9 @@ source needs a Rust toolchain (MSRV 1.89) and ## Safety contract The Python binding releases the GIL while Rust searches, scores, and mutates -indexes. NumPy arrays passed to those methods are read in place while the call -is active; do not mutate them from another thread until the method returns. +indexes. Inputs that cross a GIL-released call are copied into Rust-owned +buffers first, so ordinary Python in-place NumPy mutation from another thread +cannot race the detached Rust scan. The cross-language ownership and lifetime contract is maintained in [`docs/bindings-safety.md`](https://github.com/Project-Navi/ordvec/blob/v0.5.0/docs/bindings-safety.md) for this release line. diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index 77f31778..8be8f9b9 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -24,15 +24,11 @@ //! unmodified — there is no `..` / traversal sanitisation — so callers must //! treat the path as trusted input (see the `ordvec` package docstring). //! -//! Threading: the search / candidate / `add` methods release the GIL -//! (`py.detach`) around the Rust scan and read the input arrays *in place* -//! (the `PyReadonlyArray` keeps the buffer alive and blocks rust-numpy-mediated -//! writes for the call's duration, but a raw Python in-place mutation from -//! another thread is not tracked). So a caller must not mutate an input array -//! from another thread while such a call is in progress — the released GIL lets -//! the write race the read and may yield inconsistent results. This is the -//! usual contract for GIL-releasing numeric extensions (NumPy behaves the same -//! way). +//! Threading: methods that release the GIL (`py.detach`) copy NumPy input +//! buffers into Rust-owned memory before the detached scan. This keeps ordinary +//! Python in-place mutations in other threads from racing Rust reads of borrowed +//! NumPy storage. The copy covers query/vector inputs and candidate/doc-id +//! arrays that are consumed inside the detached closure. use numpy::{IntoPyArray, PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray1, PyReadonlyArray2}; use pyo3::prelude::*; @@ -144,10 +140,9 @@ fn not_contiguous_err() -> PyErr { ) } -/// Candidate / doc-id slice obtained from a NumPy array, either borrowed -/// zero-copy (already `uint32` and contiguous) or owned (converted from another -/// integer dtype). The `Borrowed` variant keeps the `PyReadonlyArray` guard -/// alive so its slice stays valid across a GIL-released `py.detach` call. +/// Candidate / doc-id slice obtained from a NumPy array, either borrowed for +/// validation (already `uint32` and contiguous) or owned (converted from another +/// integer dtype). Callers must use `into_owned` before crossing `py.detach`. enum CandidateIds<'py> { Borrowed(PyReadonlyArray1<'py, u32>), Owned(Vec), @@ -160,6 +155,16 @@ impl CandidateIds<'_> { CandidateIds::Owned(v) => Ok(v), } } + + fn into_owned(self) -> PyResult> { + match self { + CandidateIds::Borrowed(ro) => ro + .as_slice() + .map(|slice| slice.to_vec()) + .map_err(|_| not_contiguous_err()), + CandidateIds::Owned(v) => Ok(v), + } + } } /// Coerce a NumPy candidate/doc-id array of *any* integer dtype to `u32`. @@ -446,9 +451,10 @@ impl Rank { self.inner.dim(), std::mem::size_of::(), )?; + let owned = slice.to_vec(); // Release the GIL around the parallel rank-transform / pack so other - // Python threads run during a bulk add. `slice` (`&[f32]`) and - // `&mut self.inner` are both `Ungil`, so no pointer juggling is needed. + // Python threads run during a bulk add. The input buffer is Rust-owned + // before detach, so Python-side mutation cannot race this read. // // SAFETY (detaching on a `&mut self` method): `detach` drops the GIL // but NOT the `&mut self` exclusive borrow — PyO3 holds this object's @@ -456,7 +462,7 @@ impl Rank { // re-acquires the GIL and tries to touch the SAME object gets a clean // `Already borrowed` RuntimeError, never concurrent mutation. Distinct // objects run freely, which is the point of releasing the GIL. - py.detach(|| self.inner.add(slice)); + py.detach(|| self.inner.add(&owned)); Ok(()) } @@ -476,7 +482,8 @@ impl Rank { "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - let results = py.detach(|| self.inner.search(slice, k)); + let owned = slice.to_vec(); + let results = py.detach(|| self.inner.search(&owned, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? .into_pyarray(py); @@ -502,7 +509,8 @@ impl Rank { "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - let results = py.detach(|| self.inner.search_asymmetric(slice, k)); + let owned = slice.to_vec(); + let results = py.detach(|| self.inner.search_asymmetric(&owned, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? .into_pyarray(py); @@ -633,9 +641,10 @@ impl RankQuant { self.inner.bytes_per_vec(), std::mem::size_of::(), )?; + let owned = slice.to_vec(); // Release the GIL around the parallel rank-transform / pack so other - // Python threads run during a bulk add. `slice` (`&[f32]`) and - // `&mut self.inner` are both `Ungil`, so no pointer juggling is needed. + // Python threads run during a bulk add. The input buffer is Rust-owned + // before detach, so Python-side mutation cannot race this read. // // SAFETY (detaching on a `&mut self` method): `detach` drops the GIL // but NOT the `&mut self` exclusive borrow — PyO3 holds this object's @@ -643,7 +652,7 @@ impl RankQuant { // re-acquires the GIL and tries to touch the SAME object gets a clean // `Already borrowed` RuntimeError, never concurrent mutation. Distinct // objects run freely, which is the point of releasing the GIL. - py.detach(|| self.inner.add(slice)); + py.detach(|| self.inner.add(&owned)); Ok(()) } @@ -661,7 +670,8 @@ impl RankQuant { "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - let results = py.detach(|| self.inner.search(slice, k)); + let owned = slice.to_vec(); + let results = py.detach(|| self.inner.search(&owned, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? .into_pyarray(py); @@ -686,7 +696,8 @@ impl RankQuant { "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - let results = py.detach(|| self.inner.search_asymmetric(slice, k)); + let owned = slice.to_vec(); + let results = py.detach(|| self.inner.search_asymmetric(&owned, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? .into_pyarray(py); @@ -775,9 +786,14 @@ impl RankQuant { // by default) and convert to the core's u32 with checked bounds, then // reject any id outside the corpus before dispatch. let cands = as_u32_ids_1d(candidates, "candidate id")?; - let c_slice = cands.as_slice()?; - check_ids_in_range(c_slice, self.inner.len(), "candidate id")?; - let (scores, ids) = py.detach(|| self.inner.search_asymmetric_subset(q_slice, c_slice, k)); + { + let c_slice = cands.as_slice()?; + check_ids_in_range(c_slice, self.inner.len(), "candidate id")?; + } + let q_owned = q_slice.to_vec(); + let c_owned = cands.into_owned()?; + let (scores, ids) = + py.detach(|| self.inner.search_asymmetric_subset(&q_owned, &c_owned, k)); Ok((scores.into_pyarray(py), ids.into_pyarray(py))) } @@ -872,9 +888,10 @@ impl Bitmap { self.inner.dim() / 64, std::mem::size_of::(), )?; + let owned = slice.to_vec(); // Release the GIL around the parallel rank-transform / pack so other - // Python threads run during a bulk add. `slice` (`&[f32]`) and - // `&mut self.inner` are both `Ungil`, so no pointer juggling is needed. + // Python threads run during a bulk add. The input buffer is Rust-owned + // before detach, so Python-side mutation cannot race this read. // // SAFETY (detaching on a `&mut self` method): `detach` drops the GIL // but NOT the `&mut self` exclusive borrow — PyO3 holds this object's @@ -882,7 +899,7 @@ impl Bitmap { // re-acquires the GIL and tries to touch the SAME object gets a clean // `Already borrowed` RuntimeError, never concurrent mutation. Distinct // objects run freely, which is the point of releasing the GIL. - py.detach(|| self.inner.add(slice)); + py.detach(|| self.inner.add(&owned)); Ok(()) } @@ -904,7 +921,8 @@ impl Bitmap { "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - let results = py.detach(|| self.inner.search(slice, k)); + let owned = slice.to_vec(); + let results = py.detach(|| self.inner.search(&owned, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? .into_pyarray(py); @@ -935,9 +953,13 @@ impl Bitmap { ) })?; let ids = as_u32_ids_1d(doc_ids, "doc id")?; - let ids_slice = ids.as_slice()?; - check_ids_in_range(ids_slice, self.inner.len(), "doc id")?; - let (scores, out_ids) = py.detach(|| self.inner.search_subset(q_slice, ids_slice, k)); + { + let ids_slice = ids.as_slice()?; + check_ids_in_range(ids_slice, self.inner.len(), "doc id")?; + } + let q_owned = q_slice.to_vec(); + let ids_owned = ids.into_owned()?; + let (scores, out_ids) = py.detach(|| self.inner.search_subset(&q_owned, &ids_owned, k)); Ok((scores.into_pyarray(py), out_ids.into_pyarray(py))) } @@ -959,7 +981,8 @@ impl Bitmap { "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - let cands = py.detach(|| self.inner.top_m_candidates(slice, m)); + let owned = slice.to_vec(); + let cands = py.detach(|| self.inner.top_m_candidates(&owned, m)); Ok(cands.into_pyarray(py)) } @@ -1012,7 +1035,8 @@ impl Bitmap { batch.checked_mul(n.max(qpv)).ok_or_else(|| { pyo3::exceptions::PyValueError::new_err("batch * index size overflows usize") })?; - let result = py.detach(|| self.inner.top_m_candidates_batched(slice, m)); + let owned = slice.to_vec(); + let result = py.detach(|| self.inner.top_m_candidates_batched(&owned, m)); let m_eff = m.min(n); let total = batch.checked_mul(m_eff).ok_or_else(|| { pyo3::exceptions::PyValueError::new_err("result size (batch * m) overflows usize") @@ -1070,9 +1094,10 @@ impl Bitmap { effective_batch.checked_mul(n.max(qpv)).ok_or_else(|| { pyo3::exceptions::PyValueError::new_err("batch_size * index size overflows usize") })?; + let owned = slice.to_vec(); let result = py.detach(|| { self.inner - .top_m_candidates_batched_chunked(slice, m, effective_batch) + .top_m_candidates_batched_chunked(&owned, m, effective_batch) }); let m_eff = m.min(n); let total = n_queries.checked_mul(m_eff).ok_or_else(|| { @@ -1127,22 +1152,26 @@ impl Bitmap { // default) and convert to u32 with checked bounds, then reject any id // outside the corpus before dispatch. let doc_ids = as_u32_ids_1d(doc_ids, "doc id")?; - let ids_slice = doc_ids.as_slice()?; - check_ids_in_range(ids_slice, self.inner.len(), "doc id")?; - // Python-side ergonomic policy (NOT a core correctness requirement): - // the Rust core scores unsorted ids correctly in input order, just with - // worse cache locality. The binding requires the sorted, cache-friendly - // form and returns a clean ValueError rather than silently running the - // slow path. - if ids_slice.windows(2).any(|w| w[0] > w[1]) { - return Err(pyo3::exceptions::PyValueError::new_err( - "doc_ids must be sorted in ascending order", - )); + { + let ids_slice = doc_ids.as_slice()?; + check_ids_in_range(ids_slice, self.inner.len(), "doc id")?; + // Python-side ergonomic policy (NOT a core correctness requirement): + // the Rust core scores unsorted ids correctly in input order, just with + // worse cache locality. The binding requires the sorted, cache-friendly + // form and returns a clean ValueError rather than silently running the + // slow path. + if ids_slice.windows(2).any(|w| w[0] > w[1]) { + return Err(pyo3::exceptions::PyValueError::new_err( + "doc_ids must be sorted in ascending order", + )); + } } - let mut out = vec![0u32; ids_slice.len()]; + let qb_owned = qb_slice.to_vec(); + let ids_owned = doc_ids.into_owned()?; + let mut out = vec![0u32; ids_owned.len()]; py.detach(|| { self.inner - .body_overlap_scores_subset(qb_slice, ids_slice, &mut out) + .body_overlap_scores_subset(&qb_owned, &ids_owned, &mut out) }); Ok(out.into_pyarray(py)) } @@ -1250,9 +1279,10 @@ impl SignBitmap { self.inner.dim() / 64, std::mem::size_of::(), )?; + let owned = slice.to_vec(); // Release the GIL around the parallel rank-transform / pack so other - // Python threads run during a bulk add. `slice` (`&[f32]`) and - // `&mut self.inner` are both `Ungil`, so no pointer juggling is needed. + // Python threads run during a bulk add. The input buffer is Rust-owned + // before detach, so Python-side mutation cannot race this read. // // SAFETY (detaching on a `&mut self` method): `detach` drops the GIL // but NOT the `&mut self` exclusive borrow — PyO3 holds this object's @@ -1260,7 +1290,7 @@ impl SignBitmap { // re-acquires the GIL and tries to touch the SAME object gets a clean // `Already borrowed` RuntimeError, never concurrent mutation. Distinct // objects run freely, which is the point of releasing the GIL. - py.detach(|| self.inner.add(slice)); + py.detach(|| self.inner.add(&owned)); Ok(()) } @@ -1280,7 +1310,8 @@ impl SignBitmap { "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - let cands = py.detach(|| self.inner.top_m_candidates(slice, m)); + let owned = slice.to_vec(); + let cands = py.detach(|| self.inner.top_m_candidates(&owned, m)); Ok(cands.into_pyarray(py)) } @@ -1313,7 +1344,8 @@ impl SignBitmap { batch.checked_mul(n.max(qpv)).ok_or_else(|| { pyo3::exceptions::PyValueError::new_err("batch * index size overflows usize") })?; - let result = py.detach(|| self.inner.top_m_candidates_batched(slice, m)); + let owned = slice.to_vec(); + let result = py.detach(|| self.inner.top_m_candidates_batched(&owned, m)); // m_eff is the per-row width the Rust impl guarantees for every non-empty // row; deriving it from `m` and the index size keeps the shape consistent // at `batch=0`. @@ -1345,7 +1377,8 @@ impl SignBitmap { "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; - let scores = py.detach(|| self.inner.score_all(slice)); + let owned = slice.to_vec(); + let scores = py.detach(|| self.inner.score_all(&owned)); Ok(scores.into_pyarray(py)) } @@ -1369,7 +1402,8 @@ impl SignBitmap { batch.checked_mul(n.max(qpv)).ok_or_else(|| { pyo3::exceptions::PyValueError::new_err("batch * index size overflows usize") })?; - let scores = py.detach(|| self.inner.score_all_batched_flat(slice)); + let owned = slice.to_vec(); + let scores = py.detach(|| self.inner.score_all_batched_flat(&owned)); Ok(numpy::ndarray::Array2::from_shape_vec((batch, n), scores) .expect("internal: batched dense score flatten shape invariant") .into_pyarray(py)) @@ -1668,7 +1702,8 @@ fn search_asymmetric_byte_lut<'py>( // capturing `index` (a `PyRef`) directly would make the closure non-`Ungil`, // but a bare `&ordvec_core::RankQuant` is fine to carry across `detach`. let inner = &index.inner; - let results = py.detach(|| ordvec_core::search_asymmetric_byte_lut(inner, slice, k)); + let owned = slice.to_vec(); + let results = py.detach(|| ordvec_core::search_asymmetric_byte_lut(inner, &owned, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? .into_pyarray(py); @@ -1715,8 +1750,10 @@ fn rankquant_eval_search<'py>( "array must be C-contiguous; call np.ascontiguousarray() first", ) })?; + let corpus_owned = corpus_slice.to_vec(); + let query_owned = query_slice.to_vec(); let results = - py.detach(|| ordvec_core::rankquant_eval_search(corpus_slice, query_slice, dim, bits, k)); + py.detach(|| ordvec_core::rankquant_eval_search(&corpus_owned, &query_owned, dim, bits, k)); let scores = numpy::ndarray::Array2::from_shape_vec((nq, results.k), results.scores) .map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string()))? .into_pyarray(py); diff --git a/src/format.rs b/src/format.rs new file mode 100644 index 00000000..2b67eab8 --- /dev/null +++ b/src/format.rs @@ -0,0 +1,244 @@ +//! Persisted-format registry shared by probes, bindings, and manifest tooling. +//! +//! This module is the single source of truth for on-disk ordvec magics and the +//! support stance of each persisted format. Loader and probe implementations +//! still live with their owning index code, but dispatchers should identify +//! formats through this registry so new magics cannot silently drift across +//! trust-boundary surfaces. + +use crate::rank_io::IndexKind; + +// Current ordvec magics — written by this crate going forward. +pub(crate) const OVR_MAGIC: &[u8; 4] = b"OVR1"; +pub(crate) const OVRQ_MAGIC: &[u8; 4] = b"OVRQ"; +pub(crate) const OVBM_MAGIC: &[u8; 4] = b"OVBM"; +pub(crate) const OVSB_MAGIC: &[u8; 4] = b"OVSB"; +// FastScan b=2 block-32 layout (`RankQuantFastscan`). New in the ordvec format: +// there is no turbovec-era counterpart, so it has no legacy magic. +pub(crate) const OVFS_MAGIC: &[u8; 4] = b"OVFS"; + +// Legacy turbovec-era magics — still accepted on load for backward +// compatibility, never written. +pub(crate) const TVR_MAGIC: &[u8; 4] = b"TVR1"; +pub(crate) const TVRQ_MAGIC: &[u8; 4] = b"TVRQ"; +pub(crate) const TVBM_MAGIC: &[u8; 4] = b"TVBM"; +pub(crate) const TVSB_MAGIC: &[u8; 4] = b"TVSB"; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub enum PersistedFormat { + Rank, + RankQuant, + Bitmap, + SignBitmap, + RankQuantFastscan, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub enum ProbeCoverage { + Covered, + NotCovered { + tracking_issue: u32, + reason: &'static str, + }, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub enum ManifestCoverage { + Covered, + NotCovered { + tracking_issue: u32, + reason: &'static str, + }, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub enum FfiLoadSupport { + Supported, + Unsupported { reason: &'static str }, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct FormatSpec { + pub format: PersistedFormat, + pub extension: &'static str, + pub magic: &'static [u8; 4], + pub legacy_magic: Option<&'static [u8; 4]>, + pub kind: IndexKind, + pub probe: ProbeCoverage, + pub manifest: ManifestCoverage, + pub ffi_load: FfiLoadSupport, +} + +const FASTSCAN_PROBE_UNSUPPORTED: &str = "OVFS (RankQuantFastscan) metadata probing is not \ +supported in this version; load the index with RankQuantFastscan::load (tracked in #232)"; +const FASTSCAN_MANIFEST_UNSUPPORTED: &str = + "RankQuantFastscan (.ovfs) is not covered by ordvec-manifest v1 (tracked in #232)"; +const FFI_CORE_ONLY: &str = "ABI v1 supports only RankQuant and Bitmap indexes"; +const FFI_FASTSCAN_UNSUPPORTED: &str = + "ABI v1 does not support RankQuantFastscan indexes (tracked in #232)"; + +pub const FORMATS: &[FormatSpec] = &[ + FormatSpec { + format: PersistedFormat::Rank, + extension: "ovr", + magic: OVR_MAGIC, + legacy_magic: Some(TVR_MAGIC), + kind: IndexKind::Rank, + probe: ProbeCoverage::Covered, + manifest: ManifestCoverage::Covered, + ffi_load: FfiLoadSupport::Unsupported { + reason: FFI_CORE_ONLY, + }, + }, + FormatSpec { + format: PersistedFormat::RankQuant, + extension: "ovrq", + magic: OVRQ_MAGIC, + legacy_magic: Some(TVRQ_MAGIC), + kind: IndexKind::RankQuant, + probe: ProbeCoverage::Covered, + manifest: ManifestCoverage::Covered, + ffi_load: FfiLoadSupport::Supported, + }, + FormatSpec { + format: PersistedFormat::Bitmap, + extension: "ovbm", + magic: OVBM_MAGIC, + legacy_magic: Some(TVBM_MAGIC), + kind: IndexKind::Bitmap, + probe: ProbeCoverage::Covered, + manifest: ManifestCoverage::Covered, + ffi_load: FfiLoadSupport::Supported, + }, + FormatSpec { + format: PersistedFormat::SignBitmap, + extension: "ovsb", + magic: OVSB_MAGIC, + legacy_magic: Some(TVSB_MAGIC), + kind: IndexKind::SignBitmap, + probe: ProbeCoverage::Covered, + manifest: ManifestCoverage::Covered, + ffi_load: FfiLoadSupport::Unsupported { + reason: FFI_CORE_ONLY, + }, + }, + FormatSpec { + format: PersistedFormat::RankQuantFastscan, + extension: "ovfs", + magic: OVFS_MAGIC, + legacy_magic: None, + kind: IndexKind::RankQuantFastscan, + probe: ProbeCoverage::NotCovered { + tracking_issue: 232, + reason: FASTSCAN_PROBE_UNSUPPORTED, + }, + manifest: ManifestCoverage::NotCovered { + tracking_issue: 232, + reason: FASTSCAN_MANIFEST_UNSUPPORTED, + }, + ffi_load: FfiLoadSupport::Unsupported { + reason: FFI_FASTSCAN_UNSUPPORTED, + }, + }, +]; + +pub fn lookup_magic(magic: &[u8; 4]) -> Option<&'static FormatSpec> { + FORMATS + .iter() + .find(|spec| spec.magic == magic || spec.legacy_magic == Some(magic)) +} + +pub fn spec(format: PersistedFormat) -> &'static FormatSpec { + FORMATS + .iter() + .find(|spec| spec.format == format) + .expect("every PersistedFormat must have a FormatSpec") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn registry_has_one_row_per_current_format() { + assert_eq!( + FORMATS.len(), + 5, + "new formats must update this registry test" + ); + assert_eq!(spec(PersistedFormat::Rank).kind, IndexKind::Rank); + assert_eq!(spec(PersistedFormat::RankQuant).kind, IndexKind::RankQuant); + assert_eq!(spec(PersistedFormat::Bitmap).kind, IndexKind::Bitmap); + assert_eq!( + spec(PersistedFormat::SignBitmap).kind, + IndexKind::SignBitmap + ); + assert_eq!( + spec(PersistedFormat::RankQuantFastscan).kind, + IndexKind::RankQuantFastscan + ); + } + + #[test] + fn every_magic_and_legacy_magic_resolves_to_its_row() { + for spec in FORMATS { + assert_eq!( + lookup_magic(spec.magic).map(|found| found.format), + Some(spec.format) + ); + if let Some(legacy_magic) = spec.legacy_magic { + assert_eq!( + lookup_magic(legacy_magic).map(|found| found.format), + Some(spec.format) + ); + } + } + assert!(lookup_magic(b"NOPE").is_none()); + } + + #[test] + fn manifest_coverage_is_explicit_for_every_format() { + for spec in FORMATS { + match spec.manifest { + ManifestCoverage::Covered => {} + ManifestCoverage::NotCovered { + tracking_issue, + reason, + } => { + assert!(tracking_issue > 0); + assert!(!reason.trim().is_empty()); + } + } + } + assert!(matches!( + spec(PersistedFormat::RankQuantFastscan).manifest, + ManifestCoverage::NotCovered { + tracking_issue: 232, + .. + } + )); + } + + #[test] + fn ffi_load_support_is_limited_to_abi_v1_formats() { + for spec in FORMATS { + match (spec.format, spec.ffi_load) { + ( + PersistedFormat::RankQuant | PersistedFormat::Bitmap, + FfiLoadSupport::Supported, + ) => {} + ( + PersistedFormat::Rank + | PersistedFormat::SignBitmap + | PersistedFormat::RankQuantFastscan, + FfiLoadSupport::Unsupported { reason }, + ) => assert!(!reason.trim().is_empty()), + other => panic!("unexpected FFI load support matrix entry: {other:?}"), + } + } + } +} diff --git a/src/lib.rs b/src/lib.rs index ea854f95..aebf4ee3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,6 +83,7 @@ pub mod const_weight_bitmap; #[cfg(feature = "experimental")] mod contingency; mod fastscan; +pub mod format; #[cfg(feature = "experimental")] mod multi_bucket; mod quant; @@ -94,6 +95,9 @@ pub mod sign_bitmap; mod util; pub use bitmap::Bitmap; +pub use format::{ + FfiLoadSupport, FormatSpec, ManifestCoverage, PersistedFormat, ProbeCoverage, FORMATS, +}; pub use quant::SubsetScratch; pub use quant::{rankquant_eval_search, RankQuant, RankQuantCapability, TwoStageCandidatePolicy}; pub use rank::Rank; diff --git a/src/rank_io.rs b/src/rank_io.rs index 67924556..e05505c8 100644 --- a/src/rank_io.rs +++ b/src/rank_io.rs @@ -73,21 +73,11 @@ use std::fs::File; use std::io::{self, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; use std::path::Path; -// Current ordvec magics — written by this crate going forward. -const OVR_MAGIC: &[u8; 4] = b"OVR1"; -const OVRQ_MAGIC: &[u8; 4] = b"OVRQ"; -const OVBM_MAGIC: &[u8; 4] = b"OVBM"; -const OVSB_MAGIC: &[u8; 4] = b"OVSB"; -// FastScan b=2 block-32 layout (`RankQuantFastscan`). New in the ordvec format — -// there is no turbovec-era counterpart, so it has no legacy magic. -const OVFS_MAGIC: &[u8; 4] = b"OVFS"; -// Legacy turbovec-era magics — still accepted on load for backward -// compatibility, never written. Files produced before the ordvec rebrand carry -// these; loaders accept either the `OV*` or the matching `TV*` magic. -const TVR_MAGIC: &[u8; 4] = b"TVR1"; -const TVRQ_MAGIC: &[u8; 4] = b"TVRQ"; -const TVBM_MAGIC: &[u8; 4] = b"TVBM"; -const TVSB_MAGIC: &[u8; 4] = b"TVSB"; +use crate::format::{ + self, PersistedFormat, ProbeCoverage, OVBM_MAGIC, OVFS_MAGIC, OVRQ_MAGIC, OVR_MAGIC, + OVSB_MAGIC, TVBM_MAGIC, TVRQ_MAGIC, TVR_MAGIC, TVSB_MAGIC, +}; + const VERSION: u8 = 1; /// Persisted index family identified from an on-disk ordvec index header. @@ -98,6 +88,7 @@ pub enum IndexKind { RankQuant, Bitmap, SignBitmap, + RankQuantFastscan, } /// Format-specific parameters declared by an on-disk ordvec index header. @@ -108,6 +99,7 @@ pub enum IndexParams { RankQuant { bits: u8 }, Bitmap { n_top: usize }, SignBitmap, + RankQuantFastscan { bits: u8 }, } /// Header-derived metadata for a persisted ordvec index. @@ -378,22 +370,16 @@ pub fn probe_index_metadata(path: impl AsRef) -> io::Result let file_size_bytes = file.metadata()?.len(); let mut f = BufReader::new(file); let magic = read_magic(&mut f, "ordvec index")?; - match &magic { - OVR_MAGIC | TVR_MAGIC => probe_rank_metadata(&mut f, file_size_bytes), - OVRQ_MAGIC | TVRQ_MAGIC => probe_rankquant_metadata(&mut f, file_size_bytes), - OVBM_MAGIC | TVBM_MAGIC => probe_bitmap_metadata(&mut f, file_size_bytes), - OVSB_MAGIC | TVSB_MAGIC => probe_sign_bitmap_metadata(&mut f, file_size_bytes), - // `OVFS` (RankQuantFastscan) is a recognized magic, but metadata probing - // is intentionally NOT wired up here yet. `.ovfs` files still - // round-trip via `RankQuantFastscan::{write,load}`; only this - // metadata-probe path is pending. Return a specific, actionable error - // rather than letting it fall through to the generic unknown-magic case - // (which would be misleading, since the magic *is* known). - OVFS_MAGIC => Err(invalid( - "OVFS (RankQuantFastscan) metadata probing is not supported in this \ - version; load the index with RankQuantFastscan::load (tracked in #232)", - )), - _ => Err(invalid("unknown ordvec index magic")), + let spec = format::lookup_magic(&magic).ok_or_else(|| invalid("unknown ordvec index magic"))?; + match spec.format { + PersistedFormat::Rank => probe_rank_metadata(&mut f, file_size_bytes), + PersistedFormat::RankQuant => probe_rankquant_metadata(&mut f, file_size_bytes), + PersistedFormat::Bitmap => probe_bitmap_metadata(&mut f, file_size_bytes), + PersistedFormat::SignBitmap => probe_sign_bitmap_metadata(&mut f, file_size_bytes), + PersistedFormat::RankQuantFastscan => match spec.probe { + ProbeCoverage::Covered => unreachable!("FastScan probe is not wired yet"), + ProbeCoverage::NotCovered { reason, .. } => Err(invalid(reason)), + }, } } @@ -1906,8 +1892,8 @@ mod tests { // Probing a valid `.ovfs` file returns a specific, actionable error — NOT the // generic "unknown ordvec index magic" (which would be misleading, since the // magic *is* recognized). Metadata-probe support for OVFS is deferred to #232; - // this pins the deferral contract so it can't silently regress to the generic - // case. See `probe_index_metadata`'s `OVFS_MAGIC` arm. + // this pins the registry-backed deferral contract so it can't silently + // regress to the generic case. #[test] fn probe_rejects_ovfs_with_specific_unsupported_error() { use super::{probe_index_metadata, write_fastscan}; From f849f2b0bf5352597c316258cae53f859c2b0290 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 18:46:06 -0500 Subject: [PATCH 4/7] Tighten audit PR review items --- CHANGELOG.md | 4 +++- README.md | 3 ++- docs/bindings-safety.md | 3 ++- ordvec-python/README.md | 3 ++- ordvec-python/src/lib.rs | 6 +++--- src/format.rs | 4 ++++ src/lib.rs | 2 ++ 7 files changed, 18 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7160aa25..5ba7e1bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Hardened the Python binding's GIL-released search, candidate, scoring, and `add` paths: NumPy inputs are now copied into Rust-owned buffers before `py.detach`, so safe Python code cannot race a detached Rust read by mutating - the same array from another thread. + the same array from another thread. This intentionally trades zero-copy + detached reads for race-free copied inputs; large calls may temporarily require + an additional input-sized buffer. - Updated release governance to document and audit the two-approver `crates-io` / `pypi` GitHub Environment gates: `Fieldnote-Echo` and `toadkicker` are listed as required reviewers, self-review is blocked, and a diff --git a/README.md b/README.md index 11d8eba7..85b10c33 100644 --- a/README.md +++ b/README.md @@ -278,7 +278,8 @@ The consolidated cross-language ownership and lifetime contract is in Python search, candidate-generation, scoring, and `add` methods release the GIL after copying NumPy inputs into Rust-owned buffers, so ordinary Python in-place -array mutation in another thread cannot race the detached Rust scan. +array mutation in another thread cannot race the detached Rust scan. Large calls +may temporarily require an additional input-sized buffer. The C ABI allows concurrent search and info calls on one loaded handle. `ordvec_index_free` must not race with any other call on the same handle. diff --git a/docs/bindings-safety.md b/docs/bindings-safety.md index f7444a53..31707ccf 100644 --- a/docs/bindings-safety.md +++ b/docs/bindings-safety.md @@ -14,7 +14,8 @@ still own scheduling, path trust, input mutability, and deployment provenance. - Python search, candidate-generation, scoring, and `add` methods release the GIL while Rust performs the heavy work. PyO3 still enforces object borrow rules, and the binding copies NumPy input arrays into Rust-owned buffers - before releasing the GIL. + before releasing the GIL. Large calls may temporarily require an additional + input-sized buffer. - The C ABI permits concurrent `ordvec_index_search`, `ordvec_index_probe`, and `ordvec_index_info` calls on one loaded handle. `ordvec_index_free` must not race with any other call on that handle. diff --git a/ordvec-python/README.md b/ordvec-python/README.md index 6a6aa48c..d6ee5c98 100644 --- a/ordvec-python/README.md +++ b/ordvec-python/README.md @@ -71,7 +71,8 @@ source needs a Rust toolchain (MSRV 1.89) and The Python binding releases the GIL while Rust searches, scores, and mutates indexes. Inputs that cross a GIL-released call are copied into Rust-owned buffers first, so ordinary Python in-place NumPy mutation from another thread -cannot race the detached Rust scan. +cannot race the detached Rust scan. Large calls may temporarily require an +additional input-sized buffer. The cross-language ownership and lifetime contract is maintained in [`docs/bindings-safety.md`](https://github.com/Project-Navi/ordvec/blob/v0.5.0/docs/bindings-safety.md) for this release line. diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index 8be8f9b9..c7ba3b8c 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -113,9 +113,9 @@ fn check_bits_124(bits: u8) -> PyResult<()> { Ok(()) } -/// Reject a `bits` value outside the Python constructor surface. The Rust core -/// has b=8 evidence/refinement helpers, but Python `RankQuant` exposes only the -/// byte-aligned persisted widths through this constructor path. +/// Reject a `bits` value outside the Python primitive-helper surface. The Rust +/// core has b=8 evidence/refinement helpers, but these Python helpers still +/// cap single-byte bucket alphabets at b<=7 in v0.5.0. fn check_bits_max7(bits: u8) -> PyResult<()> { if bits > 7 { return Err(pyo3::exceptions::PyValueError::new_err("bits must be <= 7")); diff --git a/src/format.rs b/src/format.rs index 2b67eab8..21145aae 100644 --- a/src/format.rs +++ b/src/format.rs @@ -5,6 +5,10 @@ //! still live with their owning index code, but dispatchers should identify //! formats through this registry so new magics cannot silently drift across //! trust-boundary surfaces. +//! +//! This module is `#[doc(hidden)]` in v0.5.0. It is public only as lockstep +//! support for workspace crates such as `ordvec-manifest`, not as a documented +//! downstream extension surface. use crate::rank_io::IndexKind; diff --git a/src/lib.rs b/src/lib.rs index aebf4ee3..1220dadf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -83,6 +83,7 @@ pub mod const_weight_bitmap; #[cfg(feature = "experimental")] mod contingency; mod fastscan; +#[doc(hidden)] pub mod format; #[cfg(feature = "experimental")] mod multi_bucket; @@ -95,6 +96,7 @@ pub mod sign_bitmap; mod util; pub use bitmap::Bitmap; +#[doc(hidden)] pub use format::{ FfiLoadSupport, FormatSpec, ManifestCoverage, PersistedFormat, ProbeCoverage, FORMATS, }; From a3ba31c63b3520ea256949e0fa4915194b370baa Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 18:56:54 -0500 Subject: [PATCH 5/7] Add Python GIL snapshot regression test --- ordvec-python/tests/test_rank_quant.py | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/ordvec-python/tests/test_rank_quant.py b/ordvec-python/tests/test_rank_quant.py index cdd0c5fd..12406fd1 100644 --- a/ordvec-python/tests/test_rank_quant.py +++ b/ordvec-python/tests/test_rank_quant.py @@ -10,6 +10,8 @@ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor +import threading +import time import numpy as np import pytest @@ -134,6 +136,50 @@ def run_search(): np.testing.assert_allclose(scores, baseline_scores, rtol=0, atol=0) +def test_search_asymmetric_snapshots_query_before_detach(): + corpus = np.ascontiguousarray(unit_vectors(25_000, 128, seed=303)) + queries = np.ascontiguousarray(unit_vectors(256, 128, seed=404)) + idx = RankQuant(dim=128, bits=2) + idx.add(corpus) + + baseline_scores, baseline_indices = idx.search_asymmetric(queries.copy(), k=8) + + for _ in range(3): + mutable_queries = queries.copy() + result: dict[str, tuple[np.ndarray, np.ndarray]] = {} + errors: list[BaseException] = [] + + def run_search() -> None: + try: + result["out"] = idx.search_asymmetric(mutable_queries, k=8) + except BaseException as exc: # pragma: no cover - re-raised below + errors.append(exc) + + worker = threading.Thread(target=run_search) + worker.start() + # Give the worker a chance to enter the extension and release the GIL. + # With copy-before-detach, this thread cannot run until the Rust-owned + # snapshot exists; with a borrowed detached slice, the mutations below + # can change query rows that Rust has not consumed yet. + time.sleep(0.001) + + mutations = 0 + while worker.is_alive() and mutations < 1_000: + mutable_queries[:, :] = -queries + mutable_queries[:, ::2] = 0.0 + mutations += 1 + + worker.join(timeout=5.0) + assert not worker.is_alive() + if errors: + raise errors[0] + assert mutations > 0, "search finished before mutation tripwire ran" + + scores, indices = result["out"] + np.testing.assert_array_equal(indices, baseline_indices) + np.testing.assert_allclose(scores, baseline_scores, rtol=0, atol=0) + + @pytest.mark.parametrize("bits", [1, 2, 4]) def test_rankquant_eval_search_matches_rankquant_search(bits): vectors = unit_vectors(45, 128, seed=31 + bits) From eeb7be218d5e39c97251e7889cfed06e01fb67cd Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 19:01:48 -0500 Subject: [PATCH 6/7] Address Python snapshot test review nit --- ordvec-python/tests/test_rank_quant.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ordvec-python/tests/test_rank_quant.py b/ordvec-python/tests/test_rank_quant.py index 12406fd1..f821ce55 100644 --- a/ordvec-python/tests/test_rank_quant.py +++ b/ordvec-python/tests/test_rank_quant.py @@ -147,12 +147,12 @@ def test_search_asymmetric_snapshots_query_before_detach(): for _ in range(3): mutable_queries = queries.copy() result: dict[str, tuple[np.ndarray, np.ndarray]] = {} - errors: list[BaseException] = [] + errors: list[Exception] = [] def run_search() -> None: try: result["out"] = idx.search_asymmetric(mutable_queries, k=8) - except BaseException as exc: # pragma: no cover - re-raised below + except Exception as exc: # pragma: no cover - re-raised below errors.append(exc) worker = threading.Thread(target=run_search) From 3fed7e9e3737b77fb90be402e5ee5f427d28e8b1 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 19:11:43 -0500 Subject: [PATCH 7/7] Relax Python snapshot test coverage timeout --- ordvec-python/tests/test_rank_quant.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ordvec-python/tests/test_rank_quant.py b/ordvec-python/tests/test_rank_quant.py index f821ce55..a6e1e2f4 100644 --- a/ordvec-python/tests/test_rank_quant.py +++ b/ordvec-python/tests/test_rank_quant.py @@ -169,8 +169,10 @@ def run_search() -> None: mutable_queries[:, ::2] = 0.0 mutations += 1 - worker.join(timeout=5.0) - assert not worker.is_alive() + worker.join(timeout=60.0) + assert not worker.is_alive(), ( + "search did not finish within the coverage-safe timeout" + ) if errors: raise errors[0] assert mutations > 0, "search finished before mutation tripwire ran"