diff --git a/.github/pull_request_bodies/caif-ifc-index-authority.md b/.github/pull_request_bodies/caif-ifc-index-authority.md new file mode 100644 index 0000000..ac2b595 --- /dev/null +++ b/.github/pull_request_bodies/caif-ifc-index-authority.md @@ -0,0 +1,49 @@ +## Summary + +Adds an optional CAIF-style Index Authority Receipt for ordvec benchmark evidence. + +The goal is to make ordvec's index-first retrieval evidence machine-readable: quality delta, bytes/vector, latency regime, benchmark scope, limitations, fallback conditions, and a deterministic receipt hash. + +## Why + +ordvec already has a strong index-first compute story: compressed ordinal/sign retrieval can preserve retrieval quality under stated benchmark scopes while reducing storage and latency. + +This PR adds a small evidence packet and verifier so downstream systems can answer: + +> Is this compressed/index-first retrieval path evidence-supported before dense compute for this stated workload scope? + +## What this includes + +- `docs/INDEX_AUTHORITY_RECEIPTS.md` +- `examples/caif/trec-covid-sign-rq2.index-authority.json` +- `tools/verify_index_authority.py` + +## What this does not do + +- Does not change Rust code +- Does not change `Cargo.toml` +- Does not add runtime dependencies +- Does not add CI requirements +- Does not claim new benchmark results +- Does not add signing, key management, or deployment trust policy + +## Verification + + python3 tools/verify_index_authority.py examples/caif/trec-covid-sign-rq2.index-authority.json + +Expected output includes: + + decision: ALLOW_INDEX_FIRST + quality_within_bootstrap_noise: true + storage_reduction: 10.6667x + single_query_speedup: 105.6604x + +## Scope + +The example uses existing public README benchmark values and preserves the stated limitations around dataset, encoder, corpus size, batch/threading regime, HNSW comparison, and larger-corpus claims. + +## Framing + +Benchmarks should not only report performance. + +They should authorize compute paths within a defined evidence envelope. diff --git a/docs/INDEX_AUTHORITY_RECEIPTS.md b/docs/INDEX_AUTHORITY_RECEIPTS.md new file mode 100644 index 0000000..9b745e8 --- /dev/null +++ b/docs/INDEX_AUTHORITY_RECEIPTS.md @@ -0,0 +1,48 @@ +# Index Authority Receipts for ordvec + +Index Authority Receipts are CAIF-style evidence packets for ordvec benchmark results. + +They make index-first retrieval evidence machine-readable. + +Instead of only asking whether a retrieval mode is faster, a receipt asks whether the benchmark evidence supports using a compressed/index-first retrieval path within a stated workload scope. + +## IFC + +Index-First Compute means a cheaper index representation is evaluated before more expensive dense compute. + +For ordvec, IFC can include RankQuant compressed scan, Bitmap candidate generation, SignBitmap candidate generation, or SignBitmap to RankQuant rerank. + +## CAIF + +Compute Authority Index Format describes whether a compute path is justified under a stated evidence envelope. + +A receipt records baseline mode, candidate mode, quality delta, storage reduction, latency profile, scope, limitations, fallback conditions, and a deterministic receipt hash. + +## Verify + +Run: + + python3 tools/verify_index_authority.py examples/caif/trec-covid-sign-rq2.index-authority.json + +Expected output: + + decision: ALLOW_INDEX_FIRST + mode: sign_to_rq2 + baseline: flat_exact + quality_within_bootstrap_noise: true + storage_reduction: 10.6667x + single_query_speedup: 105.6604x + +## Non-goals + +This does not change Rust code, Cargo.toml, CI, runtime behavior, signing, key management, or deployment trust policy. + +It does not create new benchmark claims. + +It preserves the stated benchmark scope and limitations. + +## Principle + +Benchmarks should not only report performance. + +They should authorize compute paths within a defined evidence envelope. diff --git a/examples/caif/trec-covid-sign-rq2.index-authority.json b/examples/caif/trec-covid-sign-rq2.index-authority.json new file mode 100644 index 0000000..a4ff5e1 --- /dev/null +++ b/examples/caif/trec-covid-sign-rq2.index-authority.json @@ -0,0 +1,82 @@ +{ + "schema": "ordvec.index_authority.v0.1", + "subject": { + "project": "ordvec", + "mode": "sign_to_rq2", + "version": "0.5.0" + }, + "baseline": { + "mode": "flat_exact", + "bytes_per_vector": 4096 + }, + "ifc": { + "enabled": true, + "compute_path": [ + "sign_bitmap_candidate_generation", + "rankquant_b2_rerank" + ], + "training_required": false, + "fit_required": false, + "graph_required": false, + "float_corpus_required_for_reported_path": false + }, + "evidence": { + "dataset": "trec-covid", + "dataset_family": "BEIR", + "encoder": "Harrier-Q8 1024-d", + "corpus_size": 171332, + "metric": "nDCG@10", + "baseline_score": 0.7574, + "candidate_score": 0.7638, + "delta_vs_baseline": 0.0064, + "within_bootstrap_noise": true, + "evidence_source": "repository README benchmark table" + }, + "economics": { + "candidate_bytes_per_vector": 384, + "storage_reduction_x": 10.6667, + "single_query_latency_ms": { + "baseline": 56.0, + "candidate": 0.53 + }, + "single_query_speedup_x": 105.6604 + }, + "decision": { + "recommended": "ALLOW_INDEX_FIRST", + "policy": { + "min_storage_reduction_x": 8.0, + "min_single_query_speedup_x": 10.0, + "require_quality_within_bootstrap_noise": true, + "require_scope": true, + "require_limitations": true + }, + "fallback": [ + "Use dense flat or ANN comparison when dataset, encoder, scale, or serving regime falls outside the stated evidence scope.", + "Require HNSW comparison for highly parallel threaded serving claims.", + "Require checked-in artifacts before extending the claim to larger corpora or alternate encoders." + ] + }, + "scope": { + "claim_status": "public_repository_evidence", + "applies_to": [ + "BEIR trec-covid", + "Harrier-Q8 1024-d embeddings", + "171332 document public benchmark run", + "single-query latency comparison against exact flat" + ], + "does_not_claim": [ + "million-scale HNSW crossover", + "GPU bandwidth claims", + "alternate-encoder generalization", + "all serving regimes", + "dominance over HNSW in highly parallel threaded throughput" + ] + }, + "limitations": [ + "The compressed scan remains O(n), with a lower constant than dense flat.", + "HNSW wins the committed highly parallel threaded view.", + "The claim is scoped to the stated dataset, encoder, corpus size, and benchmark artifact.", + "Larger-corpus and alternate-encoder claims require checked-in run artifacts.", + "This receipt does not sign artifacts or manage deployment trust policy." + ] +} diff --git a/tools/verify_index_authority.py b/tools/verify_index_authority.py new file mode 100755 index 0000000..bc5dc3c --- /dev/null +++ b/tools/verify_index_authority.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import json +import sys +from pathlib import Path + +def die(msg, code=2): + print("ERROR:", msg, file=sys.stderr) + raise SystemExit(code) + +def sha(obj): + b = json.dumps(obj, sort_keys=True, separators=(",", ":")).encode() + return "sha256:" + hashlib.sha256(b).hexdigest() + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("receipt", type=Path) + args = ap.parse_args() + + try: + r = json.loads(args.receipt.read_text()) + except Exception as e: + die(f"cannot read receipt: {e}") + + for k in ["schema","subject","baseline","ifc","evidence","economics","decision","scope","limitations"]: + if k not in r: + die(f"missing field {k}") + + if r["schema"] != "ordvec.index_authority.v0.1": + die("bad schema") + + e = r["evidence"] + econ = r["economics"] + base = r["baseline"] + policy = r["decision"]["policy"] + + expected_delta = e["candidate_score"] - e["baseline_score"] + if abs(e["delta_vs_baseline"] - expected_delta) > 0.0001: + die("delta_vs_baseline mismatch") + + expected_storage = base["bytes_per_vector"] / econ["candidate_bytes_per_vector"] + if abs(econ["storage_reduction_x"] - expected_storage) > 0.02: + die("storage_reduction_x mismatch") + + expected_speedup = econ["single_query_latency_ms"]["baseline"] / econ["single_query_latency_ms"]["candidate"] + if abs(econ["single_query_speedup_x"] - expected_speedup) > 0.02: + die("single_query_speedup_x mismatch") + + decision = "ALLOW_INDEX_FIRST" + if policy["require_quality_within_bootstrap_noise"] and not e["within_bootstrap_noise"]: + decision = "REQUIRE_DENSE_FALLBACK" + if econ["storage_reduction_x"] < policy["min_storage_reduction_x"]: + decision = "REQUIRE_DENSE_FALLBACK" + if econ["single_query_speedup_x"] < policy["min_single_query_speedup_x"]: + decision = "REQUIRE_DENSE_FALLBACK" + if policy["require_scope"] and (not r["scope"]["applies_to"] or not r["scope"]["does_not_claim"]): + decision = "DENY_UNSCOPED_CLAIM" + if policy["require_limitations"] and not r["limitations"]: + decision = "DENY_UNSCOPED_CLAIM" + + print(f"decision: {decision}") + print(f"mode: {r['subject']['mode']}") + print(f"baseline: {base['mode']}") + print(f"quality_within_bootstrap_noise: {str(e['within_bootstrap_noise']).lower()}") + print(f"storage_reduction: {econ['storage_reduction_x']}x") + print(f"single_query_speedup: {econ['single_query_speedup_x']}x") + print(f"receipt_hash: {sha(r)}") + + if decision != r["decision"]["recommended"]: + die(f"declared decision {r['decision']['recommended']} does not match computed decision {decision}", 3) + +if __name__ == "__main__": + main()