diff --git a/.gitignore b/.gitignore index d342254b..6d7da4c6 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,8 @@ venv/ .DS_Store .idea/ .vscode/ + +# BEIR benchmark harness — embedding cache and result files. +/.cache/ordvec-beir/ +/results/beir/* +!/results/beir/.gitkeep diff --git a/CHANGELOG.md b/CHANGELOG.md index bd5c1cc8..683b5b54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added + +- **Reproducible BEIR benchmark harness** (`make benchmark-beir`; dev-only, + excluded from the published crate). All latency is measured in a single Rust + process (`benchmarks/beir-bench`) — ordvec's rank/sign methods against an exact + inner-product baseline (`flat`, identical retrieval to FAISS `IndexFlatIP`, via + a pure-Rust SIMD GEMM) and a pure-Rust HNSW (`hnsw_rs`, M=32) — so the + comparison is apples-to-apples (same machine, batch, thread count, no + Python/FFI in the hot path). Covers single-query / batched / 32-thread regimes + and a corpus-size scaling sweep on public BEIR datasets, with the corpus + embedded by Harrier-Q8 (GGUF `Q8_0` via `llama-cpp-python`, CUDA). The README + now leads with the resulting scaling curve, latency bars, and nDCG@10 table; + every figure is regenerated by the harness (nothing hand-entered). Replaces the + previous private-arXiv real-embedding numbers in the README. + ### Performance - **AVX-512 VPOPCNTDQ scan kernels now cover every `dim` (a multiple of 64), not diff --git a/Cargo.lock b/Cargo.lock index 235bcd8a..19acee93 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -11,6 +26,23 @@ dependencies = [ "libc", ] +[[package]] +name = "anndists" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8396b473aa0bceed68fb32462505387ea39fa47c7029417e0a49f10592b036" +dependencies = [ + "anyhow", + "cfg-if", + "cpu-time", + "env_logger", + "lazy_static", + "log", + "num-traits", + "num_cpus", + "rayon", +] + [[package]] name = "anstream" version = "1.0.0" @@ -73,6 +105,27 @@ version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" +[[package]] +name = "beir-bench" +version = "0.0.0" +dependencies = [ + "hnsw_rs", + "matrixmultiply", + "ordvec", + "rayon", + "serde_json", + "sha2", +] + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "2.11.1" @@ -94,6 +147,18 @@ version = "3.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + [[package]] name = "cc" version = "1.2.62" @@ -110,6 +175,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chacha20" version = "0.10.0" @@ -118,7 +189,7 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", "cpufeatures", - "rand_core", + "rand_core 0.10.1", ] [[package]] @@ -178,6 +249,16 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + [[package]] name = "const-oid" version = "0.10.2" @@ -190,6 +271,16 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "cpufeatures" version = "0.3.0" @@ -250,6 +341,41 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" +[[package]] +name = "enum-as-inner" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "env_filter" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e90c2accc4b07a8456ea0debdc2e7587bdd890680d71173a15d4ae604f6eef" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0621c04f2196ac3f488dd583365b9c09be011a4ab8b9f37248ffcc8f6198b56a" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -326,6 +452,18 @@ dependencies = [ "slab", ] +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi 5.3.0", + "wasip2", +] + [[package]] name = "getrandom" version = "0.4.2" @@ -334,8 +472,8 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", - "rand_core", + "r-efi 6.0.0", + "rand_core 0.10.1", "wasip2", "wasip3", ] @@ -346,6 +484,8 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ + "allocator-api2", + "equivalent", "foldhash 0.1.5", ] @@ -379,12 +519,43 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hnsw_rs" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a5258f079b97bf2e8311ff9579e903c899dcbac0d9a138d62e9a066778bd07" +dependencies = [ + "anndists", + "anyhow", + "bincode", + "cfg-if", + "cpu-time", + "env_logger", + "hashbrown 0.15.5", + "indexmap", + "lazy_static", + "log", + "mmap-rs", + "num-traits", + "num_cpus", + "parking_lot", + "rand 0.9.4", + "rayon", + "serde", +] + [[package]] name = "hybrid-array" version = "0.4.12" @@ -448,6 +619,30 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "jiff" +version = "0.2.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4603d3033e49e2b0e31229fcab20a5d40089c607d975cd9c80551dc69eed9102" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", +] + +[[package]] +name = "jiff-static" +version = "0.2.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "js-sys" version = "0.3.99" @@ -460,6 +655,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "leb128fmt" version = "0.1.0" @@ -489,12 +690,30 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "616ec5685824bcc94416c6d4a7a446eea774a31efd7062c8480ba6fd06d7a6e5" +[[package]] +name = "mach2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d640282b302c0bb0a2a8e0233ead9035e3bed871f0b7e81fe4a1ec829765db44" +dependencies = [ + "libc", +] + [[package]] name = "matrixmultiply" version = "0.3.10" @@ -511,6 +730,23 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "mmap-rs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ecce9d566cb9234ae3db9e249c8b55665feaaf32b0859ff1e27e310d2beb3d8" +dependencies = [ + "bitflags", + "combine", + "libc", + "mach2", + "nix", + "sysctl", + "thiserror 2.0.18", + "widestring", + "windows", +] + [[package]] name = "ndarray" version = "0.17.2" @@ -526,6 +762,18 @@ dependencies = [ "rawpointer", ] +[[package]] +name = "nix" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" +dependencies = [ + "bitflags", + "cfg-if", + "cfg_aliases", + "libc", +] + [[package]] name = "num-complex" version = "0.4.6" @@ -553,6 +801,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "numpy" version = "0.29.0" @@ -585,8 +843,8 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" name = "ordvec" version = "0.5.0" dependencies = [ - "rand", - "rand_chacha", + "rand 0.10.1", + "rand_chacha 0.10.0", "rayon", ] @@ -632,6 +890,29 @@ dependencies = [ "pyo3", ] +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -753,12 +1034,28 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "r-efi" version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + [[package]] name = "rand" version = "0.10.1" @@ -766,8 +1063,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom", - "rand_core", + "getrandom 0.4.2", + "rand_core 0.10.1", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", ] [[package]] @@ -777,7 +1084,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e6af7f3e25ded52c41df4e0b1af2d047e45896c2f3281792ed68a1c243daedb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.10.1", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", ] [[package]] @@ -812,6 +1128,44 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + [[package]] name = "rsqlite-vfs" version = "0.1.1" @@ -819,7 +1173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c51c9ae4df8a7fba42103df5c621fa3c37eccf3a3c650879e90fc48b11cc192c" dependencies = [ "hashbrown 0.16.1", - "thiserror", + "thiserror 2.0.18", ] [[package]] @@ -862,6 +1216,21 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.28" @@ -969,6 +1338,20 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sysctl" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc" +dependencies = [ + "bitflags", + "byteorder", + "enum-as-inner", + "libc", + "thiserror 1.0.69", + "walkdir", +] + [[package]] name = "target-lexicon" version = "0.13.5" @@ -982,19 +1365,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom", + "getrandom 0.4.2", "once_cell", "rustix", "windows-sys", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + [[package]] name = "thiserror" version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -1038,7 +1441,7 @@ version = "1.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d258b83ceec21034727ecee8c382cfa6c3e133699b0742c64571814fb420c9f7" dependencies = [ - "getrandom", + "getrandom 0.4.2", "js-sys", "wasm-bindgen", ] @@ -1049,6 +1452,16 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasip2" version = "1.0.3+wasi-0.2.9" @@ -1146,6 +1559,52 @@ dependencies = [ "semver", ] +[[package]] +name = "widestring" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72069c3113ab32ab29e5584db3c6ec55d416895e60715417b5b883a357c3e471" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -1214,6 +1673,63 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "wit-bindgen" version = "0.51.0" diff --git a/Cargo.toml b/Cargo.toml index 3a63ad89..43111e50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,11 @@ exclude = [ ".github/", ".gitignore", ".playwright-mcp/", + # The BEIR harness + figures + its bench crate are dev tooling — not shipped + # in the published crate. (benchmarks/rank_modes_results.txt stays IN the + # package: the README links it and the release-publish invariant requires it.) + "benchmarks/beir/", + "benchmarks/beir-bench/", "CLAUDE.md", "CODE_OF_CONDUCT.md", "CONTRIBUTING.md", @@ -92,7 +97,7 @@ opt-level = 3 # `Cargo.lock` carries their transitive dependencies. [workspace] resolver = "2" -members = ["ordvec-python", "ordvec-ffi", "ordvec-manifest", "ordvec-manifest-python"] +members = ["ordvec-python", "ordvec-ffi", "ordvec-manifest", "ordvec-manifest-python", "benchmarks/beir-bench"] default-members = ["."] # fuzz/ is a cargo-fuzz crate built only via `cargo +nightly fuzz`. Keep it out of # the workspace so it stays a standalone crate (its own Cargo.lock) and `cargo fuzz` diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..4b2e1f22 --- /dev/null +++ b/Makefile @@ -0,0 +1,194 @@ +# ordvec-beir benchmark harness +# +# Reproduces, on a fresh CUDA machine, the ordvec retrieval story on standard +# BEIR datasets: +# * quality — nDCG@10 vs the official BEIR qrels, ordvec vs an exact +# full-float baseline (`flat`, == FAISS IndexFlatIP math). +# * scaling — speedup-vs-corpus-size: brute force is O(n), ordvec sign/rank +# candidate-gen is near-flat in n, so the gap widens with scale. +# * graphics — three README figures (scaling curve + single-thread & threaded +# latency bars). +# +# ALL latency is measured in ONE Rust process (`beir-bench`): ordvec vs an exact +# inner-product baseline vs a pure-Rust HNSW — same machine, batch, and thread +# count, no Python/FFI boundary. Python only embeds (GGUF Q8 via llama-cpp-python), +# scores nDCG, and renders the figures. +# +# Usage: +# make bench-beir-setup # install Python deps + CUDA llama-cpp-python +# make benchmark-beir-smoke # quick end-to-end sanity (scifact only) +# make benchmark-beir # full: quality + scaling + graphics + +# ── interpreter ────────────────────────────────────────────────────────────── +PY ?= python3 + +# ── paths ───────────────────────────────────────────────────────────────────── +CACHE_DIR := .cache/ordvec-beir +RESULTS_DIR := results/beir +FIG_DIR := $(RESULTS_DIR)/figures + +# ── datasets ────────────────────────────────────────────────────────────────── +# Quality (nDCG) datasets. PERF_DATASET drives the scaling curve + latency bars +# and must be large enough for the curve to bend (trec-covid ≈ 171K docs). +QUALITY_DATASETS := scifact +PERF_DATASET := trec-covid +SPLIT := test + +# Smoke overrides (scifact is small + already cheap to embed). +SMOKE_QUALITY := scifact +SMOKE_PERF_DATASET := scifact +SMOKE_SCALE_SIZES := 500 1000 2000 + +# ── retrieval parameters ───────────────────────────────────────────────────── +TOPK := 100 +K_VALUES := 10 100 +BATCH := 32 +CANDIDATES := 500 +SEED := 1 +NPROC := $(shell nproc 2>/dev/null || echo 8) +# Batch regimes for the graphics: the scaling curve + single-thread bar use +# single-query (batch=1) — the latency-sensitive deployment where flat is +# memory-bound and ordvec wins ~100×; the threaded bar uses a batched throughput +# regime where flat amortizes its corpus stream across the batch. +SCALE_BATCH := 1 +MULTI_BATCH := 32 + +# Corpus-size ladder for the scaling sweep (clamped to the real corpus size by +# the bench). Full-corpus points are added by the dedicated full runs. +SCALE_SIZES := 1000 3000 10000 30000 100000 170000 + +# ── methods (all measured in the single Rust process) ───────────────────────── +# flat exact inner product (== FAISS IndexFlatIP math), 4096 B/vec +# hnsw pure-Rust HNSW M=32 (Malkov–Yashunin), 4096 B/vec +# rq2/rq4 ordvec RankQuant b=2 / b=4 (256 / 512 B/vec) +# bitmap-rq2 ordvec Bitmap → RankQuant b=2 (two-stage) +# sign-rq2 ordvec SignBitmap → RankQuant b=2 (two-stage) +BENCH_METHODS := flat,hnsw,rq2,rq4,bitmap-rq2,sign-rq2 + +# ── encoder (canonical: GGUF Q8_0 via llama-cpp-python / CUDA) ──────────────── +HARRIER_GGUF_REPO := mradermacher/harrier-oss-v1-0.6b-GGUF +GGUF_FILE := *Q8_0.gguf +N_GPU_LAYERS := -1 +N_CTX := 2048 +ENCODE_BATCH := 16 +# CUDA build flags for llama-cpp-python (override LLAMA_CMAKE_ARGS= for CPU). +LLAMA_CMAKE_ARGS := -DGGML_CUDA=on + +# ── phony ───────────────────────────────────────────────────────────────────── +.PHONY: benchmark-beir benchmark-beir-smoke bench-beir-setup bench-beir-build \ + bench-beir-guardrail bench-beir-quality bench-beir-scaling \ + bench-beir-plot bench-beir-clean bench-beir-clean-cache + +# The pipeline is strictly sequential (prepare writes the cache the bench reads; +# eval/plot read run files). Steps are unordered prerequisites, so under a +# parallel make (-j, or an inherited MAKEFLAGS=-jN) they would race on a +# half-written cache. Force serial execution regardless. +.NOTPARALLEL: + +# ── top-level targets ───────────────────────────────────────────────────────── + +## Full run: quality (nDCG) + scaling sweep + three README graphics. +benchmark-beir: bench-beir-guardrail bench-beir-quality bench-beir-scaling bench-beir-plot + +## Quick end-to-end sanity: everything on scifact, tiny scaling ladder. +benchmark-beir-smoke: + $(MAKE) bench-beir-guardrail + $(MAKE) bench-beir-quality QUALITY_DATASETS="$(SMOKE_QUALITY)" + $(MAKE) bench-beir-scaling PERF_DATASET=$(SMOKE_PERF_DATASET) SCALE_SIZES="$(SMOKE_SCALE_SIZES)" + $(MAKE) bench-beir-plot PERF_DATASET=$(SMOKE_PERF_DATASET) + +# ── setup ───────────────────────────────────────────────────────────────────── + +## Install Python deps (core wheels) + CUDA llama-cpp-python. The latter is built +## against the host CUDA toolkit; --no-cache-dir + --force-reinstall defeat pip's +## wheel cache (it ignores CMAKE_ARGS and would hand back a stale CPU build). +## CPU-only box: make bench-beir-setup LLAMA_CMAKE_ARGS= +bench-beir-setup: + $(PY) -m pip install -r benchmarks/beir/requirements.txt + CMAKE_ARGS="$(LLAMA_CMAKE_ARGS)" $(PY) -m pip install \ + --upgrade --force-reinstall --no-cache-dir llama-cpp-python + +## Build the all-Rust comparison harness (release). +bench-beir-build: + cargo build --release -p beir-bench + +# ── guardrail ───────────────────────────────────────────────────────────────── + +## Fail loudly if any harness *.py imports the ordvec Python package directly — +## the benchmark hot path is the Rust crate, not the Python bindings. +bench-beir-guardrail: + @if grep -rnE "^[[:space:]]*(import ordvec|from ordvec)\b" benchmarks/beir --include='*.py' 2>/dev/null; then \ + echo "ERROR: a benchmarks/beir/*.py file imports the ordvec Python package."; \ + exit 1; \ + fi + @echo "guardrail OK: no 'import ordvec' in benchmarks/beir/*.py" + +# ── quality: nDCG@10 vs qrels (ordvec vs exact flat) ────────────────────────── + +## Embed → run all methods (single-thread, full corpus) → score nDCG, per dataset. +bench-beir-quality: bench-beir-build + @for d in $(QUALITY_DATASETS); do \ + echo "=== quality: $$d ==="; \ + $(PY) benchmarks/beir/beir_prepare.py --datasets $$d --split $(SPLIT) \ + --provider llamacpp --model "$(HARRIER_GGUF_REPO)" --gguf-file "$(GGUF_FILE)" \ + --n-gpu-layers $(N_GPU_LAYERS) --n-ctx $(N_CTX) --batch-size $(ENCODE_BATCH) \ + --cache-dir "$(CACHE_DIR)" --seed $(SEED) || exit 1; \ + $(CURDIR)/target/release/beir-bench --cache-dir "$(CACHE_DIR)" --dataset $$d \ + --split $(SPLIT) --top-k $(TOPK) --batch $(BATCH) --candidates $(CANDIDATES) \ + --threads 1 --methods $(BENCH_METHODS) --out-dir "$(RESULTS_DIR)" || exit 1; \ + $(PY) benchmarks/beir/beir_eval.py --datasets $$d --split $(SPLIT) \ + --cache-dir "$(CACHE_DIR)" --runs-dir "$(RESULTS_DIR)" --k-values $(K_VALUES) \ + --baseline flat --bootstrap-iters 1000 --seed $(SEED) --out-dir "$(RESULTS_DIR)" || exit 1; \ + done + +# ── scaling: speedup-vs-corpus-size + single/threaded full-corpus points ─────── + +## Sweep the perf dataset over a corpus-size ladder (single-thread), then full +## corpus at 1 thread and at $(NPROC) threads. All append to timing.jsonl. +bench-beir-scaling: bench-beir-build + @echo "=== scaling: $(PERF_DATASET) (sizes: $(SCALE_SIZES); threaded full = $(NPROC)t) ===" + $(PY) benchmarks/beir/beir_prepare.py --datasets $(PERF_DATASET) --split $(SPLIT) \ + --provider llamacpp --model "$(HARRIER_GGUF_REPO)" --gguf-file "$(GGUF_FILE)" \ + --n-gpu-layers $(N_GPU_LAYERS) --n-ctx $(N_CTX) --batch-size $(ENCODE_BATCH) \ + --cache-dir "$(CACHE_DIR)" --seed $(SEED) + rm -f "$(RESULTS_DIR)/$(PERF_DATASET)/timing.jsonl" + @for n in $(SCALE_SIZES); do \ + echo " -- n=$$n (1 thread, single-query batch=$(SCALE_BATCH)) --"; \ + $(CURDIR)/target/release/beir-bench --cache-dir "$(CACHE_DIR)" --dataset $(PERF_DATASET) \ + --split $(SPLIT) --top-k $(TOPK) --batch $(SCALE_BATCH) --candidates $(CANDIDATES) \ + --threads 1 --max-docs $$n --methods $(BENCH_METHODS) --out-dir "$(RESULTS_DIR)" || exit 1; \ + done + @echo " -- full corpus (1 thread, single-query batch=$(SCALE_BATCH); writes topk + nDCG inputs) --" + $(CURDIR)/target/release/beir-bench --cache-dir "$(CACHE_DIR)" --dataset $(PERF_DATASET) \ + --split $(SPLIT) --top-k $(TOPK) --batch $(SCALE_BATCH) --candidates $(CANDIDATES) \ + --threads 1 --methods $(BENCH_METHODS) --out-dir "$(RESULTS_DIR)" + @echo " -- full corpus ($(NPROC) threads, batched batch=$(MULTI_BATCH)) --" + $(CURDIR)/target/release/beir-bench --cache-dir "$(CACHE_DIR)" --dataset $(PERF_DATASET) \ + --split $(SPLIT) --top-k $(TOPK) --batch $(MULTI_BATCH) --candidates $(CANDIDATES) \ + --threads $(NPROC) --methods $(BENCH_METHODS) --out-dir "$(RESULTS_DIR)" + $(PY) benchmarks/beir/beir_eval.py --datasets $(PERF_DATASET) --split $(SPLIT) \ + --cache-dir "$(CACHE_DIR)" --runs-dir "$(RESULTS_DIR)" --k-values $(K_VALUES) \ + --baseline flat --bootstrap-iters 1000 --seed $(SEED) --out-dir "$(RESULTS_DIR)" + +# ── graphics ────────────────────────────────────────────────────────────────── + +## Render the three README figures from the timing records. +bench-beir-plot: + $(PY) benchmarks/beir/beir_plot.py --runs-dir "$(RESULTS_DIR)" \ + --scaling-dataset $(PERF_DATASET) --bar-dataset $(PERF_DATASET) \ + --scaling-threads 1 --scaling-batch $(SCALE_BATCH) \ + --bar-single-threads 1 --bar-single-batch $(SCALE_BATCH) \ + --bar-multi-threads $(NPROC) --bar-multi-batch $(MULTI_BATCH) \ + --out-dir "$(FIG_DIR)" + +# ── cleanup ─────────────────────────────────────────────────────────────────── + +## Remove generated result files (keeps the embedding cache). +bench-beir-clean: + find $(RESULTS_DIR) -name "*.topk.jsonl" -delete + find $(RESULTS_DIR) -name "*.summary.json" -delete + find $(RESULTS_DIR) -name "timing.jsonl" -delete + +## Remove the embedding cache (re-encoding will be required). +bench-beir-clean-cache: + rm -rf $(CACHE_DIR) diff --git a/README.md b/README.md index f6d77306..e293e34e 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,45 @@ Training-free ordinal & sign quantization for vector retrieval. `ordvec` is a small, dependency-light Rust crate for compressed nearest-neighbour search over high-dimensional embeddings. +## Benchmark at a glance + +> **ordvec matches dense retrieval quality within BEIR qrel noise at 8–16× smaller +> vector storage — with no training and no graph build — and sub-millisecond +> single-query retrieval on 171K Harrier embeddings. A threaded HNSW graph still +> wins highly-parallel batched serving; ordvec wins the lightweight +> compressed-substrate lane.** + +On **trec-covid** (171,332 documents, the public [BEIR](https://github.com/beir-cellar/beir) +benchmark) with **Harrier-Q8** 1024-d embeddings, ordvec's two-stage retrieval +keeps a near-flat per-query cost as the corpus grows, while exact brute-force +(`flat`, identical math to FAISS `IndexFlatIP`) is O(n) — so the speedup +*widens* with scale: + +![ordvec speedup over exact search grows with corpus size](https://raw.githubusercontent.com/Fieldnote-Echo/ordvec/main/benchmarks/beir/figures/scaling_curve.png) + +- **~100× faster, single query.** At 171K docs, single-query latency: exact + `flat` 56 ms vs ordvec `Sign→rq2` **0.53 ms** — and the gap grows with the + corpus (it is ~5× at 1K docs). +- **8–16× smaller.** 256–384 bytes/vector vs 4096 for full float, at + **nDCG@10 within bootstrap noise of exact** (on trec-covid the ordinal rows + even edge ahead; see [Benchmarks](#benchmarks)). +- **Reproducible on your machine, one command:** + + ```sh + make bench-beir-setup # Python deps + CUDA llama-cpp-python (GGUF Q8 encoder) + make benchmark-beir # download BEIR, embed, run all methods, render graphics + ``` + + The figures and result tables in this README were produced by that command on + public BEIR data: the harness writes the figures and the nDCG/timing summaries, + the README tables transcribe those outputs, and you can regenerate or verify + every number yourself (exact latencies vary with hardware and batch size). The + default run reproduces **scifact + trec-covid**; the harness also supports + `nfcorpus` and `fiqa`. Latency for every method is measured in **one Rust + process** (no Python/FFI in the hot path); see the [Benchmarks](#benchmarks) + section for the single-query, batched-throughput, and threaded views and their + caveats. + ## What's different Compressed-retrieval libraries usually either **fit a codebook to your @@ -241,47 +280,105 @@ candidate slices passed to `Search` until the call returns. ## Benchmarks -### Real-embedding retrieval - -The current paper-harness run is a real-embedding source-recovery task, not the -in-repo synthetic stress test: 207,695 arXiv paper embeddings, 7,200 queries -across title / first-sentence / middle-sentence / paraphrase query sets, 1024-D -sentence-transformer embeddings, and `nDCG@10` / `hit@10` / `MRR@10` against the -source paper id. - -The baseline rows use FAISS over L2-normalized FP32 embeddings: -`IndexFlatIP` for dense exact search and `IndexHNSWFlat(M=32, efSearch=128)` for -the tested HNSW configuration. The ordinal rows remove stored dense coordinate -magnitudes: - -- **ordinal rank-cosine** stores mean-centered, L2-normalized - `argsort(argsort(.))` rank vectors and queries with the same rank-cosine - representation; and -- **RankQuant b=2 asym** stores 2-bit ordinal document codes - (`256 bytes/vector` at dim=1024) and scores FP32 queries with - `RankQuant::search_asymmetric`. - -| Mode | bytes/vec | nDCG@10 | hit@10 | MRR@10 | -|------|----------:|--------:|-------:|-------:| -| FAISS dense exact | 4096 | 0.7817 | 0.8604 | 0.7566 | -| ordinal rank-cosine | 4096 | 0.7796 | 0.8596 | 0.7542 | -| FAISS HNSW | ~4352 | 0.7756 | 0.8528 | 0.7509 | -| RankQuant b=2 asym | 256 | 0.7754 | 0.8536 | 0.7506 | - -Paired bootstrap over all 7,200 queries: - -- ordinal rank-cosine minus FAISS HNSW: `+0.00406 nDCG@10`, 95% CI - `[+0.00133, +0.00687]` -- ordinal rank-cosine minus FAISS dense exact: `-0.00205 nDCG@10`, 95% CI - `[-0.00429, +0.00019]` -- RankQuant b=2 asym minus FAISS HNSW: `-0.00014 nDCG@10`, 95% CI - `[-0.00318, +0.00292]` - -Read narrowly: on this real retrieval task, ordinal structure retains nearly all -of the dense retrieval signal, and the 2-bit deployed path matches the tested -FAISS HNSW configuration within bootstrap noise at 1/16 the FP32 vector payload. -The arXiv artifact set is not shipped in this crate; the self-contained -clean-checkout benchmark below is the reproducible stress test. +### BEIR retrieval (public datasets, reproducible) + +A fully reproducible harness over standard [BEIR](https://github.com/beir-cellar/beir) +datasets lives in [`benchmarks/beir/`](https://github.com/Fieldnote-Echo/ordvec/tree/main/benchmarks/beir). It embeds the corpus +with **Harrier-Q8** (GGUF `Q8_0` via `llama-cpp-python`, CUDA), then measures +ordvec's methods against two references **in a single Rust process** so the +latency comparison is genuinely apples-to-apples — same machine, batch, and +thread count, no Python/FFI in the hot path: + +- **`flat`** — exact inner-product brute force (identical retrieval to FAISS + `IndexFlatIP`), a pure-Rust SIMD GEMM. *Baseline, not ground truth.* +- **`hnsw`** — pure-Rust HNSW (`hnsw_rs`, M=32, ef=128) — the portable + stand-in for the C++ hnswlib. + +Reproduce end-to-end (downloads the data, embeds, runs every method, renders the +figures) — nothing below is hand-entered: + +```sh +make bench-beir-setup # Python deps + CUDA llama-cpp-python +make benchmark-beir # quality (nDCG) + scaling sweep + graphics +``` + +#### Quality — nDCG@10 vs the official BEIR qrels + +nDCG@10 is computed against the human-annotated qrels (not against `flat`). +`Δ vs flat` is the paired-bootstrap mean delta; `*` marks a 95% CI that straddles +0 (i.e. within noise of exact). `flat` and the ordvec rows are **deterministic** +(byte-identical run to run); the `hnsw` row is **approximate** — its graph is +built in parallel, so its nDCG and latency vary slightly between runs (≈±0.003 +nDCG here, within the same noise band). The numbers below are one representative +run; regenerate your own with `make benchmark-beir`. + +| Dataset | Method | Bytes/vec | nDCG@10 | Δ vs flat (95% CI) | +|---|---|--:|--:|---| +| scifact (5,183) | `flat` (exact) | 4096 | 0.7551 | (baseline) | +| | `hnsw` M=32 | 4096 | 0.7554 | +0.0003 * | +| | **ordvec rq4** | **512** | **0.7549** | −0.0003 * | +| | ordvec rq2 | 256 | 0.7471 | −0.0080 * | +| | ordvec sign→rq2 | 384 | 0.7471 | −0.0080 * | +| trec-covid (171,332) | `flat` (exact) | 4096 | 0.7574 | (baseline) | +| | `hnsw` M=32 | 4096 | 0.7555 | −0.0019 * | +| | ordvec rq2 | 256 | 0.7632 | +0.0057 * | +| | **ordvec rq4** | **512** | **0.7636** | +0.0062 * | +| | ordvec sign→rq2 | 384 | 0.7638 | +0.0064 * | + +Every ordvec row is within bootstrap noise of exact dense at **8–16× smaller** +storage; on trec-covid the ordinal codes even edge slightly ahead. + +#### Latency — three honest views + +ordvec never touches the float corpus, so its per-query cost is tiny and grows +slowly with `n`; `flat`'s cost is dominated by streaming the 4096-byte vectors, +which is O(n) and **memory-bandwidth-bound**. That single fact explains all three +views (trec-covid, 171,332 docs, 1024-d): + +**1. Single query (batch = 1, 1 thread)** — latency-sensitive serving, where +`flat` cannot amortize its memory traffic: + +![single-query latency bars](https://raw.githubusercontent.com/Fieldnote-Echo/ordvec/main/benchmarks/beir/figures/bars_single_thread.png) + +`flat` 56 ms → ordvec `sign→rq2` **0.53 ms (≈106×)**, `bitmap→rq2` 0.62 ms (≈91×), +`hnsw` 1.5 ms (37×). The scaling curve [above](#benchmark-at-a-glance) is this +view swept over corpus size — the speedup *grows* with `n`. + +**2. Batched throughput (batch = 32, 1 thread)** — when many queries arrive at +once, `flat`'s GEMM amortizes the corpus stream across the batch (56→4 ms), +narrowing the gap: ordvec `sign→rq2`/`bitmap→rq2` stay ≈8–9.5× ahead. + +**3. Many cores (batch = 32, 32 threads)** — everything parallelizes and the +field compresses; `hnsw` threads best: + +![threaded throughput bars](https://raw.githubusercontent.com/Fieldnote-Echo/ordvec/main/benchmarks/beir/figures/bars_threaded.png) + +`hnsw` 4.8× vs `flat`, ordvec `bitmap→rq2` 3.7×, `rq2` 2.5×, `sign→rq2` 2.1×. +**HNSW wins this regime** — by a hair on threaded throughput. The honest +ordvec-vs-HNSW tradeoff, all from this same run (trec-covid, 171,332 docs): + +| | HNSW M=32 | ordvec `sign→rq2` | +|---|---|---| +| threaded latency (32 threads, batch 32) | **0.23 ms** ✅ | 0.52 ms | +| single-query latency (batch 1) | 1.52 ms | **0.53 ms** ✅ (~3×) | +| index size / vector | 4096 B + graph | **256–384 B** ✅ (8–16× less) | +| build time, 171K docs | **51.4 s** | **0.26 s** ✅ (training-free) | +| nDCG@10 (trec-covid) | 0.7555 | **0.7638** ✅ | + +So even where HNSW edges ahead on threaded latency, ordvec gets there with **no +graph to build** (instant, training-free, and rebuilt for free when the corpus +drifts) and **8–16× less memory** — and it still wins single-query latency and +ties or edges quality. And the two aren't mutually exclusive: ordvec's codes are +index-agnostic, so they compose *under* an HNSW/sharding layer (see +[Scope](#scope)) rather than replacing it. + +**Read it honestly:** ordvec's huge latency win is a single-query / low-batch +phenomenon (and grows with corpus size); under large-batch throughput a batched +exact GEMM is a strong baseline and HNSW threads very well. The durable wins are +**compression at iso-quality** and **single-query latency that stays flat as the +corpus grows**. `flat` is a comparison reference, not ground truth; nDCG@10 is +the qrel-based metric. Numbers vary with encoder, dataset, hardware, and batch — +the point is that you can regenerate all of them with `make benchmark-beir`. ### Synthetic stress test diff --git a/benchmarks/beir-bench/Cargo.toml b/benchmarks/beir-bench/Cargo.toml new file mode 100644 index 00000000..5122bf5e --- /dev/null +++ b/benchmarks/beir-bench/Cargo.toml @@ -0,0 +1,33 @@ +# All-Rust BEIR comparison harness. Lives as a workspace member (NOT in the core +# `ordvec` crate's dependencies) so that pulling `hnsw_rs` here never touches the +# `-p ordvec`-scoped deps gate or the published crate. `publish = false`. +[package] +name = "beir-bench" +version = "0.0.0" +edition = "2021" +publish = false +license = "MIT OR Apache-2.0" + +[[bin]] +name = "beir-bench" +path = "src/main.rs" + +[dependencies] +ordvec = { path = "../.." } +# Pure-Rust HNSW (Malkov–Yashunin); no system/C++ deps. The faithful portable +# stand-in for the C++ hnswlib (no maintained Rust binding to that exists). +hnsw_rs = "0.3" +rayon = "1.10" +# Pure-Rust SIMD GEMM (runtime AVX/NEON dispatch, no BLAS/system deps) — gives +# the exact-inner-product `flat` baseline a competitive kernel so it isn't +# unfairly slow vs ordvec's SIMD paths. Default single-threaded (our rayon pool +# owns parallelism); we never enable its `threading` feature. +matrixmultiply = "0.3" +# Pure-Rust SHA-256 (manifest provenance digest) + robust JSON read/write, so the +# harness never shells out for hashing and never emits invalid JSON on IDs that +# contain quotes/backslashes/unicode escapes. Both are already workspace deps. +sha2 = "0.11" +serde_json = "1" + +# Release profile is inherited from the workspace root (lto, codegen-units=1, +# opt-level=3); a member-level [profile] would be ignored with a warning. diff --git a/benchmarks/beir-bench/src/main.rs b/benchmarks/beir-bench/src/main.rs new file mode 100644 index 00000000..a6d63e45 --- /dev/null +++ b/benchmarks/beir-bench/src/main.rs @@ -0,0 +1,1259 @@ +//! All-Rust BEIR comparison harness. +//! +//! Measures ordvec's rank/sign methods against an exact inner-product baseline +//! (`flat`, identical math to FAISS `IndexFlatIP`) and a pure-Rust HNSW +//! (`hnsw_rs`, Malkov–Yashunin — the faithful portable stand-in for C++ hnswlib), +//! ALL in one process so the latency comparison is genuinely apples-to-apples: +//! same machine, same batch, same thread count, no Python/FFI boundary. +//! +//! Two knobs make the comparison fair and reveal the scaling story: +//! +//! `--threads N`: query latency is measured inside a rayon pool of exactly N +//! threads (index *build* still uses all cores). N=1 gives the single-thread +//! story; N>1 the throughput story. Batch is matched across every method. +//! +//! `--max-docs M`: truncate the corpus to its first M vectors. Sweeping M +//! produces the speedup-vs-corpus-size curve (brute force is O(n); ordvec +//! sign/rank candidate-gen is near-flat in n). +//! +//! Output: `//timing.jsonl` gets one record per +//! (method, n_docs, threads) run, appended every invocation — the plotter +//! consumes this. A FULL-corpus run (`--max-docs` absent) additionally writes +//! `.topk.jsonl` + `.summary.json` for offline nDCG eval; +//! sub-sampled runs skip those (qrels-based nDCG is only valid on the full +//! corpus). +//! +//! Cache layout (one encoder per prepare run): +//! ///encoder=/ +//! corpus.f32.npy queries.f32.npy corpus_ids.json query_ids.json +//! qrels.json embeddings.manifest.json ... + +use ordvec::{Bitmap, CandidateBatch, RankQuant, SignBitmap, SubsetScratch}; +use rayon::prelude::*; +use std::io::{BufWriter, Write}; +use std::time::Instant; + +use hnsw_rs::prelude::*; + +// HNSW hyper-parameters (faithful to the prior "hnswlib M=32" comparison). +const HNSW_M: usize = 32; +const HNSW_EF_CONSTRUCTION: usize = 200; +const HNSW_EF_SEARCH: usize = 128; +const HNSW_MAX_LAYER: usize = 16; + +// --------------------------------------------------------------------------- +// Config +// --------------------------------------------------------------------------- + +struct Config { + cache_dir: String, + dataset: String, + split: String, + top_k: usize, + batch: usize, + candidates: usize, + methods: Vec, + out_dir: String, + threads: usize, // 0 = all cores + max_docs: Option, // None = full corpus +} + +fn parse_args() -> Config { + let mut cache_dir = String::from(".cache/ordvec-beir"); + let mut dataset = String::new(); + let mut split = String::from("test"); + let mut top_k = 100usize; + let mut batch = 8usize; + let mut candidates = 500usize; + let mut methods = vec![ + "flat".to_string(), + "hnsw".to_string(), + "rq2".to_string(), + "rq4".to_string(), + "bitmap-rq2".to_string(), + "sign-rq2".to_string(), + ]; + let mut out_dir = String::from("results/beir"); + let mut threads = 0usize; + let mut max_docs: Option = None; + + let mut args = std::env::args().skip(1); + while let Some(a) = args.next() { + match a.as_str() { + "--cache-dir" => cache_dir = args.next().expect("--cache-dir requires a value"), + "--dataset" => dataset = args.next().expect("--dataset requires a value"), + "--split" => split = args.next().expect("--split requires a value"), + "--top-k" => { + top_k = args + .next() + .expect("--top-k requires a value") + .parse() + .expect("--top-k must be an integer") + } + "--batch" => { + batch = args + .next() + .expect("--batch requires a value") + .parse() + .expect("--batch must be an integer") + } + "--candidates" => { + candidates = args + .next() + .expect("--candidates requires a value") + .parse() + .expect("--candidates must be an integer") + } + "--methods" => { + methods = args + .next() + .expect("--methods requires a value") + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect() + } + "--out-dir" => out_dir = args.next().expect("--out-dir requires a value"), + "--threads" => { + threads = args + .next() + .expect("--threads requires a value") + .parse() + .expect("--threads must be an integer") + } + "--max-docs" => { + max_docs = Some( + args.next() + .expect("--max-docs requires a value") + .parse() + .expect("--max-docs must be an integer"), + ) + } + other => panic!("unknown argument: {other}"), + } + } + assert!(!dataset.is_empty(), "--dataset is required"); + assert!(batch >= 1, "--batch must be >= 1"); + assert!(top_k >= 1, "--top-k must be >= 1"); + assert!(candidates >= 1, "--candidates must be >= 1"); + + Config { + cache_dir, + dataset, + split, + top_k, + batch, + candidates, + methods, + out_dir, + threads, + max_docs, + } +} + +// --------------------------------------------------------------------------- +// NumPy v1/v2 reader (2-D LE f32, C-order) +// --------------------------------------------------------------------------- + +fn load_npy_f32(path: &str) -> (Vec, usize, usize) { + let bytes = std::fs::read(path).unwrap_or_else(|e| panic!("read npy {path}: {e}")); + assert!(bytes.len() >= 10, "npy file too short: {path}"); + assert_eq!(&bytes[..6], b"\x93NUMPY", "not a numpy file: {path}"); + let major = bytes[6]; + let minor = bytes[7]; + assert!( + major == 1 || major == 2, + "unsupported npy version {major}.{minor}: {path}", + ); + let (header_len, header_start) = if major == 1 { + let hl = u16::from_le_bytes([bytes[8], bytes[9]]) as usize; + (hl, 10) + } else { + let hl = u32::from_le_bytes([bytes[8], bytes[9], bytes[10], bytes[11]]) as usize; + (hl, 12) + }; + let header = std::str::from_utf8(&bytes[header_start..header_start + header_len]) + .expect("npy header not utf-8"); + assert!( + header.contains("'descr': ' = after[open + 1..close] + .split(',') + .filter_map(|s| s.trim().parse::().ok()) + .collect(); + assert_eq!(dims.len(), 2, "expected 2-D array in {path}"); + let n = dims[0]; + let dim = dims[1]; + let data_start = header_start + header_len; + let n_floats = n * dim; + assert_eq!( + bytes.len() - data_start, + n_floats * 4, + "data length mismatch in {path}", + ); + let mut out = vec![0.0f32; n_floats]; + for (i, chunk) in bytes[data_start..].chunks_exact(4).enumerate() { + out[i] = f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]); + } + (out, n, dim) +} + +// --------------------------------------------------------------------------- +// JSON helpers +// --------------------------------------------------------------------------- + +fn load_json_string_array(path: &str) -> Vec { + let text = std::fs::read_to_string(path).unwrap_or_else(|e| panic!("read {path}: {e}")); + serde_json::from_str(&text).unwrap_or_else(|e| panic!("parse json string array {path}: {e}")) +} + +/// SHA-256 of a file, pure Rust (no shelling out — portable, incl. Windows / +/// minimal containers). Hex-encoded; matches the Python `hashlib` digest. +fn sha256_file(path: &str) -> String { + use sha2::{Digest, Sha256}; + let bytes = std::fs::read(path).unwrap_or_else(|e| panic!("read {path} for sha256: {e}")); + Sha256::digest(&bytes) + .iter() + .map(|b| format!("{b:02x}")) + .collect() +} + +fn rustc_version() -> String { + if let Ok(out) = std::process::Command::new("rustc") + .arg("--version") + .output() + { + if out.status.success() { + return String::from_utf8_lossy(&out.stdout).trim().to_string(); + } + } + "unknown".to_string() +} + +fn detected_simd() -> Vec { + #[cfg(target_arch = "x86_64")] + { + let mut v = Vec::new(); + if is_x86_feature_detected!("avx2") { + v.push("avx2".to_string()); + } + if is_x86_feature_detected!("fma") { + v.push("fma".to_string()); + } + if is_x86_feature_detected!("avx512f") { + v.push("avx512f".to_string()); + } + v + } + #[cfg(not(target_arch = "x86_64"))] + { + Vec::new() + } +} + +fn percentile_ms(samples: &[u128], p: f32) -> f64 { + let mut s = samples.to_vec(); + s.sort_unstable(); + if s.is_empty() { + return 0.0; + } + let i = ((s.len() as f32 - 1.0) * p).round() as usize; + s[i] as f64 / 1_000_000.0 +} + +// --------------------------------------------------------------------------- +// Validate embeddings (dim == 1024, unit-norm rows) +// --------------------------------------------------------------------------- + +fn validate_embeddings(data: &[f32], n: usize, dim: usize, label: &str) { + assert_eq!(dim, 1024, "{label}: embedding_dim must be 1024, got {dim}"); + assert_eq!(dim % 16, 0, "{label}: dim must be divisible by 16"); + for (i, row) in data.chunks_exact(dim).enumerate() { + let norm: f32 = row.iter().map(|x| x * x).sum::().sqrt(); + assert!( + (norm - 1.0).abs() < 1e-3, + "{label} row {i}: L2 norm {norm:.6} not ~1.0", + ); + } + eprintln!(" {label}: validated {n} rows (dim={dim}, L2-normalised)"); +} + +// --------------------------------------------------------------------------- +// Output helpers +// --------------------------------------------------------------------------- + +fn open_output(out_dir: &str, dataset: &str, slug: &str, ext: &str) -> BufWriter { + let dir = format!("{out_dir}/{dataset}"); + std::fs::create_dir_all(&dir).unwrap_or_else(|e| panic!("create_dir_all {dir}: {e}")); + let path = format!("{dir}/{slug}.{ext}"); + let f = std::fs::File::create(&path).unwrap_or_else(|e| panic!("create {path}: {e}")); + BufWriter::new(f) +} + +/// Append-only writer for the per-config timing record stream. +fn open_timing_appender(out_dir: &str, dataset: &str) -> BufWriter { + let dir = format!("{out_dir}/{dataset}"); + std::fs::create_dir_all(&dir).unwrap_or_else(|e| panic!("create_dir_all {dir}: {e}")); + let path = format!("{dir}/timing.jsonl"); + let f = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(&path) + .unwrap_or_else(|e| panic!("open {path}: {e}")); + BufWriter::new(f) +} + +/// Write one JSONL row per query (global doc indices; -1 = padding). +#[allow(clippy::too_many_arguments)] +fn write_topk_jsonl( + writer: &mut W, + dataset: &str, + split: &str, + method: &str, + k: usize, + query_ids: &[String], + corpus_ids: &[String], + indices: &[i64], + scores: &[f32], +) { + let nq = query_ids.len(); + let n_corpus = corpus_ids.len(); + for qi in 0..nq { + let row_indices = &indices[qi * k..(qi + 1) * k]; + let mut doc_idxs: Vec = Vec::new(); + let mut doc_ids: Vec<&str> = Vec::new(); + let mut row_scores: Vec = Vec::new(); + for (j, &di) in row_indices.iter().enumerate() { + if di < 0 { + break; // sentinel marks the end of this query's results + } + let di_usize = di as usize; + doc_idxs.push(di_usize as u64); + doc_ids.push(if di_usize < n_corpus { + corpus_ids[di_usize].as_str() + } else { + "" + }); + let sc = scores.get(qi * k + j).copied().unwrap_or(0.0); + row_scores.push(if sc.is_finite() { sc as f64 } else { 0.0 }); + } + // serde_json guarantees valid JSON (escapes quotes/backslashes/unicode in + // doc/query IDs), so downstream `json.loads` never trips. + let row = serde_json::json!({ + "dataset": dataset, + "split": split, + "method": method, + "qid_idx": qi, + "qid": query_ids[qi], + "k": k, + "doc_idxs": doc_idxs, + "doc_ids": doc_ids, + "scores": row_scores, + }); + writeln!(writer, "{row}").expect("write topk jsonl"); + } +} + +/// A single benchmarked configuration's record — written both to the per-method +/// summary.json (full-corpus runs) and appended to timing.jsonl (every run). +struct Record<'a> { + dataset: &'a str, + split: &'a str, + method: &'a str, + dim: usize, + n_docs: usize, + n_queries: usize, + top_k: usize, + threads: usize, + batch: usize, + candidates: usize, + bytes_per_vector: usize, + index_total_mib: f64, + build_seconds: f64, + p50_ms: f64, + p95_ms: f64, + p99_ms: f64, + qps: f64, + simd: &'a [String], + encoder_sha: &'a str, +} + +fn write_record_json(w: &mut W, r: &Record) { + let rec = serde_json::json!({ + "dataset": r.dataset, + "split": r.split, + "method": r.method, + "dim": r.dim, + "n_docs": r.n_docs, + "n_queries": r.n_queries, + "top_k": r.top_k, + "threads": r.threads, + "batch": r.batch, + "candidates": r.candidates, + "bytes_per_vector": r.bytes_per_vector, + "index_total_mib": r.index_total_mib, + "build_seconds": r.build_seconds, + "query_latency_ms_p50": r.p50_ms, + "query_latency_ms_p95": r.p95_ms, + "query_latency_ms_p99": r.p99_ms, + "queries_per_second": r.qps, + "cpu_arch": std::env::consts::ARCH, + "simd_detected": r.simd, + "rustc": rustc_version(), + "crate_version": env!("CARGO_PKG_VERSION"), + "encoder_manifest_sha256": r.encoder_sha, + }); + writeln!(w, "{rec}").expect("write record json"); +} + +// --------------------------------------------------------------------------- +// Timing driver +// --------------------------------------------------------------------------- + +/// Flat `nq*top_k` (indices, scores) for one query batch (sentinel -1 padded). +type Preds = (Vec, Vec); +/// Per-query latency samples (ns) + optionally the collected predictions. +type TimedRun = (Vec, Option); + +/// Warm up, time (amortized per-query over each batch), and optionally collect +/// predictions. `search_batch(b_start, b_end)` returns flat `(b_end-b_start)*top_k` +/// indices (sentinel -1 padded) and matching scores. Runs inside the caller's +/// rayon pool so query parallelism is pinned to the configured thread count. +fn time_and_collect( + n_queries: usize, + batch: usize, + warmup: usize, + collect: bool, + mut search_batch: F, +) -> TimedRun +where + F: FnMut(usize, usize) -> Preds, +{ + // Warmup. + let w_end = (warmup.div_ceil(batch) * batch).min(n_queries); + let mut b_start = 0usize; + while b_start < w_end { + let b_end = (b_start + batch).min(n_queries); + let _ = search_batch(b_start, b_end); + b_start = b_end; + } + + // Timing. + let mut samples = Vec::with_capacity(n_queries); + let mut preds_i: Vec = Vec::new(); + let mut preds_s: Vec = Vec::new(); + b_start = 0; + while b_start < n_queries { + let b_end = (b_start + batch).min(n_queries); + let b = b_end - b_start; + let t0 = Instant::now(); + let (idx, sc) = search_batch(b_start, b_end); + let per_query_ns = t0.elapsed().as_nanos() / b as u128; + for _ in 0..b { + samples.push(per_query_ns); + } + if collect { + preds_i.extend_from_slice(&idx); + preds_s.extend_from_slice(&sc); + } + b_start = b_end; + } + + let preds = if collect { + Some((preds_i, preds_s)) + } else { + None + }; + (samples, preds) +} + +/// Finalize one method run: percentiles, optional topk/summary write, timing record. +#[allow(clippy::too_many_arguments)] +fn finalize( + slug: &str, + samples: &[u128], + preds: Option<(Vec, Vec)>, + dim: usize, + n_docs: usize, + n_queries: usize, + top_k: usize, + threads: usize, + batch: usize, + candidates: usize, + bytes_per_vector: usize, + index_total_mib: f64, + build_seconds: f64, + dataset: &str, + split: &str, + query_ids: &[String], + corpus_ids: &[String], + out_dir: &str, + simd: &[String], + encoder_sha: &str, + timing_writer: &mut dyn Write, +) { + let p50 = percentile_ms(samples, 0.50); + let p95 = percentile_ms(samples, 0.95); + let p99 = percentile_ms(samples, 0.99); + let qps = 1_000.0 / p50.max(f64::EPSILON); + + let rec = Record { + dataset, + split, + method: slug, + dim, + n_docs, + n_queries, + top_k, + threads, + batch, + candidates, + bytes_per_vector, + index_total_mib, + build_seconds, + p50_ms: p50, + p95_ms: p95, + p99_ms: p99, + qps, + simd, + encoder_sha, + }; + // Always append to the timing stream. + write_record_json(timing_writer, &rec); + + // Full-corpus runs (preds collected) also write topk + per-method summary. + if let Some((pred_i, pred_s)) = preds { + let mut jw = open_output(out_dir, dataset, slug, "topk.jsonl"); + write_topk_jsonl( + &mut jw, dataset, split, slug, top_k, query_ids, corpus_ids, &pred_i, &pred_s, + ); + jw.flush().expect("flush topk"); + let mut sw = open_output(out_dir, dataset, slug, "summary.json"); + write_record_json(&mut sw, &rec); + sw.flush().expect("flush summary"); + } + + eprintln!( + " {slug} [n={n_docs} t={threads}]: p50={p50:.4}ms p95={p95:.4}ms p99={p99:.4}ms qps={qps:.1}" + ); +} + +// --------------------------------------------------------------------------- +// Per-query top-k from raw scores (used by the flat baseline) +// --------------------------------------------------------------------------- + +/// One chunk's contribution: `nq` rows, each a local top-k of (score, global_id). +type ChunkTopK = Vec>; + +/// Local top-k of a score row, returned as (score, global_id) sorted by score +/// desc, with `id_offset` added to the local column index. +fn local_topk(row: &[f32], id_offset: usize, top_k: usize) -> Vec<(f32, i64)> { + let mut scored: Vec<(f32, i64)> = row + .iter() + .enumerate() + .map(|(j, &s)| (s, (id_offset + j) as i64)) + .collect(); + let k = top_k.min(scored.len()); + // `k > 0` guards the `k - 1` index (top_k is asserted >= 1 at the CLI, but + // keep this defensive so a zero can never underflow to usize::MAX here). + if k > 0 && k < scored.len() { + scored.select_nth_unstable_by(k - 1, |a, b| b.0.total_cmp(&a.0)); + scored.truncate(k); + } + scored +} + +/// Exact inner-product top-k for a whole query batch against `corpus[..n_docs]`. +/// Same math as FAISS `IndexFlatIP`: scores = Q · Dᵀ via a blocked SIMD GEMM +/// (matrixmultiply), parallelized over doc-chunks on the current rayon pool so +/// the baseline both vectorizes and scales with the configured thread count. +fn flat_batch_topk( + qbatch: &[f32], + nq: usize, + corpus: &[f32], + n_docs: usize, + dim: usize, + top_k: usize, +) -> (Vec, Vec) { + // ~2 chunks per thread (≥1024 docs each) for balance without tiny GEMMs. + let nthreads = rayon::current_num_threads().max(1); + let target_chunks = (nthreads * 2).max(1); + let chunk_size = n_docs.div_ceil(target_chunks).max(1024); + let n_chunks = n_docs.div_ceil(chunk_size).max(1); + + // Per chunk → nq rows of local top-k (global ids). + let per_chunk: Vec = (0..n_chunks) + .into_par_iter() + .map(|c| { + let start = c * chunk_size; + let end = (start + chunk_size).min(n_docs); + let cn = end - start; + if cn == 0 { + return vec![Vec::new(); nq]; + } + // C(nq × cn) = Q(nq × dim) · Dᵀ_chunk : B element (k, j) is at + // corpus[(start+j)*dim + k] → row-stride 1, col-stride dim. + let mut cmat = vec![0.0f32; nq * cn]; + unsafe { + matrixmultiply::sgemm( + nq, + dim, + cn, + 1.0, + qbatch.as_ptr(), + dim as isize, + 1, + corpus[start * dim..end * dim].as_ptr(), + 1, + dim as isize, + 0.0, + cmat.as_mut_ptr(), + cn as isize, + 1, + ); + } + (0..nq) + .map(|qi| local_topk(&cmat[qi * cn..(qi + 1) * cn], start, top_k)) + .collect() + }) + .collect(); + + // Merge chunk-local top-k into the global top-k per query. + let mut idx = vec![-1i64; nq * top_k]; + let mut sc = vec![0.0f32; nq * top_k]; + for qi in 0..nq { + let mut merged: Vec<(f32, i64)> = Vec::new(); + for chunk in &per_chunk { + merged.extend_from_slice(&chunk[qi]); + } + let k = top_k.min(merged.len()); + if k < merged.len() { + merged.select_nth_unstable_by(k - 1, |a, b| b.0.total_cmp(&a.0)); + merged.truncate(k); + } + merged.sort_unstable_by(|a, b| b.0.total_cmp(&a.0)); + for (j, &(s, i)) in merged.iter().take(top_k).enumerate() { + idx[qi * top_k + j] = i; + sc[qi * top_k + j] = s; + } + } + (idx, sc) +} + +/// Pad a per-query Vec<(idx, score)> ordering into flat `top_k` rows (-1 / 0.0). +fn pad_rows(rows: Vec>, top_k: usize) -> (Vec, Vec) { + let mut idx = vec![-1i64; rows.len() * top_k]; + let mut sc = vec![0.0f32; rows.len() * top_k]; + for (qi, row) in rows.iter().enumerate() { + for (j, &(i, s)) in row.iter().take(top_k).enumerate() { + idx[qi * top_k + j] = i; + sc[qi * top_k + j] = s; + } + } + (idx, sc) +} + +// --------------------------------------------------------------------------- +// Cache resolution +// --------------------------------------------------------------------------- + +fn resolve_encoder_dir(cache_dir: &str, dataset: &str, split: &str) -> String { + let parent = format!("{cache_dir}/{dataset}/{split}"); + let entries = std::fs::read_dir(&parent).unwrap_or_else(|e| panic!("read_dir {parent}: {e}")); + let mut matches: Vec = entries + .filter_map(|e| e.ok()) + .filter(|e| e.file_name().to_string_lossy().starts_with("encoder=") && e.path().is_dir()) + .map(|e| e.path().to_string_lossy().to_string()) + .collect(); + assert!(!matches.is_empty(), "no encoder=* subdir under {parent}"); + assert!( + matches.len() == 1, + "multiple encoder=* dirs under {parent}: {matches:?} — one encoder per dataset/split", + ); + matches.remove(0) +} + +// --------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------- + +fn main() { + let cfg = parse_args(); + + let threads_resolved = if cfg.threads == 0 { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) + } else { + cfg.threads + }; + // Per-config query pool: build still uses all cores (default global pool); + // query latency is pinned to `threads_resolved` via pool.install(...). + let query_pool = rayon::ThreadPoolBuilder::new() + .num_threads(threads_resolved) + .build() + .expect("build query thread pool"); + + eprintln!( + "beir-bench: dataset={} split={} top_k={} batch={} candidates={} threads={} (resolved {}) max_docs={:?} methods={:?}", + cfg.dataset, cfg.split, cfg.top_k, cfg.batch, cfg.candidates, cfg.threads, threads_resolved, cfg.max_docs, cfg.methods, + ); + + let enc_dir = resolve_encoder_dir(&cfg.cache_dir, &cfg.dataset, &cfg.split); + let manifest_path = format!("{enc_dir}/embeddings.manifest.json"); + let encoder_sha = sha256_file(&manifest_path); + + let (corpus_full, n_corpus_full, dim) = load_npy_f32(&format!("{enc_dir}/corpus.f32.npy")); + let (queries, n_queries, q_dim) = load_npy_f32(&format!("{enc_dir}/queries.f32.npy")); + assert_eq!(q_dim, dim, "query dim {q_dim} != corpus dim {dim}"); + validate_embeddings(&corpus_full, n_corpus_full, dim, "corpus"); + validate_embeddings(&queries, n_queries, q_dim, "queries"); + + let corpus_ids_full = load_json_string_array(&format!("{enc_dir}/corpus_ids.json")); + let query_ids = load_json_string_array(&format!("{enc_dir}/query_ids.json")); + assert_eq!( + corpus_ids_full.len(), + n_corpus_full, + "corpus_ids/embeddings mismatch" + ); + assert_eq!(query_ids.len(), n_queries, "query_ids/embeddings mismatch"); + + // Sub-sample the corpus for the scaling sweep (latency-only; no nDCG). + let n_docs = cfg.max_docs.unwrap_or(n_corpus_full).min(n_corpus_full); + let full_corpus = cfg.max_docs.is_none() || n_docs == n_corpus_full; + let corpus = &corpus_full[..n_docs * dim]; + let corpus_ids = &corpus_ids_full[..n_docs]; + let write_topk = full_corpus; // qrels-based nDCG only valid on the full corpus + + let simd = detected_simd(); + eprintln!( + "dim={dim} n_docs={n_docs}{} n_queries={n_queries} simd={simd:?}", + if full_corpus { + " (full)" + } else { + " (sub-sampled)" + } + ); + + let mut timing_writer = open_timing_appender(&cfg.out_dir, &cfg.dataset); + + for method in &cfg.methods { + eprintln!("\n--- {method} ---"); + match method.as_str() { + "flat" => run_flat( + corpus, + &queries, + dim, + n_docs, + n_queries, + cfg.top_k, + cfg.batch, + threads_resolved, + &query_pool, + &cfg, + corpus_ids, + &query_ids, + &simd, + &encoder_sha, + write_topk, + &mut timing_writer, + ), + "hnsw" => run_hnsw( + corpus, + &queries, + dim, + n_docs, + n_queries, + cfg.top_k, + cfg.batch, + threads_resolved, + &query_pool, + &cfg, + corpus_ids, + &query_ids, + &simd, + &encoder_sha, + write_topk, + &mut timing_writer, + ), + "rq2" => run_rq( + corpus, + &queries, + dim, + n_docs, + n_queries, + cfg.top_k, + cfg.batch, + 2, + threads_resolved, + &query_pool, + &cfg, + corpus_ids, + &query_ids, + &simd, + &encoder_sha, + write_topk, + &mut timing_writer, + ), + "rq4" => run_rq( + corpus, + &queries, + dim, + n_docs, + n_queries, + cfg.top_k, + cfg.batch, + 4, + threads_resolved, + &query_pool, + &cfg, + corpus_ids, + &query_ids, + &simd, + &encoder_sha, + write_topk, + &mut timing_writer, + ), + "bitmap-rq2" => run_two_stage( + TwoStage::Bitmap, + corpus, + &queries, + dim, + n_docs, + n_queries, + cfg.top_k, + cfg.batch, + cfg.candidates, + threads_resolved, + &query_pool, + &cfg, + corpus_ids, + &query_ids, + &simd, + &encoder_sha, + write_topk, + &mut timing_writer, + ), + "sign-rq2" => run_two_stage( + TwoStage::Sign, + corpus, + &queries, + dim, + n_docs, + n_queries, + cfg.top_k, + cfg.batch, + cfg.candidates, + threads_resolved, + &query_pool, + &cfg, + corpus_ids, + &query_ids, + &simd, + &encoder_sha, + write_topk, + &mut timing_writer, + ), + other => panic!( + "unknown method '{other}'. Supported: flat, hnsw, rq2, rq4, bitmap-rq2, sign-rq2" + ), + } + } + timing_writer.flush().expect("flush timing.jsonl"); + eprintln!( + "\ndone. timing -> {}/{}/timing.jsonl", + cfg.out_dir, cfg.dataset + ); +} + +// --------------------------------------------------------------------------- +// Method: flat (exact inner product == FAISS IndexFlatIP math) +// --------------------------------------------------------------------------- + +#[allow(clippy::too_many_arguments)] +fn run_flat( + corpus: &[f32], + queries: &[f32], + dim: usize, + n_docs: usize, + n_queries: usize, + top_k: usize, + batch: usize, + threads: usize, + pool: &rayon::ThreadPool, + cfg: &Config, + corpus_ids: &[String], + query_ids: &[String], + simd: &[String], + encoder_sha: &str, + write_topk: bool, + timing_writer: &mut dyn Write, +) { + let bytes_per_vector = dim * 4; + let index_total_mib = (n_docs * bytes_per_vector) as f64 / 1024.0 / 1024.0; + let warmup = 5.min(n_queries); + + let (samples, preds) = pool.install(|| { + time_and_collect(n_queries, batch, warmup, write_topk, |bs, be| { + let qbatch = &queries[bs * dim..be * dim]; + flat_batch_topk(qbatch, be - bs, corpus, n_docs, dim, top_k) + }) + }); + + finalize( + "flat", + &samples, + preds, + dim, + n_docs, + n_queries, + top_k, + threads, + batch, + 0, + bytes_per_vector, + index_total_mib, + 0.0, + &cfg.dataset, + &cfg.split, + query_ids, + corpus_ids, + &cfg.out_dir, + simd, + encoder_sha, + timing_writer, + ); +} + +// --------------------------------------------------------------------------- +// Method: hnsw (pure-Rust HNSW, hnsw_rs; DistDot on unit-norm vectors) +// --------------------------------------------------------------------------- + +#[allow(clippy::too_many_arguments)] +fn run_hnsw( + corpus: &[f32], + queries: &[f32], + dim: usize, + n_docs: usize, + n_queries: usize, + top_k: usize, + batch: usize, + threads: usize, + pool: &rayon::ThreadPool, + cfg: &Config, + corpus_ids: &[String], + query_ids: &[String], + simd: &[String], + encoder_sha: &str, + write_topk: bool, + timing_writer: &mut dyn Write, +) { + let slug = "hnsw"; + eprintln!(" building HNSW M={HNSW_M} ef_c={HNSW_EF_CONSTRUCTION} ({n_docs} docs) ..."); + let hnsw: Hnsw = Hnsw::new( + HNSW_M, + n_docs, + HNSW_MAX_LAYER, + HNSW_EF_CONSTRUCTION, + DistDot {}, + ); + // Insert (build uses all cores via the global pool). + let doc_refs: Vec<(&[f32], usize)> = (0..n_docs) + .map(|di| (&corpus[di * dim..(di + 1) * dim], di)) + .collect(); + let t0 = Instant::now(); + hnsw.parallel_insert_slice(&doc_refs); + let build_seconds = t0.elapsed().as_secs_f64(); + eprintln!(" build done in {build_seconds:.2}s"); + + // HNSW graph size is implementation-internal; report the stored-vector bytes + // (full float) as the index footprint, matching the dense baseline accounting. + let bytes_per_vector = dim * 4; + let index_total_mib = (n_docs * bytes_per_vector) as f64 / 1024.0 / 1024.0; + let warmup = 5.min(n_queries); + + // Pre-slice query rows so neither timing mode pays per-batch allocation. + let query_rows: Vec<&[f32]> = (0..n_queries) + .map(|qi| &queries[qi * dim..(qi + 1) * dim]) + .collect(); + + let (samples, preds) = pool.install(|| { + time_and_collect(n_queries, batch, warmup, write_topk, |bs, be| { + let rows: Vec> = if threads == 1 { + // Single-thread: serial search per query. + (bs..be) + .map(|qi| { + hnsw.search(query_rows[qi], top_k, HNSW_EF_SEARCH) + .into_iter() + .map(|nb| (nb.d_id as i64, 1.0 - nb.distance)) + .collect() + }) + .collect() + } else { + // Threaded: batched parallel search (rayon, this pool). + let batch_slice: Vec> = + (bs..be).map(|qi| query_rows[qi].to_vec()).collect(); + hnsw.parallel_search(&batch_slice, top_k, HNSW_EF_SEARCH) + .into_iter() + .map(|nbs| { + nbs.into_iter() + .map(|nb| (nb.d_id as i64, 1.0 - nb.distance)) + .collect() + }) + .collect() + }; + pad_rows(rows, top_k) + }) + }); + + finalize( + slug, + &samples, + preds, + dim, + n_docs, + n_queries, + top_k, + threads, + batch, + 0, + bytes_per_vector, + index_total_mib, + build_seconds, + &cfg.dataset, + &cfg.split, + query_ids, + corpus_ids, + &cfg.out_dir, + simd, + encoder_sha, + timing_writer, + ); +} + +// --------------------------------------------------------------------------- +// Method: rq2 / rq4 (RankQuant full-scan asymmetric LUT) +// --------------------------------------------------------------------------- + +#[allow(clippy::too_many_arguments)] +fn run_rq( + corpus: &[f32], + queries: &[f32], + dim: usize, + n_docs: usize, + n_queries: usize, + top_k: usize, + batch: usize, + bits: u8, + threads: usize, + pool: &rayon::ThreadPool, + cfg: &Config, + corpus_ids: &[String], + query_ids: &[String], + simd: &[String], + encoder_sha: &str, + write_topk: bool, + timing_writer: &mut dyn Write, +) { + let slug = format!("ordvec-rq{bits}"); + eprintln!(" building RankQuant b={bits} ({n_docs} docs) ..."); + let mut idx = RankQuant::new(dim, bits); + let t0 = Instant::now(); + idx.add(corpus); + let build_seconds = t0.elapsed().as_secs_f64(); + let bytes_per_vector = idx.bytes_per_vec(); + let index_total_mib = idx.byte_size() as f64 / 1024.0 / 1024.0; + let warmup = 5.min(n_queries); + + let (samples, preds) = pool.install(|| { + time_and_collect(n_queries, batch, warmup, write_topk, |bs, be| { + let batch_q = &queries[bs * dim..be * dim]; + let res = idx.search_asymmetric(batch_q, top_k); + (res.indices, res.scores) + }) + }); + + finalize( + &slug, + &samples, + preds, + dim, + n_docs, + n_queries, + top_k, + threads, + batch, + 0, + bytes_per_vector, + index_total_mib, + build_seconds, + &cfg.dataset, + &cfg.split, + query_ids, + corpus_ids, + &cfg.out_dir, + simd, + encoder_sha, + timing_writer, + ); +} + +// --------------------------------------------------------------------------- +// Method: bitmap-rq2 / sign-rq2 (two-stage candidate-gen → rerank) +// --------------------------------------------------------------------------- + +#[derive(Clone, Copy)] +enum TwoStage { + Bitmap, + Sign, +} + +fn bitmap_vecs_to_csr(vecs: Vec>) -> (Vec, Vec) { + let mut offsets = Vec::with_capacity(vecs.len() + 1); + let mut candidates = Vec::new(); + offsets.push(0usize); + for row in &vecs { + candidates.extend_from_slice(row); + offsets.push(candidates.len()); + } + (offsets, candidates) +} + +#[allow(clippy::too_many_arguments)] +fn run_two_stage( + stage: TwoStage, + corpus: &[f32], + queries: &[f32], + dim: usize, + n_docs: usize, + n_queries: usize, + top_k: usize, + batch: usize, + candidates: usize, + threads: usize, + pool: &rayon::ThreadPool, + cfg: &Config, + corpus_ids: &[String], + query_ids: &[String], + simd: &[String], + encoder_sha: &str, + write_topk: bool, + timing_writer: &mut dyn Write, +) { + let (slug, label) = match stage { + TwoStage::Bitmap => ("ordvec-bitmap-rq2", "Bitmap"), + TwoStage::Sign => ("ordvec-sign-rq2", "SignBitmap"), + }; + eprintln!(" building {label} + RankQuant b=2 (m={candidates}, {n_docs} docs) ..."); + + let n_top = dim / 4; + let mut bitmap = Bitmap::new(dim, n_top); + let mut sign = SignBitmap::new(dim); + let mut rq = RankQuant::new(dim, 2); + let t0 = Instant::now(); + match stage { + TwoStage::Bitmap => bitmap.add(corpus), + TwoStage::Sign => sign.add(corpus), + } + rq.add(corpus); + let build_seconds = t0.elapsed().as_secs_f64(); + + let stage1_bytes = match stage { + TwoStage::Bitmap => bitmap.bytes_per_vec(), + TwoStage::Sign => sign.bytes_per_vec(), + }; + let stage1_size = match stage { + TwoStage::Bitmap => bitmap.byte_size(), + TwoStage::Sign => sign.byte_size(), + }; + let bytes_per_vector = stage1_bytes + rq.bytes_per_vec(); + let index_total_mib = (stage1_size + rq.byte_size()) as f64 / 1024.0 / 1024.0; + + let out_k = top_k.min(candidates).min(n_docs); + let warmup = 5.min(n_queries); + + let mut scratch = SubsetScratch::new(); + let mut out_scores_buf = vec![f32::NEG_INFINITY; batch * out_k]; + let mut out_indices_buf = vec![-1i64; batch * out_k]; + + let (samples, preds) = pool.install(|| { + time_and_collect(n_queries, batch, warmup, write_topk, |bs, be| { + let batch_q = &queries[bs * dim..be * dim]; + let nq_batch = be - bs; + let needed = nq_batch * out_k; + if out_scores_buf.len() != needed { + out_scores_buf.resize(needed, f32::NEG_INFINITY); + out_indices_buf.resize(needed, -1); + } + + // Stage 1: candidate generation → CSR (offsets, candidates). + let (offsets, cand_flat) = match stage { + TwoStage::Bitmap => { + let cand_vecs = bitmap.top_m_candidates_batched(batch_q, candidates); + bitmap_vecs_to_csr(cand_vecs) + } + TwoStage::Sign => { + let cb: CandidateBatch = + sign.top_m_candidates_batched_serial_csr(batch_q, candidates); + (cb.offsets, cb.candidates) + } + }; + + // Stage 2: pooled subset rerank (allocation-free). + rq.search_asymmetric_subset_batched_serial_into( + batch_q, + &offsets, + &cand_flat, + top_k, + &mut scratch, + &mut out_scores_buf, + &mut out_indices_buf, + ); + + // Pad per-query results to `top_k`. + let mut idx = vec![-1i64; nq_batch * top_k]; + let mut sc = vec![0.0f32; nq_batch * top_k]; + for qi in 0..nq_batch { + let src_i = &out_indices_buf[qi * out_k..(qi + 1) * out_k]; + let src_s = &out_scores_buf[qi * out_k..(qi + 1) * out_k]; + let copy = src_i.len().min(top_k); + idx[qi * top_k..qi * top_k + copy].copy_from_slice(&src_i[..copy]); + sc[qi * top_k..qi * top_k + copy].copy_from_slice(&src_s[..copy]); + } + (idx, sc) + }) + }); + + finalize( + slug, + &samples, + preds, + dim, + n_docs, + n_queries, + top_k, + threads, + batch, + candidates, + bytes_per_vector, + index_total_mib, + build_seconds, + &cfg.dataset, + &cfg.split, + query_ids, + corpus_ids, + &cfg.out_dir, + simd, + encoder_sha, + timing_writer, + ); +} diff --git a/benchmarks/beir/README.md b/benchmarks/beir/README.md new file mode 100644 index 00000000..3f9887e4 --- /dev/null +++ b/benchmarks/beir/README.md @@ -0,0 +1,135 @@ +# ordvec BEIR benchmark harness + +Reproducible evaluation of ordvec's rank/sign retrieval on standard +[BEIR](https://github.com/beir-cellar/beir) datasets — quality (nDCG@10 vs +qrels) and latency (single-query / batched / threaded) — against an exact +inner-product baseline and a pure-Rust HNSW. The shared encoder is Microsoft +**Harrier** (`harrier-oss-v1-0.6b`, 1024-dim), run as GGUF `Q8_0`. + +All latency is measured in **one Rust process** (`benchmarks/beir-bench`); Python +only embeds the corpus, scores nDCG against qrels, and renders the figures. + +## Claims discipline + +> **Benchmark numbers in this repository reflect synthetic or user-runnable +> real-corpus experiments only. No numbers are fabricated or cherry-picked. +> Every result file produced by `make benchmark-beir` is fully reproducible +> from the commands documented here, using publicly available BEIR datasets and +> the pinned encoder revision recorded in `embeddings.manifest.json`.** + +> **The `flat` baseline is an exact full-float inner-product search (identical +> retrieval to FAISS `IndexFlatIP`) used for comparison purposes — it is NOT +> ground truth. nDCG@10 is computed against the official BEIR qrels +> (human-annotated relevance judgements), not against the `flat` results. +> Recall-vs-`flat` is an optional diagnostic only; it does not substitute for +> qrel-based evaluation.** + +## Dataset suite + +| Dataset | Domain | #Queries | #Corpus | +|------------|-------------------------------|---------:|--------:| +| scifact | Scientific claim verification | 300 | 5,183 | +| nfcorpus | Biomedical IR | 323 | 3,633 | +| fiqa | Financial QA | 648 | 57,638 | +| trec-covid | COVID-19 literature | 50 | 171,332 | + +Datasets are downloaded automatically on first run by a small vendored BEIR +reader (no `beir` PyPI package — it pulls an unbuildable `pytrec_eval`). The +default `make benchmark-beir` reproduces **scifact** (quality) + **trec-covid** +(scaling + latency); `nfcorpus`/`fiqa` are supported via `QUALITY_DATASETS=...`. + +## Encoder + +**Harrier (`harrier-oss-v1-0.6b`)** — a 600M-parameter bi-encoder producing +1024-dimensional L2-normalised float32 embeddings. The canonical lane runs the +**GGUF `Q8_0`** weights via `llama-cpp-python` (CUDA), last-token pooled. + +- Documents receive no instruction prefix. +- Queries are prefixed with + `"Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "`. +- The exact repo/file/quant + library versions are recorded in + `embeddings.manifest.json` per cache directory. + +Optional alternate encoder lanes (heavier; off by default): sentence-transformers +(`make bench-beir-prepare-st`) and Ollama (`make bench-beir-prepare-ollama`). + +## Quick start + +```bash +make bench-beir-setup # Python deps + CUDA llama-cpp-python (built from source) +make benchmark-beir-smoke # quick end-to-end sanity (scifact only) +make benchmark-beir # quality (nDCG) + scaling sweep + three figures +``` + +`bench-beir-setup` installs `requirements.txt` and then builds `llama-cpp-python` +against the host CUDA toolkit (`CMAKE_ARGS="-DGGML_CUDA=on"`; override +`LLAMA_CMAKE_ARGS=` for a CPU-only build). + +## Methods (all measured in the Rust harness) + +| Method | Bytes/vec | Description | +|-------------------|----------:|-------------| +| `flat` | 4096 | Exact inner product (== FAISS `IndexFlatIP` math), pure-Rust SIMD GEMM. **Baseline, not ground truth.** | +| `hnsw` | 4096 | Pure-Rust HNSW (`hnsw_rs`, M=32, ef=128) — portable stand-in for C++ hnswlib. | +| `rq2` | 256 | RankQuant 2 bits/dim, asymmetric float-query LUT scan. | +| `rq4` | 512 | RankQuant 4 bits/dim, asymmetric float-query LUT scan. | +| `bitmap-rq2` | 384 | Two-stage: Bitmap candidate-gen → RankQuant-2 rerank. | +| `sign-rq2` | 384 | Two-stage: SignBitmap candidate-gen → RankQuant-2 rerank. | + +Thread/batch knobs (per `beir-bench`): `--threads N` pins query latency to a +rayon pool of N threads (index build still uses all cores); `--max-docs M` +sub-samples the corpus for the scaling sweep; `--batch` sets the matched batch. + +## Cache layout + +One encoder run produces a directory per dataset/split: + +``` +.cache/ordvec-beir///encoder=/ + corpus.f32.npy # float32 (n_docs, 1024), L2-normalised, C-order + queries.f32.npy # float32 (n_queries, 1024), L2-normalised, C-order + corpus_ids.json # list[str], sorted(corpus.keys()) + query_ids.json # list[str], sorted(qrels.keys()) + qrels.json # {qid: {doc_id: int_relevance}} + texts.manifest.json # raw-text provenance + embeddings.manifest.json # encoder provider/model/quant/revision/dim/versions + sha256s.json # sha256 of each npy file +``` + +`prepare` skips re-embedding if these artefacts already exist (use `--force` to +re-embed). + +## Results layout + +``` +results/beir// + .topk.jsonl # one JSON line per query (full-corpus runs) + .summary.json # aggregate latency + provenance (full-corpus runs) + timing.jsonl # one record per (method, n_docs, threads) — drives the plots +results/beir/figures/ # scaling_curve / bars_single_thread / bars_threaded (.png/.svg) +``` + +Top-k JSONL row schema (emitted with `serde_json`, so IDs are always valid JSON): + +```json +{"dataset":"scifact","split":"test","method":"ordvec-rq2", + "qid_idx":0,"qid":"1","k":100, + "doc_idxs":[42,7],"doc_ids":["abc","def"],"scores":[0.91,0.88]} +``` + +## `import ordvec` rule + +This harness is an **external benchmark driver**. Python prepares embeddings, +evaluates qrels, and renders plots; the ordvec hot path is the Rust `beir-bench` +binary. The Python `ordvec` package is intentionally **not** imported — so the +latency numbers reflect the crate, not the bindings, and the harness does not +even require the wheel to be installed. The `bench-beir-guardrail` Make target +(run automatically by `benchmark-beir`) fails with a clear error if any +`benchmarks/beir/*.py` file contains `import ordvec` / `from ordvec`. + +## Clean up + +```bash +make bench-beir-clean # remove result files + timing.jsonl, keep embedding cache +make bench-beir-clean-cache # remove embedding cache (re-encoding required) +``` diff --git a/benchmarks/beir/beir_eval.py b/benchmarks/beir/beir_eval.py new file mode 100644 index 00000000..13d0c429 --- /dev/null +++ b/benchmarks/beir/beir_eval.py @@ -0,0 +1,790 @@ +""" +beir_eval.py — Evaluate ordvec-beir top-k runs against BEIR qrels. + +Responsibilities (spec §9) +-------------------------- +1. Discover every ``//*.topk.jsonl`` file. +2. Build the run dict ``{qid: {doc_id: score}}`` for each method. +3. Evaluate against the cached BEIR qrels using ``pytrec_eval`` (the same + engine BEIR's ``EvaluateRetrieval`` wraps). Headline metric is nDCG@10; + secondary metrics are MAP@10, Recall@100, MRR@10, Precision@10. +4. Pull systems columns (bytes/vector, total MiB, build seconds, + p50/p95/p99 latency, queries/second) from each method's ``.summary.json``. +5. Run a *paired* bootstrap of every method vs the ``--baseline`` (faiss-flat): + resample queries with replacement ``--bootstrap-iters`` times (seeded), + compute the per-query metric delta (method - baseline), and report the + mean delta + 95% CI + ``within_noise``. +6. (Diagnostic, behind ``--include-ann-diagnostics``) ANN recall@100 vs the + baseline (overlap of top-100 doc sets). Kept OUT of the headline summary. +7. Emit ``summary.csv``, ``summary.json``, ``comparison-matrix.md``, + ``bootstrap.json`` and (via :mod:`beir_report`) ``summary.md``. + +This harness is an *external consumer* of ordvec — it MUST NOT ``import +ordvec``. It only reads cached artefacts and result files. + +CLI +--- +Run ``python beir_eval.py --help`` for full usage. +""" + +from __future__ import annotations + +import argparse +import json +import pathlib +import sys +from typing import Any + +import numpy as np + +# Allow `from common import ...` when run as a script from the repo root +# (the Makefile invokes `python3 benchmarks/beir/