From fc4c26f539dcd60738a13a8c0ee6615a1151c0dd Mon Sep 17 00:00:00 2001 From: Todd Baur Date: Fri, 19 Jun 2026 16:00:19 -0700 Subject: [PATCH 1/3] feat(beir-bench): --ef-search flag for HNSW (was hardcoded const) HNSW ef_search was a compile-time const (128), so the harness ran HNSW at one fixed recall/latency operating point. Expose it as --ef-search so HNSW's own recall-vs-latency Pareto can be swept and compared fairly against ordvec's candidate-count sweep. Default unchanged (128). No other behavior change. Verified: ef=16 -> p50 0.32ms / nDCG 0.456; ef=256 -> 2.43ms / 0.470 on fiqa (the knob traces HNSW's frontier as intended). Co-Authored-By: Claude Opus 4.8 Signed-off-by: Todd Baur --- benchmarks/beir-bench/src/main.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/benchmarks/beir-bench/src/main.rs b/benchmarks/beir-bench/src/main.rs index b931c6a..cba21ca 100644 --- a/benchmarks/beir-bench/src/main.rs +++ b/benchmarks/beir-bench/src/main.rs @@ -38,7 +38,6 @@ use hnsw_rs::prelude::*; // HNSW hyper-parameters (faithful to the prior "hnswlib M=32" comparison). const HNSW_M: usize = 32; const HNSW_EF_CONSTRUCTION: usize = 200; -const HNSW_EF_SEARCH: usize = 128; const HNSW_MAX_LAYER: usize = 16; // --------------------------------------------------------------------------- @@ -56,6 +55,7 @@ struct Config { out_dir: String, threads: usize, // 0 = all cores max_docs: Option, // None = full corpus + ef_search: usize, // HNSW query-time recall/latency knob (default 128) } fn parse_args() -> Config { @@ -76,10 +76,18 @@ fn parse_args() -> Config { let mut out_dir = String::from("results/beir"); let mut threads = 0usize; let mut max_docs: Option = None; + let mut ef_search = 128usize; let mut args = std::env::args().skip(1); while let Some(a) = args.next() { match a.as_str() { + "--ef-search" => { + ef_search = args + .next() + .expect("--ef-search requires a value") + .parse() + .expect("--ef-search must be an integer") + } "--cache-dir" => cache_dir = args.next().expect("--cache-dir requires a value"), "--dataset" => dataset = args.next().expect("--dataset requires a value"), "--split" => split = args.next().expect("--split requires a value"), @@ -148,6 +156,7 @@ fn parse_args() -> Config { out_dir, threads, max_docs, + ef_search, } } @@ -1032,7 +1041,7 @@ fn run_hnsw( timing_writer: &mut dyn Write, ) { let slug = "hnsw"; - eprintln!(" building HNSW M={HNSW_M} ef_c={HNSW_EF_CONSTRUCTION} ({n_docs} docs) ..."); + eprintln!(" building HNSW M={HNSW_M} ef_c={HNSW_EF_CONSTRUCTION} ef_s={} ({n_docs} docs) ...", cfg.ef_search); // DistL2 (not DistDot): embeddings are unit-normalized, so min-L2 ≡ max-dot ≡ // max-cosine — identical neighbors — but DistL2 avoids anndists' DistDot // `1-dot` distance assert, which panics on near-duplicate pairs whose float @@ -1070,7 +1079,7 @@ fn run_hnsw( // Single-thread: serial search per query. (bs..be) .map(|qi| { - hnsw.search(query_rows[qi], top_k, HNSW_EF_SEARCH) + hnsw.search(query_rows[qi], top_k, cfg.ef_search) .into_iter() .map(|nb| (nb.d_id as i64, -nb.distance)) .collect() @@ -1080,7 +1089,7 @@ fn run_hnsw( // Threaded: batched parallel search (rayon, this pool). let batch_slice: Vec> = (bs..be).map(|qi| query_rows[qi].to_vec()).collect(); - hnsw.parallel_search(&batch_slice, top_k, HNSW_EF_SEARCH) + hnsw.parallel_search(&batch_slice, top_k, cfg.ef_search) .into_iter() .map(|nbs| { nbs.into_iter() From cc7753a757f6d72aa7c8ee4de24375635376bab0 Mon Sep 17 00:00:00 2001 From: Todd Baur Date: Fri, 19 Jun 2026 16:20:30 -0700 Subject: [PATCH 2/3] fix(beir-bench): ef in HNSW slug + validate ef>=top_k (review feedback) Two issues raised on the --ef-search flag: 1. ef-sweep emitted all points under the fixed 'hnsw' slug with no ef field, so topk/summary/timing rows collided (full-corpus overwrite; timing dedup by method/n_docs) and the recall/latency frontier couldn't be reconstructed. Fix: slug is now 'hnsw_ef{N}' so every operating point is recorded distinctly. 2. hnsw_rs clamps ef=max(ef,knbn) internally, so --ef-search below --top-k would be silently bumped, flattening the low end of a sweep. Fix: clamp ef_search up to top_k explicitly with a warning, so the recorded ef matches what actually ran. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Todd Baur --- benchmarks/beir-bench/src/main.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/benchmarks/beir-bench/src/main.rs b/benchmarks/beir-bench/src/main.rs index cba21ca..36a5dae 100644 --- a/benchmarks/beir-bench/src/main.rs +++ b/benchmarks/beir-bench/src/main.rs @@ -144,6 +144,19 @@ fn parse_args() -> Config { assert!(batch >= 1, "--batch must be >= 1"); assert!(top_k >= 1, "--top-k must be >= 1"); assert!(candidates >= 1, "--candidates must be >= 1"); + // hnsw_rs requires ef_search >= the requested neighbour count (it internally + // clamps ef = max(ef, knbn)). An --ef-search below --top-k would otherwise be + // silently bumped, flattening an ef sweep at the low end. Clamp explicitly + + // warn so the sweep stays meaningful and the recorded ef matches what ran. + let ef_search = if ef_search < top_k { + eprintln!( + "warning: --ef-search {ef_search} < --top-k {top_k}; clamping ef_search to {top_k} \ + (hnsw_rs requires ef >= k)" + ); + top_k + } else { + ef_search + }; Config { cache_dir, @@ -1040,7 +1053,10 @@ fn run_hnsw( write_topk: bool, timing_writer: &mut dyn Write, ) { - let slug = "hnsw"; + // ef in the slug so an ef-sweep does not overwrite topk/summary/timing rows + // (each operating point on the recall/latency frontier is recorded distinctly). + let slug = format!("hnsw_ef{}", cfg.ef_search); + let slug = slug.as_str(); eprintln!(" building HNSW M={HNSW_M} ef_c={HNSW_EF_CONSTRUCTION} ef_s={} ({n_docs} docs) ...", cfg.ef_search); // DistL2 (not DistDot): embeddings are unit-normalized, so min-L2 ≡ max-dot ≡ // max-cosine — identical neighbors — but DistL2 avoids anndists' DistDot From c06270ea42a459f146c9a0afeaa1a400a2bf7144 Mon Sep 17 00:00:00 2001 From: Nelson Spence Date: Fri, 19 Jun 2026 19:22:42 -0500 Subject: [PATCH 3/3] Fix BEIR HNSW ef sweep consumers --- benchmarks/beir-bench/src/main.rs | 5 ++- benchmarks/beir/beir_eval.py | 5 +++ benchmarks/beir/beir_plot.py | 72 ++++++++++++++++++++++++++++--- 3 files changed, 75 insertions(+), 7 deletions(-) diff --git a/benchmarks/beir-bench/src/main.rs b/benchmarks/beir-bench/src/main.rs index 36a5dae..a12acc7 100644 --- a/benchmarks/beir-bench/src/main.rs +++ b/benchmarks/beir-bench/src/main.rs @@ -1057,7 +1057,10 @@ fn run_hnsw( // (each operating point on the recall/latency frontier is recorded distinctly). let slug = format!("hnsw_ef{}", cfg.ef_search); let slug = slug.as_str(); - eprintln!(" building HNSW M={HNSW_M} ef_c={HNSW_EF_CONSTRUCTION} ef_s={} ({n_docs} docs) ...", cfg.ef_search); + eprintln!( + " building HNSW M={HNSW_M} ef_c={HNSW_EF_CONSTRUCTION} ef_s={} ({n_docs} docs) ...", + cfg.ef_search + ); // DistL2 (not DistDot): embeddings are unit-normalized, so min-L2 ≡ max-dot ≡ // max-cosine — identical neighbors — but DistL2 avoids anndists' DistDot // `1-dot` distance assert, which panics on near-duplicate pairs whose float diff --git a/benchmarks/beir/beir_eval.py b/benchmarks/beir/beir_eval.py index f8ef8b5..22b3580 100644 --- a/benchmarks/beir/beir_eval.py +++ b/benchmarks/beir/beir_eval.py @@ -577,6 +577,11 @@ def write_csv( def method_stem(method_slug: str) -> str: """Strip ``-m`` / ``-b`` parameter suffixes from a method slug.""" + hnsw_ef_prefix = "hnsw_ef" + if method_slug.startswith(hnsw_ef_prefix): + suffix = method_slug[len(hnsw_ef_prefix):] + if suffix.isdigit(): + return "hnsw" parts = method_slug.split("-") kept = [ p diff --git a/benchmarks/beir/beir_plot.py b/benchmarks/beir/beir_plot.py index 74ddcf1..f83f801 100644 --- a/benchmarks/beir/beir_plot.py +++ b/benchmarks/beir/beir_plot.py @@ -47,6 +47,53 @@ ORDER = [s for s, _, _ in METHOD_STYLE] +def _hnsw_ef(slug: str) -> int | None: + prefix = "hnsw_ef" + if not slug.startswith(prefix): + return None + suffix = slug[len(prefix):] + return int(suffix) if suffix.isdigit() else None + + +def _method_family(slug: str) -> str: + if _hnsw_ef(slug) is not None: + return "hnsw" + return slug + + +def _method_label(slug: str) -> str: + ef = _hnsw_ef(slug) + if ef is not None: + return f"HNSW M=32 ef={ef} (4096 B + graph)" + return LABEL.get(slug, slug) + + +def _method_short_label(slug: str) -> str: + ef = _hnsw_ef(slug) + if ef is not None: + return f"HNSW ef={ef}" + return _method_label(slug).split(" (")[0] + + +def _method_color(slug: str) -> str: + return COLOR.get(_method_family(slug), "#777777") + + +def _method_order_key(slug: str) -> tuple[int, int, str]: + family = _method_family(slug) + try: + base_order = ORDER.index(family) + except ValueError: + base_order = len(ORDER) + ef = _hnsw_ef(slug) + return (base_order, ef if ef is not None else -1, slug) + + +def _ordered_methods(records: list[dict]) -> list[str]: + slugs = {r["method"] for r in records if "method" in r} + return sorted(slugs, key=_method_order_key) + + def _read_timing(path: pathlib.Path) -> list[dict]: records: list[dict] = [] with path.open("r", encoding="utf-8") as fh: @@ -82,7 +129,7 @@ def plot_scaling(records: list[dict], dataset: str, threads: int, batch: int, mode = "single-query (batch=1)" if batch == 1 else f"batched (batch={batch})" fig, ax = plt.subplots(figsize=(8.2, 5.0)) - for slug in ORDER: + for slug in _ordered_methods(recs): pts = sorted( ((r["n_docs"], r["query_latency_ms_p50"]) for r in recs if r["method"] == slug), key=lambda t: t[0], @@ -92,9 +139,22 @@ def plot_scaling(records: list[dict], dataset: str, threads: int, batch: int, if len(xs) < 2: continue if slug == "flat": - ax.axhline(1.0, color=COLOR[slug], ls="--", lw=1.2, label=LABEL[slug]) + ax.axhline( + 1.0, + color=_method_color(slug), + ls="--", + lw=1.2, + label=_method_label(slug), + ) else: - ax.plot(xs, ys, marker="o", lw=2.0, color=COLOR[slug], label=LABEL[slug]) + ax.plot( + xs, + ys, + marker="o", + lw=2.0, + color=_method_color(slug), + label=_method_label(slug), + ) ax.set_xscale("log") ax.set_yscale("log") @@ -124,15 +184,15 @@ def plot_bars(records: list[dict], dataset: str, threads: int, batch: int, n_doc lambda r: r["method"], ) by_method = {r["method"]: r for r in recs} - slugs = [s for s in ORDER if s in by_method] + slugs = [s for s in _ordered_methods(recs) if s in by_method] if not slugs: print(f"[plot] no records for {fname} (threads={threads}, n={n_docs})", file=sys.stderr) return p50 = [by_method[s]["query_latency_ms_p50"] for s in slugs] qps = [by_method[s]["queries_per_second"] for s in slugs] - colors = [COLOR[s] for s in slugs] - labels = [LABEL[s].split(" (")[0] for s in slugs] + colors = [_method_color(s) for s in slugs] + labels = [_method_short_label(s) for s in slugs] flat_p50 = by_method.get("flat", {}).get("query_latency_ms_p50")