rapidsai · jnke2016 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
@@ -113,22 +113,8 @@ The steps below demonstrate how to download, install, and run benchmarks on a su
     # (1) Prepare dataset.
     python -m cuvs_bench.get_dataset --dataset deep-image-96-angular --normalize
 
-.. code-block:: python
-
     # (2) Build and search index.
-    from cuvs_bench.orchestrator import BenchmarkOrchestrator
-
-    orchestrator = BenchmarkOrchestrator(backend_type="cpp_gbench")
-    results = orchestrator.run_benchmark(
-        dataset="deep-image-96-inner",
-        algorithms="cuvs_cagra",
-        count=10,
-        batch_size=10,
-        build=True,
-        search=True,
-    )
-
-.. code-block:: bash
+    python -m cuvs_bench.run --dataset deep-image-96-inner --algorithms cuvs_cagra --batch-size 10 -k 10 --build --search
 
     # (3) Export data.
     python -m cuvs_bench.run --data-export --dataset deep-image-96-inner
@@ -210,22 +196,10 @@ The steps below demonstrate how to download, install, and run benchmarks on a su
     python -m cuvs_bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
-.. code-block:: python
+.. code-block:: bash
 
     # (2) Build and search index.
-    from cuvs_bench.orchestrator import BenchmarkOrchestrator
-
-    orchestrator = BenchmarkOrchestrator(backend_type="cpp_gbench")
-    results = orchestrator.run_benchmark(
-        dataset="deep-1B",
-        algorithms="cuvs_cagra",
-        count=10,
-        batch_size=10,
-        build=True,
-        search=True,
-    )
-
-.. code-block:: bash
+    python -m cuvs_bench.run --dataset deep-1B --algorithms cuvs_cagra --batch-size 10 -k 10 --build --search
 
     # (3) Export data.
     python -m cuvs_bench.run --data-export --dataset deep-1B

@@ -18,18 +18,28 @@
     from ..orchestrator.config_loaders import IndexConfig
 
 
-@dataclass
 class Dataset:
     """
     Dataset representation for benchmarking.
 
-    Attributes
+    Supports two usage patterns:
+
+    1. File-path based (C++ backend): The orchestrator passes file paths
+       and the backend reads them directly via ``base_file``, ``query_file``,
+       etc. Vectors are never loaded into Python.
+
+    2. Array based (Python-native backends like OpenSearch, Elasticsearch):
+       Backends access ``base_vectors``, ``query_vectors``, etc. If vectors
+       were not provided directly but a file path exists, they are loaded
+       lazily on first access. This keeps file I/O invisible to backends.
+
+    Parameters
     ----------
     name : str
         Dataset name (e.g., "glove-100-inner")
-    base_vectors : np.ndarray
+    base_vectors : Optional[np.ndarray]
         Base vectors for index building, shape (n_vectors, dims)
-    query_vectors : np.ndarray
+    query_vectors : Optional[np.ndarray]
         Query vectors for search, shape (n_queries, dims)
     groundtruth_neighbors : Optional[np.ndarray]
         Ground truth neighbor IDs, shape (n_queries, k_gt)
@@ -38,25 +48,98 @@ class Dataset:
     distance_metric : str
         Distance metric ("euclidean", "inner_product", "cosine")
     base_file : Optional[str]
-        Path to base vectors file (for C++ backend compatibility)
+        Path to base vectors file
     query_file : Optional[str]
-        Path to query vectors file (for C++ backend compatibility)
+        Path to query vectors file
     groundtruth_neighbors_file : Optional[str]
-        Path to ground truth neighbors file (for C++ backend compatibility)
-    metadata : Dict[str, Any]
-        Additional dataset metadata like {"source": "ann-benchmarks"}
+        Path to ground truth neighbors file
+    metadata : Optional[Dict[str, Any]]
+        Additional dataset metadata like {"subset_size": 10000}
     """
 
-    name: str
-    base_vectors: np.ndarray
-    query_vectors: np.ndarray
-    groundtruth_neighbors: Optional[np.ndarray] = None
-    groundtruth_distances: Optional[np.ndarray] = None
-    distance_metric: str = "euclidean"
-    base_file: Optional[str] = None
-    query_file: Optional[str] = None
-    groundtruth_neighbors_file: Optional[str] = None
-    metadata: Dict[str, Any] = field(default_factory=dict)
+    def __init__(
+        self,
+        name: str,
+        base_vectors: Optional[np.ndarray] = None,
+        query_vectors: Optional[np.ndarray] = None,
+        groundtruth_neighbors: Optional[np.ndarray] = None,
+        groundtruth_distances: Optional[np.ndarray] = None,
+        distance_metric: str = "euclidean",
+        base_file: Optional[str] = None,
+        query_file: Optional[str] = None,
+        groundtruth_neighbors_file: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ):
+        self.name = name
+        # Vectors are stored privately to support lazy loading.
+        # If the caller provides vectors directly, they are used as-is.
+        # If empty and a file path is provided, the corresponding property
+        # loads vectors from the file on first access. This keeps the
+        # loading logic invisible to backends: they just access
+        # dataset.base_vectors and get a numpy array regardless of whether
+        # it was passed in or loaded from disk.
+        self._base_vectors = base_vectors if base_vectors is not None else np.empty((0, 0))
+        self._query_vectors = query_vectors if query_vectors is not None else np.empty((0, 0))
+        self._groundtruth_neighbors = groundtruth_neighbors
+        self.groundtruth_distances = groundtruth_distances
+        self.distance_metric = distance_metric
+        self.base_file = base_file
+        self.query_file = query_file
+        self.groundtruth_neighbors_file = groundtruth_neighbors_file
+        self.metadata = metadata or {}
+
+    @property
+    def base_vectors(self) -> np.ndarray:
+        """Base vectors for index building.
+
+        Loaded from base_file on first access if not provided directly.
+        """
+        if self._base_vectors.size == 0 and self.base_file:
+            from .utils import load_vectors
+            self._base_vectors = load_vectors(
+                self.base_file, self.metadata.get("subset_size")
+            )
+        return self._base_vectors
+
+    @base_vectors.setter
+    def base_vectors(self, value: Optional[np.ndarray]) -> None:
+        """Set base vectors directly."""
+        self._base_vectors = value if value is not None else np.empty((0, 0))
+
+    @property
+    def query_vectors(self) -> np.ndarray:
+        """Query vectors for search.
+
+        Loaded from query_file on first access if not provided directly.
+        """
+        if self._query_vectors.size == 0 and self.query_file:
+            from .utils import load_vectors
+            self._query_vectors = load_vectors(self.query_file)
+        return self._query_vectors
+
+    @query_vectors.setter
+    def query_vectors(self, value: Optional[np.ndarray]) -> None:
+        """Set query vectors directly."""
+        self._query_vectors = value if value is not None else np.empty((0, 0))
+
+    @property
+    def groundtruth_neighbors(self) -> Optional[np.ndarray]:
+        """Ground truth neighbor IDs.
+
+        Loaded from groundtruth_neighbors_file on first access if not
+        provided directly.
+        """
+        if self._groundtruth_neighbors is None and self.groundtruth_neighbors_file:
+            from .utils import load_vectors
+            self._groundtruth_neighbors = load_vectors(
+                self.groundtruth_neighbors_file
+            )
+        return self._groundtruth_neighbors
+
+    @groundtruth_neighbors.setter
+    def groundtruth_neighbors(self, value: Optional[np.ndarray]) -> None:
+        """Set ground truth neighbors directly."""
+        self._groundtruth_neighbors = value
 
     @property
     def dims(self) -> int:

@@ -0,0 +1,192 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+"""
+Shared utilities for cuvs-bench backends.
+
+Provides common functions for Python-native backends (e.g., OpenSearch,
+Elasticsearch):
+- Vector file I/O: used internally by the Dataset class for transparent
+  vector loading. The C++ backend does not use these since it passes file
+  paths directly to the subprocess.
+- Parameter expansion: converts YAML param specs into lists of param dicts.
+- Recall computation: computes recall@k from neighbors and ground truth.
+
+The dtype_from_filename function originates from generate_groundtruth/utils.py.
+Note: generate_groundtruth/utils.py uses .hbin for float16 while the rest of
+the codebase (get_dataset/fbin_to_f16bin.py, OpenSearch backend) uses .f16bin.
+We standardize on .f16bin here to match the naming convention of the other
+formats (.fbin, .i8bin, .u8bin).
+"""
+
+import itertools
+import os
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+
+def dtype_from_filename(filename):
+    """Map file extension to numpy dtype.
+
+    Parameters
+    ----------
+    filename : str
+        Path or filename with a supported extension.
+
+    Returns
+    -------
+    numpy.dtype
+        The corresponding numpy dtype.
+
+    Raises
+    ------
+    RuntimeError
+        If the file extension is not supported.
+    """
+    ext = os.path.splitext(filename)[1]
+    if ext == ".fbin":
+        return np.float32
+    if ext == ".f16bin":
+        return np.float16
+    elif ext == ".ibin":
+        return np.int32
+    elif ext == ".u8bin":
+        return np.ubyte
+    elif ext == ".i8bin":
+        return np.byte
+    else:
+        raise RuntimeError(f"Unsupported file extension: {ext}")
+
+
+def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
+    """
+    Read a binary vector file into a numpy array.
+
+    Supports the standard big-ann-bench binary format used by cuvs-bench
+    datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``,
+    followed by ``n_rows * n_cols`` elements of the dtype inferred from
+    the file extension via ``dtype_from_filename``.
+
+    Parameters
+    ----------
+    path : str
+        Path to the binary file. The dtype is inferred from the extension:
+        ``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8),
+        ``.i8bin`` (int8), ``.ibin`` (int32).
+    subset_size : Optional[int]
+        If provided, only the first ``subset_size`` rows are loaded.
+
+    Returns
+    -------
+    np.ndarray
+        Array of shape ``(n_rows, n_cols)`` with the inferred dtype.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the file does not exist.
+    ValueError
+        If the file extension is unsupported, ``subset_size`` is not positive,
+        or the file is truncated.
+    """
+    dtype = dtype_from_filename(path)
+    if subset_size is not None and subset_size < 1:
+        raise ValueError(
+            f"subset_size must be a positive integer, got {subset_size}"
+        )
+    with open(path, "rb") as f:
+        header = f.read(8)
+        if len(header) < 8:
+            raise ValueError(
+                f"File too small to contain a valid header (expected 8 bytes, "
+                f"got {len(header)}): {path}"
+            )
+        n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0])
+        n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0])
+        if subset_size is not None:
+            n_rows = min(n_rows, subset_size)
+        expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize
+        raw = f.read(expected_bytes)
+        if len(raw) < expected_bytes:
+            raise ValueError(
+                f"File is truncated: expected {expected_bytes} bytes of data "
+                f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), "
+                f"got {len(raw)}: {path}"
+            )
+        data = np.frombuffer(raw, dtype=dtype)
+    return data.reshape(n_rows, n_cols)
+
+
+def expand_param_grid(param_spec: Dict[str, List[Any]]) -> List[Dict[str, Any]]:
+    """
+    Expand a parameter specification into all combinations via Cartesian product.
+
+    Takes a dict where each key maps to a list of values (as defined in
+    algorithm YAML configs) and produces a list of dicts, one per combination.
+
+    Parameters
+    ----------
+    param_spec : Dict[str, List[Any]]
+        Parameter specification, e.g., {"m": [16, 32], "ef_construction": [100, 200]}
+
+    Returns
+    -------
+    List[Dict[str, Any]]
+        List of parameter dicts, e.g.,
+        [{"m": 16, "ef_construction": 100}, {"m": 16, "ef_construction": 200},
+         {"m": 32, "ef_construction": 100}, {"m": 32, "ef_construction": 200}]
+        Returns [{}] if param_spec is empty.
+    """
+    if not param_spec:
+        return [{}]
+    keys = list(param_spec.keys())
+    return [dict(zip(keys, vals)) for vals in itertools.product(*param_spec.values())]
+
+
+def compute_recall(
+    neighbors: np.ndarray, groundtruth: np.ndarray, k: int
+) -> float:
+    """
+    Compute recall@k by comparing returned neighbors against ground truth.
+
+    For each query, counts how many of the k returned neighbors appear
+    in the ground truth set, then averages across all queries.
+
+    Parameters
+    ----------
+    neighbors : np.ndarray
+        Returned neighbor IDs, shape (n_queries, k)
+    groundtruth : np.ndarray
+        Ground truth neighbor IDs, shape (n_queries, gt_k)
+    k : int
+        Number of neighbors to evaluate
+
+    Returns
+    -------
+    float
+        Recall@k in range [0.0, 1.0]
+    """
+    if not isinstance(neighbors, np.ndarray) or not isinstance(groundtruth, np.ndarray):
+        raise ValueError("neighbors and groundtruth must be numpy ndarrays")
+    if neighbors.ndim != 2 or groundtruth.ndim != 2:
+        raise ValueError(
+            f"neighbors and groundtruth must be 2-D arrays, got "
+            f"neighbors.shape={neighbors.shape}, groundtruth.shape={groundtruth.shape}"
+        )
+    n_queries = neighbors.shape[0]
+    if groundtruth.shape[0] != n_queries:
+        raise ValueError(
+            f"Row count mismatch: neighbors has {n_queries} rows, "
+            f"groundtruth has {groundtruth.shape[0]} rows"
+        )
+    gt_k = min(k, groundtruth.shape[1])
+    if gt_k == 0 or n_queries == 0:
+        return 0.0
+    n_correct = sum(
+        len(set(neighbors[i, :k].tolist()) & set(groundtruth[i, :gt_k].tolist()))
+        for i in range(n_queries)
+    )
+    return n_correct / (n_queries * gt_k)