diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst
index 2efa9ff86b..158f7a9d7a 100644
--- a/docs/source/cuvs_bench/index.rst
+++ b/docs/source/cuvs_bench/index.rst
@@ -113,22 +113,8 @@ The steps below demonstrate how to download, install, and run benchmarks on a su
     # (1) Prepare dataset.
     python -m cuvs_bench.get_dataset --dataset deep-image-96-angular --normalize
 
-.. code-block:: python
-
     # (2) Build and search index.
-    from cuvs_bench.orchestrator import BenchmarkOrchestrator
-
-    orchestrator = BenchmarkOrchestrator(backend_type="cpp_gbench")
-    results = orchestrator.run_benchmark(
-        dataset="deep-image-96-inner",
-        algorithms="cuvs_cagra",
-        count=10,
-        batch_size=10,
-        build=True,
-        search=True,
-    )
-
-.. code-block:: bash
+    python -m cuvs_bench.run --dataset deep-image-96-inner --algorithms cuvs_cagra --batch-size 10 -k 10 --build --search
 
     # (3) Export data.
     python -m cuvs_bench.run --data-export --dataset deep-image-96-inner
@@ -210,22 +196,10 @@ The steps below demonstrate how to download, install, and run benchmarks on a su
     python -m cuvs_bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
-.. code-block:: python
+.. code-block:: bash
 
     # (2) Build and search index.
-    from cuvs_bench.orchestrator import BenchmarkOrchestrator
-
-    orchestrator = BenchmarkOrchestrator(backend_type="cpp_gbench")
-    results = orchestrator.run_benchmark(
-        dataset="deep-1B",
-        algorithms="cuvs_cagra",
-        count=10,
-        batch_size=10,
-        build=True,
-        search=True,
-    )
-
-.. code-block:: bash
+    python -m cuvs_bench.run --dataset deep-1B --algorithms cuvs_cagra --batch-size 10 -k 10 --build --search
 
     # (3) Export data.
     python -m cuvs_bench.run --data-export --dataset deep-1B
diff --git a/python/cuvs_bench/cuvs_bench/backends/base.py b/python/cuvs_bench/cuvs_bench/backends/base.py
index b208d6dfbb..f1d8dbcb9d 100644
--- a/python/cuvs_bench/cuvs_bench/backends/base.py
+++ b/python/cuvs_bench/cuvs_bench/backends/base.py
@@ -18,18 +18,28 @@
     from ..orchestrator.config_loaders import IndexConfig
 
 
-@dataclass
 class Dataset:
     """
     Dataset representation for benchmarking.
 
-    Attributes
+    Supports two usage patterns:
+
+    1. File-path based (C++ backend): The orchestrator passes file paths
+       and the backend reads them directly via ``base_file``, ``query_file``,
+       etc. Vectors are never loaded into Python.
+
+    2. Array based (Python-native backends like OpenSearch, Elasticsearch):
+       Backends access ``base_vectors``, ``query_vectors``, etc. If vectors
+       were not provided directly but a file path exists, they are loaded
+       lazily on first access. This keeps file I/O invisible to backends.
+
+    Parameters
     ----------
     name : str
         Dataset name (e.g., "glove-100-inner")
-    base_vectors : np.ndarray
+    base_vectors : Optional[np.ndarray]
         Base vectors for index building, shape (n_vectors, dims)
-    query_vectors : np.ndarray
+    query_vectors : Optional[np.ndarray]
         Query vectors for search, shape (n_queries, dims)
     groundtruth_neighbors : Optional[np.ndarray]
         Ground truth neighbor IDs, shape (n_queries, k_gt)
@@ -38,25 +48,108 @@ class Dataset:
     distance_metric : str
         Distance metric ("euclidean", "inner_product", "cosine")
     base_file : Optional[str]
-        Path to base vectors file (for C++ backend compatibility)
+        Path to base vectors file
     query_file : Optional[str]
-        Path to query vectors file (for C++ backend compatibility)
+        Path to query vectors file
     groundtruth_neighbors_file : Optional[str]
-        Path to ground truth neighbors file (for C++ backend compatibility)
-    metadata : Dict[str, Any]
-        Additional dataset metadata like {"source": "ann-benchmarks"}
+        Path to ground truth neighbors file
+    metadata : Optional[Dict[str, Any]]
+        Additional dataset metadata like {"subset_size": 10000}
     """
 
-    name: str
-    base_vectors: np.ndarray
-    query_vectors: np.ndarray
-    groundtruth_neighbors: Optional[np.ndarray] = None
-    groundtruth_distances: Optional[np.ndarray] = None
-    distance_metric: str = "euclidean"
-    base_file: Optional[str] = None
-    query_file: Optional[str] = None
-    groundtruth_neighbors_file: Optional[str] = None
-    metadata: Dict[str, Any] = field(default_factory=dict)
+    def __init__(
+        self,
+        name: str,
+        base_vectors: Optional[np.ndarray] = None,
+        query_vectors: Optional[np.ndarray] = None,
+        groundtruth_neighbors: Optional[np.ndarray] = None,
+        groundtruth_distances: Optional[np.ndarray] = None,
+        distance_metric: str = "euclidean",
+        base_file: Optional[str] = None,
+        query_file: Optional[str] = None,
+        groundtruth_neighbors_file: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ):
+        self.name = name
+        # Vectors are stored privately to support lazy loading.
+        # If the caller provides vectors directly, they are used as-is.
+        # If empty and a file path is provided, the corresponding property
+        # loads vectors from the file on first access. This keeps the
+        # loading logic invisible to backends: they just access
+        # dataset.base_vectors and get a numpy array regardless of whether
+        # it was passed in or loaded from disk.
+        self._base_vectors = (
+            base_vectors if base_vectors is not None else np.empty((0, 0))
+        )
+        self._query_vectors = (
+            query_vectors if query_vectors is not None else np.empty((0, 0))
+        )
+        self._groundtruth_neighbors = groundtruth_neighbors
+        self.groundtruth_distances = groundtruth_distances
+        self.distance_metric = distance_metric
+        self.base_file = base_file
+        self.query_file = query_file
+        self.groundtruth_neighbors_file = groundtruth_neighbors_file
+        self.metadata = metadata or {}
+
+    @property
+    def base_vectors(self) -> np.ndarray:
+        """Base vectors for index building.
+
+        Loaded from base_file on first access if not provided directly.
+        """
+        if self._base_vectors.size == 0 and self.base_file:
+            from .utils import load_vectors
+
+            self._base_vectors = load_vectors(
+                self.base_file, self.metadata.get("subset_size")
+            )
+        return self._base_vectors
+
+    @base_vectors.setter
+    def base_vectors(self, value: Optional[np.ndarray]) -> None:
+        """Set base vectors directly."""
+        self._base_vectors = value if value is not None else np.empty((0, 0))
+
+    @property
+    def query_vectors(self) -> np.ndarray:
+        """Query vectors for search.
+
+        Loaded from query_file on first access if not provided directly.
+        """
+        if self._query_vectors.size == 0 and self.query_file:
+            from .utils import load_vectors
+
+            self._query_vectors = load_vectors(self.query_file)
+        return self._query_vectors
+
+    @query_vectors.setter
+    def query_vectors(self, value: Optional[np.ndarray]) -> None:
+        """Set query vectors directly."""
+        self._query_vectors = value if value is not None else np.empty((0, 0))
+
+    @property
+    def groundtruth_neighbors(self) -> Optional[np.ndarray]:
+        """Ground truth neighbor IDs.
+
+        Loaded from groundtruth_neighbors_file on first access if not
+        provided directly.
+        """
+        if (
+            self._groundtruth_neighbors is None
+            and self.groundtruth_neighbors_file
+        ):
+            from .utils import load_vectors
+
+            self._groundtruth_neighbors = load_vectors(
+                self.groundtruth_neighbors_file
+            )
+        return self._groundtruth_neighbors
+
+    @groundtruth_neighbors.setter
+    def groundtruth_neighbors(self, value: Optional[np.ndarray]) -> None:
+        """Set ground truth neighbors directly."""
+        self._groundtruth_neighbors = value
 
     @property
     def dims(self) -> int:
diff --git a/python/cuvs_bench/cuvs_bench/backends/utils.py b/python/cuvs_bench/cuvs_bench/backends/utils.py
new file mode 100644
index 0000000000..b931f3128b
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/backends/utils.py
@@ -0,0 +1,202 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+"""
+Shared utilities for cuvs-bench backends.
+
+Provides common functions for Python-native backends (e.g., OpenSearch,
+Elasticsearch):
+- Vector file I/O: used internally by the Dataset class for transparent
+  vector loading. The C++ backend does not use these since it passes file
+  paths directly to the subprocess.
+- Parameter expansion: converts YAML param specs into lists of param dicts.
+- Recall computation: computes recall@k from neighbors and ground truth.
+
+The dtype_from_filename function originates from generate_groundtruth/utils.py.
+Note: generate_groundtruth/utils.py uses .hbin for float16 while the rest of
+the codebase (get_dataset/fbin_to_f16bin.py, OpenSearch backend) uses .f16bin.
+We standardize on .f16bin here to match the naming convention of the other
+formats (.fbin, .i8bin, .u8bin).
+"""
+
+import itertools
+import os
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+
+def dtype_from_filename(filename):
+    """Map file extension to numpy dtype.
+
+    Parameters
+    ----------
+    filename : str
+        Path or filename with a supported extension.
+
+    Returns
+    -------
+    numpy.dtype
+        The corresponding numpy dtype.
+
+    Raises
+    ------
+    RuntimeError
+        If the file extension is not supported.
+    """
+    ext = os.path.splitext(filename)[1]
+    if ext == ".fbin":
+        return np.float32
+    if ext == ".f16bin":
+        return np.float16
+    elif ext == ".ibin":
+        return np.int32
+    elif ext == ".u8bin":
+        return np.ubyte
+    elif ext == ".i8bin":
+        return np.byte
+    else:
+        raise RuntimeError(f"Unsupported file extension: {ext}")
+
+
+def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray:
+    """
+    Read a binary vector file into a numpy array.
+
+    Supports the standard big-ann-bench binary format used by cuvs-bench
+    datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``,
+    followed by ``n_rows * n_cols`` elements of the dtype inferred from
+    the file extension via ``dtype_from_filename``.
+
+    Parameters
+    ----------
+    path : str
+        Path to the binary file. The dtype is inferred from the extension:
+        ``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8),
+        ``.i8bin`` (int8), ``.ibin`` (int32).
+    subset_size : Optional[int]
+        If provided, only the first ``subset_size`` rows are loaded.
+
+    Returns
+    -------
+    np.ndarray
+        Array of shape ``(n_rows, n_cols)`` with the inferred dtype.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the file does not exist.
+    ValueError
+        If the file extension is unsupported, ``subset_size`` is not positive,
+        or the file is truncated.
+    """
+    dtype = dtype_from_filename(path)
+    if subset_size is not None and subset_size < 1:
+        raise ValueError(
+            f"subset_size must be a positive integer, got {subset_size}"
+        )
+    with open(path, "rb") as f:
+        header = f.read(8)
+        if len(header) < 8:
+            raise ValueError(
+                f"File too small to contain a valid header (expected 8 bytes, "
+                f"got {len(header)}): {path}"
+            )
+        n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0])
+        n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0])
+        if subset_size is not None:
+            n_rows = min(n_rows, subset_size)
+        expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize
+        raw = f.read(expected_bytes)
+        if len(raw) < expected_bytes:
+            raise ValueError(
+                f"File is truncated: expected {expected_bytes} bytes of data "
+                f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), "
+                f"got {len(raw)}: {path}"
+            )
+        data = np.frombuffer(raw, dtype=dtype)
+    return data.reshape(n_rows, n_cols)
+
+
+def expand_param_grid(
+    param_spec: Dict[str, List[Any]],
+) -> List[Dict[str, Any]]:
+    """
+    Expand a parameter specification into all combinations via Cartesian product.
+
+    Takes a dict where each key maps to a list of values (as defined in
+    algorithm YAML configs) and produces a list of dicts, one per combination.
+
+    Parameters
+    ----------
+    param_spec : Dict[str, List[Any]]
+        Parameter specification, e.g., {"m": [16, 32], "ef_construction": [100, 200]}
+
+    Returns
+    -------
+    List[Dict[str, Any]]
+        List of parameter dicts, e.g.,
+        [{"m": 16, "ef_construction": 100}, {"m": 16, "ef_construction": 200},
+         {"m": 32, "ef_construction": 100}, {"m": 32, "ef_construction": 200}]
+        Returns [{}] if param_spec is empty.
+    """
+    if not param_spec:
+        return [{}]
+    keys = list(param_spec.keys())
+    return [
+        dict(zip(keys, vals))
+        for vals in itertools.product(*param_spec.values())
+    ]
+
+
+def compute_recall(
+    neighbors: np.ndarray, groundtruth: np.ndarray, k: int
+) -> float:
+    """
+    Compute recall@k by comparing returned neighbors against ground truth.
+
+    For each query, counts how many of the k returned neighbors appear
+    in the ground truth set, then averages across all queries.
+
+    Parameters
+    ----------
+    neighbors : np.ndarray
+        Returned neighbor IDs, shape (n_queries, k)
+    groundtruth : np.ndarray
+        Ground truth neighbor IDs, shape (n_queries, gt_k)
+    k : int
+        Number of neighbors to evaluate
+
+    Returns
+    -------
+    float
+        Recall@k in range [0.0, 1.0]
+    """
+    if not isinstance(neighbors, np.ndarray) or not isinstance(
+        groundtruth, np.ndarray
+    ):
+        raise ValueError("neighbors and groundtruth must be numpy ndarrays")
+    if neighbors.ndim != 2 or groundtruth.ndim != 2:
+        raise ValueError(
+            f"neighbors and groundtruth must be 2-D arrays, got "
+            f"neighbors.shape={neighbors.shape}, groundtruth.shape={groundtruth.shape}"
+        )
+    n_queries = neighbors.shape[0]
+    if groundtruth.shape[0] != n_queries:
+        raise ValueError(
+            f"Row count mismatch: neighbors has {n_queries} rows, "
+            f"groundtruth has {groundtruth.shape[0]} rows"
+        )
+    gt_k = min(k, groundtruth.shape[1])
+    if gt_k == 0 or n_queries == 0:
+        return 0.0
+    n_correct = sum(
+        len(
+            set(neighbors[i, :k].tolist())
+            & set(groundtruth[i, :gt_k].tolist())
+        )
+        for i in range(n_queries)
+    )
+    return n_correct / (n_queries * gt_k)
diff --git a/python/cuvs_bench/cuvs_bench/orchestrator/__init__.py b/python/cuvs_bench/cuvs_bench/orchestrator/__init__.py
index c6ad2db7f1..ecbcc94059 100644
--- a/python/cuvs_bench/cuvs_bench/orchestrator/__init__.py
+++ b/python/cuvs_bench/cuvs_bench/orchestrator/__init__.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-from .orchestrator import BenchmarkOrchestrator, run_benchmark
+from .orchestrator import BenchmarkOrchestrator
 from .config_loaders import ConfigLoader, BenchmarkConfig, DatasetConfig, CppGBenchConfigLoader
 from ..backends.registry import (
     get_backend_class,
@@ -15,7 +15,6 @@
 __all__ = [
     # Main orchestrator
     "BenchmarkOrchestrator",
-    "run_benchmark",
     # Config loaders
     "ConfigLoader",
     "BenchmarkConfig",
diff --git a/python/cuvs_bench/cuvs_bench/orchestrator/config_loaders.py b/python/cuvs_bench/cuvs_bench/orchestrator/config_loaders.py
index 3d81d54f9a..6e5f0dd214 100644
--- a/python/cuvs_bench/cuvs_bench/orchestrator/config_loaders.py
+++ b/python/cuvs_bench/cuvs_bench/orchestrator/config_loaders.py
@@ -10,7 +10,6 @@
 implementations that handle configuration loading and preprocessing.
 """
 
-import itertools
 import os
 import warnings
 from abc import ABC, abstractmethod
@@ -21,6 +20,8 @@
 
 import yaml
 
+from ..backends.utils import expand_param_grid
+
 
 @dataclass
 class IndexConfig:
@@ -127,23 +128,99 @@ class ConfigLoader(ABC):
     """
     Abstract base class for configuration loaders.
 
-    Each backend type has its own ConfigLoader that knows how to:
-    - Load configuration from files or other sources
-    - Expand parameter combinations
-    - Validate constraints
-    - Produce standardized BenchmarkConfig objects
+    Uses a template method pattern: load() handles the shared steps
+    (loading dataset YAML, looking up the dataset, constructing
+    DatasetConfig) then delegates to _build_benchmark_configs() for
+    backend-specific logic. Backend authors only implement
+    _build_benchmark_configs() and receive the DatasetConfig already built.
+
+    Provides shared helper methods for common config-loading operations
+    (YAML loading, dataset lookup, algorithm config gathering) so that
+    individual loaders do not need to reimplement them.
     """
 
-    @abstractmethod
-    def load(self, **kwargs) -> Tuple[DatasetConfig, List[BenchmarkConfig]]:
+    def load(
+        self,
+        dataset: str,
+        dataset_path: str,
+        **kwargs,
+    ) -> Tuple[DatasetConfig, List[BenchmarkConfig]]:
         """
         Load and prepare benchmark configurations.
 
+        Handles shared config-loading steps (dataset YAML, dataset lookup,
+        DatasetConfig construction) then delegates to the backend-specific
+        _build_benchmark_configs() for producing BenchmarkConfig objects.
+
+        Parameters
+        ----------
+        dataset : str
+            Dataset name
+        dataset_path : str
+            Path to dataset directory
+        **kwargs
+            Backend-specific arguments passed through to
+            _build_benchmark_configs()
+
         Returns
         -------
         Tuple[DatasetConfig, List[BenchmarkConfig]]
             Dataset configuration and list of benchmark configurations to run
         """
+        # Shared step 1: Load dataset YAML
+        ds_yaml_path = kwargs.get("dataset_configuration") or os.path.join(
+            self.config_path, "datasets", "datasets.yaml"
+        )
+        ds_conf_all = self.load_yaml_file(ds_yaml_path)
+
+        # Shared step 2: Find dataset by name
+        ds_conf = self.get_dataset_configuration(dataset, ds_conf_all)
+
+        # Shared step 3: Construct DatasetConfig with resolved paths
+        dataset_config = self.build_dataset_config(
+            ds_conf, dataset_path, kwargs.get("subset_size")
+        )
+
+        # Backend-specific step: produce BenchmarkConfig list
+        benchmark_configs = self._build_benchmark_configs(
+            dataset_config, ds_conf, dataset, dataset_path, **kwargs
+        )
+        return dataset_config, benchmark_configs
+
+    @abstractmethod
+    def _build_benchmark_configs(
+        self,
+        dataset_config: DatasetConfig,
+        dataset_conf: dict,
+        dataset: str,
+        dataset_path: str,
+        **kwargs,
+    ) -> List[BenchmarkConfig]:
+        """
+        Build backend-specific benchmark configurations.
+
+        Called by load() after the shared steps are complete. Each backend
+        implements this with its own logic (executable discovery for C++,
+        connection params for OpenSearch, etc.).
+
+        Parameters
+        ----------
+        dataset_config : DatasetConfig
+            Already-constructed dataset configuration
+        dataset_conf : dict
+            Raw dataset dict from YAML (for backends that need extra fields)
+        dataset : str
+            Dataset name
+        dataset_path : str
+            Path to dataset directory
+        **kwargs
+            Backend-specific arguments
+
+        Returns
+        -------
+        List[BenchmarkConfig]
+            List of benchmark configurations to run
+        """
         pass
 
     @property
@@ -152,6 +229,136 @@ def backend_type(self) -> str:
         """Return the backend type this loader is for (e.g., 'cpp_gbench')."""
         pass
 
+    def build_dataset_config(
+        self,
+        dataset_conf: dict,
+        dataset_path: Optional[str] = None,
+        subset_size: Optional[int] = None,
+    ) -> DatasetConfig:
+        """
+        Construct a DatasetConfig from a dataset YAML dict.
+
+        Resolves relative file paths against dataset_path if provided.
+
+        Parameters
+        ----------
+        dataset_conf : dict
+            Dataset configuration dict from datasets.yaml
+        dataset_path : Optional[str]
+            Base path for resolving relative file paths
+        subset_size : Optional[int]
+            Limit dataset to first N vectors
+
+        Returns
+        -------
+        DatasetConfig
+        """
+
+        def _resolve(rel):
+            if rel and dataset_path and not os.path.isabs(rel):
+                return os.path.join(dataset_path, rel)
+            return rel
+
+        return DatasetConfig(
+            name=dataset_conf["name"],
+            base_file=_resolve(dataset_conf.get("base_file")),
+            query_file=_resolve(dataset_conf.get("query_file")),
+            groundtruth_neighbors_file=_resolve(
+                dataset_conf.get("groundtruth_neighbors_file")
+            ),
+            distance=dataset_conf.get("distance", "euclidean"),
+            dims=dataset_conf.get("dims"),
+            subset_size=subset_size,
+        )
+
+    def load_yaml_file(self, file_path: str) -> dict:
+        """
+        Load a YAML file and return its contents as a dictionary.
+
+        Parameters
+        ----------
+        file_path : str
+            The path to the YAML file.
+
+        Returns
+        -------
+        dict
+            The contents of the YAML file.
+        """
+        with open(file_path, "r") as f:
+            return yaml.safe_load(f)
+
+    def get_dataset_configuration(
+        self, dataset: str, dataset_conf_all: list
+    ) -> dict:
+        """
+        Retrieve the configuration for a specific dataset.
+
+        Parameters
+        ----------
+        dataset : str
+            The name of the dataset to retrieve the configuration for.
+        dataset_conf_all : list
+            A list of dataset configurations.
+
+        Returns
+        -------
+        dict
+            The configuration for the specified dataset.
+
+        Raises
+        ------
+        ValueError
+            If the dataset configuration is not found.
+        """
+        for d in dataset_conf_all:
+            if dataset == d["name"]:
+                return d
+        raise ValueError(
+            f"Could not find a dataset configuration for '{dataset}'"
+        )
+
+    def gather_algorithm_configs(
+        self, config_path: str, algorithm_configuration: Optional[str]
+    ) -> list:
+        """
+        Gather the list of algorithm configuration files.
+
+        Parameters
+        ----------
+        config_path : str
+            The path to the config directory.
+        algorithm_configuration : Optional[str]
+            The path to the algorithm configuration directory or file.
+
+        Returns
+        -------
+        list
+            A list of paths to the algorithm configuration files.
+        """
+        algos_conf_fs = os.listdir(os.path.join(config_path, "algos"))
+        algos_conf_fs = [
+            os.path.join(config_path, "algos", f)
+            for f in algos_conf_fs
+            if f.endswith((".yaml", ".yml"))
+        ]
+
+        if algorithm_configuration:
+            if os.path.isdir(algorithm_configuration):
+                algos_conf_fs += [
+                    os.path.join(algorithm_configuration, f)
+                    for f in os.listdir(algorithm_configuration)
+                    if f.endswith((".yaml", ".yml"))
+                ]
+            elif os.path.isfile(algorithm_configuration):
+                algos_conf_fs.append(algorithm_configuration)
+            else:
+                warnings.warn(
+                    f"algorithm_configuration path does not exist: "
+                    f"{algorithm_configuration}"
+                )
+        return algos_conf_fs
+
 
 class CppGBenchConfigLoader(ConfigLoader):
     """
@@ -196,94 +403,53 @@ def gpu_present(self) -> bool:
                 self._gpu_present = False
         return self._gpu_present
 
-    def load(
+    def _build_benchmark_configs(
         self,
+        dataset_config: DatasetConfig,
+        dataset_conf: dict,
         dataset: str,
         dataset_path: str,
-        dataset_configuration: Optional[str] = None,
-        algorithm_configuration: Optional[str] = None,
-        algorithms: Optional[str] = None,
-        groups: Optional[str] = None,
-        algo_groups: Optional[str] = None,
-        count: int = 10,
-        batch_size: int = 10000,
-        subset_size: Optional[int] = None,
         **kwargs,
-    ) -> Tuple[DatasetConfig, List[BenchmarkConfig]]:
+    ) -> List[BenchmarkConfig]:
         """
-        Load C++ benchmark configurations from YAML files.
+        Build C++ benchmark configurations.
 
         Parameters
         ----------
+        dataset_config : DatasetConfig
+            Already-constructed dataset configuration
+        dataset_conf : dict
+            Raw dataset dict from YAML
         dataset : str
             Dataset name
         dataset_path : str
             Path to dataset directory
-        dataset_configuration : Optional[str]
-            Path to dataset configuration file
-        algorithm_configuration : Optional[str]
-            Path to algorithm configuration directory or file
-        algorithms : Optional[str]
-            Comma-separated list of algorithms to run
-        groups : Optional[str]
-            Comma-separated list of groups to run
-        algo_groups : Optional[str]
-            Comma-separated list of algorithm groups
-        count : int
-            Number of neighbors (k)
-        batch_size : int
-            Batch size for search
-        subset_size : Optional[int]
-            Dataset subset size
         **kwargs
-            Additional keyword arguments. In tune mode, the orchestrator
-            passes these internal kwargs:
-
-            - _tune_mode : bool
-                If True, returns single config with exact params instead of
-                Cartesian product expansion from YAML.
-            - _tune_build_params : dict
-                Exact build parameters suggested by Optuna
-                (e.g., {"nlist": 5347})
-            - _tune_search_params : dict
-                Exact search parameters suggested by Optuna
-                (e.g., {"nprobe": 73})
+            C++ specific arguments: algorithm_configuration, algorithms,
+            groups, algo_groups, count, batch_size, subset_size,
+            and tune mode kwargs (_tune_mode, _tune_build_params,
+            _tune_search_params)
 
         Returns
         -------
-        Tuple[DatasetConfig, List[BenchmarkConfig]]
-            Dataset config and list of benchmark configs.
-            - Sweep mode: Multiple configs (Cartesian product of YAML params)
-            - Tune mode: Single config (exact Optuna-suggested params)
+        List[BenchmarkConfig]
+            Benchmark configs grouped by executable
         """
         # Extract tune mode kwargs (passed by orchestrator._run_trial)
         tune_mode = kwargs.pop("_tune_mode", False)
         tune_build_params = kwargs.pop("_tune_build_params", None)
         tune_search_params = kwargs.pop("_tune_search_params", None)
 
-        config_path = self.config_path
+        algorithm_configuration = kwargs.get("algorithm_configuration")
+        algorithms = kwargs.get("algorithms")
+        groups = kwargs.get("groups")
+        algo_groups = kwargs.get("algo_groups")
+        count = kwargs.get("count", 10)
+        batch_size = kwargs.get("batch_size", 10000)
+        subset_size = kwargs.get("subset_size")
+        executable_dir = kwargs.get("executable_dir")
 
-        # Load dataset configuration
-        dataset_conf_all = self.load_yaml_file(
-            dataset_configuration
-            or os.path.join(config_path, "datasets", "datasets.yaml")
-        )
-        dataset_conf = self.get_dataset_configuration(
-            dataset, dataset_conf_all
-        )
-
-        # Create DatasetConfig
-        dataset_config = DatasetConfig(
-            name=dataset_conf["name"],
-            base_file=dataset_conf.get("base_file"),
-            query_file=dataset_conf.get("query_file"),
-            groundtruth_neighbors_file=dataset_conf.get(
-                "groundtruth_neighbors_file"
-            ),
-            distance=dataset_conf.get("distance", "euclidean"),
-            dims=dataset_conf.get("dims"),
-            subset_size=subset_size,
-        )
+        config_path = self.config_path
 
         # Prepare conf_file for constraint validation
         conf_file = {"dataset": dataset_conf}
@@ -332,9 +498,10 @@ def load(
             tune_mode=tune_mode,
             tune_build_params=tune_build_params,
             tune_search_params=tune_search_params,
+            executable_dir=executable_dir,
         )
 
-        return dataset_config, benchmark_configs
+        return benchmark_configs
 
     def _prepare_benchmark_configs(
         self,
@@ -349,6 +516,7 @@ def _prepare_benchmark_configs(
         tune_mode: bool = False,
         tune_build_params: dict = None,
         tune_search_params: dict = None,
+        executable_dir: Optional[str] = None,
     ) -> List[BenchmarkConfig]:
         """
         Prepare list of BenchmarkConfig from algorithm configurations.
@@ -373,7 +541,12 @@ def _prepare_benchmark_configs(
                 try:
                     executable, executable_path, file_name = (
                         self.find_executable(
-                            algos_yaml, algo, group, count, batch_size
+                            algos_yaml,
+                            algo,
+                            group,
+                            count,
+                            batch_size,
+                            executable_dir,
                         )
                     )
                 except FileNotFoundError:
@@ -448,93 +621,9 @@ def _prepare_benchmark_configs(
         return benchmark_configs
 
     # =========================================================================
-    # `Helper methods (copied from run.py as-is)`
+    # C++ specific helper methods
     # =========================================================================
 
-    def load_yaml_file(self, file_path: str) -> dict:
-        """
-        Load a YAML file and return its contents as a dictionary.
-
-        Parameters
-        ----------
-        file_path : str
-            The path to the YAML file.
-
-        Returns
-        -------
-        dict
-            The contents of the YAML file.
-        """
-        with open(file_path, "r") as f:
-            return yaml.safe_load(f)
-
-    def get_dataset_configuration(
-        self, dataset: str, dataset_conf_all: list
-    ) -> dict:
-        """
-        Retrieve the configuration for a specific dataset.
-
-        Parameters
-        ----------
-        dataset : str
-            The name of the dataset to retrieve the configuration for.
-        dataset_conf_all : list
-            A list of dataset configurations.
-
-        Returns
-        -------
-        dict
-            The configuration for the specified dataset.
-
-        Raises
-        ------
-        ValueError
-            If the dataset configuration is not found.
-        """
-        for d in dataset_conf_all:
-            if dataset == d["name"]:
-                return d
-        raise ValueError("Could not find a dataset configuration")
-
-    def gather_algorithm_configs(
-        self, config_path: str, algorithm_configuration: Optional[str]
-    ) -> list:
-        """
-        Gather the list of algorithm configuration files.
-
-        Parameters
-        ----------
-        config_path : str
-            The path to the config directory.
-        algorithm_configuration : Optional[str]
-            The path to the algorithm configuration directory or file.
-
-        Returns
-        -------
-        list
-            A list of paths to the algorithm configuration files.
-        """
-        algos_conf_fs = os.listdir(os.path.join(config_path, "algos"))
-        algos_conf_fs = [
-            os.path.join(config_path, "algos", f)
-            for f in algos_conf_fs
-            if ".json" not in f
-            and "constraint" not in f
-            and ".py" not in f
-            and "__pycache__" not in f
-        ]
-
-        if algorithm_configuration:
-            if os.path.isdir(algorithm_configuration):
-                algos_conf_fs += [
-                    os.path.join(algorithm_configuration, f)
-                    for f in os.listdir(algorithm_configuration)
-                    if ".json" not in f
-                ]
-            elif os.path.isfile(algorithm_configuration):
-                algos_conf_fs.append(algorithm_configuration)
-        return algos_conf_fs
-
     def load_algorithms_conf(
         self,
         algos_conf_fs: list,
@@ -620,7 +709,13 @@ def validate_algorithm(
         )
 
     def find_executable(
-        self, algos_conf: dict, algo: str, group: str, k: int, batch_size: int
+        self,
+        algos_conf: dict,
+        algo: str,
+        group: str,
+        k: int,
+        batch_size: int,
+        executable_dir: Optional[str] = None,
     ) -> Tuple[str, str, Tuple[str, str]]:
         """
         Find the executable for the given algorithm and group.
@@ -637,6 +732,8 @@ def find_executable(
             The number of nearest neighbors to search for.
         batch_size : int
             The size of each batch for processing.
+        executable_dir : Optional[str]
+            User-specified directory to search first.
 
         Returns
         -------
@@ -646,12 +743,14 @@ def find_executable(
         """
         executable = algos_conf[algo]["executable"]
         file_name = (f"{algo},{group}", f"{algo},{group},k{k},bs{batch_size}")
-        build_path = self.get_build_path(executable)
+        build_path = self.get_build_path(executable, executable_dir)
         if build_path:
             return executable, build_path, file_name
         raise FileNotFoundError(executable)
 
-    def get_build_path(self, executable: str) -> Optional[str]:
+    def get_build_path(
+        self, executable: str, executable_dir: Optional[str] = None
+    ) -> Optional[str]:
         """
         Get the build path for the given executable.
 
@@ -659,12 +758,20 @@ def get_build_path(self, executable: str) -> Optional[str]:
         ----------
         executable : str
             The name of the executable.
+        executable_dir : Optional[str]
+            User-specified directory to search first. If provided and the
+            executable exists there, it is used before auto-discovery.
 
         Returns
         -------
         Optional[str]
             The build path for the executable, if found.
         """
+        if executable_dir is not None:
+            build_path = os.path.join(executable_dir, executable)
+            if os.path.exists(build_path):
+                print(f"-- Using cuVS bench from {build_path}.")
+                return build_path
 
         devcontainer_path = "/home/coder/cuvs/cpp/build/latest/bench/ann"
         if os.path.exists(devcontainer_path):
@@ -747,40 +854,19 @@ def prepare_indexes(
         search_params = group_conf.get("search", {})
 
         if tune_mode and tune_build_params is not None:
-            # TUNE MODE: Use exact params from Optuna (single config)
-            param_names = list(tune_build_params.keys())
-            all_build_params = [tuple(tune_build_params.values())]
-            # For search params, use tune_search_params keys/values
-            if tune_search_params:
-                search_param_names = tuple(tune_search_params.keys())
-                search_param_lists = [[v] for v in tune_search_params.values()]
-            else:
-                search_param_names, search_param_lists = [], []
-        else:
-            # SWEEP MODE: Cartesian product of all YAML params
-            all_build_params = itertools.product(*build_params.values())
-            search_param_names, search_param_lists = (
-                zip(*search_params.items()) if search_params else ([], [])
+            all_build_params = [tune_build_params.copy()]
+            all_search_params = (
+                [tune_search_params.copy()] if tune_search_params else [{}]
             )
-            param_names = list(build_params.keys())
-        for params in all_build_params:
-            # Build the build_param dict using appropriate keys
-            if tune_mode and tune_build_params is not None:
-                # Tune mode: use exact params from Optuna
-                index = {
-                    "algo": algo,
-                    "build_param": tune_build_params.copy(),
-                }
-            else:
-                # Sweep mode: use YAML param names
-                index = {
-                    "algo": algo,
-                    "build_param": dict(zip(param_names, params)),
-                }
+        else:
+            all_build_params = expand_param_grid(build_params)
+            all_search_params = expand_param_grid(search_params)
+
+        for build_param in all_build_params:
+            index = {"algo": algo, "build_param": build_param}
 
-            # Build index name from params
             index_name = f"{algo}_{group}" if group != "base" else f"{algo}"
-            for name, val in zip(param_names, params):
+            for name, val in build_param.items():
                 index_name += f".{name}{val}"
 
             # Skip constraint validation in tune mode (Optuna handles bounds)
@@ -789,7 +875,7 @@ def prepare_indexes(
                     algos_conf,
                     algo,
                     "build",
-                    index["build_param"],
+                    build_param,
                     None,
                     conf_file["dataset"].get("dims"),
                     count,
@@ -807,16 +893,12 @@ def prepare_indexes(
 
             # Handle search params
             if tune_mode and tune_search_params is not None:
-                # Tune mode: use exact search params from Optuna (as single-item list)
                 index["search_params"] = [tune_search_params.copy()]
             else:
-                # Sweep mode: validate and expand search params
                 index["search_params"] = self.validate_search_params(
-                    itertools.product(*search_param_lists),
-                    search_param_names,
-                    index["build_param"],
+                    all_search_params,
+                    build_param,
                     algo,
-                    group_conf,
                     algos_conf,
                     conf_file,
                     count,
@@ -830,29 +912,26 @@ def prepare_indexes(
     def validate_search_params(
         self,
         all_search_params,
-        search_param_names,
         build_params,
         algo,
-        group_conf,
         algos_conf,
         conf_file,
         count,
         batch_size,
     ) -> list:
         """
-        Validate and prepare the search parameters for the given algorithm
-        and group.
+        Validate and filter search parameter combinations.
 
         Parameters
         ----------
-        all_search_params : itertools.product
-            The Cartesian product of search parameter values.
-        search_param_names : list
-            The names of the search parameters.
+        all_search_params : List[Dict[str, Any]]
+            List of search parameter dicts (from expand_param_grid).
+        build_params : dict
+            The build parameters for the current index.
         algo : str
             The name of the algorithm.
-        group_conf : dict
-            The configuration for the algorithm group.
+        algos_conf : dict
+            The loaded algorithm configurations.
         conf_file : dict
             The main configuration file.
         count : int
@@ -863,11 +942,10 @@ def validate_search_params(
         Returns
         -------
         list
-            A list of validated search parameters.
+            A list of validated search parameter dicts.
         """
         search_params_list = []
-        for search_params in all_search_params:
-            search_dict = dict(zip(search_param_names, search_params))
+        for search_dict in all_search_params:
             if self.validate_constraints(
                 algos_conf,
                 algo,
diff --git a/python/cuvs_bench/cuvs_bench/orchestrator/orchestrator.py b/python/cuvs_bench/cuvs_bench/orchestrator/orchestrator.py
index 091290979c..d76127d3b2 100644
--- a/python/cuvs_bench/cuvs_bench/orchestrator/orchestrator.py
+++ b/python/cuvs_bench/cuvs_bench/orchestrator/orchestrator.py
@@ -20,6 +20,7 @@
     get_config_loader,
     list_backends,
 )
+from ..backends.utils import compute_recall
 from .config_loaders import DatasetConfig
 
 
@@ -253,6 +254,19 @@ def _run_sweep(
                     search_threads=search_threads,
                     dry_run=dry_run,
                 )
+
+                # Compute recall for backends that return actual neighbors.
+                # The C++ backend computes recall in the subprocess and returns
+                # empty neighbors, so this is skipped for it.
+                # Note: The recall computation relies on empty neighbors to
+                # distinguish backends that already computed recall.
+                if search_result.success and search_result.neighbors.size > 0:
+                    gt = bench_dataset.groundtruth_neighbors
+                    if gt is not None:
+                        search_result.recall = compute_recall(
+                            search_result.neighbors, gt, count
+                        )
+
                 results.append(search_result)
 
                 if not search_result.success:
@@ -575,6 +589,14 @@ def _run_trial(
                 dry_run=dry_run,
             )
 
+            # Compute recall for backends that return actual neighbors.
+            # Note: The recall computation relies on empty neighbors to
+            # distinguish backends that already computed recall.
+            if result.success and result.neighbors.size > 0:
+                gt = bench_dataset.groundtruth_neighbors
+                if gt is not None:
+                    result.recall = compute_recall(result.neighbors, gt, count)
+
         return result
 
     def _create_dataset(self, dataset_config: DatasetConfig) -> Dataset:
@@ -618,98 +640,3 @@ def available_backends() -> List[str]:
             List of registered backend names
         """
         return list(list_backends().keys())
-
-
-# ============================================================================
-# Standalone function for backward compatibility
-# ============================================================================
-
-
-def run_benchmark(
-    subset_size: int = None,
-    count: int = 10,
-    batch_size: int = 10000,
-    dataset_configuration: Optional[str] = None,
-    configuration: Optional[str] = None,
-    dataset: str = None,
-    dataset_path: str = None,
-    build: Optional[bool] = None,
-    search: Optional[bool] = None,
-    algorithms: Optional[str] = None,
-    groups: Optional[str] = None,
-    algo_groups: Optional[str] = None,
-    force: bool = False,
-    search_mode: str = "latency",
-    search_threads: Optional[int] = None,
-    dry_run: bool = False,
-    data_export: bool = False,
-    backend_type: str = "cpp_gbench",
-) -> List[Union[BuildResult, SearchResult]]:
-    """
-    Standalone function for backward compatibility with run.py interface.
-
-    Parameters
-    ----------
-    subset_size : int
-        The subset size of the dataset.
-    count : int
-        The number of nearest neighbors to search for.
-    batch_size : int
-        The size of each batch for processing.
-    dataset_configuration : Optional[str]
-        Path to the dataset configuration file.
-    configuration : Optional[str]
-        Path to the algorithm configuration directory or file.
-    dataset : str
-        The name of the dataset to use.
-    dataset_path : str
-        The path to the dataset directory.
-    build : Optional[bool]
-        Whether to build the indices.
-    search : Optional[bool]
-        Whether to perform the search.
-    algorithms : Optional[str]
-        Comma-separated list of algorithm names to use.
-    groups : str
-        Comma-separated list of groups to consider.
-    algo_groups : Optional[str]
-        Comma-separated list of algorithm groups to consider.
-    force : bool
-        Whether to force the execution regardless of warnings.
-    search_mode : str
-        The mode of search to perform.
-    search_threads : int
-        The number of threads to use for searching.
-    dry_run : bool
-        Whether to perform a dry run without actual execution.
-    data_export : bool
-        Whether to export data (unused, kept for compatibility).
-    backend_type : str
-        Type of backend to use (default: 'cpp_gbench')
-
-    Returns
-    -------
-    List[Union[BuildResult, SearchResult]]
-        List of all build and search results.
-    """
-    orchestrator = BenchmarkOrchestrator(backend_type=backend_type)
-
-    return orchestrator.run_benchmark(
-        build=build if build is not None else True,
-        search=search if search is not None else True,
-        force=force,
-        dry_run=dry_run,
-        count=count,
-        batch_size=batch_size,
-        search_mode=search_mode,
-        search_threads=search_threads,
-        # ConfigLoader-specific kwargs (for cpp_gbench)
-        dataset=dataset,
-        dataset_path=dataset_path,
-        dataset_configuration=dataset_configuration,
-        configuration=configuration,
-        algorithms=algorithms,
-        groups=groups,
-        algo_groups=algo_groups,
-        subset_size=subset_size,
-    )
diff --git a/python/cuvs_bench/cuvs_bench/run/__main__.py b/python/cuvs_bench/cuvs_bench/run/__main__.py
index 47c41a903e..6950ff7202 100644
--- a/python/cuvs_bench/cuvs_bench/run/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/run/__main__.py
@@ -3,15 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+import json
 import os
-import warnings
 from pathlib import Path
 from typing import Optional
 
 import click
+import yaml
 
 from .data_export import convert_json_to_csv_build, convert_json_to_csv_search
-from .run import run_benchmark
+from ..orchestrator import BenchmarkOrchestrator
 
 
 @click.command()
@@ -150,6 +151,36 @@
     "were interrupted, use this option to convert those intermediate "
     "files manually.",
 )
+@click.option(
+    "--mode",
+    type=click.Choice(["sweep", "tune"], case_sensitive=False),
+    default="sweep",
+    show_default=True,
+    help="Benchmark mode: 'sweep' runs all parameter combinations from "
+    "YAML configs. 'tune' uses Optuna to intelligently search the "
+    "parameter space (requires --constraints).",
+)
+@click.option(
+    "--constraints",
+    default=None,
+    help="Tune mode constraints as JSON string. One metric should have "
+    "'maximize' or 'minimize', others have min/max bounds. "
+    'Example: \'{"recall": "maximize", "latency": {"max": 10}}\'',
+)
+@click.option(
+    "--n-trials",
+    type=int,
+    default=None,
+    help="Number of Optuna trials for tune mode (default: 100).",
+)
+@click.option(
+    "--backend-config",
+    default=None,
+    help="Path to YAML configuration file for non-C++ backends. "
+    "The file must contain a 'backend' field specifying the backend "
+    "type (e.g., 'opensearch', 'elastic'). All other fields are "
+    "passed as backend-specific parameters.",
+)
 def main(
     subset_size: Optional[int],
     count: int,
@@ -169,6 +200,10 @@ def main(
     search_threads: Optional[str],
     dry_run: bool,
     data_export: bool,
+    mode: str,
+    constraints: Optional[str],
+    n_trials: Optional[int],
+    backend_config: Optional[str],
 ) -> None:
     """
     Main function to run the benchmark with the provided options.
@@ -207,17 +242,64 @@ def main(
         The number of threads to use for throughput benchmark.
     dry_run : bool
         Whether to perform a dry run without actual execution.
+    data_export : bool
+        Whether to export intermediate JSON results to CSV.
+    mode : str
+        Benchmark mode: 'sweep' (exhaustive) or 'tune' (Optuna-based).
+    constraints : Optional[str]
+        Tune mode constraints as JSON string.
+    n_trials : Optional[int]
+        Number of Optuna trials for tune mode.
+    backend_config : Optional[str]
+        Path to YAML config for non-C++ backends. If not provided,
+        defaults to the C++ Google Benchmark backend. The YAML file
+        must contain a 'backend' field (e.g., 'opensearch', 'elastic')
+        and any backend-specific connection parameters (host, port, etc.).
 
     """
-    warnings.warn(
-        "The 'cuvs_bench.run' CLI is deprecated and will be removed in a future release. "
-        "Use BenchmarkOrchestrator from cuvs_bench.orchestrator instead.",
-        FutureWarning,
-        stacklevel=2,
-    )
-
     if not data_export:
-        run_benchmark(**locals())
+        # Determine backend type and extra kwargs from --backend-config
+        backend_type = "cpp_gbench"
+        backend_kwargs = {}
+        if backend_config:
+            with open(backend_config, "r") as f:
+                cfg = yaml.safe_load(f)
+            if not isinstance(cfg, dict):
+                raise ValueError(
+                    f"--backend-config must parse to a mapping, "
+                    f"got {type(cfg).__name__}"
+                )
+            if "backend" not in cfg:
+                raise ValueError(
+                    "--backend-config must include a 'backend' field"
+                )
+            backend_type = cfg.pop("backend")
+            backend_kwargs = cfg
+
+        orchestrator = BenchmarkOrchestrator(backend_type=backend_type)
+        orchestrator.run_benchmark(
+            mode=mode,
+            constraints=json.loads(constraints) if constraints else None,
+            n_trials=n_trials,
+            dataset=dataset,
+            dataset_path=dataset_path,
+            build=build,
+            search=search,
+            force=force,
+            dry_run=dry_run,
+            count=count,
+            batch_size=batch_size,
+            search_mode=search_mode,
+            search_threads=search_threads,
+            dataset_configuration=dataset_configuration,
+            algorithm_configuration=configuration,
+            algorithms=algorithms,
+            groups=groups,
+            algo_groups=algo_groups,
+            subset_size=subset_size,
+            executable_dir=executable_dir,
+            **backend_kwargs,
+        )
 
     convert_json_to_csv_build(dataset, dataset_path)
     convert_json_to_csv_search(dataset, dataset_path)
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_cli.py b/python/cuvs_bench/cuvs_bench/tests/test_cli.py
index c3f8d43856..fa2a63d041 100644
--- a/python/cuvs_bench/cuvs_bench/tests/test_cli.py
+++ b/python/cuvs_bench/cuvs_bench/tests/test_cli.py
@@ -517,3 +517,169 @@ def test_plot_command_creates_png_files(temp_datasets_dir: Path):
         assert file_path.stat().st_size > 0, (
             f"Expected file {filename} is empty."
         )
+
+
+# FIXME: Tests below use --dry-run to verify CLI flag parsing and orchestrator
+# routing without requiring actual benchmark execution. Tune mode (--mode tune)
+# requires Optuna and actual search results, so only flag acceptance is tested
+# here. End-to-end tests for tune mode and non-C++ backends should be added
+# when those features are exercised in integration testing.
+
+
+def test_run_with_mode_sweep(temp_datasets_dir):
+    """Verify --mode sweep is accepted (default behavior)."""
+    from cuvs_bench.run.__main__ import main as run_main
+
+    runner = CliRunner()
+    result = runner.invoke(
+        run_main,
+        [
+            "--dataset",
+            "test-data",
+            "--dataset-path",
+            str(temp_datasets_dir),
+            "--algorithms",
+            "cuvs_cagra",
+            "--batch-size",
+            "10",
+            "-k",
+            "10",
+            "--groups",
+            "test",
+            "-m",
+            "latency",
+            "--mode",
+            "sweep",
+            "--dry-run",
+        ],
+    )
+    assert result.exit_code == 0, (
+        f"--mode sweep failed with output:\n{result.output}"
+    )
+
+
+def test_run_with_backend_config(temp_datasets_dir, tmp_path):
+    """Verify --backend-config YAML is parsed correctly."""
+    from cuvs_bench.run.__main__ import main as run_main
+
+    config_file = tmp_path / "backend.yaml"
+    config_file.write_text("backend: cpp_gbench\n")
+    runner = CliRunner()
+    result = runner.invoke(
+        run_main,
+        [
+            "--dataset",
+            "test-data",
+            "--dataset-path",
+            str(temp_datasets_dir),
+            "--algorithms",
+            "cuvs_cagra",
+            "--batch-size",
+            "10",
+            "-k",
+            "10",
+            "--groups",
+            "test",
+            "-m",
+            "latency",
+            "--backend-config",
+            str(config_file),
+            "--dry-run",
+        ],
+    )
+    assert result.exit_code == 0, (
+        f"--backend-config failed with output:\n{result.output}"
+    )
+
+
+def test_run_with_invalid_backend_config(tmp_path):
+    """Verify missing backend config file raises error."""
+    from cuvs_bench.run.__main__ import main as run_main
+
+    runner = CliRunner()
+    result = runner.invoke(
+        run_main,
+        [
+            "--dataset",
+            "test-data",
+            "--dataset-path",
+            str(tmp_path),
+            "--algorithms",
+            "cuvs_cagra",
+            "--batch-size",
+            "10",
+            "-k",
+            "10",
+            "--groups",
+            "test",
+            "-m",
+            "latency",
+            "--backend-config",
+            "/nonexistent/config.yaml",
+        ],
+    )
+    assert result.exit_code != 0
+
+
+def test_run_with_n_trials_flag(temp_datasets_dir):
+    """Verify --n-trials flag is accepted by the CLI parser."""
+    from cuvs_bench.run.__main__ import main as run_main
+
+    runner = CliRunner()
+    result = runner.invoke(
+        run_main,
+        [
+            "--dataset",
+            "test-data",
+            "--dataset-path",
+            str(temp_datasets_dir),
+            "--algorithms",
+            "cuvs_cagra",
+            "--batch-size",
+            "10",
+            "-k",
+            "10",
+            "--groups",
+            "test",
+            "-m",
+            "latency",
+            "--n-trials",
+            "50",
+            "--dry-run",
+        ],
+    )
+    assert result.exit_code == 0, (
+        f"--n-trials failed with output:\n{result.output}"
+    )
+
+
+def test_run_with_constraints_flag(temp_datasets_dir):
+    """Verify --constraints flag accepts valid JSON."""
+    from cuvs_bench.run.__main__ import main as run_main
+
+    runner = CliRunner()
+    result = runner.invoke(
+        run_main,
+        [
+            "--dataset",
+            "test-data",
+            "--dataset-path",
+            str(temp_datasets_dir),
+            "--algorithms",
+            "cuvs_cagra",
+            "--batch-size",
+            "10",
+            "-k",
+            "10",
+            "--groups",
+            "test",
+            "-m",
+            "latency",
+            "--constraints",
+            '{"recall": "maximize", "latency": {"max": 10}}',
+            "--dry-run",
+        ],
+    )
+    assert result.exit_code == 0, (
+        f"--constraints failed with output:\n{result.output}"
+    )
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_registry.py b/python/cuvs_bench/cuvs_bench/tests/test_registry.py
index 2bf7433f4f..def7cc478b 100644
--- a/python/cuvs_bench/cuvs_bench/tests/test_registry.py
+++ b/python/cuvs_bench/cuvs_bench/tests/test_registry.py
@@ -8,7 +8,6 @@
 
 import pytest
 import numpy as np
-from pathlib import Path
 
 from cuvs_bench.backends import (
     Dataset,
@@ -27,31 +26,38 @@ class DummyBackend(BenchmarkBackend):
 
     @property
     def algo(self) -> str:
-        """Return dummy algorithm name."""
         return "dummy_algo"
 
-    def build(self, dataset, build_params, index_path, force=False):
+    def build(self, dataset, indexes, force=False, dry_run=False):
+        if not indexes:
+            raise ValueError("indexes must not be empty")
+        first = indexes[0]
         return BuildResult(
-            index_path=str(index_path),
+            index_path=first.file,
             build_time_seconds=1.0,
             index_size_bytes=1000,
-            algorithm=self.name,
-            build_params=build_params,
+            algorithm=self.algo,
+            build_params=first.build_param,
             success=True,
         )
 
     def search(
         self,
         dataset,
-        search_params,
-        index_path,
+        indexes,
         k,
         batch_size=10000,
-        mode="throughput",
+        mode="latency",
+        force=False,
+        search_threads=None,
+        dry_run=False,
     ):
+        if not indexes:
+            raise ValueError("indexes must not be empty")
         n_queries = dataset.n_queries
         neighbors = np.random.randint(0, dataset.n_base, size=(n_queries, k))
         distances = np.random.rand(n_queries, k)
+        first = indexes[0]
 
         return SearchResult(
             neighbors=neighbors,
@@ -59,41 +65,49 @@ def search(
             search_time_ms=0.1,
             queries_per_second=n_queries / 0.1,
             recall=0.95,
-            algorithm=self.name,
-            search_params=search_params,
+            algorithm=self.algo,
+            search_params=first.search_params,
             success=True,
         )
 
-    @property
-    def name(self) -> str:
-        return "dummy"
-
 
 class AnotherDummyBackend(BenchmarkBackend):
     """Another dummy backend for testing."""
 
-    def build(self, dataset, build_params, index_path, force=False):
+    @property
+    def algo(self) -> str:
+        return "another_dummy_algo"
+
+    def build(self, dataset, indexes, force=False, dry_run=False):
+        if not indexes:
+            raise ValueError("indexes must not be empty")
+        first = indexes[0]
         return BuildResult(
-            index_path=str(index_path),
+            index_path=first.file,
             build_time_seconds=2.0,
             index_size_bytes=2000,
-            algorithm=self.name,
-            build_params=build_params,
+            algorithm=self.algo,
+            build_params=first.build_param,
             success=True,
         )
 
     def search(
         self,
         dataset,
-        search_params,
-        index_path,
+        indexes,
         k,
         batch_size=10000,
-        mode="throughput",
+        mode="latency",
+        force=False,
+        search_threads=None,
+        dry_run=False,
     ):
+        if not indexes:
+            raise ValueError("indexes must not be empty")
         n_queries = dataset.n_queries
         neighbors = np.random.randint(0, dataset.n_base, size=(n_queries, k))
         distances = np.random.rand(n_queries, k)
+        first = indexes[0]
 
         return SearchResult(
             neighbors=neighbors,
@@ -101,15 +115,11 @@ def search(
             search_time_ms=0.2,
             queries_per_second=n_queries / 0.2,
             recall=0.90,
-            algorithm=self.name,
-            search_params=search_params,
+            algorithm=self.algo,
+            search_params=first.search_params if first else [],
             success=True,
         )
 
-    @property
-    def name(self) -> str:
-        return "another_dummy"
-
 
 class TestDataset:
     """Tests for Dataset dataclass."""
@@ -294,7 +304,7 @@ def test_get_backend(self):
         registry = BackendRegistry()
         registry.register("dummy", DummyBackend)
 
-        backend = registry.get_backend("dummy", config={})
+        backend = registry.get_backend("dummy", config={"name": "dummy"})
 
         assert isinstance(backend, DummyBackend)
         assert backend.name == "dummy"
@@ -324,9 +334,11 @@ def test_list_backends(self):
 class TestBackendIntegration:
     """Integration tests for backends."""
 
-    def test_dummy_backend_build(self):
+    def test_dummy_backend_build(self, tmp_path):
         """Test dummy backend build."""
-        backend = DummyBackend(config={})
+        from cuvs_bench.orchestrator.config_loaders import IndexConfig
+
+        backend = DummyBackend(config={"name": "dummy"})
 
         base = np.random.rand(1000, 128).astype(np.float32)
         queries = np.random.rand(100, 128).astype(np.float32)
@@ -334,19 +346,27 @@ def test_dummy_backend_build(self):
             name="test", base_vectors=base, query_vectors=queries
         )
 
-        result = backend.build(
-            dataset=dataset,
-            build_params={"nlist": 1024},
-            index_path=Path("/tmp/test_index"),
-        )
+        indexes = [
+            IndexConfig(
+                name="test_index",
+                algo="dummy_algo",
+                build_param={"nlist": 1024},
+                search_params=[{"nprobe": 10}],
+                file=str(tmp_path / "test_index"),
+            )
+        ]
+
+        result = backend.build(dataset=dataset, indexes=indexes)
 
         assert result.success
-        assert result.algorithm == "dummy"
+        assert result.algorithm == "dummy_algo"
         assert result.build_params["nlist"] == 1024
 
-    def test_dummy_backend_search(self):
+    def test_dummy_backend_search(self, tmp_path):
         """Test dummy backend search."""
-        backend = DummyBackend(config={})
+        from cuvs_bench.orchestrator.config_loaders import IndexConfig
+
+        backend = DummyBackend(config={"name": "dummy"})
 
         base = np.random.rand(1000, 128).astype(np.float32)
         queries = np.random.rand(100, 128).astype(np.float32)
@@ -354,12 +374,17 @@ def test_dummy_backend_search(self):
             name="test", base_vectors=base, query_vectors=queries
         )
 
-        result = backend.search(
-            dataset=dataset,
-            search_params=[{"nprobe": 10}],
-            index_path=Path("/tmp/test_index"),
-            k=10,
-        )
+        indexes = [
+            IndexConfig(
+                name="test_index",
+                algo="dummy_algo",
+                build_param={"nlist": 1024},
+                search_params=[{"nprobe": 10}],
+                file=str(tmp_path / "test_index"),
+            )
+        ]
+
+        result = backend.search(dataset=dataset, indexes=indexes, k=10)
 
         assert result.success
         assert result.recall == 0.95
@@ -389,7 +414,7 @@ def test_get_backend_global(self):
         """Test getting backend via global function."""
         register_backend("dummy_global2", DummyBackend)
 
-        backend = get_backend("dummy_global2", config={})
+        backend = get_backend("dummy_global2", config={"name": "dummy"})
 
         assert isinstance(backend, DummyBackend)
         assert backend.name == "dummy"
diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
new file mode 100644
index 0000000000..21bfb5ce6a
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py
@@ -0,0 +1,423 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Unit tests for shared backend utilities and Dataset transparent loading.
+"""
+
+import numpy as np
+import pytest
+import yaml
+
+from cuvs_bench.backends import Dataset
+from cuvs_bench.backends.utils import (
+    compute_recall,
+    dtype_from_filename,
+    expand_param_grid,
+    load_vectors,
+)
+from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader
+
+
+def _write_test_bin(path, data):
+    """Write a numpy array in big-ann-bench binary format."""
+    with open(path, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+class TestDtypeFromFilename:
+    """Tests for dtype_from_filename."""
+
+    def test_fbin(self):
+        """Test .fbin maps to float32."""
+        assert dtype_from_filename("vectors.fbin") == np.float32
+
+    def test_f16bin(self):
+        """Test .f16bin maps to float16."""
+        assert dtype_from_filename("vectors.f16bin") == np.float16
+
+    def test_ibin(self):
+        """Test .ibin maps to int32."""
+        assert dtype_from_filename("groundtruth.ibin") == np.int32
+
+    def test_u8bin(self):
+        """Test .u8bin maps to uint8."""
+        assert dtype_from_filename("vectors.u8bin") == np.ubyte
+
+    def test_i8bin(self):
+        """Test .i8bin maps to int8."""
+        assert dtype_from_filename("vectors.i8bin") == np.byte
+
+    def test_unsupported_extension(self):
+        """Test that unsupported extensions raise RuntimeError."""
+        with pytest.raises(RuntimeError, match="Unsupported file extension"):
+            dtype_from_filename("vectors.txt")
+
+    def test_full_path(self):
+        """Test that full paths are handled correctly."""
+        assert (
+            dtype_from_filename("/data/datasets/sift/base.fbin") == np.float32
+        )
+
+
+class TestLoadVectors:
+    """Tests for load_vectors."""
+
+    def test_load_float32(self, tmp_path):
+        """Test loading float32 vectors."""
+        data = np.random.rand(100, 128).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, data)
+
+    def test_load_int32(self, tmp_path):
+        """Test loading int32 vectors."""
+        data = np.random.randint(0, 1000, size=(50, 10)).astype(np.int32)
+        path = str(tmp_path / "test.ibin")
+        _write_test_bin(path, data)
+
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, data)
+
+    def test_load_uint8(self, tmp_path):
+        """Test loading uint8 vectors."""
+        data = np.random.randint(0, 255, size=(30, 64)).astype(np.uint8)
+        path = str(tmp_path / "test.u8bin")
+        _write_test_bin(path, data)
+
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, data)
+
+    def test_load_int8(self, tmp_path):
+        """Test loading int8 vectors."""
+        data = np.random.randint(-128, 127, size=(30, 64)).astype(np.int8)
+        path = str(tmp_path / "test.i8bin")
+        _write_test_bin(path, data)
+
+        loaded = load_vectors(path)
+        np.testing.assert_array_equal(loaded, data)
+
+    def test_subset_size(self, tmp_path):
+        """Test that subset_size limits the number of rows loaded."""
+        data = np.random.rand(100, 128).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        loaded = load_vectors(path, subset_size=10)
+        assert loaded.shape == (10, 128)
+        np.testing.assert_array_equal(loaded, data[:10])
+
+    def test_subset_size_larger_than_file(self, tmp_path):
+        """Test that subset_size larger than file returns all rows."""
+        data = np.random.rand(5, 16).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        loaded = load_vectors(path, subset_size=100)
+        assert loaded.shape == (5, 16)
+        np.testing.assert_array_equal(loaded, data)
+
+    def test_negative_subset_size(self, tmp_path):
+        """Test that negative subset_size raises ValueError."""
+        data = np.random.rand(10, 4).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        with pytest.raises(
+            ValueError, match="subset_size must be a positive integer"
+        ):
+            load_vectors(path, subset_size=-1)
+
+    def test_zero_subset_size(self, tmp_path):
+        """Test that zero subset_size raises ValueError."""
+        data = np.random.rand(10, 4).astype(np.float32)
+        path = str(tmp_path / "test.fbin")
+        _write_test_bin(path, data)
+
+        with pytest.raises(
+            ValueError, match="subset_size must be a positive integer"
+        ):
+            load_vectors(path, subset_size=0)
+
+    def test_unsupported_extension(self, tmp_path):
+        """Test that unsupported file extensions raise RuntimeError."""
+        path = str(tmp_path / "test.txt")
+        with open(path, "wb") as f:
+            f.write(b"dummy")
+
+        with pytest.raises(RuntimeError, match="Unsupported file extension"):
+            load_vectors(path)
+
+    def test_truncated_header(self, tmp_path):
+        """Test that truncated headers raise ValueError."""
+        path = str(tmp_path / "test.fbin")
+        with open(path, "wb") as f:
+            f.write(b"\x00\x00")
+
+        with pytest.raises(
+            ValueError, match="File too small to contain a valid header"
+        ):
+            load_vectors(path)
+
+    def test_truncated_data(self, tmp_path):
+        """Test that truncated data raises ValueError."""
+        path = str(tmp_path / "test.fbin")
+        with open(path, "wb") as f:
+            np.array([10, 4], dtype=np.uint32).tofile(f)
+            np.random.rand(5, 4).astype(np.float32).tofile(f)
+
+        with pytest.raises(ValueError, match="File is truncated"):
+            load_vectors(path)
+
+    def test_file_not_found(self):
+        """Test that missing files raise FileNotFoundError."""
+        with pytest.raises(FileNotFoundError):
+            load_vectors("/nonexistent/path/vectors.fbin")
+
+
+class TestDatasetLazyLoading:
+    """Tests for Dataset transparent vector loading."""
+
+    def test_lazy_load_base_vectors(self, tmp_path):
+        """Test that base vectors are loaded from file on first access."""
+        data = np.random.rand(50, 32).astype(np.float32)
+        path = str(tmp_path / "base.fbin")
+        _write_test_bin(path, data)
+
+        dataset = Dataset(name="test", base_file=path)
+        np.testing.assert_array_equal(dataset.base_vectors, data)
+
+    def test_lazy_load_query_vectors(self, tmp_path):
+        """Test that query vectors are loaded from file on first access."""
+        data = np.random.rand(10, 32).astype(np.float32)
+        path = str(tmp_path / "query.fbin")
+        _write_test_bin(path, data)
+
+        dataset = Dataset(name="test", query_file=path)
+        np.testing.assert_array_equal(dataset.query_vectors, data)
+
+    def test_lazy_load_groundtruth(self, tmp_path):
+        """Test that ground truth is loaded from file on first access."""
+        data = np.random.randint(0, 100, size=(10, 5)).astype(np.int32)
+        path = str(tmp_path / "gt.ibin")
+        _write_test_bin(path, data)
+
+        dataset = Dataset(name="test", groundtruth_neighbors_file=path)
+        np.testing.assert_array_equal(dataset.groundtruth_neighbors, data)
+
+    def test_preloaded_vectors_returned_directly(self):
+        """Test that pre-loaded vectors are returned without file access."""
+        base = np.random.rand(20, 16).astype(np.float32)
+        query = np.random.rand(5, 16).astype(np.float32)
+
+        dataset = Dataset(
+            name="test",
+            base_vectors=base,
+            query_vectors=query,
+        )
+        np.testing.assert_array_equal(dataset.base_vectors, base)
+        np.testing.assert_array_equal(dataset.query_vectors, query)
+
+    def test_no_file_returns_empty(self):
+        """Test that Dataset without files returns empty arrays."""
+        dataset = Dataset(name="test")
+        assert dataset.base_vectors.size == 0
+        assert dataset.query_vectors.size == 0
+        assert dataset.groundtruth_neighbors is None
+
+    def test_lazy_load_with_subset_size(self, tmp_path):
+        """Test that subset_size is respected during lazy loading."""
+        data = np.random.rand(100, 32).astype(np.float32)
+        path = str(tmp_path / "base.fbin")
+        _write_test_bin(path, data)
+
+        dataset = Dataset(
+            name="test",
+            base_file=path,
+            metadata={"subset_size": 10},
+        )
+        assert dataset.base_vectors.shape == (10, 32)
+        np.testing.assert_array_equal(dataset.base_vectors, data[:10])
+
+    def test_file_path_still_accessible(self):
+        """Test that file paths are accessible without triggering loading."""
+        dataset = Dataset(
+            name="test",
+            base_file="/path/to/base.fbin",
+            query_file="/path/to/query.fbin",
+        )
+        assert dataset.base_file == "/path/to/base.fbin"
+        assert dataset.query_file == "/path/to/query.fbin"
+        assert dataset.name == "test"
+
+    def test_file_path_access_does_not_trigger_loading(self, tmp_path):
+        """Test that accessing file paths does not load vectors into memory."""
+        data = np.random.rand(50, 32).astype(np.float32)
+        path = str(tmp_path / "base.fbin")
+        _write_test_bin(path, data)
+
+        dataset = Dataset(name="test", base_file=path)
+
+        _ = dataset.base_file
+        _ = dataset.name
+        _ = dataset.distance_metric
+
+        assert dataset._base_vectors.size == 0
+
+    def test_dims_and_counts(self, tmp_path):
+        """Test dims, n_base, and n_queries properties."""
+        data = np.random.rand(50, 32).astype(np.float32)
+        path = str(tmp_path / "base.fbin")
+        _write_test_bin(path, data)
+
+        queries = np.random.rand(10, 32).astype(np.float32)
+        qpath = str(tmp_path / "query.fbin")
+        _write_test_bin(qpath, queries)
+
+        dataset = Dataset(name="test", base_file=path, query_file=qpath)
+        assert dataset.dims == 32
+        assert dataset.n_base == 50
+        assert dataset.n_queries == 10
+
+    def test_caching(self, tmp_path):
+        """Test that vectors are loaded once and cached."""
+        data = np.random.rand(10, 4).astype(np.float32)
+        path = str(tmp_path / "base.fbin")
+        _write_test_bin(path, data)
+
+        dataset = Dataset(name="test", base_file=path)
+        first_access = dataset.base_vectors
+        second_access = dataset.base_vectors
+        assert first_access is second_access
+
+
+class TestConfigLoaderMethods:
+    """Tests for base ConfigLoader inherited methods."""
+
+    def test_load_yaml_file(self, tmp_path):
+        """Test loading a YAML file."""
+        yaml_path = str(tmp_path / "test.yaml")
+        with open(yaml_path, "w") as f:
+            yaml.dump({"name": "test", "value": 42}, f)
+
+        loader = CppGBenchConfigLoader(config_path=str(tmp_path))
+        result = loader.load_yaml_file(yaml_path)
+        assert result["name"] == "test"
+        assert result["value"] == 42
+
+    def test_get_dataset_configuration(self):
+        """Test finding a dataset by name."""
+        loader = CppGBenchConfigLoader()
+        datasets = [
+            {"name": "sift-128", "dims": 128},
+            {"name": "glove-100", "dims": 100},
+        ]
+
+        result = loader.get_dataset_configuration("glove-100", datasets)
+        assert result["name"] == "glove-100"
+        assert result["dims"] == 100
+
+    def test_get_dataset_configuration_not_found(self):
+        """Test that missing datasets raise ValueError."""
+        loader = CppGBenchConfigLoader()
+        datasets = [{"name": "sift-128"}]
+
+        with pytest.raises(
+            ValueError, match="Could not find a dataset configuration"
+        ):
+            loader.get_dataset_configuration("nonexistent", datasets)
+
+
+class TestExpandParamGrid:
+    """Tests for expand_param_grid."""
+
+    def test_single_param(self):
+        """Test expansion of a single parameter."""
+        result = expand_param_grid({"m": [16, 32]})
+        assert result == [{"m": 16}, {"m": 32}]
+
+    def test_two_params(self):
+        """Test Cartesian product of two parameters."""
+        result = expand_param_grid({"m": [16, 32], "ef": [100, 200]})
+        assert len(result) == 4
+        assert {"m": 16, "ef": 100} in result
+        assert {"m": 16, "ef": 200} in result
+        assert {"m": 32, "ef": 100} in result
+        assert {"m": 32, "ef": 200} in result
+
+    def test_empty_spec(self):
+        """Test that empty spec returns a single empty dict."""
+        result = expand_param_grid({})
+        assert result == [{}]
+
+    def test_single_values(self):
+        """Test that single-element lists produce one combination."""
+        result = expand_param_grid({"m": [16], "ef": [100]})
+        assert result == [{"m": 16, "ef": 100}]
+
+    def test_three_params(self):
+        """Test Cartesian product of three parameters."""
+        result = expand_param_grid({"a": [1, 2], "b": [3], "c": [4, 5]})
+        assert len(result) == 4
+        assert {"a": 1, "b": 3, "c": 4} in result
+        assert {"a": 2, "b": 3, "c": 5} in result
+
+
+class TestComputeRecall:
+    """Tests for compute_recall."""
+
+    def test_perfect_recall(self):
+        """Test recall is 1.0 when neighbors match ground truth exactly."""
+        neighbors = np.array([[0, 1, 2], [3, 4, 5]])
+        groundtruth = np.array([[0, 1, 2], [3, 4, 5]])
+        assert compute_recall(neighbors, groundtruth, k=3) == 1.0
+
+    def test_zero_recall(self):
+        """Test recall is 0.0 when no neighbors match ground truth."""
+        neighbors = np.array([[10, 11, 12], [13, 14, 15]])
+        groundtruth = np.array([[0, 1, 2], [3, 4, 5]])
+        assert compute_recall(neighbors, groundtruth, k=3) == 0.0
+
+    def test_partial_recall(self):
+        """Test recall with partial overlap."""
+        neighbors = np.array([[0, 1, 99]])
+        groundtruth = np.array([[0, 1, 2]])
+        recall = compute_recall(neighbors, groundtruth, k=3)
+        assert abs(recall - 2.0 / 3.0) < 1e-9
+
+    def test_k_smaller_than_groundtruth(self):
+        """Test recall when k is smaller than ground truth columns."""
+        neighbors = np.array([[0, 1]])
+        groundtruth = np.array([[0, 1, 2, 3, 4]])
+        recall = compute_recall(neighbors, groundtruth, k=2)
+        assert recall == 1.0
+
+    def test_k_larger_than_groundtruth(self):
+        """Test recall when k is larger than ground truth columns."""
+        neighbors = np.array([[0, 1, 2, 3, 4]])
+        groundtruth = np.array([[0, 1]])
+        recall = compute_recall(neighbors, groundtruth, k=5)
+        assert recall == 1.0
+
+    def test_empty_groundtruth(self):
+        """Test recall is 0.0 when ground truth has zero columns."""
+        neighbors = np.array([[0, 1, 2]])
+        groundtruth = np.empty((1, 0), dtype=np.int32)
+        assert compute_recall(neighbors, groundtruth, k=3) == 0.0
+
+    def test_empty_queries(self):
+        """Test recall is 0.0 when there are no queries."""
+        neighbors = np.empty((0, 3), dtype=np.int64)
+        groundtruth = np.empty((0, 3), dtype=np.int32)
+        assert compute_recall(neighbors, groundtruth, k=3) == 0.0
+
+    def test_multiple_queries(self):
+        """Test recall averaged across multiple queries."""
+        neighbors = np.array([[0, 1, 2], [3, 4, 99]])
+        groundtruth = np.array([[0, 1, 2], [3, 4, 5]])
+        recall = compute_recall(neighbors, groundtruth, k=3)
+        assert abs(recall - 5.0 / 6.0) < 1e-9