diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst index 2efa9ff86b..158f7a9d7a 100644 --- a/docs/source/cuvs_bench/index.rst +++ b/docs/source/cuvs_bench/index.rst @@ -113,22 +113,8 @@ The steps below demonstrate how to download, install, and run benchmarks on a su # (1) Prepare dataset. python -m cuvs_bench.get_dataset --dataset deep-image-96-angular --normalize -.. code-block:: python - # (2) Build and search index. - from cuvs_bench.orchestrator import BenchmarkOrchestrator - - orchestrator = BenchmarkOrchestrator(backend_type="cpp_gbench") - results = orchestrator.run_benchmark( - dataset="deep-image-96-inner", - algorithms="cuvs_cagra", - count=10, - batch_size=10, - build=True, - search=True, - ) - -.. code-block:: bash + python -m cuvs_bench.run --dataset deep-image-96-inner --algorithms cuvs_cagra --batch-size 10 -k 10 --build --search # (3) Export data. python -m cuvs_bench.run --data-export --dataset deep-image-96-inner @@ -210,22 +196,10 @@ The steps below demonstrate how to download, install, and run benchmarks on a su python -m cuvs_bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced -.. code-block:: python +.. code-block:: bash # (2) Build and search index. - from cuvs_bench.orchestrator import BenchmarkOrchestrator - - orchestrator = BenchmarkOrchestrator(backend_type="cpp_gbench") - results = orchestrator.run_benchmark( - dataset="deep-1B", - algorithms="cuvs_cagra", - count=10, - batch_size=10, - build=True, - search=True, - ) - -.. code-block:: bash + python -m cuvs_bench.run --dataset deep-1B --algorithms cuvs_cagra --batch-size 10 -k 10 --build --search # (3) Export data. python -m cuvs_bench.run --data-export --dataset deep-1B diff --git a/python/cuvs_bench/cuvs_bench/backends/base.py b/python/cuvs_bench/cuvs_bench/backends/base.py index b208d6dfbb..f1d8dbcb9d 100644 --- a/python/cuvs_bench/cuvs_bench/backends/base.py +++ b/python/cuvs_bench/cuvs_bench/backends/base.py @@ -18,18 +18,28 @@ from ..orchestrator.config_loaders import IndexConfig -@dataclass class Dataset: """ Dataset representation for benchmarking. - Attributes + Supports two usage patterns: + + 1. File-path based (C++ backend): The orchestrator passes file paths + and the backend reads them directly via ``base_file``, ``query_file``, + etc. Vectors are never loaded into Python. + + 2. Array based (Python-native backends like OpenSearch, Elasticsearch): + Backends access ``base_vectors``, ``query_vectors``, etc. If vectors + were not provided directly but a file path exists, they are loaded + lazily on first access. This keeps file I/O invisible to backends. + + Parameters ---------- name : str Dataset name (e.g., "glove-100-inner") - base_vectors : np.ndarray + base_vectors : Optional[np.ndarray] Base vectors for index building, shape (n_vectors, dims) - query_vectors : np.ndarray + query_vectors : Optional[np.ndarray] Query vectors for search, shape (n_queries, dims) groundtruth_neighbors : Optional[np.ndarray] Ground truth neighbor IDs, shape (n_queries, k_gt) @@ -38,25 +48,108 @@ class Dataset: distance_metric : str Distance metric ("euclidean", "inner_product", "cosine") base_file : Optional[str] - Path to base vectors file (for C++ backend compatibility) + Path to base vectors file query_file : Optional[str] - Path to query vectors file (for C++ backend compatibility) + Path to query vectors file groundtruth_neighbors_file : Optional[str] - Path to ground truth neighbors file (for C++ backend compatibility) - metadata : Dict[str, Any] - Additional dataset metadata like {"source": "ann-benchmarks"} + Path to ground truth neighbors file + metadata : Optional[Dict[str, Any]] + Additional dataset metadata like {"subset_size": 10000} """ - name: str - base_vectors: np.ndarray - query_vectors: np.ndarray - groundtruth_neighbors: Optional[np.ndarray] = None - groundtruth_distances: Optional[np.ndarray] = None - distance_metric: str = "euclidean" - base_file: Optional[str] = None - query_file: Optional[str] = None - groundtruth_neighbors_file: Optional[str] = None - metadata: Dict[str, Any] = field(default_factory=dict) + def __init__( + self, + name: str, + base_vectors: Optional[np.ndarray] = None, + query_vectors: Optional[np.ndarray] = None, + groundtruth_neighbors: Optional[np.ndarray] = None, + groundtruth_distances: Optional[np.ndarray] = None, + distance_metric: str = "euclidean", + base_file: Optional[str] = None, + query_file: Optional[str] = None, + groundtruth_neighbors_file: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ): + self.name = name + # Vectors are stored privately to support lazy loading. + # If the caller provides vectors directly, they are used as-is. + # If empty and a file path is provided, the corresponding property + # loads vectors from the file on first access. This keeps the + # loading logic invisible to backends: they just access + # dataset.base_vectors and get a numpy array regardless of whether + # it was passed in or loaded from disk. + self._base_vectors = ( + base_vectors if base_vectors is not None else np.empty((0, 0)) + ) + self._query_vectors = ( + query_vectors if query_vectors is not None else np.empty((0, 0)) + ) + self._groundtruth_neighbors = groundtruth_neighbors + self.groundtruth_distances = groundtruth_distances + self.distance_metric = distance_metric + self.base_file = base_file + self.query_file = query_file + self.groundtruth_neighbors_file = groundtruth_neighbors_file + self.metadata = metadata or {} + + @property + def base_vectors(self) -> np.ndarray: + """Base vectors for index building. + + Loaded from base_file on first access if not provided directly. + """ + if self._base_vectors.size == 0 and self.base_file: + from .utils import load_vectors + + self._base_vectors = load_vectors( + self.base_file, self.metadata.get("subset_size") + ) + return self._base_vectors + + @base_vectors.setter + def base_vectors(self, value: Optional[np.ndarray]) -> None: + """Set base vectors directly.""" + self._base_vectors = value if value is not None else np.empty((0, 0)) + + @property + def query_vectors(self) -> np.ndarray: + """Query vectors for search. + + Loaded from query_file on first access if not provided directly. + """ + if self._query_vectors.size == 0 and self.query_file: + from .utils import load_vectors + + self._query_vectors = load_vectors(self.query_file) + return self._query_vectors + + @query_vectors.setter + def query_vectors(self, value: Optional[np.ndarray]) -> None: + """Set query vectors directly.""" + self._query_vectors = value if value is not None else np.empty((0, 0)) + + @property + def groundtruth_neighbors(self) -> Optional[np.ndarray]: + """Ground truth neighbor IDs. + + Loaded from groundtruth_neighbors_file on first access if not + provided directly. + """ + if ( + self._groundtruth_neighbors is None + and self.groundtruth_neighbors_file + ): + from .utils import load_vectors + + self._groundtruth_neighbors = load_vectors( + self.groundtruth_neighbors_file + ) + return self._groundtruth_neighbors + + @groundtruth_neighbors.setter + def groundtruth_neighbors(self, value: Optional[np.ndarray]) -> None: + """Set ground truth neighbors directly.""" + self._groundtruth_neighbors = value @property def dims(self) -> int: diff --git a/python/cuvs_bench/cuvs_bench/backends/utils.py b/python/cuvs_bench/cuvs_bench/backends/utils.py new file mode 100644 index 0000000000..b931f3128b --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/backends/utils.py @@ -0,0 +1,202 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +""" +Shared utilities for cuvs-bench backends. + +Provides common functions for Python-native backends (e.g., OpenSearch, +Elasticsearch): +- Vector file I/O: used internally by the Dataset class for transparent + vector loading. The C++ backend does not use these since it passes file + paths directly to the subprocess. +- Parameter expansion: converts YAML param specs into lists of param dicts. +- Recall computation: computes recall@k from neighbors and ground truth. + +The dtype_from_filename function originates from generate_groundtruth/utils.py. +Note: generate_groundtruth/utils.py uses .hbin for float16 while the rest of +the codebase (get_dataset/fbin_to_f16bin.py, OpenSearch backend) uses .f16bin. +We standardize on .f16bin here to match the naming convention of the other +formats (.fbin, .i8bin, .u8bin). +""" + +import itertools +import os +from typing import Any, Dict, List, Optional + +import numpy as np + + +def dtype_from_filename(filename): + """Map file extension to numpy dtype. + + Parameters + ---------- + filename : str + Path or filename with a supported extension. + + Returns + ------- + numpy.dtype + The corresponding numpy dtype. + + Raises + ------ + RuntimeError + If the file extension is not supported. + """ + ext = os.path.splitext(filename)[1] + if ext == ".fbin": + return np.float32 + if ext == ".f16bin": + return np.float16 + elif ext == ".ibin": + return np.int32 + elif ext == ".u8bin": + return np.ubyte + elif ext == ".i8bin": + return np.byte + else: + raise RuntimeError(f"Unsupported file extension: {ext}") + + +def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray: + """ + Read a binary vector file into a numpy array. + + Supports the standard big-ann-bench binary format used by cuvs-bench + datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``, + followed by ``n_rows * n_cols`` elements of the dtype inferred from + the file extension via ``dtype_from_filename``. + + Parameters + ---------- + path : str + Path to the binary file. The dtype is inferred from the extension: + ``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8), + ``.i8bin`` (int8), ``.ibin`` (int32). + subset_size : Optional[int] + If provided, only the first ``subset_size`` rows are loaded. + + Returns + ------- + np.ndarray + Array of shape ``(n_rows, n_cols)`` with the inferred dtype. + + Raises + ------ + FileNotFoundError + If the file does not exist. + ValueError + If the file extension is unsupported, ``subset_size`` is not positive, + or the file is truncated. + """ + dtype = dtype_from_filename(path) + if subset_size is not None and subset_size < 1: + raise ValueError( + f"subset_size must be a positive integer, got {subset_size}" + ) + with open(path, "rb") as f: + header = f.read(8) + if len(header) < 8: + raise ValueError( + f"File too small to contain a valid header (expected 8 bytes, " + f"got {len(header)}): {path}" + ) + n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0]) + n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0]) + if subset_size is not None: + n_rows = min(n_rows, subset_size) + expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize + raw = f.read(expected_bytes) + if len(raw) < expected_bytes: + raise ValueError( + f"File is truncated: expected {expected_bytes} bytes of data " + f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), " + f"got {len(raw)}: {path}" + ) + data = np.frombuffer(raw, dtype=dtype) + return data.reshape(n_rows, n_cols) + + +def expand_param_grid( + param_spec: Dict[str, List[Any]], +) -> List[Dict[str, Any]]: + """ + Expand a parameter specification into all combinations via Cartesian product. + + Takes a dict where each key maps to a list of values (as defined in + algorithm YAML configs) and produces a list of dicts, one per combination. + + Parameters + ---------- + param_spec : Dict[str, List[Any]] + Parameter specification, e.g., {"m": [16, 32], "ef_construction": [100, 200]} + + Returns + ------- + List[Dict[str, Any]] + List of parameter dicts, e.g., + [{"m": 16, "ef_construction": 100}, {"m": 16, "ef_construction": 200}, + {"m": 32, "ef_construction": 100}, {"m": 32, "ef_construction": 200}] + Returns [{}] if param_spec is empty. + """ + if not param_spec: + return [{}] + keys = list(param_spec.keys()) + return [ + dict(zip(keys, vals)) + for vals in itertools.product(*param_spec.values()) + ] + + +def compute_recall( + neighbors: np.ndarray, groundtruth: np.ndarray, k: int +) -> float: + """ + Compute recall@k by comparing returned neighbors against ground truth. + + For each query, counts how many of the k returned neighbors appear + in the ground truth set, then averages across all queries. + + Parameters + ---------- + neighbors : np.ndarray + Returned neighbor IDs, shape (n_queries, k) + groundtruth : np.ndarray + Ground truth neighbor IDs, shape (n_queries, gt_k) + k : int + Number of neighbors to evaluate + + Returns + ------- + float + Recall@k in range [0.0, 1.0] + """ + if not isinstance(neighbors, np.ndarray) or not isinstance( + groundtruth, np.ndarray + ): + raise ValueError("neighbors and groundtruth must be numpy ndarrays") + if neighbors.ndim != 2 or groundtruth.ndim != 2: + raise ValueError( + f"neighbors and groundtruth must be 2-D arrays, got " + f"neighbors.shape={neighbors.shape}, groundtruth.shape={groundtruth.shape}" + ) + n_queries = neighbors.shape[0] + if groundtruth.shape[0] != n_queries: + raise ValueError( + f"Row count mismatch: neighbors has {n_queries} rows, " + f"groundtruth has {groundtruth.shape[0]} rows" + ) + gt_k = min(k, groundtruth.shape[1]) + if gt_k == 0 or n_queries == 0: + return 0.0 + n_correct = sum( + len( + set(neighbors[i, :k].tolist()) + & set(groundtruth[i, :gt_k].tolist()) + ) + for i in range(n_queries) + ) + return n_correct / (n_queries * gt_k) diff --git a/python/cuvs_bench/cuvs_bench/orchestrator/__init__.py b/python/cuvs_bench/cuvs_bench/orchestrator/__init__.py index c6ad2db7f1..ecbcc94059 100644 --- a/python/cuvs_bench/cuvs_bench/orchestrator/__init__.py +++ b/python/cuvs_bench/cuvs_bench/orchestrator/__init__.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # -from .orchestrator import BenchmarkOrchestrator, run_benchmark +from .orchestrator import BenchmarkOrchestrator from .config_loaders import ConfigLoader, BenchmarkConfig, DatasetConfig, CppGBenchConfigLoader from ..backends.registry import ( get_backend_class, @@ -15,7 +15,6 @@ __all__ = [ # Main orchestrator "BenchmarkOrchestrator", - "run_benchmark", # Config loaders "ConfigLoader", "BenchmarkConfig", diff --git a/python/cuvs_bench/cuvs_bench/orchestrator/config_loaders.py b/python/cuvs_bench/cuvs_bench/orchestrator/config_loaders.py index 3d81d54f9a..6e5f0dd214 100644 --- a/python/cuvs_bench/cuvs_bench/orchestrator/config_loaders.py +++ b/python/cuvs_bench/cuvs_bench/orchestrator/config_loaders.py @@ -10,7 +10,6 @@ implementations that handle configuration loading and preprocessing. """ -import itertools import os import warnings from abc import ABC, abstractmethod @@ -21,6 +20,8 @@ import yaml +from ..backends.utils import expand_param_grid + @dataclass class IndexConfig: @@ -127,23 +128,99 @@ class ConfigLoader(ABC): """ Abstract base class for configuration loaders. - Each backend type has its own ConfigLoader that knows how to: - - Load configuration from files or other sources - - Expand parameter combinations - - Validate constraints - - Produce standardized BenchmarkConfig objects + Uses a template method pattern: load() handles the shared steps + (loading dataset YAML, looking up the dataset, constructing + DatasetConfig) then delegates to _build_benchmark_configs() for + backend-specific logic. Backend authors only implement + _build_benchmark_configs() and receive the DatasetConfig already built. + + Provides shared helper methods for common config-loading operations + (YAML loading, dataset lookup, algorithm config gathering) so that + individual loaders do not need to reimplement them. """ - @abstractmethod - def load(self, **kwargs) -> Tuple[DatasetConfig, List[BenchmarkConfig]]: + def load( + self, + dataset: str, + dataset_path: str, + **kwargs, + ) -> Tuple[DatasetConfig, List[BenchmarkConfig]]: """ Load and prepare benchmark configurations. + Handles shared config-loading steps (dataset YAML, dataset lookup, + DatasetConfig construction) then delegates to the backend-specific + _build_benchmark_configs() for producing BenchmarkConfig objects. + + Parameters + ---------- + dataset : str + Dataset name + dataset_path : str + Path to dataset directory + **kwargs + Backend-specific arguments passed through to + _build_benchmark_configs() + Returns ------- Tuple[DatasetConfig, List[BenchmarkConfig]] Dataset configuration and list of benchmark configurations to run """ + # Shared step 1: Load dataset YAML + ds_yaml_path = kwargs.get("dataset_configuration") or os.path.join( + self.config_path, "datasets", "datasets.yaml" + ) + ds_conf_all = self.load_yaml_file(ds_yaml_path) + + # Shared step 2: Find dataset by name + ds_conf = self.get_dataset_configuration(dataset, ds_conf_all) + + # Shared step 3: Construct DatasetConfig with resolved paths + dataset_config = self.build_dataset_config( + ds_conf, dataset_path, kwargs.get("subset_size") + ) + + # Backend-specific step: produce BenchmarkConfig list + benchmark_configs = self._build_benchmark_configs( + dataset_config, ds_conf, dataset, dataset_path, **kwargs + ) + return dataset_config, benchmark_configs + + @abstractmethod + def _build_benchmark_configs( + self, + dataset_config: DatasetConfig, + dataset_conf: dict, + dataset: str, + dataset_path: str, + **kwargs, + ) -> List[BenchmarkConfig]: + """ + Build backend-specific benchmark configurations. + + Called by load() after the shared steps are complete. Each backend + implements this with its own logic (executable discovery for C++, + connection params for OpenSearch, etc.). + + Parameters + ---------- + dataset_config : DatasetConfig + Already-constructed dataset configuration + dataset_conf : dict + Raw dataset dict from YAML (for backends that need extra fields) + dataset : str + Dataset name + dataset_path : str + Path to dataset directory + **kwargs + Backend-specific arguments + + Returns + ------- + List[BenchmarkConfig] + List of benchmark configurations to run + """ pass @property @@ -152,6 +229,136 @@ def backend_type(self) -> str: """Return the backend type this loader is for (e.g., 'cpp_gbench').""" pass + def build_dataset_config( + self, + dataset_conf: dict, + dataset_path: Optional[str] = None, + subset_size: Optional[int] = None, + ) -> DatasetConfig: + """ + Construct a DatasetConfig from a dataset YAML dict. + + Resolves relative file paths against dataset_path if provided. + + Parameters + ---------- + dataset_conf : dict + Dataset configuration dict from datasets.yaml + dataset_path : Optional[str] + Base path for resolving relative file paths + subset_size : Optional[int] + Limit dataset to first N vectors + + Returns + ------- + DatasetConfig + """ + + def _resolve(rel): + if rel and dataset_path and not os.path.isabs(rel): + return os.path.join(dataset_path, rel) + return rel + + return DatasetConfig( + name=dataset_conf["name"], + base_file=_resolve(dataset_conf.get("base_file")), + query_file=_resolve(dataset_conf.get("query_file")), + groundtruth_neighbors_file=_resolve( + dataset_conf.get("groundtruth_neighbors_file") + ), + distance=dataset_conf.get("distance", "euclidean"), + dims=dataset_conf.get("dims"), + subset_size=subset_size, + ) + + def load_yaml_file(self, file_path: str) -> dict: + """ + Load a YAML file and return its contents as a dictionary. + + Parameters + ---------- + file_path : str + The path to the YAML file. + + Returns + ------- + dict + The contents of the YAML file. + """ + with open(file_path, "r") as f: + return yaml.safe_load(f) + + def get_dataset_configuration( + self, dataset: str, dataset_conf_all: list + ) -> dict: + """ + Retrieve the configuration for a specific dataset. + + Parameters + ---------- + dataset : str + The name of the dataset to retrieve the configuration for. + dataset_conf_all : list + A list of dataset configurations. + + Returns + ------- + dict + The configuration for the specified dataset. + + Raises + ------ + ValueError + If the dataset configuration is not found. + """ + for d in dataset_conf_all: + if dataset == d["name"]: + return d + raise ValueError( + f"Could not find a dataset configuration for '{dataset}'" + ) + + def gather_algorithm_configs( + self, config_path: str, algorithm_configuration: Optional[str] + ) -> list: + """ + Gather the list of algorithm configuration files. + + Parameters + ---------- + config_path : str + The path to the config directory. + algorithm_configuration : Optional[str] + The path to the algorithm configuration directory or file. + + Returns + ------- + list + A list of paths to the algorithm configuration files. + """ + algos_conf_fs = os.listdir(os.path.join(config_path, "algos")) + algos_conf_fs = [ + os.path.join(config_path, "algos", f) + for f in algos_conf_fs + if f.endswith((".yaml", ".yml")) + ] + + if algorithm_configuration: + if os.path.isdir(algorithm_configuration): + algos_conf_fs += [ + os.path.join(algorithm_configuration, f) + for f in os.listdir(algorithm_configuration) + if f.endswith((".yaml", ".yml")) + ] + elif os.path.isfile(algorithm_configuration): + algos_conf_fs.append(algorithm_configuration) + else: + warnings.warn( + f"algorithm_configuration path does not exist: " + f"{algorithm_configuration}" + ) + return algos_conf_fs + class CppGBenchConfigLoader(ConfigLoader): """ @@ -196,94 +403,53 @@ def gpu_present(self) -> bool: self._gpu_present = False return self._gpu_present - def load( + def _build_benchmark_configs( self, + dataset_config: DatasetConfig, + dataset_conf: dict, dataset: str, dataset_path: str, - dataset_configuration: Optional[str] = None, - algorithm_configuration: Optional[str] = None, - algorithms: Optional[str] = None, - groups: Optional[str] = None, - algo_groups: Optional[str] = None, - count: int = 10, - batch_size: int = 10000, - subset_size: Optional[int] = None, **kwargs, - ) -> Tuple[DatasetConfig, List[BenchmarkConfig]]: + ) -> List[BenchmarkConfig]: """ - Load C++ benchmark configurations from YAML files. + Build C++ benchmark configurations. Parameters ---------- + dataset_config : DatasetConfig + Already-constructed dataset configuration + dataset_conf : dict + Raw dataset dict from YAML dataset : str Dataset name dataset_path : str Path to dataset directory - dataset_configuration : Optional[str] - Path to dataset configuration file - algorithm_configuration : Optional[str] - Path to algorithm configuration directory or file - algorithms : Optional[str] - Comma-separated list of algorithms to run - groups : Optional[str] - Comma-separated list of groups to run - algo_groups : Optional[str] - Comma-separated list of algorithm groups - count : int - Number of neighbors (k) - batch_size : int - Batch size for search - subset_size : Optional[int] - Dataset subset size **kwargs - Additional keyword arguments. In tune mode, the orchestrator - passes these internal kwargs: - - - _tune_mode : bool - If True, returns single config with exact params instead of - Cartesian product expansion from YAML. - - _tune_build_params : dict - Exact build parameters suggested by Optuna - (e.g., {"nlist": 5347}) - - _tune_search_params : dict - Exact search parameters suggested by Optuna - (e.g., {"nprobe": 73}) + C++ specific arguments: algorithm_configuration, algorithms, + groups, algo_groups, count, batch_size, subset_size, + and tune mode kwargs (_tune_mode, _tune_build_params, + _tune_search_params) Returns ------- - Tuple[DatasetConfig, List[BenchmarkConfig]] - Dataset config and list of benchmark configs. - - Sweep mode: Multiple configs (Cartesian product of YAML params) - - Tune mode: Single config (exact Optuna-suggested params) + List[BenchmarkConfig] + Benchmark configs grouped by executable """ # Extract tune mode kwargs (passed by orchestrator._run_trial) tune_mode = kwargs.pop("_tune_mode", False) tune_build_params = kwargs.pop("_tune_build_params", None) tune_search_params = kwargs.pop("_tune_search_params", None) - config_path = self.config_path + algorithm_configuration = kwargs.get("algorithm_configuration") + algorithms = kwargs.get("algorithms") + groups = kwargs.get("groups") + algo_groups = kwargs.get("algo_groups") + count = kwargs.get("count", 10) + batch_size = kwargs.get("batch_size", 10000) + subset_size = kwargs.get("subset_size") + executable_dir = kwargs.get("executable_dir") - # Load dataset configuration - dataset_conf_all = self.load_yaml_file( - dataset_configuration - or os.path.join(config_path, "datasets", "datasets.yaml") - ) - dataset_conf = self.get_dataset_configuration( - dataset, dataset_conf_all - ) - - # Create DatasetConfig - dataset_config = DatasetConfig( - name=dataset_conf["name"], - base_file=dataset_conf.get("base_file"), - query_file=dataset_conf.get("query_file"), - groundtruth_neighbors_file=dataset_conf.get( - "groundtruth_neighbors_file" - ), - distance=dataset_conf.get("distance", "euclidean"), - dims=dataset_conf.get("dims"), - subset_size=subset_size, - ) + config_path = self.config_path # Prepare conf_file for constraint validation conf_file = {"dataset": dataset_conf} @@ -332,9 +498,10 @@ def load( tune_mode=tune_mode, tune_build_params=tune_build_params, tune_search_params=tune_search_params, + executable_dir=executable_dir, ) - return dataset_config, benchmark_configs + return benchmark_configs def _prepare_benchmark_configs( self, @@ -349,6 +516,7 @@ def _prepare_benchmark_configs( tune_mode: bool = False, tune_build_params: dict = None, tune_search_params: dict = None, + executable_dir: Optional[str] = None, ) -> List[BenchmarkConfig]: """ Prepare list of BenchmarkConfig from algorithm configurations. @@ -373,7 +541,12 @@ def _prepare_benchmark_configs( try: executable, executable_path, file_name = ( self.find_executable( - algos_yaml, algo, group, count, batch_size + algos_yaml, + algo, + group, + count, + batch_size, + executable_dir, ) ) except FileNotFoundError: @@ -448,93 +621,9 @@ def _prepare_benchmark_configs( return benchmark_configs # ========================================================================= - # `Helper methods (copied from run.py as-is)` + # C++ specific helper methods # ========================================================================= - def load_yaml_file(self, file_path: str) -> dict: - """ - Load a YAML file and return its contents as a dictionary. - - Parameters - ---------- - file_path : str - The path to the YAML file. - - Returns - ------- - dict - The contents of the YAML file. - """ - with open(file_path, "r") as f: - return yaml.safe_load(f) - - def get_dataset_configuration( - self, dataset: str, dataset_conf_all: list - ) -> dict: - """ - Retrieve the configuration for a specific dataset. - - Parameters - ---------- - dataset : str - The name of the dataset to retrieve the configuration for. - dataset_conf_all : list - A list of dataset configurations. - - Returns - ------- - dict - The configuration for the specified dataset. - - Raises - ------ - ValueError - If the dataset configuration is not found. - """ - for d in dataset_conf_all: - if dataset == d["name"]: - return d - raise ValueError("Could not find a dataset configuration") - - def gather_algorithm_configs( - self, config_path: str, algorithm_configuration: Optional[str] - ) -> list: - """ - Gather the list of algorithm configuration files. - - Parameters - ---------- - config_path : str - The path to the config directory. - algorithm_configuration : Optional[str] - The path to the algorithm configuration directory or file. - - Returns - ------- - list - A list of paths to the algorithm configuration files. - """ - algos_conf_fs = os.listdir(os.path.join(config_path, "algos")) - algos_conf_fs = [ - os.path.join(config_path, "algos", f) - for f in algos_conf_fs - if ".json" not in f - and "constraint" not in f - and ".py" not in f - and "__pycache__" not in f - ] - - if algorithm_configuration: - if os.path.isdir(algorithm_configuration): - algos_conf_fs += [ - os.path.join(algorithm_configuration, f) - for f in os.listdir(algorithm_configuration) - if ".json" not in f - ] - elif os.path.isfile(algorithm_configuration): - algos_conf_fs.append(algorithm_configuration) - return algos_conf_fs - def load_algorithms_conf( self, algos_conf_fs: list, @@ -620,7 +709,13 @@ def validate_algorithm( ) def find_executable( - self, algos_conf: dict, algo: str, group: str, k: int, batch_size: int + self, + algos_conf: dict, + algo: str, + group: str, + k: int, + batch_size: int, + executable_dir: Optional[str] = None, ) -> Tuple[str, str, Tuple[str, str]]: """ Find the executable for the given algorithm and group. @@ -637,6 +732,8 @@ def find_executable( The number of nearest neighbors to search for. batch_size : int The size of each batch for processing. + executable_dir : Optional[str] + User-specified directory to search first. Returns ------- @@ -646,12 +743,14 @@ def find_executable( """ executable = algos_conf[algo]["executable"] file_name = (f"{algo},{group}", f"{algo},{group},k{k},bs{batch_size}") - build_path = self.get_build_path(executable) + build_path = self.get_build_path(executable, executable_dir) if build_path: return executable, build_path, file_name raise FileNotFoundError(executable) - def get_build_path(self, executable: str) -> Optional[str]: + def get_build_path( + self, executable: str, executable_dir: Optional[str] = None + ) -> Optional[str]: """ Get the build path for the given executable. @@ -659,12 +758,20 @@ def get_build_path(self, executable: str) -> Optional[str]: ---------- executable : str The name of the executable. + executable_dir : Optional[str] + User-specified directory to search first. If provided and the + executable exists there, it is used before auto-discovery. Returns ------- Optional[str] The build path for the executable, if found. """ + if executable_dir is not None: + build_path = os.path.join(executable_dir, executable) + if os.path.exists(build_path): + print(f"-- Using cuVS bench from {build_path}.") + return build_path devcontainer_path = "/home/coder/cuvs/cpp/build/latest/bench/ann" if os.path.exists(devcontainer_path): @@ -747,40 +854,19 @@ def prepare_indexes( search_params = group_conf.get("search", {}) if tune_mode and tune_build_params is not None: - # TUNE MODE: Use exact params from Optuna (single config) - param_names = list(tune_build_params.keys()) - all_build_params = [tuple(tune_build_params.values())] - # For search params, use tune_search_params keys/values - if tune_search_params: - search_param_names = tuple(tune_search_params.keys()) - search_param_lists = [[v] for v in tune_search_params.values()] - else: - search_param_names, search_param_lists = [], [] - else: - # SWEEP MODE: Cartesian product of all YAML params - all_build_params = itertools.product(*build_params.values()) - search_param_names, search_param_lists = ( - zip(*search_params.items()) if search_params else ([], []) + all_build_params = [tune_build_params.copy()] + all_search_params = ( + [tune_search_params.copy()] if tune_search_params else [{}] ) - param_names = list(build_params.keys()) - for params in all_build_params: - # Build the build_param dict using appropriate keys - if tune_mode and tune_build_params is not None: - # Tune mode: use exact params from Optuna - index = { - "algo": algo, - "build_param": tune_build_params.copy(), - } - else: - # Sweep mode: use YAML param names - index = { - "algo": algo, - "build_param": dict(zip(param_names, params)), - } + else: + all_build_params = expand_param_grid(build_params) + all_search_params = expand_param_grid(search_params) + + for build_param in all_build_params: + index = {"algo": algo, "build_param": build_param} - # Build index name from params index_name = f"{algo}_{group}" if group != "base" else f"{algo}" - for name, val in zip(param_names, params): + for name, val in build_param.items(): index_name += f".{name}{val}" # Skip constraint validation in tune mode (Optuna handles bounds) @@ -789,7 +875,7 @@ def prepare_indexes( algos_conf, algo, "build", - index["build_param"], + build_param, None, conf_file["dataset"].get("dims"), count, @@ -807,16 +893,12 @@ def prepare_indexes( # Handle search params if tune_mode and tune_search_params is not None: - # Tune mode: use exact search params from Optuna (as single-item list) index["search_params"] = [tune_search_params.copy()] else: - # Sweep mode: validate and expand search params index["search_params"] = self.validate_search_params( - itertools.product(*search_param_lists), - search_param_names, - index["build_param"], + all_search_params, + build_param, algo, - group_conf, algos_conf, conf_file, count, @@ -830,29 +912,26 @@ def prepare_indexes( def validate_search_params( self, all_search_params, - search_param_names, build_params, algo, - group_conf, algos_conf, conf_file, count, batch_size, ) -> list: """ - Validate and prepare the search parameters for the given algorithm - and group. + Validate and filter search parameter combinations. Parameters ---------- - all_search_params : itertools.product - The Cartesian product of search parameter values. - search_param_names : list - The names of the search parameters. + all_search_params : List[Dict[str, Any]] + List of search parameter dicts (from expand_param_grid). + build_params : dict + The build parameters for the current index. algo : str The name of the algorithm. - group_conf : dict - The configuration for the algorithm group. + algos_conf : dict + The loaded algorithm configurations. conf_file : dict The main configuration file. count : int @@ -863,11 +942,10 @@ def validate_search_params( Returns ------- list - A list of validated search parameters. + A list of validated search parameter dicts. """ search_params_list = [] - for search_params in all_search_params: - search_dict = dict(zip(search_param_names, search_params)) + for search_dict in all_search_params: if self.validate_constraints( algos_conf, algo, diff --git a/python/cuvs_bench/cuvs_bench/orchestrator/orchestrator.py b/python/cuvs_bench/cuvs_bench/orchestrator/orchestrator.py index 091290979c..d76127d3b2 100644 --- a/python/cuvs_bench/cuvs_bench/orchestrator/orchestrator.py +++ b/python/cuvs_bench/cuvs_bench/orchestrator/orchestrator.py @@ -20,6 +20,7 @@ get_config_loader, list_backends, ) +from ..backends.utils import compute_recall from .config_loaders import DatasetConfig @@ -253,6 +254,19 @@ def _run_sweep( search_threads=search_threads, dry_run=dry_run, ) + + # Compute recall for backends that return actual neighbors. + # The C++ backend computes recall in the subprocess and returns + # empty neighbors, so this is skipped for it. + # Note: The recall computation relies on empty neighbors to + # distinguish backends that already computed recall. + if search_result.success and search_result.neighbors.size > 0: + gt = bench_dataset.groundtruth_neighbors + if gt is not None: + search_result.recall = compute_recall( + search_result.neighbors, gt, count + ) + results.append(search_result) if not search_result.success: @@ -575,6 +589,14 @@ def _run_trial( dry_run=dry_run, ) + # Compute recall for backends that return actual neighbors. + # Note: The recall computation relies on empty neighbors to + # distinguish backends that already computed recall. + if result.success and result.neighbors.size > 0: + gt = bench_dataset.groundtruth_neighbors + if gt is not None: + result.recall = compute_recall(result.neighbors, gt, count) + return result def _create_dataset(self, dataset_config: DatasetConfig) -> Dataset: @@ -618,98 +640,3 @@ def available_backends() -> List[str]: List of registered backend names """ return list(list_backends().keys()) - - -# ============================================================================ -# Standalone function for backward compatibility -# ============================================================================ - - -def run_benchmark( - subset_size: int = None, - count: int = 10, - batch_size: int = 10000, - dataset_configuration: Optional[str] = None, - configuration: Optional[str] = None, - dataset: str = None, - dataset_path: str = None, - build: Optional[bool] = None, - search: Optional[bool] = None, - algorithms: Optional[str] = None, - groups: Optional[str] = None, - algo_groups: Optional[str] = None, - force: bool = False, - search_mode: str = "latency", - search_threads: Optional[int] = None, - dry_run: bool = False, - data_export: bool = False, - backend_type: str = "cpp_gbench", -) -> List[Union[BuildResult, SearchResult]]: - """ - Standalone function for backward compatibility with run.py interface. - - Parameters - ---------- - subset_size : int - The subset size of the dataset. - count : int - The number of nearest neighbors to search for. - batch_size : int - The size of each batch for processing. - dataset_configuration : Optional[str] - Path to the dataset configuration file. - configuration : Optional[str] - Path to the algorithm configuration directory or file. - dataset : str - The name of the dataset to use. - dataset_path : str - The path to the dataset directory. - build : Optional[bool] - Whether to build the indices. - search : Optional[bool] - Whether to perform the search. - algorithms : Optional[str] - Comma-separated list of algorithm names to use. - groups : str - Comma-separated list of groups to consider. - algo_groups : Optional[str] - Comma-separated list of algorithm groups to consider. - force : bool - Whether to force the execution regardless of warnings. - search_mode : str - The mode of search to perform. - search_threads : int - The number of threads to use for searching. - dry_run : bool - Whether to perform a dry run without actual execution. - data_export : bool - Whether to export data (unused, kept for compatibility). - backend_type : str - Type of backend to use (default: 'cpp_gbench') - - Returns - ------- - List[Union[BuildResult, SearchResult]] - List of all build and search results. - """ - orchestrator = BenchmarkOrchestrator(backend_type=backend_type) - - return orchestrator.run_benchmark( - build=build if build is not None else True, - search=search if search is not None else True, - force=force, - dry_run=dry_run, - count=count, - batch_size=batch_size, - search_mode=search_mode, - search_threads=search_threads, - # ConfigLoader-specific kwargs (for cpp_gbench) - dataset=dataset, - dataset_path=dataset_path, - dataset_configuration=dataset_configuration, - configuration=configuration, - algorithms=algorithms, - groups=groups, - algo_groups=algo_groups, - subset_size=subset_size, - ) diff --git a/python/cuvs_bench/cuvs_bench/run/__main__.py b/python/cuvs_bench/cuvs_bench/run/__main__.py index 47c41a903e..6950ff7202 100644 --- a/python/cuvs_bench/cuvs_bench/run/__main__.py +++ b/python/cuvs_bench/cuvs_bench/run/__main__.py @@ -3,15 +3,16 @@ # SPDX-License-Identifier: Apache-2.0 # +import json import os -import warnings from pathlib import Path from typing import Optional import click +import yaml from .data_export import convert_json_to_csv_build, convert_json_to_csv_search -from .run import run_benchmark +from ..orchestrator import BenchmarkOrchestrator @click.command() @@ -150,6 +151,36 @@ "were interrupted, use this option to convert those intermediate " "files manually.", ) +@click.option( + "--mode", + type=click.Choice(["sweep", "tune"], case_sensitive=False), + default="sweep", + show_default=True, + help="Benchmark mode: 'sweep' runs all parameter combinations from " + "YAML configs. 'tune' uses Optuna to intelligently search the " + "parameter space (requires --constraints).", +) +@click.option( + "--constraints", + default=None, + help="Tune mode constraints as JSON string. One metric should have " + "'maximize' or 'minimize', others have min/max bounds. " + 'Example: \'{"recall": "maximize", "latency": {"max": 10}}\'', +) +@click.option( + "--n-trials", + type=int, + default=None, + help="Number of Optuna trials for tune mode (default: 100).", +) +@click.option( + "--backend-config", + default=None, + help="Path to YAML configuration file for non-C++ backends. " + "The file must contain a 'backend' field specifying the backend " + "type (e.g., 'opensearch', 'elastic'). All other fields are " + "passed as backend-specific parameters.", +) def main( subset_size: Optional[int], count: int, @@ -169,6 +200,10 @@ def main( search_threads: Optional[str], dry_run: bool, data_export: bool, + mode: str, + constraints: Optional[str], + n_trials: Optional[int], + backend_config: Optional[str], ) -> None: """ Main function to run the benchmark with the provided options. @@ -207,17 +242,64 @@ def main( The number of threads to use for throughput benchmark. dry_run : bool Whether to perform a dry run without actual execution. + data_export : bool + Whether to export intermediate JSON results to CSV. + mode : str + Benchmark mode: 'sweep' (exhaustive) or 'tune' (Optuna-based). + constraints : Optional[str] + Tune mode constraints as JSON string. + n_trials : Optional[int] + Number of Optuna trials for tune mode. + backend_config : Optional[str] + Path to YAML config for non-C++ backends. If not provided, + defaults to the C++ Google Benchmark backend. The YAML file + must contain a 'backend' field (e.g., 'opensearch', 'elastic') + and any backend-specific connection parameters (host, port, etc.). """ - warnings.warn( - "The 'cuvs_bench.run' CLI is deprecated and will be removed in a future release. " - "Use BenchmarkOrchestrator from cuvs_bench.orchestrator instead.", - FutureWarning, - stacklevel=2, - ) - if not data_export: - run_benchmark(**locals()) + # Determine backend type and extra kwargs from --backend-config + backend_type = "cpp_gbench" + backend_kwargs = {} + if backend_config: + with open(backend_config, "r") as f: + cfg = yaml.safe_load(f) + if not isinstance(cfg, dict): + raise ValueError( + f"--backend-config must parse to a mapping, " + f"got {type(cfg).__name__}" + ) + if "backend" not in cfg: + raise ValueError( + "--backend-config must include a 'backend' field" + ) + backend_type = cfg.pop("backend") + backend_kwargs = cfg + + orchestrator = BenchmarkOrchestrator(backend_type=backend_type) + orchestrator.run_benchmark( + mode=mode, + constraints=json.loads(constraints) if constraints else None, + n_trials=n_trials, + dataset=dataset, + dataset_path=dataset_path, + build=build, + search=search, + force=force, + dry_run=dry_run, + count=count, + batch_size=batch_size, + search_mode=search_mode, + search_threads=search_threads, + dataset_configuration=dataset_configuration, + algorithm_configuration=configuration, + algorithms=algorithms, + groups=groups, + algo_groups=algo_groups, + subset_size=subset_size, + executable_dir=executable_dir, + **backend_kwargs, + ) convert_json_to_csv_build(dataset, dataset_path) convert_json_to_csv_search(dataset, dataset_path) diff --git a/python/cuvs_bench/cuvs_bench/tests/test_cli.py b/python/cuvs_bench/cuvs_bench/tests/test_cli.py index c3f8d43856..fa2a63d041 100644 --- a/python/cuvs_bench/cuvs_bench/tests/test_cli.py +++ b/python/cuvs_bench/cuvs_bench/tests/test_cli.py @@ -517,3 +517,169 @@ def test_plot_command_creates_png_files(temp_datasets_dir: Path): assert file_path.stat().st_size > 0, ( f"Expected file {filename} is empty." ) + + +# FIXME: Tests below use --dry-run to verify CLI flag parsing and orchestrator +# routing without requiring actual benchmark execution. Tune mode (--mode tune) +# requires Optuna and actual search results, so only flag acceptance is tested +# here. End-to-end tests for tune mode and non-C++ backends should be added +# when those features are exercised in integration testing. + + +def test_run_with_mode_sweep(temp_datasets_dir): + """Verify --mode sweep is accepted (default behavior).""" + from cuvs_bench.run.__main__ import main as run_main + + runner = CliRunner() + result = runner.invoke( + run_main, + [ + "--dataset", + "test-data", + "--dataset-path", + str(temp_datasets_dir), + "--algorithms", + "cuvs_cagra", + "--batch-size", + "10", + "-k", + "10", + "--groups", + "test", + "-m", + "latency", + "--mode", + "sweep", + "--dry-run", + ], + ) + assert result.exit_code == 0, ( + f"--mode sweep failed with output:\n{result.output}" + ) + + +def test_run_with_backend_config(temp_datasets_dir, tmp_path): + """Verify --backend-config YAML is parsed correctly.""" + from cuvs_bench.run.__main__ import main as run_main + + config_file = tmp_path / "backend.yaml" + config_file.write_text("backend: cpp_gbench\n") + runner = CliRunner() + result = runner.invoke( + run_main, + [ + "--dataset", + "test-data", + "--dataset-path", + str(temp_datasets_dir), + "--algorithms", + "cuvs_cagra", + "--batch-size", + "10", + "-k", + "10", + "--groups", + "test", + "-m", + "latency", + "--backend-config", + str(config_file), + "--dry-run", + ], + ) + assert result.exit_code == 0, ( + f"--backend-config failed with output:\n{result.output}" + ) + + +def test_run_with_invalid_backend_config(tmp_path): + """Verify missing backend config file raises error.""" + from cuvs_bench.run.__main__ import main as run_main + + runner = CliRunner() + result = runner.invoke( + run_main, + [ + "--dataset", + "test-data", + "--dataset-path", + str(tmp_path), + "--algorithms", + "cuvs_cagra", + "--batch-size", + "10", + "-k", + "10", + "--groups", + "test", + "-m", + "latency", + "--backend-config", + "/nonexistent/config.yaml", + ], + ) + assert result.exit_code != 0 + + +def test_run_with_n_trials_flag(temp_datasets_dir): + """Verify --n-trials flag is accepted by the CLI parser.""" + from cuvs_bench.run.__main__ import main as run_main + + runner = CliRunner() + result = runner.invoke( + run_main, + [ + "--dataset", + "test-data", + "--dataset-path", + str(temp_datasets_dir), + "--algorithms", + "cuvs_cagra", + "--batch-size", + "10", + "-k", + "10", + "--groups", + "test", + "-m", + "latency", + "--n-trials", + "50", + "--dry-run", + ], + ) + assert result.exit_code == 0, ( + f"--n-trials failed with output:\n{result.output}" + ) + + +def test_run_with_constraints_flag(temp_datasets_dir): + """Verify --constraints flag accepts valid JSON.""" + from cuvs_bench.run.__main__ import main as run_main + + runner = CliRunner() + result = runner.invoke( + run_main, + [ + "--dataset", + "test-data", + "--dataset-path", + str(temp_datasets_dir), + "--algorithms", + "cuvs_cagra", + "--batch-size", + "10", + "-k", + "10", + "--groups", + "test", + "-m", + "latency", + "--constraints", + '{"recall": "maximize", "latency": {"max": 10}}', + "--dry-run", + ], + ) + assert result.exit_code == 0, ( + f"--constraints failed with output:\n{result.output}" + ) diff --git a/python/cuvs_bench/cuvs_bench/tests/test_registry.py b/python/cuvs_bench/cuvs_bench/tests/test_registry.py index 2bf7433f4f..def7cc478b 100644 --- a/python/cuvs_bench/cuvs_bench/tests/test_registry.py +++ b/python/cuvs_bench/cuvs_bench/tests/test_registry.py @@ -8,7 +8,6 @@ import pytest import numpy as np -from pathlib import Path from cuvs_bench.backends import ( Dataset, @@ -27,31 +26,38 @@ class DummyBackend(BenchmarkBackend): @property def algo(self) -> str: - """Return dummy algorithm name.""" return "dummy_algo" - def build(self, dataset, build_params, index_path, force=False): + def build(self, dataset, indexes, force=False, dry_run=False): + if not indexes: + raise ValueError("indexes must not be empty") + first = indexes[0] return BuildResult( - index_path=str(index_path), + index_path=first.file, build_time_seconds=1.0, index_size_bytes=1000, - algorithm=self.name, - build_params=build_params, + algorithm=self.algo, + build_params=first.build_param, success=True, ) def search( self, dataset, - search_params, - index_path, + indexes, k, batch_size=10000, - mode="throughput", + mode="latency", + force=False, + search_threads=None, + dry_run=False, ): + if not indexes: + raise ValueError("indexes must not be empty") n_queries = dataset.n_queries neighbors = np.random.randint(0, dataset.n_base, size=(n_queries, k)) distances = np.random.rand(n_queries, k) + first = indexes[0] return SearchResult( neighbors=neighbors, @@ -59,41 +65,49 @@ def search( search_time_ms=0.1, queries_per_second=n_queries / 0.1, recall=0.95, - algorithm=self.name, - search_params=search_params, + algorithm=self.algo, + search_params=first.search_params, success=True, ) - @property - def name(self) -> str: - return "dummy" - class AnotherDummyBackend(BenchmarkBackend): """Another dummy backend for testing.""" - def build(self, dataset, build_params, index_path, force=False): + @property + def algo(self) -> str: + return "another_dummy_algo" + + def build(self, dataset, indexes, force=False, dry_run=False): + if not indexes: + raise ValueError("indexes must not be empty") + first = indexes[0] return BuildResult( - index_path=str(index_path), + index_path=first.file, build_time_seconds=2.0, index_size_bytes=2000, - algorithm=self.name, - build_params=build_params, + algorithm=self.algo, + build_params=first.build_param, success=True, ) def search( self, dataset, - search_params, - index_path, + indexes, k, batch_size=10000, - mode="throughput", + mode="latency", + force=False, + search_threads=None, + dry_run=False, ): + if not indexes: + raise ValueError("indexes must not be empty") n_queries = dataset.n_queries neighbors = np.random.randint(0, dataset.n_base, size=(n_queries, k)) distances = np.random.rand(n_queries, k) + first = indexes[0] return SearchResult( neighbors=neighbors, @@ -101,15 +115,11 @@ def search( search_time_ms=0.2, queries_per_second=n_queries / 0.2, recall=0.90, - algorithm=self.name, - search_params=search_params, + algorithm=self.algo, + search_params=first.search_params if first else [], success=True, ) - @property - def name(self) -> str: - return "another_dummy" - class TestDataset: """Tests for Dataset dataclass.""" @@ -294,7 +304,7 @@ def test_get_backend(self): registry = BackendRegistry() registry.register("dummy", DummyBackend) - backend = registry.get_backend("dummy", config={}) + backend = registry.get_backend("dummy", config={"name": "dummy"}) assert isinstance(backend, DummyBackend) assert backend.name == "dummy" @@ -324,9 +334,11 @@ def test_list_backends(self): class TestBackendIntegration: """Integration tests for backends.""" - def test_dummy_backend_build(self): + def test_dummy_backend_build(self, tmp_path): """Test dummy backend build.""" - backend = DummyBackend(config={}) + from cuvs_bench.orchestrator.config_loaders import IndexConfig + + backend = DummyBackend(config={"name": "dummy"}) base = np.random.rand(1000, 128).astype(np.float32) queries = np.random.rand(100, 128).astype(np.float32) @@ -334,19 +346,27 @@ def test_dummy_backend_build(self): name="test", base_vectors=base, query_vectors=queries ) - result = backend.build( - dataset=dataset, - build_params={"nlist": 1024}, - index_path=Path("/tmp/test_index"), - ) + indexes = [ + IndexConfig( + name="test_index", + algo="dummy_algo", + build_param={"nlist": 1024}, + search_params=[{"nprobe": 10}], + file=str(tmp_path / "test_index"), + ) + ] + + result = backend.build(dataset=dataset, indexes=indexes) assert result.success - assert result.algorithm == "dummy" + assert result.algorithm == "dummy_algo" assert result.build_params["nlist"] == 1024 - def test_dummy_backend_search(self): + def test_dummy_backend_search(self, tmp_path): """Test dummy backend search.""" - backend = DummyBackend(config={}) + from cuvs_bench.orchestrator.config_loaders import IndexConfig + + backend = DummyBackend(config={"name": "dummy"}) base = np.random.rand(1000, 128).astype(np.float32) queries = np.random.rand(100, 128).astype(np.float32) @@ -354,12 +374,17 @@ def test_dummy_backend_search(self): name="test", base_vectors=base, query_vectors=queries ) - result = backend.search( - dataset=dataset, - search_params=[{"nprobe": 10}], - index_path=Path("/tmp/test_index"), - k=10, - ) + indexes = [ + IndexConfig( + name="test_index", + algo="dummy_algo", + build_param={"nlist": 1024}, + search_params=[{"nprobe": 10}], + file=str(tmp_path / "test_index"), + ) + ] + + result = backend.search(dataset=dataset, indexes=indexes, k=10) assert result.success assert result.recall == 0.95 @@ -389,7 +414,7 @@ def test_get_backend_global(self): """Test getting backend via global function.""" register_backend("dummy_global2", DummyBackend) - backend = get_backend("dummy_global2", config={}) + backend = get_backend("dummy_global2", config={"name": "dummy"}) assert isinstance(backend, DummyBackend) assert backend.name == "dummy" diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py new file mode 100644 index 0000000000..21bfb5ce6a --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py @@ -0,0 +1,423 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +""" +Unit tests for shared backend utilities and Dataset transparent loading. +""" + +import numpy as np +import pytest +import yaml + +from cuvs_bench.backends import Dataset +from cuvs_bench.backends.utils import ( + compute_recall, + dtype_from_filename, + expand_param_grid, + load_vectors, +) +from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader + + +def _write_test_bin(path, data): + """Write a numpy array in big-ann-bench binary format.""" + with open(path, "wb") as f: + np.asarray(data.shape, dtype=np.uint32).tofile(f) + data.tofile(f) + + +class TestDtypeFromFilename: + """Tests for dtype_from_filename.""" + + def test_fbin(self): + """Test .fbin maps to float32.""" + assert dtype_from_filename("vectors.fbin") == np.float32 + + def test_f16bin(self): + """Test .f16bin maps to float16.""" + assert dtype_from_filename("vectors.f16bin") == np.float16 + + def test_ibin(self): + """Test .ibin maps to int32.""" + assert dtype_from_filename("groundtruth.ibin") == np.int32 + + def test_u8bin(self): + """Test .u8bin maps to uint8.""" + assert dtype_from_filename("vectors.u8bin") == np.ubyte + + def test_i8bin(self): + """Test .i8bin maps to int8.""" + assert dtype_from_filename("vectors.i8bin") == np.byte + + def test_unsupported_extension(self): + """Test that unsupported extensions raise RuntimeError.""" + with pytest.raises(RuntimeError, match="Unsupported file extension"): + dtype_from_filename("vectors.txt") + + def test_full_path(self): + """Test that full paths are handled correctly.""" + assert ( + dtype_from_filename("/data/datasets/sift/base.fbin") == np.float32 + ) + + +class TestLoadVectors: + """Tests for load_vectors.""" + + def test_load_float32(self, tmp_path): + """Test loading float32 vectors.""" + data = np.random.rand(100, 128).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data) + + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, data) + + def test_load_int32(self, tmp_path): + """Test loading int32 vectors.""" + data = np.random.randint(0, 1000, size=(50, 10)).astype(np.int32) + path = str(tmp_path / "test.ibin") + _write_test_bin(path, data) + + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, data) + + def test_load_uint8(self, tmp_path): + """Test loading uint8 vectors.""" + data = np.random.randint(0, 255, size=(30, 64)).astype(np.uint8) + path = str(tmp_path / "test.u8bin") + _write_test_bin(path, data) + + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, data) + + def test_load_int8(self, tmp_path): + """Test loading int8 vectors.""" + data = np.random.randint(-128, 127, size=(30, 64)).astype(np.int8) + path = str(tmp_path / "test.i8bin") + _write_test_bin(path, data) + + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, data) + + def test_subset_size(self, tmp_path): + """Test that subset_size limits the number of rows loaded.""" + data = np.random.rand(100, 128).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data) + + loaded = load_vectors(path, subset_size=10) + assert loaded.shape == (10, 128) + np.testing.assert_array_equal(loaded, data[:10]) + + def test_subset_size_larger_than_file(self, tmp_path): + """Test that subset_size larger than file returns all rows.""" + data = np.random.rand(5, 16).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data) + + loaded = load_vectors(path, subset_size=100) + assert loaded.shape == (5, 16) + np.testing.assert_array_equal(loaded, data) + + def test_negative_subset_size(self, tmp_path): + """Test that negative subset_size raises ValueError.""" + data = np.random.rand(10, 4).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data) + + with pytest.raises( + ValueError, match="subset_size must be a positive integer" + ): + load_vectors(path, subset_size=-1) + + def test_zero_subset_size(self, tmp_path): + """Test that zero subset_size raises ValueError.""" + data = np.random.rand(10, 4).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data) + + with pytest.raises( + ValueError, match="subset_size must be a positive integer" + ): + load_vectors(path, subset_size=0) + + def test_unsupported_extension(self, tmp_path): + """Test that unsupported file extensions raise RuntimeError.""" + path = str(tmp_path / "test.txt") + with open(path, "wb") as f: + f.write(b"dummy") + + with pytest.raises(RuntimeError, match="Unsupported file extension"): + load_vectors(path) + + def test_truncated_header(self, tmp_path): + """Test that truncated headers raise ValueError.""" + path = str(tmp_path / "test.fbin") + with open(path, "wb") as f: + f.write(b"\x00\x00") + + with pytest.raises( + ValueError, match="File too small to contain a valid header" + ): + load_vectors(path) + + def test_truncated_data(self, tmp_path): + """Test that truncated data raises ValueError.""" + path = str(tmp_path / "test.fbin") + with open(path, "wb") as f: + np.array([10, 4], dtype=np.uint32).tofile(f) + np.random.rand(5, 4).astype(np.float32).tofile(f) + + with pytest.raises(ValueError, match="File is truncated"): + load_vectors(path) + + def test_file_not_found(self): + """Test that missing files raise FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + load_vectors("/nonexistent/path/vectors.fbin") + + +class TestDatasetLazyLoading: + """Tests for Dataset transparent vector loading.""" + + def test_lazy_load_base_vectors(self, tmp_path): + """Test that base vectors are loaded from file on first access.""" + data = np.random.rand(50, 32).astype(np.float32) + path = str(tmp_path / "base.fbin") + _write_test_bin(path, data) + + dataset = Dataset(name="test", base_file=path) + np.testing.assert_array_equal(dataset.base_vectors, data) + + def test_lazy_load_query_vectors(self, tmp_path): + """Test that query vectors are loaded from file on first access.""" + data = np.random.rand(10, 32).astype(np.float32) + path = str(tmp_path / "query.fbin") + _write_test_bin(path, data) + + dataset = Dataset(name="test", query_file=path) + np.testing.assert_array_equal(dataset.query_vectors, data) + + def test_lazy_load_groundtruth(self, tmp_path): + """Test that ground truth is loaded from file on first access.""" + data = np.random.randint(0, 100, size=(10, 5)).astype(np.int32) + path = str(tmp_path / "gt.ibin") + _write_test_bin(path, data) + + dataset = Dataset(name="test", groundtruth_neighbors_file=path) + np.testing.assert_array_equal(dataset.groundtruth_neighbors, data) + + def test_preloaded_vectors_returned_directly(self): + """Test that pre-loaded vectors are returned without file access.""" + base = np.random.rand(20, 16).astype(np.float32) + query = np.random.rand(5, 16).astype(np.float32) + + dataset = Dataset( + name="test", + base_vectors=base, + query_vectors=query, + ) + np.testing.assert_array_equal(dataset.base_vectors, base) + np.testing.assert_array_equal(dataset.query_vectors, query) + + def test_no_file_returns_empty(self): + """Test that Dataset without files returns empty arrays.""" + dataset = Dataset(name="test") + assert dataset.base_vectors.size == 0 + assert dataset.query_vectors.size == 0 + assert dataset.groundtruth_neighbors is None + + def test_lazy_load_with_subset_size(self, tmp_path): + """Test that subset_size is respected during lazy loading.""" + data = np.random.rand(100, 32).astype(np.float32) + path = str(tmp_path / "base.fbin") + _write_test_bin(path, data) + + dataset = Dataset( + name="test", + base_file=path, + metadata={"subset_size": 10}, + ) + assert dataset.base_vectors.shape == (10, 32) + np.testing.assert_array_equal(dataset.base_vectors, data[:10]) + + def test_file_path_still_accessible(self): + """Test that file paths are accessible without triggering loading.""" + dataset = Dataset( + name="test", + base_file="/path/to/base.fbin", + query_file="/path/to/query.fbin", + ) + assert dataset.base_file == "/path/to/base.fbin" + assert dataset.query_file == "/path/to/query.fbin" + assert dataset.name == "test" + + def test_file_path_access_does_not_trigger_loading(self, tmp_path): + """Test that accessing file paths does not load vectors into memory.""" + data = np.random.rand(50, 32).astype(np.float32) + path = str(tmp_path / "base.fbin") + _write_test_bin(path, data) + + dataset = Dataset(name="test", base_file=path) + + _ = dataset.base_file + _ = dataset.name + _ = dataset.distance_metric + + assert dataset._base_vectors.size == 0 + + def test_dims_and_counts(self, tmp_path): + """Test dims, n_base, and n_queries properties.""" + data = np.random.rand(50, 32).astype(np.float32) + path = str(tmp_path / "base.fbin") + _write_test_bin(path, data) + + queries = np.random.rand(10, 32).astype(np.float32) + qpath = str(tmp_path / "query.fbin") + _write_test_bin(qpath, queries) + + dataset = Dataset(name="test", base_file=path, query_file=qpath) + assert dataset.dims == 32 + assert dataset.n_base == 50 + assert dataset.n_queries == 10 + + def test_caching(self, tmp_path): + """Test that vectors are loaded once and cached.""" + data = np.random.rand(10, 4).astype(np.float32) + path = str(tmp_path / "base.fbin") + _write_test_bin(path, data) + + dataset = Dataset(name="test", base_file=path) + first_access = dataset.base_vectors + second_access = dataset.base_vectors + assert first_access is second_access + + +class TestConfigLoaderMethods: + """Tests for base ConfigLoader inherited methods.""" + + def test_load_yaml_file(self, tmp_path): + """Test loading a YAML file.""" + yaml_path = str(tmp_path / "test.yaml") + with open(yaml_path, "w") as f: + yaml.dump({"name": "test", "value": 42}, f) + + loader = CppGBenchConfigLoader(config_path=str(tmp_path)) + result = loader.load_yaml_file(yaml_path) + assert result["name"] == "test" + assert result["value"] == 42 + + def test_get_dataset_configuration(self): + """Test finding a dataset by name.""" + loader = CppGBenchConfigLoader() + datasets = [ + {"name": "sift-128", "dims": 128}, + {"name": "glove-100", "dims": 100}, + ] + + result = loader.get_dataset_configuration("glove-100", datasets) + assert result["name"] == "glove-100" + assert result["dims"] == 100 + + def test_get_dataset_configuration_not_found(self): + """Test that missing datasets raise ValueError.""" + loader = CppGBenchConfigLoader() + datasets = [{"name": "sift-128"}] + + with pytest.raises( + ValueError, match="Could not find a dataset configuration" + ): + loader.get_dataset_configuration("nonexistent", datasets) + + +class TestExpandParamGrid: + """Tests for expand_param_grid.""" + + def test_single_param(self): + """Test expansion of a single parameter.""" + result = expand_param_grid({"m": [16, 32]}) + assert result == [{"m": 16}, {"m": 32}] + + def test_two_params(self): + """Test Cartesian product of two parameters.""" + result = expand_param_grid({"m": [16, 32], "ef": [100, 200]}) + assert len(result) == 4 + assert {"m": 16, "ef": 100} in result + assert {"m": 16, "ef": 200} in result + assert {"m": 32, "ef": 100} in result + assert {"m": 32, "ef": 200} in result + + def test_empty_spec(self): + """Test that empty spec returns a single empty dict.""" + result = expand_param_grid({}) + assert result == [{}] + + def test_single_values(self): + """Test that single-element lists produce one combination.""" + result = expand_param_grid({"m": [16], "ef": [100]}) + assert result == [{"m": 16, "ef": 100}] + + def test_three_params(self): + """Test Cartesian product of three parameters.""" + result = expand_param_grid({"a": [1, 2], "b": [3], "c": [4, 5]}) + assert len(result) == 4 + assert {"a": 1, "b": 3, "c": 4} in result + assert {"a": 2, "b": 3, "c": 5} in result + + +class TestComputeRecall: + """Tests for compute_recall.""" + + def test_perfect_recall(self): + """Test recall is 1.0 when neighbors match ground truth exactly.""" + neighbors = np.array([[0, 1, 2], [3, 4, 5]]) + groundtruth = np.array([[0, 1, 2], [3, 4, 5]]) + assert compute_recall(neighbors, groundtruth, k=3) == 1.0 + + def test_zero_recall(self): + """Test recall is 0.0 when no neighbors match ground truth.""" + neighbors = np.array([[10, 11, 12], [13, 14, 15]]) + groundtruth = np.array([[0, 1, 2], [3, 4, 5]]) + assert compute_recall(neighbors, groundtruth, k=3) == 0.0 + + def test_partial_recall(self): + """Test recall with partial overlap.""" + neighbors = np.array([[0, 1, 99]]) + groundtruth = np.array([[0, 1, 2]]) + recall = compute_recall(neighbors, groundtruth, k=3) + assert abs(recall - 2.0 / 3.0) < 1e-9 + + def test_k_smaller_than_groundtruth(self): + """Test recall when k is smaller than ground truth columns.""" + neighbors = np.array([[0, 1]]) + groundtruth = np.array([[0, 1, 2, 3, 4]]) + recall = compute_recall(neighbors, groundtruth, k=2) + assert recall == 1.0 + + def test_k_larger_than_groundtruth(self): + """Test recall when k is larger than ground truth columns.""" + neighbors = np.array([[0, 1, 2, 3, 4]]) + groundtruth = np.array([[0, 1]]) + recall = compute_recall(neighbors, groundtruth, k=5) + assert recall == 1.0 + + def test_empty_groundtruth(self): + """Test recall is 0.0 when ground truth has zero columns.""" + neighbors = np.array([[0, 1, 2]]) + groundtruth = np.empty((1, 0), dtype=np.int32) + assert compute_recall(neighbors, groundtruth, k=3) == 0.0 + + def test_empty_queries(self): + """Test recall is 0.0 when there are no queries.""" + neighbors = np.empty((0, 3), dtype=np.int64) + groundtruth = np.empty((0, 3), dtype=np.int32) + assert compute_recall(neighbors, groundtruth, k=3) == 0.0 + + def test_multiple_queries(self): + """Test recall averaged across multiple queries.""" + neighbors = np.array([[0, 1, 2], [3, 4, 99]]) + groundtruth = np.array([[0, 1, 2], [3, 4, 5]]) + recall = compute_recall(neighbors, groundtruth, k=3) + assert abs(recall - 5.0 / 6.0) < 1e-9