diff --git a/benchmarks/bench_utils/loaders.py b/benchmarks/bench_utils/loaders.py index fc702db8..d68f92d4 100644 --- a/benchmarks/bench_utils/loaders.py +++ b/benchmarks/bench_utils/loaders.py @@ -44,12 +44,18 @@ def load_pickle(filepath: str, max_count: int = 0, seed: int | None = None) -> l seed: Optional seed for the sampling RNG. Returns: - List of parsed RDKit molecules. + List of parsed RDKit molecules. The list is always shuffled + (deterministic with ``seed``) so benches that consume a head slice + get a representative cross-section rather than file-order bias. """ with open(filepath, "rb") as fh: binary_mols = pickle.load(fh) + rng = random.Random(seed) if max_count > 0 and len(binary_mols) > max_count: - binary_mols = random.Random(seed).sample(binary_mols, max_count) + binary_mols = rng.sample(binary_mols, max_count) + else: + binary_mols = list(binary_mols) + rng.shuffle(binary_mols) mols = process_map( _mol_from_binary, binary_mols, @@ -95,6 +101,10 @@ def load_smiles( does not have to fit in memory and only the sampled SMILES are parsed. A 10% buffer is read past ``max_count`` to absorb parse failures, after which the result is trimmed back to ``max_count``. + + The returned list is always shuffled (deterministic with ``seed``) so + benches that consume a head slice get a representative cross-section + rather than file-order bias (some upstream files are sorted by size). """ read_limit = int(max_count * 1.1) if max_count > 0 else 0 rng = random.Random(seed) @@ -127,6 +137,8 @@ def load_smiles( if max_count > 0 and len(mols) > max_count: mols = rng.sample(mols, max_count) + else: + rng.shuffle(mols) print(f" Loaded {len(mols)} molecules from {filepath}") return mols @@ -168,7 +180,12 @@ def load_sdf( removeHs: bool = False, sanitize: bool = True, ) -> list[Chem.Mol]: - """Load molecules from an SDF file with optional reservoir sampling.""" + """Load molecules from an SDF file with optional reservoir sampling. + + The returned list is always shuffled (deterministic with ``seed``) so + benches that consume a head slice get a representative cross-section + rather than file-order bias (some upstream files are sorted by size). + """ supplier = Chem.SDMolSupplier(filepath, removeHs=removeHs, sanitize=sanitize) read_limit = int(max_count * 1.1) if max_count > 0 else 0 rng = random.Random(seed) @@ -199,6 +216,8 @@ def load_sdf( if max_count > 0 and len(mols) > max_count: mols = rng.sample(mols, max_count) + else: + rng.shuffle(mols) if parse_failures > 0: print(f" ({parse_failures} parse failures)") diff --git a/benchmarks/butina_clustering_bench.py b/benchmarks/butina_clustering_bench.py index 71495ba9..f1c5c4ed 100644 --- a/benchmarks/butina_clustering_bench.py +++ b/benchmarks/butina_clustering_bench.py @@ -20,9 +20,10 @@ import numpy as np import pandas as pd import torch +from bench_utils import load_smiles from benchmark_timing import time_it from rdkit import DataStructs -from rdkit.Chem import AllChem, MolFromSmiles +from rdkit.Chem import AllChem from rdkit.DataStructs import BulkTanimotoSimilarity from rdkit.ML.Cluster.Butina import ClusterData @@ -163,6 +164,7 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size): ) parser.add_argument("--cutoff", type=float, default=None, help="Run only this cutoff value") parser.add_argument("--runs", type=int, default=3, help="Number of timed repetitions (default: 3)") + parser.add_argument("--seed", type=int, default=42, help="Random seed for sampling SMILES (default: 42)") parser.add_argument( "-o", "--output", type=str, default="results.csv", help="Output CSV file path (default: results.csv)" ) @@ -207,10 +209,7 @@ def bench_nvmol_with_tanimoto(fps, threshold, neighborlist_max_size): max_size = max(e["size"] for e in run_plan) - with open(args.input_smiles_file, "r") as f: - smis = [line.strip() for line in f.readlines()] - mols = [MolFromSmiles(smi, sanitize=True) for smi in smis[: max_size + 100]] - mols = [mol for mol in mols if mol is not None] + mols = load_smiles(args.input_smiles_file, max_count=max_size + 100, sanitize=True, seed=args.seed) if include_tanimoto and len(mols) < max_size: print( diff --git a/benchmarks/cross_similarity_bench.py b/benchmarks/cross_similarity_bench.py index b26a3a7a..9a8564ea 100644 --- a/benchmarks/cross_similarity_bench.py +++ b/benchmarks/cross_similarity_bench.py @@ -13,14 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas as pd import pyperf import torch -from rdkit.Chem import MolFromSmiles, rdFingerprintGenerator +from bench_utils import load_smiles +from rdkit.Chem import rdFingerprintGenerator from rdkit.DataStructs import BulkCosineSimilarity, BulkTanimotoSimilarity -from nvmolkit.similarity import crossCosineSimilarity, crossTanimotoSimilarity from nvmolkit.fingerprints import MorganFingerprintGenerator +from nvmolkit.similarity import crossCosineSimilarity, crossTanimotoSimilarity SIZES = [2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 20000, 24000, 28000, 32000] @@ -45,9 +45,10 @@ def nvmolkit_sim_gpu_only(fps, sim_type): runner = pyperf.Runner(min_time=0.01, values=3, processes=1, loops=3) runner.metadata["description"] = "Cross Similarity benchmark" runner.argparser.add_argument( - "--input", type=str, default="data/benchmark_smiles.csv", help="Path to input SMILES CSV file" + "--input", type=str, default="data/benchmark_smiles.csv", help="Path to input SMILES file (.smi/.csv/.cxsmiles)" ) runner.argparser.add_argument("--cosine", action="store_true", help="Include cosine similarity benchmarks") +runner.argparser.add_argument("--seed", type=int, default=42, help="Random seed for sampling SMILES (default: 42)") args = runner.parse_args() sim_types = ("tanimoto", "cosine") if args.cosine else ("tanimoto",) @@ -55,10 +56,7 @@ def nvmolkit_sim_gpu_only(fps, sim_type): max_size = max(SIZES) default_values = runner.args.values -df = pd.read_csv(args.input) -smis = df.iloc[:, 0].to_list() -mols = [MolFromSmiles(smi) for smi in smis] -mols = [mol for mol in mols if mol is not None] +mols = load_smiles(args.input, max_count=max_size, seed=args.seed) if not mols: raise ValueError(f"No molecules parsed from {args.input}") while len(mols) < max_size: diff --git a/benchmarks/substruct_bench.py b/benchmarks/substruct_bench.py index c5df1638..61e6915e 100644 --- a/benchmarks/substruct_bench.py +++ b/benchmarks/substruct_bench.py @@ -54,15 +54,14 @@ import argparse import gc -import pickle import random import sys -from functools import partial from multiprocessing import Pool -from typing import Callable, Iterator +from typing import Callable import nvtx import pandas as pd +from bench_utils import load_pickle, load_smarts, load_smiles from benchmark_timing import time_it as _time_it from nvmolkit import autotune as nv_autotune from nvmolkit.substructure import ( @@ -71,9 +70,8 @@ getSubstructMatches, hasSubstructMatch, ) -from rdkit import Chem, RDLogger +from rdkit import Chem from rdkit.Chem import rdSubstructLibrary -from tqdm.contrib.concurrent import process_map OPTUNA_AVAILABLE = nv_autotune.is_available() @@ -84,133 +82,6 @@ def time_it(func: Callable, runs: int = 1, gpu_sync: bool = False) -> tuple[floa return result.mean_ms, result.std_ms -def load_pickle(filepath: str, max_count: int = 0, seed: int | None = None) -> list[Chem.Mol]: - """Load molecules from a pickled file containing binary mol data. - - When ``max_count > 0``, a uniform random sample of binary mols is drawn. - """ - with open(filepath, "rb") as f: - binary_mols = pickle.load(f) - if max_count > 0 and len(binary_mols) > max_count: - binary_mols = random.Random(seed).sample(binary_mols, max_count) - mols = process_map( - _mol_from_binary, - binary_mols, - desc="Unpickling molecules", - chunksize=1000, - ) - print(f" Loaded {len(mols)} molecules from {filepath}") - return mols - - -def _mol_from_binary(binary_mol: bytes) -> Chem.Mol: - """Load a molecule from RDKit binary format.""" - return Chem.Mol(binary_mol) - - -def _parse_smiles(smi: str, sanitize: bool) -> Chem.Mol | None: - """Parse a single SMILES string.""" - return Chem.MolFromSmiles(smi, sanitize=sanitize) - - -def _iter_smiles_tokens(filepath: str, sanitize: bool) -> Iterator[str]: - """Yield SMILES tokens from a file, skipping blanks/comments and a parse-failing first line. - - The first non-comment line is parsed quietly; if it fails to parse it is treated as a header - and dropped, matching the original loader's behavior. - """ - with open(filepath, "r") as f: - first_data_seen = False - for line in f: - stripped = line.strip() - if not stripped or stripped.startswith("#"): - continue - smi = stripped.split()[0] - if not first_data_seen: - first_data_seen = True - RDLogger.DisableLog("rdApp.*") - mol = Chem.MolFromSmiles(smi, sanitize=sanitize) - RDLogger.EnableLog("rdApp.*") - if mol is None: - continue - yield smi - - -def load_smiles(filepath: str, max_count: int = 0, sanitize: bool = True, seed: int | None = None) -> list[Chem.Mol]: - """Load and parse molecules from a SMILES file. - - When ``max_count > 0``, reservoir sampling draws a uniform random sample of lines in a single - streaming pass (with a 10% buffer to absorb parse failures) so the file isn't fully loaded into - memory and only the sampled SMILES are parsed. - """ - # Use a 10% buffer to account for potential parse failures - # "On parse failures continue down the file. Load 10% more molecules than needed" - read_limit = int(max_count * 1.1) if max_count > 0 else 0 - - if read_limit > 0: - rng = random.Random(seed) - reservoir: list[str] = [] - for index, smi in enumerate(_iter_smiles_tokens(filepath, sanitize)): - if index < read_limit: - reservoir.append(smi) - else: - replace_index = rng.randint(0, index) - if replace_index < read_limit: - reservoir[replace_index] = smi - smiles_list = reservoir - else: - smiles_list = list(_iter_smiles_tokens(filepath, sanitize)) - - mols: list[Chem.Mol] = [] - if smiles_list: - parse_func = partial(_parse_smiles, sanitize=sanitize) - parsed = process_map(parse_func, smiles_list, desc="Parsing molecules", chunksize=1000) - - parse_failures = 0 - for mol in parsed: - if mol is None: - parse_failures += 1 - else: - mols.append(mol) - - if parse_failures > 0: - print(f" ({parse_failures} parse failures)") - - # Trim to exactly max_count if we have more than requested - if max_count > 0 and len(mols) > max_count: - mols = mols[:max_count] - - print(f" Loaded {len(mols)} molecules from {filepath}") - return mols - - -def load_smarts(filepath: str, max_count: int = 0) -> tuple[list[Chem.Mol], list[str]]: - """Load and parse query patterns from a SMARTS file.""" - queries = [] - smarts_list = [] - parse_failures = 0 - - with open(filepath, "r") as f: - for line in f: - if max_count > 0 and len(queries) >= max_count: - break - line = line.strip() - if not line or line.startswith("#"): - continue - smarts = line.split()[0] - query = Chem.MolFromSmarts(smarts) - if query is None: - parse_failures += 1 - continue - queries.append(query) - smarts_list.append(smarts) - - print(f" Loaded {len(queries)} SMARTS patterns from {filepath}") - if parse_failures > 0: - print(f" ({parse_failures} parse failures)") - return queries, smarts_list - - _worker_queries = None _worker_params = None