Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions convert_to_nvidia_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@
from typing import List, Dict, Optional, Tuple


def create_index_name(config: Dict) -> str:
"""Create index name from configuration parameters"""
def create_index_name(config: Dict, metrics: Dict) -> str:
"""Create index name from configuration and metrics.

efSearch is read from metrics (where it's a scalar int per search run)
rather than config (where it's now a list of values to sweep).
"""
algorithm = config.get('algoToRun', 'UNKNOWN')
ef_search = config.get('efSearch', 0)
ef_search = metrics.get('efSearch', 0)

if algorithm in ['LUCENE_HNSW', 'hnsw']:
beam_width = config.get('hnswBeamWidth', 0)
Expand Down Expand Up @@ -38,7 +42,7 @@ def convert_results_to_nvidia_format(results_json_path: str, output_dir: str, da
elif algorithm in ['hnsw', 'LUCENE_HNSW']:
algorithm = 'LUCENE_HNSW'

index_name = create_index_name(config)
index_name = create_index_name(config, metrics)

recall_key = next((key for key in metrics.keys() if 'recall-accuracy' in key.lower()), None)
if not recall_key:
Expand Down Expand Up @@ -97,12 +101,13 @@ def convert_results_to_nvidia_format(results_json_path: str, output_dir: str, da

if build_time_key:
build_time_ms = float(metrics[build_time_key])
build_time_s = build_time_ms / 1000.0

build_benchmark = {
"name": f"{algorithm}/{index_name}",
"real_time": build_time_ms,
"real_time": build_time_s,
"iterations": 1,
"time_unit": "ms",
"time_unit": "s",
"run_name": "run_1",
"run_type": "iteration",
"repetitions": 1,
Expand Down
191 changes: 56 additions & 135 deletions generate-combinations.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import itertools
import argparse
import sys
Expand Down Expand Up @@ -42,7 +41,11 @@
invariants["vectorDimension"] = dataset_info["vector_dimension"]
print("sweep: " + sweep)
for param, value in sweeps[sweep].get("common-params", {}).items():
if not isinstance(value, list):
# efSearch is always passed through as a list — Java handles the iteration
if param == 'efSearch':
# Ensure it's always a list
invariants[param] = value if isinstance(value, list) else [value]
elif not isinstance(value, list):
invariants[param] = value
else:
variants[param] = value
Expand All @@ -54,146 +57,64 @@

for param, value in algorithms[algo].items():
if param not in ["params"]:
if not isinstance(value, list):
# efSearch is always passed through as a list — Java handles the iteration
if param == 'efSearch':
algo_invariants[param] = value if isinstance(value, list) else [value]
elif not isinstance(value, list):
algo_invariants[param] = value
else:
algo_variants[param] = value

# Generate all combination of variants. For each combination, generate a hashed ID, and a file with the
# name pattern as <sweep>-<algo>-<hash>.json. The file should contain the invariants as is, and the variants as the current combination.
if algo_variants:
# Separate efSearch from other variants if it exists
efSearch_values = None
other_variant_keys = []
other_variant_values = []

for key, value in algo_variants.items():
if key == 'efSearch':
efSearch_values = value
else:
other_variant_keys.append(key)
other_variant_values.append(value)

# Generate combinations with efSearch at the beginning (innermost loop)
if efSearch_values and other_variant_keys:
# Generate combinations of other parameters first
for other_combination in itertools.product(*other_variant_values):
other_variants = dict(zip(other_variant_keys, other_combination))
# Then iterate through efSearch values
for ef_index, ef_value in enumerate(efSearch_values):
current_variants = other_variants.copy()
current_variants['efSearch'] = ef_value

# Skip if cagraIntermediateDegree < cagraGraphDegree
if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
continue

# Skip if hnswMaxConn > hnswBeamWidth
if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
continue

# Generate hash only from other_variants (excluding efSearch)
base_hash = hashlib.md5(json.dumps(other_variants, sort_keys=True).encode()).hexdigest()[:8]
hash_id = f"{base_hash}-ef{ef_value}"

config = algo_invariants.copy()
config.update(current_variants)

# For multiple efSearch combinations: subsequent ones skip indexing
if len(efSearch_values) > 1 and ef_index > 0:
config['skipIndexing'] = True

# Set cleanIndexDirectory based on position
if ef_index == 0:
config['cleanIndexDirectory'] = False
elif ef_index == len(efSearch_values) - 1:
config['cleanIndexDirectory'] = True
else:
config['cleanIndexDirectory'] = False

# Use base_hash for index directory paths
if 'hnswIndexDirPath' in config:
config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
if 'cuvsIndexDirPath' in config:
config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
elif efSearch_values:
# Only efSearch values, no other variants
for ef_index, ef_value in enumerate(efSearch_values):
current_variants = {'efSearch': ef_value}
# Generate hash from empty dict since no other variants exist
base_hash = hashlib.md5(json.dumps({}, sort_keys=True).encode()).hexdigest()[:8]
hash_id = f"{base_hash}-ef{ef_value}"

config = algo_invariants.copy()
config.update(current_variants)

# For multiple efSearch combinations: subsequent ones skip indexing
if len(efSearch_values) > 1 and ef_index > 0:
config['skipIndexing'] = True

# Set cleanIndexDirectory based on position
if ef_index == 0:
config['cleanIndexDirectory'] = False
elif ef_index == len(efSearch_values) - 1:
config['cleanIndexDirectory'] = True
else:
config['cleanIndexDirectory'] = False

# Use base_hash for index directory paths
if 'hnswIndexDirPath' in config:
config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
if 'cuvsIndexDirPath' in config:
config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
else:
# No efSearch, use original logic
variant_keys = list(algo_variants.keys())
variant_values = list(algo_variants.values())
for combination in itertools.product(*variant_values):
current_variants = dict(zip(variant_keys, combination))

# Skip if cagraIntermediateDegree < cagraGraphDegree
if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
continue

# Skip if hnswMaxConn > hnswBeamWidth
if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
continue

hash_id = hashlib.md5(json.dumps(current_variants, sort_keys=True).encode()).hexdigest()[:8]

config = algo_invariants.copy()
config.update(current_variants)
filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
variant_keys = list(algo_variants.keys())
variant_values = list(algo_variants.values())
for combination in itertools.product(*variant_values):
current_variants = dict(zip(variant_keys, combination))

config = algo_invariants.copy()
config.update(current_variants)

# Skip if cagraIntermediateGraphDegree < cagraGraphDegree
# (CAGRA silently clamps graphDegree down to intermediateGraphDegree,
# which produces duplicate test runs)
if config.get('cagraIntermediateGraphDegree', float('inf')) < config.get('cagraGraphDegree', 0):
print(f"\t\tSkipping combination: cagraIntermediateGraphDegree ({config['cagraIntermediateGraphDegree']}) < cagraGraphDegree ({config['cagraGraphDegree']})")
continue

# Skip if hnswMaxConn > hnswBeamWidth
if config.get('hnswMaxConn', 0) > config.get('hnswBeamWidth', float('inf')):
print(f"\t\tSkipping combination: hnswMaxConn ({config['hnswMaxConn']}) > hnswBeamWidth ({config['hnswBeamWidth']})")
continue

# Set indexDirPath based on hash
hash_input = {k: v for k, v in config.items() if k != 'indexDirPath'}
hash_id = hashlib.md5(json.dumps(hash_input, sort_keys=True, default=str).encode()).hexdigest()[:8]
config['indexDirPath'] = f"index-{hash_id}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
else:
# No variants at all, just generate a single config
hash_id = hashlib.md5(json.dumps(algo_invariants, sort_keys=True).encode()).hexdigest()[:8]
config = algo_invariants.copy()

# Set indexDirPath based on hash
config['indexDirPath'] = f"index-{hash_id}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")


print("----------------------")
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
<dependency>
<groupId>com.nvidia.cuvs.lucene</groupId>
<artifactId>cuvs-lucene</artifactId>
<version>26.04.0</version>
<version>26.06.0</version>
</dependency>
<dependency>
<groupId>com.nvidia.cuvs</groupId>
Expand Down
34 changes: 22 additions & 12 deletions run_pareto_analysis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,14 @@ import csv
import json
import glob

def create_index_name_from_config(config):
def create_index_name_from_results(config, metrics):
\"\"\"Create index name from config and metrics.

efSearch is read from metrics (scalar int per search run)
rather than config (now a list of values to sweep).
\"\"\"
algorithm = config.get('algoToRun', 'UNKNOWN')
ef_search = config.get('efSearch', 0)
ef_search = metrics.get('efSearch', 0)

if algorithm in ['LUCENE_HNSW', 'hnsw']:
beam_width = config.get('hnswBeamWidth', 0)
Expand Down Expand Up @@ -144,13 +149,17 @@ for algorithm, pareto_indices in pareto_runs_by_algo.items():

index_to_dir = {}
for benchmark_dir in benchmark_dirs:
results_json_path = os.path.join(benchmark_dir, 'results.json')
if os.path.exists(results_json_path):
# Walk into subdirectories to find efSearch_* results
for root, dirs, files in os.walk(benchmark_dir):
if 'results.json' not in files:
continue
results_json_path = os.path.join(root, 'results.json')
try:
with open(results_json_path, 'r') as f:
results_data = json.load(f)

config = results_data['configuration']
metrics = results_data['metrics']
algo_to_run = config.get('algoToRun')

algorithm_match = False
Expand All @@ -160,20 +169,20 @@ for algorithm, pareto_indices in pareto_runs_by_algo.items():
algorithm_match = True

if algorithm_match:
index_name = create_index_name_from_config(config)
index_name = create_index_name_from_results(config, metrics)
if index_name not in index_to_dir:
index_to_dir[index_name] = benchmark_dir
index_to_dir[index_name] = root
except Exception as e:
print(f' Error processing {benchmark_dir}: {e}')
print(f' Error processing {root}: {e}')

print(f'Mapped {len(index_to_dir)} configurations')

matched = 0
unmatched = 0
for index_name, pareto_run in pareto_indices.items():
if index_name in index_to_dir:
benchmark_dir = index_to_dir[index_name]
is_pareto_file = os.path.join(benchmark_dir, 'is_pareto')
result_dir = index_to_dir[index_name]
is_pareto_file = os.path.join(result_dir, 'is_pareto')

with open(is_pareto_file, 'w') as f:
f.write(f'Pareto optimal run\\n')
Expand Down Expand Up @@ -211,10 +220,11 @@ ls -la "${OUTPUT_DIR}/plots"/*.png

echo ""
echo "Cleaning up intermediate files..."
rm -rf "${INTERMEDIATE_DIR}"
echo "Intermediate files cleaned up!"
# rm -rf "${INTERMEDIATE_DIR}"
# echo "Intermediate files cleaned up!"
echo "Intermediate files left intact!"
echo ""
echo "Final output:"
echo "- Pareto optimal runs marked with is_pareto files"
echo "- Plots: ${OUTPUT_DIR}/plots/"
echo "- No intermediate files (completely cleaned up)"
echo "- Yes intermediate files are still present ;)"
6 changes: 6 additions & 0 deletions run_sweep.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
#!/bin/bash

export MAVEN_OPTS="-Xmx80g -Xms8g \
-XX:+UseG1GC \
-XX:MinHeapFreeRatio=5 \
-XX:MaxHeapFreeRatio=15 \
-XX:G1PeriodicGCInterval=5000"

# Parse command-line arguments
while getopts ":-:" opt; do
case $OPTARG in
Expand Down
Loading