SearchScale · nvzm123 · Mar 30, 2026 · Mar 30, 2026 · Apr 7, 2026 · Apr 29, 2026
diff --git a/convert_to_nvidia_format.py b/convert_to_nvidia_format.py
@@ -7,10 +7,14 @@
 from typing import List, Dict, Optional, Tuple
 
 
-def create_index_name(config: Dict) -> str:
-    """Create index name from configuration parameters"""
+def create_index_name(config: Dict, metrics: Dict) -> str:
+    """Create index name from configuration and metrics.
+
+    efSearch is read from metrics (where it's a scalar int per search run)
+    rather than config (where it's now a list of values to sweep).
+    """
     algorithm = config.get('algoToRun', 'UNKNOWN')
-    ef_search = config.get('efSearch', 0)
+    ef_search = metrics.get('efSearch', 0)
 
     if algorithm in ['LUCENE_HNSW', 'hnsw']:
         beam_width = config.get('hnswBeamWidth', 0)
@@ -38,7 +42,7 @@ def convert_results_to_nvidia_format(results_json_path: str, output_dir: str, da
     elif algorithm in ['hnsw', 'LUCENE_HNSW']:
         algorithm = 'LUCENE_HNSW'
 
-    index_name = create_index_name(config)
+    index_name = create_index_name(config, metrics)
 
     recall_key = next((key for key in metrics.keys() if 'recall-accuracy' in key.lower()), None)
     if not recall_key:
@@ -97,12 +101,13 @@ def convert_results_to_nvidia_format(results_json_path: str, output_dir: str, da
 
     if build_time_key:
         build_time_ms = float(metrics[build_time_key])
+        build_time_s = build_time_ms / 1000.0
 
         build_benchmark = {
             "name": f"{algorithm}/{index_name}",
-            "real_time": build_time_ms,
+            "real_time": build_time_s,
             "iterations": 1,
-            "time_unit": "ms",
+            "time_unit": "s",
             "run_name": "run_1",
             "run_type": "iteration",
             "repetitions": 1,

diff --git a/generate-combinations.py b/generate-combinations.py
@@ -1,4 +1,3 @@
-
 import itertools
 import argparse
 import sys
@@ -42,7 +41,11 @@
     invariants["vectorDimension"] = dataset_info["vector_dimension"]
     print("sweep: " + sweep)
     for param, value in sweeps[sweep].get("common-params", {}).items():
-        if not isinstance(value, list):
+        # efSearch is always passed through as a list — Java handles the iteration
+        if param == 'efSearch':
+            # Ensure it's always a list
+            invariants[param] = value if isinstance(value, list) else [value]
+        elif not isinstance(value, list):
             invariants[param] = value
         else:
             variants[param] = value
@@ -54,146 +57,64 @@
 
         for param, value in algorithms[algo].items():
             if param not in ["params"]:
-                if not isinstance(value, list):
+                # efSearch is always passed through as a list — Java handles the iteration
+                if param == 'efSearch':
+                    algo_invariants[param] = value if isinstance(value, list) else [value]
+                elif not isinstance(value, list):
                     algo_invariants[param] = value
                 else:
                     algo_variants[param] = value
 
         # Generate all combination of variants. For each combination, generate a hashed ID, and a file with the
         # name pattern as <sweep>-<algo>-<hash>.json. The file should contain the invariants as is, and the variants as the current combination.
         if algo_variants:
-            # Separate efSearch from other variants if it exists
-            efSearch_values = None
-            other_variant_keys = []
-            other_variant_values = []
-
-            for key, value in algo_variants.items():
-                if key == 'efSearch':
-                    efSearch_values = value
-                else:
-                    other_variant_keys.append(key)
-                    other_variant_values.append(value)
-
-            # Generate combinations with efSearch at the beginning (innermost loop)
-            if efSearch_values and other_variant_keys:
-                # Generate combinations of other parameters first
-                for other_combination in itertools.product(*other_variant_values):
-                    other_variants = dict(zip(other_variant_keys, other_combination))
-                    # Then iterate through efSearch values
-                    for ef_index, ef_value in enumerate(efSearch_values):
-                        current_variants = other_variants.copy()
-                        current_variants['efSearch'] = ef_value
-
-                        # Skip if cagraIntermediateDegree < cagraGraphDegree
-                        if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
-                            if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
-                                print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
-                                continue
-
-                        # Skip if hnswMaxConn > hnswBeamWidth
-                        if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
-                            if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
-                                print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
-                                continue
-
-                        # Generate hash only from other_variants (excluding efSearch)
-                        base_hash = hashlib.md5(json.dumps(other_variants, sort_keys=True).encode()).hexdigest()[:8]
-                        hash_id = f"{base_hash}-ef{ef_value}"
-
-                        config = algo_invariants.copy()
-                        config.update(current_variants)
-
-                        # For multiple efSearch combinations: subsequent ones skip indexing
-                        if len(efSearch_values) > 1 and ef_index > 0:
-                            config['skipIndexing'] = True
-
-                        # Set cleanIndexDirectory based on position
-                        if ef_index == 0:
-                            config['cleanIndexDirectory'] = False
-                        elif ef_index == len(efSearch_values) - 1:
-                            config['cleanIndexDirectory'] = True
-                        else:
-                            config['cleanIndexDirectory'] = False
-
-                        # Use base_hash for index directory paths
-                        if 'hnswIndexDirPath' in config:
-                            config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
-                        if 'cuvsIndexDirPath' in config:
-                            config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"
-
-                        filename = f"{algo}-{hash_id}.json"
-                        sweep_dir = f"{args.configs_dir}/{sweep}"
-                        filepath = f"{sweep_dir}/{filename}"
-                        os.makedirs(sweep_dir, exist_ok=True)
-                        with open(filepath, 'w') as f:
-                            json.dump(config, f, indent=2)
-                        print(f"\tGenerated config file: {filepath}")
-            elif efSearch_values:
-                # Only efSearch values, no other variants
-                for ef_index, ef_value in enumerate(efSearch_values):
-                    current_variants = {'efSearch': ef_value}
-                    # Generate hash from empty dict since no other variants exist
-                    base_hash = hashlib.md5(json.dumps({}, sort_keys=True).encode()).hexdigest()[:8]
-                    hash_id = f"{base_hash}-ef{ef_value}"
-
-                    config = algo_invariants.copy()
-                    config.update(current_variants)
-
-                    # For multiple efSearch combinations: subsequent ones skip indexing
-                    if len(efSearch_values) > 1 and ef_index > 0:
-                        config['skipIndexing'] = True
-
-                    # Set cleanIndexDirectory based on position
-                    if ef_index == 0:
-                        config['cleanIndexDirectory'] = False
-                    elif ef_index == len(efSearch_values) - 1:
-                        config['cleanIndexDirectory'] = True
-                    else:
-                        config['cleanIndexDirectory'] = False
-
-                    # Use base_hash for index directory paths
-                    if 'hnswIndexDirPath' in config:
-                        config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
-                    if 'cuvsIndexDirPath' in config:
-                        config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"
-
-                    filename = f"{algo}-{hash_id}.json"
-                    sweep_dir = f"{args.configs_dir}/{sweep}"
-                    filepath = f"{sweep_dir}/{filename}"
-                    os.makedirs(sweep_dir, exist_ok=True)
-                    with open(filepath, 'w') as f:
-                        json.dump(config, f, indent=2)
-                    print(f"\tGenerated config file: {filepath}")
-            else:
-                # No efSearch, use original logic
-                variant_keys = list(algo_variants.keys())
-                variant_values = list(algo_variants.values())
-                for combination in itertools.product(*variant_values):
-                    current_variants = dict(zip(variant_keys, combination))
-
-                    # Skip if cagraIntermediateDegree < cagraGraphDegree
-                    if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
-                        if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
-                            print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
-                            continue
-
-                    # Skip if hnswMaxConn > hnswBeamWidth
-                    if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
-                        if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
-                            print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
-                            continue
-
-                    hash_id = hashlib.md5(json.dumps(current_variants, sort_keys=True).encode()).hexdigest()[:8]
-
-                    config = algo_invariants.copy()
-                    config.update(current_variants)
-                    filename = f"{algo}-{hash_id}.json"
-                    sweep_dir = f"{args.configs_dir}/{sweep}"
-                    filepath = f"{sweep_dir}/{filename}"
-                    os.makedirs(sweep_dir, exist_ok=True)
-                    with open(filepath, 'w') as f:
-                        json.dump(config, f, indent=2)
-                    print(f"\tGenerated config file: {filepath}")
+            variant_keys = list(algo_variants.keys())
+            variant_values = list(algo_variants.values())
+            for combination in itertools.product(*variant_values):
+                current_variants = dict(zip(variant_keys, combination))
+
+                config = algo_invariants.copy()
+                config.update(current_variants)
+
+                # Skip if cagraIntermediateGraphDegree < cagraGraphDegree
+                # (CAGRA silently clamps graphDegree down to intermediateGraphDegree,
+                # which produces duplicate test runs)
+                if config.get('cagraIntermediateGraphDegree', float('inf')) < config.get('cagraGraphDegree', 0):
+                    print(f"\t\tSkipping combination: cagraIntermediateGraphDegree ({config['cagraIntermediateGraphDegree']}) < cagraGraphDegree ({config['cagraGraphDegree']})")
+                    continue
+
+                # Skip if hnswMaxConn > hnswBeamWidth
+                if config.get('hnswMaxConn', 0) > config.get('hnswBeamWidth', float('inf')):
+                    print(f"\t\tSkipping combination: hnswMaxConn ({config['hnswMaxConn']}) > hnswBeamWidth ({config['hnswBeamWidth']})")
+                    continue
+
+                # Set indexDirPath based on hash
+                hash_input = {k: v for k, v in config.items() if k != 'indexDirPath'}
+                hash_id = hashlib.md5(json.dumps(hash_input, sort_keys=True, default=str).encode()).hexdigest()[:8]
+                config['indexDirPath'] = f"index-{hash_id}"
+
+                filename = f"{algo}-{hash_id}.json"
+                sweep_dir = f"{args.configs_dir}/{sweep}"
+                filepath = f"{sweep_dir}/{filename}"
+                os.makedirs(sweep_dir, exist_ok=True)
+                with open(filepath, 'w') as f:
+                    json.dump(config, f, indent=2)
+                print(f"\tGenerated config file: {filepath}")
+        else:
+            # No variants at all, just generate a single config
+            hash_id = hashlib.md5(json.dumps(algo_invariants, sort_keys=True).encode()).hexdigest()[:8]
+            config = algo_invariants.copy()
+
+            # Set indexDirPath based on hash
+            config['indexDirPath'] = f"index-{hash_id}"
+
+            filename = f"{algo}-{hash_id}.json"
+            sweep_dir = f"{args.configs_dir}/{sweep}"
+            filepath = f"{sweep_dir}/{filename}"
+            os.makedirs(sweep_dir, exist_ok=True)
+            with open(filepath, 'w') as f:
+                json.dump(config, f, indent=2)
+            print(f"\tGenerated config file: {filepath}")
 
 
     print("----------------------")
diff --git a/pom.xml b/pom.xml
@@ -35,7 +35,7 @@
 	<dependency>
 		  <groupId>com.nvidia.cuvs.lucene</groupId>
 		    <artifactId>cuvs-lucene</artifactId>
-		      <version>26.04.0</version>
+		      <version>26.06.0</version>
 		      </dependency>
         <dependency>
             <groupId>com.nvidia.cuvs</groupId>

diff --git a/run_pareto_analysis.sh b/run_pareto_analysis.sh
@@ -81,9 +81,14 @@ import csv
 import json
 import glob
 
-def create_index_name_from_config(config):
+def create_index_name_from_results(config, metrics):
+    \"\"\"Create index name from config and metrics.
+
+    efSearch is read from metrics (scalar int per search run)
+    rather than config (now a list of values to sweep).
+    \"\"\"
     algorithm = config.get('algoToRun', 'UNKNOWN')
-    ef_search = config.get('efSearch', 0)
+    ef_search = metrics.get('efSearch', 0)
 
     if algorithm in ['LUCENE_HNSW', 'hnsw']:
         beam_width = config.get('hnswBeamWidth', 0)
@@ -144,13 +149,17 @@ for algorithm, pareto_indices in pareto_runs_by_algo.items():
 
     index_to_dir = {}
     for benchmark_dir in benchmark_dirs:
-        results_json_path = os.path.join(benchmark_dir, 'results.json')
-        if os.path.exists(results_json_path):
+        # Walk into subdirectories to find efSearch_* results
+        for root, dirs, files in os.walk(benchmark_dir):
+            if 'results.json' not in files:
+                continue
+            results_json_path = os.path.join(root, 'results.json')
             try:
                 with open(results_json_path, 'r') as f:
                     results_data = json.load(f)
 
                 config = results_data['configuration']
+                metrics = results_data['metrics']
                 algo_to_run = config.get('algoToRun')
 
                 algorithm_match = False
@@ -160,20 +169,20 @@ for algorithm, pareto_indices in pareto_runs_by_algo.items():
                     algorithm_match = True
 
                 if algorithm_match:
-                    index_name = create_index_name_from_config(config)
+                    index_name = create_index_name_from_results(config, metrics)
                     if index_name not in index_to_dir:
-                        index_to_dir[index_name] = benchmark_dir
+                        index_to_dir[index_name] = root
             except Exception as e:
-                print(f'  Error processing {benchmark_dir}: {e}')
+                print(f'  Error processing {root}: {e}')
 
     print(f'Mapped {len(index_to_dir)} configurations')
 
     matched = 0
     unmatched = 0
     for index_name, pareto_run in pareto_indices.items():
         if index_name in index_to_dir:
-            benchmark_dir = index_to_dir[index_name]
-            is_pareto_file = os.path.join(benchmark_dir, 'is_pareto')
+            result_dir = index_to_dir[index_name]
+            is_pareto_file = os.path.join(result_dir, 'is_pareto')
 
             with open(is_pareto_file, 'w') as f:
                 f.write(f'Pareto optimal run\\n')
@@ -211,10 +220,11 @@ ls -la "${OUTPUT_DIR}/plots"/*.png
 
 echo ""
 echo "Cleaning up intermediate files..."
-rm -rf "${INTERMEDIATE_DIR}"
-echo "Intermediate files cleaned up!"
+# rm -rf "${INTERMEDIATE_DIR}"
+# echo "Intermediate files cleaned up!"
+echo "Intermediate files left intact!"
 echo ""
 echo "Final output:"
 echo "- Pareto optimal runs marked with is_pareto files"
 echo "- Plots: ${OUTPUT_DIR}/plots/"
-echo "- No intermediate files (completely cleaned up)"
+echo "- Yes intermediate files are still present ;)"
diff --git a/run_sweep.sh b/run_sweep.sh
@@ -1,5 +1,11 @@
 #!/bin/bash
 
+export MAVEN_OPTS="-Xmx80g -Xms8g \
+  -XX:+UseG1GC \
+  -XX:MinHeapFreeRatio=5 \
+  -XX:MaxHeapFreeRatio=15 \
+  -XX:G1PeriodicGCInterval=5000"
+
 # Parse command-line arguments
 while getopts ":-:" opt; do
     case $OPTARG in