tugberkcapraz · tugberkcapraz · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -32,3 +32,41 @@ jobs:
         run: pip install '.[dev]'
       - name: Run tests
         run: pytest tests/ -v
+
+  wheel-build:
+    name: wheel-${{ matrix.os }}-${{ matrix.target }}
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: linux
+            runner: ubuntu-latest
+            target: x86_64
+          - os: linux
+            runner: ubuntu-latest
+            target: aarch64
+          - os: macos
+            runner: macos-14
+            target: x86_64
+          - os: macos
+            runner: macos-14
+            target: aarch64
+          - os: windows
+            runner: windows-latest
+            target: x86_64
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Build wheel
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.os == 'macos' && format('{0}-apple-darwin', matrix.target) || matrix.target }}
+          args: --release --out dist -i 3.12
+          manylinux: auto
+          before-script-linux: |
+            if ! command -v python3 > /dev/null 2>&1; then
+              apt-get update && apt-get install -y python3-dev || true
+            fi
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "incdbscan_rs"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 
 [lib]
@@ -16,3 +16,12 @@ pyo3 = { version = "0.23", optional = true }
 numpy = { version = "0.23", optional = true }
 petgraph = "0.7"
 xxhash-rust = { version = "0.8", features = ["xxh64"] }
+rayon = "1.10"
+
+[dev-dependencies]
+rand = "0.8"
+rand_distr = "0.4"
+
+[[bench]]
+name = "batch_scaling"
+harness = false
diff --git a/README.md b/README.md
@@ -121,6 +121,25 @@ Deletion involves BFS-based split detection, which has similar algorithmic compl
 | 500 points | 0.592s | 0.017s | **34x** |
 | 1000 points | 2.566s | 0.251s | **10x** |
 
+### High-dimensional scaling (v0.2.0)
+
+Simulates a real embedding workload: 996-dimensional L2-normalized vectors (resembling OpenAI/Gemini text embeddings), `eps=1.2`, `min_pts=5`, inserted in batches of 1,500 points. Measured on a single machine.
+
+| Batch | Total Points | v0.1.0 (s) | v0.2.0 (s) | Speedup |
+|---|---|---|---|---|
+| 1 | 1,500 | 1.617 | 0.547 | **3.0x** |
+| 3 | 4,500 | 8.053 | 1.397 | **5.8x** |
+| 5 | 7,500 | 14.466 | 2.273 | **6.4x** |
+| 7 | 10,500 | 20.898 | 3.039 | **6.9x** |
+| 10 | 15,000 | 29.606 | 4.347 | **6.8x** |
+| **Total** | | **160.2s** | **24.9s** | **6.4x** |
+
+v0.2.0 introduces two optimizations that dramatically improve performance for high-dimensional data:
+
+1. **Early termination in distance computation.** For Euclidean distance (p=2.0), squared differences are accumulated in chunks of 4 dimensions. If the partial sum exceeds eps² at any checkpoint, the remaining dimensions are skipped. This is exact -- no approximation, bit-for-bit identical results to a full computation. For high-dimensional embeddings with a tight eps, most non-neighbor pairs are rejected after computing only 5-15% of dimensions.
+
+2. **Parallel spatial index scan.** When the dataset exceeds 1,000 points, the brute-force neighbor scan is parallelized across CPU cores using [rayon](https://github.com/rayon-rs/rayon). Below 1,000 points, sequential scan avoids thread pool overhead.
+
 ### Stress test: 10 batches of 500 points
 
 Simulates a real workload: insert 500 points per batch, delete 100 per batch, 10 batches total (5000 inserts, 1000 deletes).
@@ -184,9 +203,9 @@ src/
 ├── lib.rs              # PyO3 module + IncrementalDBSCAN pyclass
 ├── engine.rs           # Pure-Rust IncrementalDbscan entry point
 ├── types.rs            # ObjectId (u64), ClusterLabel (i64), constants, hash function
-├── distance.rs         # Minkowski distance family (p=2 optimized with squared distance)
+├── distance.rs         # Minkowski distance family (p=2 optimized with early termination)
 ├── object.rs           # ObjectData struct (id, count, neighbor_count, core status)
-├── spatial_index.rs    # Brute-force spatial index (Vec-based, O(1) insert, O(n) query)
+├── spatial_index.rs    # Brute-force spatial index (Vec-based, O(1) insert, parallel O(n) query)
 ├── labels.rs           # LabelHandler (bidirectional HashMap mapping)
 ├── objects.rs          # Central manager (petgraph StableGraph + spatial index + labels)
 ├── inserter.rs         # Insertion algorithm (creation / absorption / merge)
@@ -199,18 +218,18 @@ src/
 - **`petgraph::StableGraph`** instead of a plain graph. Stable node indices survive node removal, which is critical since we store `NodeIndex` values in hash maps.
 - **No neighbor set on objects.** The Python version stores `obj.neighbors` as a set. Rust queries `graph.neighbors(node_idx)` directly, avoiding duplicated state and circular references.
 - **`DeletedObjectInfo` pattern.** Python accesses a deleted object's neighbors after deletion (the object persists in memory via GC). Rust snapshots neighbor data into a struct before removal.
-- **Brute-force spatial index.** O(1) insert + O(n) query per insert beats Python's O(n log n) tree rebuild + O(log n + k) query, because the rebuild dominates. Upgradeable to grid-based spatial hashing for very large datasets.
+- **Brute-force spatial index with early termination.** O(1) insert + O(n) query per insert beats Python's O(n log n) tree rebuild + O(log n + k) query, because the rebuild dominates. For Euclidean distance, the query uses early termination to skip dimensions once the partial squared distance exceeds eps², and parallelizes across CPU cores via rayon for datasets above 1,000 points.
 - **Feature-gated PyO3.** PyO3 bindings are behind the `extension-module` Cargo feature, so `cargo test` runs pure Rust tests without requiring a Python interpreter.
 
 ## Running tests
 
-### Rust unit tests (24 tests)
+### Rust unit tests (25 tests)
 
 ```bash
 cargo test
 ```
 
-Tests cover: distance calculations, hashing, spatial index operations, label management, object data structures.
+Tests cover: distance calculations, early termination correctness, hashing, spatial index operations, label management, object data structures.
 
 ### Python tests (36 tests)
 
@@ -221,6 +240,14 @@ pytest
 
 Tests cover: construction, noise, cluster creation, absorption, merge, duplicates, deletion, 2-way/3-way splits, reinsert, multi-dimensional (1D-50D), distance metrics, sklearn cross-validation, stress testing.
 
+### Benchmarks
+
+```bash
+cargo bench --bench batch_scaling
+```
+
+Runs the high-dimensional batch insertion benchmark (10 batches of 1,500 points, 996 dimensions).
+
 ## Differences from Python incdbscan
 
 | Feature | Python incdbscan | incdbscan-rs |

diff --git a/benches/batch_scaling.rs b/benches/batch_scaling.rs
@@ -0,0 +1,107 @@
+use std::time::Instant;
+
+use rand::prelude::*;
+use rand::rngs::StdRng;
+use rand_distr::StandardNormal;
+
+use incdbscan_rs::engine::IncrementalDbscan;
+
+/// Generate L2-normalized random vectors resembling real text embeddings
+/// (OpenAI, Gemini style): each component ~ N(0,1), then normalized to unit length.
+///
+/// Points are grouped into clusters to simulate real article embeddings where
+/// articles about similar topics have nearby embeddings.
+fn generate_clustered_embeddings(
+    rng: &mut StdRng,
+    count: usize,
+    dims: usize,
+    num_clusters: usize,
+    noise_scale: f64,
+) -> Vec<Vec<f64>> {
+    // Generate cluster centers (random unit vectors)
+    let centers: Vec<Vec<f64>> = (0..num_clusters)
+        .map(|_| {
+            let v: Vec<f64> = (0..dims).map(|_| rng.sample(StandardNormal)).collect();
+            normalize(&v)
+        })
+        .collect();
+
+    // Generate points around cluster centers
+    (0..count)
+        .map(|_| {
+            let center = &centers[rng.gen_range(0..num_clusters)];
+            let noisy: Vec<f64> = center
+                .iter()
+                .map(|&c| c + noise_scale * rng.sample::<f64, _>(StandardNormal))
+                .collect();
+            normalize(&noisy)
+        })
+        .collect()
+}
+
+fn normalize(v: &[f64]) -> Vec<f64> {
+    let norm: f64 = v.iter().map(|x| x * x).sum::<f64>().sqrt();
+    v.iter().map(|x| x / norm).collect()
+}
+
+fn main() {
+    let dims = 996;
+    let eps = 1.2;
+    let min_pts: u32 = 5;
+    let p = 2.0;
+    let num_batches = 10;
+    let batch_size = 1500;
+    let num_clusters = 50;
+    // noise_scale calibrated so intra-cluster distances are ~0.5-1.0 (within eps=1.2)
+    let noise_scale = 0.02;
+
+    let mut rng = StdRng::seed_from_u64(42);
+
+    // Pre-generate all batches
+    let batches: Vec<Vec<Vec<f64>>> = (0..num_batches)
+        .map(|_| generate_clustered_embeddings(&mut rng, batch_size, dims, num_clusters, noise_scale))
+        .collect();
+
+    println!();
+    println!("IncrementalDBSCAN Batch Insertion Benchmark");
+    println!("============================================");
+    println!(
+        "dims={}, eps={}, min_pts={}, p={}, clusters={}",
+        dims, eps, min_pts, p, num_clusters
+    );
+    println!(
+        "batches={}, points_per_batch={}",
+        num_batches, batch_size
+    );
+    println!("{:-<60}", "");
+    println!(
+        "{:<8} {:>14} {:>14} {:>14}",
+        "Batch", "Total Points", "Batch (s)", "Cumul (s)"
+    );
+    println!("{:-<60}", "");
+
+    let mut db = IncrementalDbscan::new(eps, min_pts, p);
+    let overall_start = Instant::now();
+
+    for (i, batch) in batches.iter().enumerate() {
+        let batch_start = Instant::now();
+        for point in batch {
+            db.insert(point);
+        }
+        let batch_elapsed = batch_start.elapsed().as_secs_f64();
+        let cumul_elapsed = overall_start.elapsed().as_secs_f64();
+
+        println!(
+            "{:<8} {:>14} {:>14.3} {:>14.3}",
+            i + 1,
+            (i + 1) * batch_size,
+            batch_elapsed,
+            cumul_elapsed
+        );
+    }
+
+    let total = overall_start.elapsed();
+    println!("{:-<60}", "");
+    println!("Total: {:.3}s", total.as_secs_f64());
+    println!();
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "incdbscan-rs"
-version = "0.1.0"
+version = "0.2.0"
 description = "High-performance Rust implementation of IncrementalDBSCAN with Python bindings."
 readme = "README.md"
 license = {text = "BSD-3-Clause"}

diff --git a/python/incdbscan_rs/__init__.py b/python/incdbscan_rs/__init__.py
@@ -1,4 +1,4 @@
 from ._incdbscan_rs import IncrementalDBSCAN
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 __all__ = ["IncrementalDBSCAN"]
diff --git a/src/distance.rs b/src/distance.rs
@@ -26,6 +26,42 @@ pub fn squared_euclidean_distance(a: &[f64], b: &[f64]) -> f64 {
         .sum()
 }
 
+/// Check if squared Euclidean distance is within threshold, with early termination.
+/// Returns true if ||a - b||² <= threshold.
+///
+/// Since squared differences are non-negative, the running sum can only grow.
+/// If the partial sum exceeds the threshold after processing a chunk of dimensions,
+/// the remaining dimensions can only make it larger — so we bail out early.
+/// This is exact — no approximation, bit-for-bit identical results.
+#[inline]
+pub fn squared_euclidean_within(a: &[f64], b: &[f64], threshold: f64) -> bool {
+    debug_assert_eq!(a.len(), b.len());
+    let n = a.len();
+    let mut sum = 0.0;
+
+    // Process in chunks of 4 for better auto-vectorization within each chunk
+    let chunks = n / 4;
+    for chunk in 0..chunks {
+        let base = chunk * 4;
+        let d0 = unsafe { *a.get_unchecked(base) - *b.get_unchecked(base) };
+        let d1 = unsafe { *a.get_unchecked(base + 1) - *b.get_unchecked(base + 1) };
+        let d2 = unsafe { *a.get_unchecked(base + 2) - *b.get_unchecked(base + 2) };
+        let d3 = unsafe { *a.get_unchecked(base + 3) - *b.get_unchecked(base + 3) };
+        sum += d0 * d0 + d1 * d1 + d2 * d2 + d3 * d3;
+        if sum > threshold {
+            return false;
+        }
+    }
+
+    // Handle remaining dimensions
+    for i in (chunks * 4)..n {
+        let d = unsafe { *a.get_unchecked(i) - *b.get_unchecked(i) };
+        sum += d * d;
+    }
+
+    sum <= threshold
+}
+
 #[inline]
 fn euclidean_distance(a: &[f64], b: &[f64]) -> f64 {
     squared_euclidean_distance(a, b).sqrt()