From 490efd78b30d846bbc66a54505cb5f8625cba72f Mon Sep 17 00:00:00 2001 From: tugberkcapraz Date: Fri, 13 Mar 2026 15:35:03 +0100 Subject: [PATCH 1/3] Add early termination and rayon parallelism for high-dimensional performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spatial index query_radius now uses early termination for Euclidean distance: squared differences are accumulated in chunks of 4 dimensions and bailed out early when the partial sum exceeds eps². For 996-dim embeddings with eps=1.2, most non-neighbor pairs are rejected after ~5-15% of dimensions. Above 1000 points, the brute-force scan is parallelized across CPU cores via rayon. Below that threshold, sequential scan avoids thread pool overhead. Benchmark (10 batches x 1500 points, 996-dim, eps=1.2): v0.1.0: 160.2s total v0.2.0: 24.9s total (6.4x faster) Bump version to 0.2.0 across Cargo.toml, pyproject.toml, and __init__.py. Co-Authored-By: Claude Opus 4.6 --- Cargo.toml | 11 +++- README.md | 37 +++++++++-- benches/batch_scaling.rs | 107 ++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- python/incdbscan_rs/__init__.py | 2 +- src/distance.rs | 36 +++++++++++ src/spatial_index.rs | 78 +++++++++++++++++++---- 7 files changed, 254 insertions(+), 19 deletions(-) create mode 100644 benches/batch_scaling.rs diff --git a/Cargo.toml b/Cargo.toml index 884eb1a..9f8a25d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "incdbscan_rs" -version = "0.1.0" +version = "0.2.0" edition = "2021" [lib] @@ -16,3 +16,12 @@ pyo3 = { version = "0.23", optional = true } numpy = { version = "0.23", optional = true } petgraph = "0.7" xxhash-rust = { version = "0.8", features = ["xxh64"] } +rayon = "1.10" + +[dev-dependencies] +rand = "0.8" +rand_distr = "0.4" + +[[bench]] +name = "batch_scaling" +harness = false diff --git a/README.md b/README.md index d592bea..7dae8b4 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,25 @@ Deletion involves BFS-based split detection, which has similar algorithmic compl | 500 points | 0.592s | 0.017s | **34x** | | 1000 points | 2.566s | 0.251s | **10x** | +### High-dimensional scaling (v0.2.0) + +Simulates a real embedding workload: 996-dimensional L2-normalized vectors (resembling OpenAI/Gemini text embeddings), `eps=1.2`, `min_pts=5`, inserted in batches of 1,500 points. Measured on a single machine. + +| Batch | Total Points | v0.1.0 (s) | v0.2.0 (s) | Speedup | +|---|---|---|---|---| +| 1 | 1,500 | 1.617 | 0.547 | **3.0x** | +| 3 | 4,500 | 8.053 | 1.397 | **5.8x** | +| 5 | 7,500 | 14.466 | 2.273 | **6.4x** | +| 7 | 10,500 | 20.898 | 3.039 | **6.9x** | +| 10 | 15,000 | 29.606 | 4.347 | **6.8x** | +| **Total** | | **160.2s** | **24.9s** | **6.4x** | + +v0.2.0 introduces two optimizations that dramatically improve performance for high-dimensional data: + +1. **Early termination in distance computation.** For Euclidean distance (p=2.0), squared differences are accumulated in chunks of 4 dimensions. If the partial sum exceeds eps² at any checkpoint, the remaining dimensions are skipped. This is exact -- no approximation, bit-for-bit identical results to a full computation. For high-dimensional embeddings with a tight eps, most non-neighbor pairs are rejected after computing only 5-15% of dimensions. + +2. **Parallel spatial index scan.** When the dataset exceeds 1,000 points, the brute-force neighbor scan is parallelized across CPU cores using [rayon](https://github.com/rayon-rs/rayon). Below 1,000 points, sequential scan avoids thread pool overhead. + ### Stress test: 10 batches of 500 points Simulates a real workload: insert 500 points per batch, delete 100 per batch, 10 batches total (5000 inserts, 1000 deletes). @@ -184,9 +203,9 @@ src/ ├── lib.rs # PyO3 module + IncrementalDBSCAN pyclass ├── engine.rs # Pure-Rust IncrementalDbscan entry point ├── types.rs # ObjectId (u64), ClusterLabel (i64), constants, hash function -├── distance.rs # Minkowski distance family (p=2 optimized with squared distance) +├── distance.rs # Minkowski distance family (p=2 optimized with early termination) ├── object.rs # ObjectData struct (id, count, neighbor_count, core status) -├── spatial_index.rs # Brute-force spatial index (Vec-based, O(1) insert, O(n) query) +├── spatial_index.rs # Brute-force spatial index (Vec-based, O(1) insert, parallel O(n) query) ├── labels.rs # LabelHandler (bidirectional HashMap mapping) ├── objects.rs # Central manager (petgraph StableGraph + spatial index + labels) ├── inserter.rs # Insertion algorithm (creation / absorption / merge) @@ -199,18 +218,18 @@ src/ - **`petgraph::StableGraph`** instead of a plain graph. Stable node indices survive node removal, which is critical since we store `NodeIndex` values in hash maps. - **No neighbor set on objects.** The Python version stores `obj.neighbors` as a set. Rust queries `graph.neighbors(node_idx)` directly, avoiding duplicated state and circular references. - **`DeletedObjectInfo` pattern.** Python accesses a deleted object's neighbors after deletion (the object persists in memory via GC). Rust snapshots neighbor data into a struct before removal. -- **Brute-force spatial index.** O(1) insert + O(n) query per insert beats Python's O(n log n) tree rebuild + O(log n + k) query, because the rebuild dominates. Upgradeable to grid-based spatial hashing for very large datasets. +- **Brute-force spatial index with early termination.** O(1) insert + O(n) query per insert beats Python's O(n log n) tree rebuild + O(log n + k) query, because the rebuild dominates. For Euclidean distance, the query uses early termination to skip dimensions once the partial squared distance exceeds eps², and parallelizes across CPU cores via rayon for datasets above 1,000 points. - **Feature-gated PyO3.** PyO3 bindings are behind the `extension-module` Cargo feature, so `cargo test` runs pure Rust tests without requiring a Python interpreter. ## Running tests -### Rust unit tests (24 tests) +### Rust unit tests (25 tests) ```bash cargo test ``` -Tests cover: distance calculations, hashing, spatial index operations, label management, object data structures. +Tests cover: distance calculations, early termination correctness, hashing, spatial index operations, label management, object data structures. ### Python tests (36 tests) @@ -221,6 +240,14 @@ pytest Tests cover: construction, noise, cluster creation, absorption, merge, duplicates, deletion, 2-way/3-way splits, reinsert, multi-dimensional (1D-50D), distance metrics, sklearn cross-validation, stress testing. +### Benchmarks + +```bash +cargo bench --bench batch_scaling +``` + +Runs the high-dimensional batch insertion benchmark (10 batches of 1,500 points, 996 dimensions). + ## Differences from Python incdbscan | Feature | Python incdbscan | incdbscan-rs | diff --git a/benches/batch_scaling.rs b/benches/batch_scaling.rs new file mode 100644 index 0000000..d650d7b --- /dev/null +++ b/benches/batch_scaling.rs @@ -0,0 +1,107 @@ +use std::time::Instant; + +use rand::prelude::*; +use rand::rngs::StdRng; +use rand_distr::StandardNormal; + +use incdbscan_rs::engine::IncrementalDbscan; + +/// Generate L2-normalized random vectors resembling real text embeddings +/// (OpenAI, Gemini style): each component ~ N(0,1), then normalized to unit length. +/// +/// Points are grouped into clusters to simulate real article embeddings where +/// articles about similar topics have nearby embeddings. +fn generate_clustered_embeddings( + rng: &mut StdRng, + count: usize, + dims: usize, + num_clusters: usize, + noise_scale: f64, +) -> Vec> { + // Generate cluster centers (random unit vectors) + let centers: Vec> = (0..num_clusters) + .map(|_| { + let v: Vec = (0..dims).map(|_| rng.sample(StandardNormal)).collect(); + normalize(&v) + }) + .collect(); + + // Generate points around cluster centers + (0..count) + .map(|_| { + let center = ¢ers[rng.gen_range(0..num_clusters)]; + let noisy: Vec = center + .iter() + .map(|&c| c + noise_scale * rng.sample::(StandardNormal)) + .collect(); + normalize(&noisy) + }) + .collect() +} + +fn normalize(v: &[f64]) -> Vec { + let norm: f64 = v.iter().map(|x| x * x).sum::().sqrt(); + v.iter().map(|x| x / norm).collect() +} + +fn main() { + let dims = 996; + let eps = 1.2; + let min_pts: u32 = 5; + let p = 2.0; + let num_batches = 10; + let batch_size = 1500; + let num_clusters = 50; + // noise_scale calibrated so intra-cluster distances are ~0.5-1.0 (within eps=1.2) + let noise_scale = 0.02; + + let mut rng = StdRng::seed_from_u64(42); + + // Pre-generate all batches + let batches: Vec>> = (0..num_batches) + .map(|_| generate_clustered_embeddings(&mut rng, batch_size, dims, num_clusters, noise_scale)) + .collect(); + + println!(); + println!("IncrementalDBSCAN Batch Insertion Benchmark"); + println!("============================================"); + println!( + "dims={}, eps={}, min_pts={}, p={}, clusters={}", + dims, eps, min_pts, p, num_clusters + ); + println!( + "batches={}, points_per_batch={}", + num_batches, batch_size + ); + println!("{:-<60}", ""); + println!( + "{:<8} {:>14} {:>14} {:>14}", + "Batch", "Total Points", "Batch (s)", "Cumul (s)" + ); + println!("{:-<60}", ""); + + let mut db = IncrementalDbscan::new(eps, min_pts, p); + let overall_start = Instant::now(); + + for (i, batch) in batches.iter().enumerate() { + let batch_start = Instant::now(); + for point in batch { + db.insert(point); + } + let batch_elapsed = batch_start.elapsed().as_secs_f64(); + let cumul_elapsed = overall_start.elapsed().as_secs_f64(); + + println!( + "{:<8} {:>14} {:>14.3} {:>14.3}", + i + 1, + (i + 1) * batch_size, + batch_elapsed, + cumul_elapsed + ); + } + + let total = overall_start.elapsed(); + println!("{:-<60}", ""); + println!("Total: {:.3}s", total.as_secs_f64()); + println!(); +} diff --git a/pyproject.toml b/pyproject.toml index cc270c4..b6a0496 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "incdbscan-rs" -version = "0.1.0" +version = "0.2.0" description = "High-performance Rust implementation of IncrementalDBSCAN with Python bindings." readme = "README.md" license = {text = "BSD-3-Clause"} diff --git a/python/incdbscan_rs/__init__.py b/python/incdbscan_rs/__init__.py index c6ce2f2..43d0985 100644 --- a/python/incdbscan_rs/__init__.py +++ b/python/incdbscan_rs/__init__.py @@ -1,4 +1,4 @@ from ._incdbscan_rs import IncrementalDBSCAN -__version__ = "0.1.0" +__version__ = "0.2.0" __all__ = ["IncrementalDBSCAN"] diff --git a/src/distance.rs b/src/distance.rs index a843206..b393176 100644 --- a/src/distance.rs +++ b/src/distance.rs @@ -26,6 +26,42 @@ pub fn squared_euclidean_distance(a: &[f64], b: &[f64]) -> f64 { .sum() } +/// Check if squared Euclidean distance is within threshold, with early termination. +/// Returns true if ||a - b||² <= threshold. +/// +/// Since squared differences are non-negative, the running sum can only grow. +/// If the partial sum exceeds the threshold after processing a chunk of dimensions, +/// the remaining dimensions can only make it larger — so we bail out early. +/// This is exact — no approximation, bit-for-bit identical results. +#[inline] +pub fn squared_euclidean_within(a: &[f64], b: &[f64], threshold: f64) -> bool { + debug_assert_eq!(a.len(), b.len()); + let n = a.len(); + let mut sum = 0.0; + + // Process in chunks of 4 for better auto-vectorization within each chunk + let chunks = n / 4; + for chunk in 0..chunks { + let base = chunk * 4; + let d0 = unsafe { *a.get_unchecked(base) - *b.get_unchecked(base) }; + let d1 = unsafe { *a.get_unchecked(base + 1) - *b.get_unchecked(base + 1) }; + let d2 = unsafe { *a.get_unchecked(base + 2) - *b.get_unchecked(base + 2) }; + let d3 = unsafe { *a.get_unchecked(base + 3) - *b.get_unchecked(base + 3) }; + sum += d0 * d0 + d1 * d1 + d2 * d2 + d3 * d3; + if sum > threshold { + return false; + } + } + + // Handle remaining dimensions + for i in (chunks * 4)..n { + let d = unsafe { *a.get_unchecked(i) - *b.get_unchecked(i) }; + sum += d * d; + } + + sum <= threshold +} + #[inline] fn euclidean_distance(a: &[f64], b: &[f64]) -> f64 { squared_euclidean_distance(a, b).sqrt() diff --git a/src/spatial_index.rs b/src/spatial_index.rs index 46cd0a2..6ac0cf9 100644 --- a/src/spatial_index.rs +++ b/src/spatial_index.rs @@ -1,6 +1,12 @@ -use crate::distance::{minkowski_distance, squared_euclidean_distance}; +use rayon::prelude::*; + +use crate::distance::{minkowski_distance, squared_euclidean_within}; use crate::types::ObjectId; +/// Minimum number of stored points before switching to parallel scan. +/// Below this, rayon's thread pool overhead exceeds the parallelism benefit. +const PARALLEL_THRESHOLD: usize = 1000; + /// Brute-force spatial index with O(1) insert, O(1) delete, O(n) query. /// Trades query speed for incremental insert/delete efficiency. pub struct SpatialIndex { @@ -55,20 +61,43 @@ impl SpatialIndex { pub fn query_radius(&self, query: &[f64]) -> Vec { debug_assert_eq!(query.len(), self.dims); let n = self.ids.len(); - let mut result = Vec::new(); if self.p == 2.0 { - // Optimized: compare squared distances to avoid sqrt let eps_sq = self.eps * self.eps; - for i in 0..n { - let start = i * self.dims; - let end = start + self.dims; - let point = &self.coords[start..end]; - if squared_euclidean_distance(query, point) <= eps_sq { - result.push(self.ids[i]); + + if n >= PARALLEL_THRESHOLD { + // Parallel scan with rayon + early termination + let ids = &self.ids; + let coords = &self.coords; + let dims = self.dims; + + (0..n) + .into_par_iter() + .filter_map(|i| { + let start = i * dims; + let point = &coords[start..start + dims]; + if squared_euclidean_within(query, point, eps_sq) { + Some(ids[i]) + } else { + None + } + }) + .collect() + } else { + // Sequential scan with early termination for small datasets + let mut result = Vec::new(); + for i in 0..n { + let start = i * self.dims; + let point = &self.coords[start..start + self.dims]; + if squared_euclidean_within(query, point, eps_sq) { + result.push(self.ids[i]); + } } + result } } else { + // Non-Euclidean metrics: sequential scan (rayon possible but less common) + let mut result = Vec::new(); for i in 0..n { let start = i * self.dims; let end = start + self.dims; @@ -77,15 +106,15 @@ impl SpatialIndex { result.push(self.ids[i]); } } + result } - - result } } #[cfg(test)] mod tests { use super::*; + use crate::distance::squared_euclidean_distance; #[test] fn test_insert_and_query() { @@ -155,4 +184,31 @@ mod tests { assert!(neighbors.contains(&2)); assert!(!neighbors.contains(&3)); } + + /// Verify that early termination produces identical results to full computation. + #[test] + fn test_early_termination_correctness() { + let mut idx = SpatialIndex::new(1.5, 2.0); + let eps_sq = 1.5_f64 * 1.5; + + // Add some high-dimensional points + let dims = 100; + let p1: Vec = (0..dims).map(|i| i as f64 * 0.01).collect(); + let p2: Vec = (0..dims).map(|i| i as f64 * 0.01 + 0.1).collect(); + let p3: Vec = (0..dims).map(|i| i as f64 * 0.1).collect(); // far away + + idx.insert(1, &p1); + idx.insert(2, &p2); + idx.insert(3, &p3); + + let neighbors = idx.query_radius(&p1); + + // Verify against full distance computation + let d12 = squared_euclidean_distance(&p1, &p2); + let d13 = squared_euclidean_distance(&p1, &p3); + + assert_eq!(neighbors.contains(&2), d12 <= eps_sq); + assert_eq!(neighbors.contains(&3), d13 <= eps_sq); + assert!(neighbors.contains(&1)); // self is always a neighbor + } } From 076ade38b6cc3851915ecde06cd4edaa23299584 Mon Sep 17 00:00:00 2001 From: tugberkcapraz Date: Fri, 13 Mar 2026 15:42:33 +0100 Subject: [PATCH 2/3] Add wheel build verification to CI for all platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build wheels for Linux (x86_64, aarch64), macOS (x86_64, aarch64), and Windows (x86_64) on every push/PR. This catches packaging failures before merge rather than at release time. No publishing — just build verification. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a65b5b3..d320fa4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,3 +32,41 @@ jobs: run: pip install '.[dev]' - name: Run tests run: pytest tests/ -v + + wheel-build: + name: wheel-${{ matrix.os }}-${{ matrix.target }} + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - os: linux + runner: ubuntu-latest + target: x86_64 + - os: linux + runner: ubuntu-latest + target: aarch64 + - os: macos + runner: macos-14 + target: x86_64 + - os: macos + runner: macos-14 + target: aarch64 + - os: windows + runner: windows-latest + target: x86_64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Build wheel + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.os == 'macos' && format('{0}-apple-darwin', matrix.target) || matrix.target }} + args: --release --out dist + manylinux: auto + before-script-linux: | + if ! command -v python3 > /dev/null 2>&1; then + apt-get update && apt-get install -y python3-dev || true + fi From 8bb70ddc945094cce5a8dff0bdaf5d25f4684b2a Mon Sep 17 00:00:00 2001 From: tugberkcapraz Date: Fri, 13 Mar 2026 15:45:17 +0100 Subject: [PATCH 3/3] Fix CI wheel build: specify Python interpreter for cross-compilation The Linux aarch64 cross-compilation container lacks Python interpreters, causing maturin to fail. Specify -i 3.12 explicitly (matching the release workflow pattern). One version is sufficient to verify the build. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d320fa4..c4f2457 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: uses: PyO3/maturin-action@v1 with: target: ${{ matrix.os == 'macos' && format('{0}-apple-darwin', matrix.target) || matrix.target }} - args: --release --out dist + args: --release --out dist -i 3.12 manylinux: auto before-script-linux: | if ! command -v python3 > /dev/null 2>&1; then