xarray-contrib · brendancol · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv
@@ -2,7 +2,7 @@ module,last_inspected,oom_verdict,bottleneck,high_count,issue,notes
 aspect,2026-05-29,SAFE,compute-bound,1,2688,"dask+cupy geodesic densified full lat/lon on one GPU at graph build (OOM at scale); fixed via per-block map_blocks cupy conversion. planar/numpy/dask SAFE; geodesic GPU kernel ~184 regs, mitigated by 16x16 blocks."
 balanced_allocation,2026-04-16T12:00:00Z,WILL OOM,memory-bound,8,1114,"Re-audit 2026-04-16 after PR 1203 float32 fix. 8 HIGH found (friction.compute L339, argmin.compute in iter loop L182, double all_nan recompute L206, stacked cost_surfaces allocation). Covered by existing documented limitation on #1114. Not refiled."
 bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
-bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation).
+bump,2026-07-02,SAFE,graph-bound,0,3612,"Perf re-audit 2026-07-02: MEDIUM (#3612/PR) _partition_bumps rescanned all locs per chunk -> O(n_chunks*count) dask graph build (7.8s@25.6k chunks, count=100k); fixed with searchsorted bucketing, byte-identical partitions, 85x@25.6k chunks. Memory SAFE (per-chunk lazy build + guard). LOW: cupy path runs CPU kernel then single host->device transfer (by design, no GPU kernel); np.random.choice(range) slower than randint; unused rows,cols in _finish_bump. cuda-available, cupy+dask+cupy parity verified."
 classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU."
 contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,

diff --git a/benchmarks/benchmarks/bump.py b/benchmarks/benchmarks/bump.py
@@ -0,0 +1,30 @@
+import numpy as np
+import xarray as xr
+
+from xrspatial import bump
+
+
+class Bump:
+    # Two axes: raster width and backend.  The "dask" case times graph
+    # construction only (no compute), which is where _partition_bumps runs;
+    # it is finely chunked so the per-chunk bump partitioning is exercised.
+    params = ([256, 1024, 2048], ["numpy", "dask"])
+    param_names = ("nx", "type")
+
+    def setup(self, nx, type):
+        ny = nx
+        if type == "numpy":
+            self.agg = xr.DataArray(np.zeros((ny, nx)), dims=["y", "x"])
+        elif type == "dask":
+            import dask.array as da
+            self.agg = xr.DataArray(
+                da.zeros((ny, nx), chunks=(64, 64), dtype=np.float64),
+                dims=["y", "x"],
+            )
+        else:
+            raise NotImplementedError()
+        np.random.seed(71942)
+
+    def time_bump(self, nx, type):
+        # numpy: full eager render.  dask: lazy graph build (no compute).
+        bump(agg=self.agg, count=50000, spread=1)
diff --git a/xrspatial/bump.py b/xrspatial/bump.py
@@ -87,20 +87,32 @@ def _partition_bumps(data, locs, heights, spread):
     ny = len(data.chunks[0])
     nx = len(data.chunks[1])
 
-    partitions = {}
-    for yi in range(ny):
-        y0, y1 = int(y_offsets[yi]), int(y_offsets[yi + 1])
-        for xi in range(nx):
-            x0, x1 = int(x_offsets[xi]), int(x_offsets[xi + 1])
-            mask = ((locs[:, 0] >= x0) & (locs[:, 0] < x1) &
-                    (locs[:, 1] >= y0) & (locs[:, 1] < y1))
-            if np.any(mask):
-                local_locs = locs[mask].copy()
-                local_locs[:, 0] -= x0
-                local_locs[:, 1] -= y0
-                partitions[(yi, xi)] = (local_locs, heights[mask].copy())
-            else:
-                partitions[(yi, xi)] = None
+    # Assign every bump to the chunk that holds its centre pixel in a single
+    # vectorised pass.  Rescanning all of ``locs`` once per chunk would be
+    # O(n_chunks * n_bumps), which runs eagerly during dask graph
+    # construction and dominates the runtime for finely chunked rasters.
+    # Callers pass ``locs`` already clamped to ``[0, width) x [0, height)``
+    # (bump() draws them from ``range(w)`` / ``range(h)``), so searchsorted
+    # always lands inside the chunk grid.
+    xi = np.searchsorted(x_offsets, locs[:, 0], side='right') - 1
+    yi = np.searchsorted(y_offsets, locs[:, 1], side='right') - 1
+
+    partitions = {(a, b): None for a in range(ny) for b in range(nx)}
+    if len(locs):
+        flat = yi.astype(np.intp) * nx + xi
+        order = np.argsort(flat, kind='stable')
+        flat_sorted = flat[order]
+        bounds = np.nonzero(np.diff(flat_sorted))[0] + 1
+        starts = np.concatenate([[0], bounds])
+        ends = np.concatenate([bounds, [len(flat_sorted)]])
+        for s, e in zip(starts, ends):
+            idx = order[s:e]
+            a, b = divmod(int(flat_sorted[s]), nx)
+            x0, y0 = int(x_offsets[b]), int(y_offsets[a])
+            local_locs = locs[idx].copy()
+            local_locs[:, 0] -= x0
+            local_locs[:, 1] -= y0
+            partitions[(a, b)] = (local_locs, heights[idx].copy())
     return partitions
 
 

diff --git a/xrspatial/tests/test_bump.py b/xrspatial/tests/test_bump.py
@@ -271,6 +271,63 @@ def test_bump_dask_partitioned_matches_numpy():
     np.testing.assert_array_equal(result_np.values, result_dask.values)
 
 
+# --- Issue #3612 regression test ---
+
+@dask_array_available
+def test_partition_bumps_matches_bruteforce():
+    """Vectorised _partition_bumps must assign each bump to the chunk that
+    holds its centre pixel, with chunk-local coordinates and original order
+    preserved (#3612).
+
+    The reference is the original per-chunk boolean-mask partition, checked
+    across an irregular chunk layout and empty chunks.
+    """
+    from xrspatial.bump import _partition_bumps
+
+    def _reference(data, locs, heights):
+        y_off = np.concatenate([[0], np.cumsum(data.chunks[0])])
+        x_off = np.concatenate([[0], np.cumsum(data.chunks[1])])
+        ny, nx = len(data.chunks[0]), len(data.chunks[1])
+        parts = {}
+        for yi in range(ny):
+            y0, y1 = int(y_off[yi]), int(y_off[yi + 1])
+            for xi in range(nx):
+                x0, x1 = int(x_off[xi]), int(x_off[xi + 1])
+                mask = ((locs[:, 0] >= x0) & (locs[:, 0] < x1) &
+                        (locs[:, 1] >= y0) & (locs[:, 1] < y1))
+                if np.any(mask):
+                    ll = locs[mask].copy()
+                    ll[:, 0] -= x0
+                    ll[:, 1] -= y0
+                    parts[(yi, xi)] = (ll, heights[mask].copy())
+                else:
+                    parts[(yi, xi)] = None
+        return parts
+
+    n = 400
+    data = da.zeros((n, n), dtype=np.float64).rechunk(
+        {0: (50, 120, 80, 150), 1: (70, 90, 110, 130)}
+    )
+    rng = np.random.default_rng(3612)
+    count = 2000
+    locs = np.empty((count, 2), dtype=np.int32)
+    locs[:, 0] = rng.integers(0, n, count)
+    locs[:, 1] = rng.integers(0, n, count)
+    heights = rng.random(count)
+
+    got = _partition_bumps(data, locs, heights, 1)
+    ref = _reference(data, locs, heights)
+
+    assert set(got) == set(ref)
+    for key in ref:
+        g, r = got[key], ref[key]
+        assert (g is None) == (r is None), key
+        if r is None:
+            continue
+        np.testing.assert_array_equal(g[0], r[0])   # chunk-local locs + order
+        np.testing.assert_array_equal(g[1], r[1])   # heights + order
+
+
 # --- Issue #1231 regression tests ---
 
 def test_bump_raster_memory_guard_rejects_pathological_size():