From 5a333c59987bcca6b494a3513ad9c5eb92810902 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 2 Jul 2026 11:48:12 -0400 Subject: [PATCH 1/2] Vectorise _partition_bumps to fix O(n_chunks*count) dask graph build (#3612) _partition_bumps rescanned the full locs array with a boolean mask once per chunk, so dask graph construction ran in O(n_chunks * n_bumps) before any compute. Bucket each bump into its chunk in a single searchsorted pass instead (byte-identical partitions, ~85x faster at 25.6k chunks). Add a benchmark (benchmarks/benchmarks/bump.py) and a regression test that locks the partition against the original boolean-mask reference. Sweep: performance. Backends: dask+numpy, dask+cupy (shared helper). --- .claude/sweep-performance-state.csv | 2 +- benchmarks/benchmarks/bump.py | 30 +++++++++++++++ xrspatial/bump.py | 37 ++++++++++++------- xrspatial/tests/test_bump.py | 57 +++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 15 deletions(-) create mode 100644 benchmarks/benchmarks/bump.py diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index ef411e9ea..ec1099b40 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -2,7 +2,7 @@ module,last_inspected,oom_verdict,bottleneck,high_count,issue,notes aspect,2026-05-29,SAFE,compute-bound,1,2688,"dask+cupy geodesic densified full lat/lon on one GPU at graph build (OOM at scale); fixed via per-block map_blocks cupy conversion. planar/numpy/dask SAFE; geodesic GPU kernel ~184 regs, mitigated by 16x16 blocks." balanced_allocation,2026-04-16T12:00:00Z,WILL OOM,memory-bound,8,1114,"Re-audit 2026-04-16 after PR 1203 float32 fix. 8 HIGH found (friction.compute L339, argmin.compute in iter loop L182, double all_nan recompute L206, stacked cost_surfaces allocation). Covered by existing documented limitation on #1114. Not refiled." bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, -bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation). +bump,2026-07-02,SAFE,graph-bound,0,3612,"Perf re-audit 2026-07-02: MEDIUM (#3612/PR) _partition_bumps rescanned all locs per chunk -> O(n_chunks*count) dask graph build (7.8s@25.6k chunks, count=100k); fixed with searchsorted bucketing, byte-identical partitions, 85x@25.6k chunks. Memory SAFE (per-chunk lazy build + guard). LOW: cupy path runs CPU kernel then single host->device transfer (by design, no GPU kernel); np.random.choice(range) slower than randint; unused rows,cols in _finish_bump. cuda-available, cupy+dask+cupy parity verified." classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU." contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, diff --git a/benchmarks/benchmarks/bump.py b/benchmarks/benchmarks/bump.py new file mode 100644 index 000000000..7961d0c3c --- /dev/null +++ b/benchmarks/benchmarks/bump.py @@ -0,0 +1,30 @@ +import numpy as np +import xarray as xr + +from xrspatial import bump + + +class Bump: + # Two axes: raster width and backend. The "dask" case times graph + # construction only (no compute), which is where _partition_bumps runs; + # it is finely chunked so the per-chunk bump partitioning is exercised. + params = ([256, 1024, 2048], ["numpy", "dask"]) + param_names = ("nx", "type") + + def setup(self, nx, type): + ny = nx + if type == "numpy": + self.agg = xr.DataArray(np.zeros((ny, nx)), dims=["y", "x"]) + elif type == "dask": + import dask.array as da + self.agg = xr.DataArray( + da.zeros((ny, nx), chunks=(64, 64), dtype=np.float64), + dims=["y", "x"], + ) + else: + raise NotImplementedError() + np.random.seed(71942) + + def time_bump(self, nx, type): + # numpy: full eager render. dask: lazy graph build (no compute). + bump(agg=self.agg, count=50000, spread=1) diff --git a/xrspatial/bump.py b/xrspatial/bump.py index c7518b461..aca902c7b 100644 --- a/xrspatial/bump.py +++ b/xrspatial/bump.py @@ -87,20 +87,29 @@ def _partition_bumps(data, locs, heights, spread): ny = len(data.chunks[0]) nx = len(data.chunks[1]) - partitions = {} - for yi in range(ny): - y0, y1 = int(y_offsets[yi]), int(y_offsets[yi + 1]) - for xi in range(nx): - x0, x1 = int(x_offsets[xi]), int(x_offsets[xi + 1]) - mask = ((locs[:, 0] >= x0) & (locs[:, 0] < x1) & - (locs[:, 1] >= y0) & (locs[:, 1] < y1)) - if np.any(mask): - local_locs = locs[mask].copy() - local_locs[:, 0] -= x0 - local_locs[:, 1] -= y0 - partitions[(yi, xi)] = (local_locs, heights[mask].copy()) - else: - partitions[(yi, xi)] = None + # Assign every bump to the chunk that holds its centre pixel in a single + # vectorised pass. Rescanning all of ``locs`` once per chunk would be + # O(n_chunks * n_bumps), which runs eagerly during dask graph + # construction and dominates the runtime for finely chunked rasters. + xi = np.searchsorted(x_offsets, locs[:, 0], side='right') - 1 + yi = np.searchsorted(y_offsets, locs[:, 1], side='right') - 1 + + partitions = {(a, b): None for a in range(ny) for b in range(nx)} + if len(locs): + flat = yi.astype(np.intp) * nx + xi + order = np.argsort(flat, kind='stable') + flat_sorted = flat[order] + bounds = np.nonzero(np.diff(flat_sorted))[0] + 1 + starts = np.concatenate([[0], bounds]) + ends = np.concatenate([bounds, [len(flat_sorted)]]) + for s, e in zip(starts, ends): + idx = order[s:e] + a, b = divmod(int(flat_sorted[s]), nx) + x0, y0 = int(x_offsets[b]), int(y_offsets[a]) + local_locs = locs[idx].copy() + local_locs[:, 0] -= x0 + local_locs[:, 1] -= y0 + partitions[(a, b)] = (local_locs, heights[idx].copy()) return partitions diff --git a/xrspatial/tests/test_bump.py b/xrspatial/tests/test_bump.py index bb9857a32..ea4f0e955 100644 --- a/xrspatial/tests/test_bump.py +++ b/xrspatial/tests/test_bump.py @@ -271,6 +271,63 @@ def test_bump_dask_partitioned_matches_numpy(): np.testing.assert_array_equal(result_np.values, result_dask.values) +# --- Issue #3612 regression test --- + +@dask_array_available +def test_partition_bumps_matches_bruteforce(): + """Vectorised _partition_bumps must assign each bump to the chunk that + holds its centre pixel, with chunk-local coordinates and original order + preserved (#3612). + + The reference is the original per-chunk boolean-mask partition, checked + across an irregular chunk layout and empty chunks. + """ + from xrspatial.bump import _partition_bumps + + def _reference(data, locs, heights): + y_off = np.concatenate([[0], np.cumsum(data.chunks[0])]) + x_off = np.concatenate([[0], np.cumsum(data.chunks[1])]) + ny, nx = len(data.chunks[0]), len(data.chunks[1]) + parts = {} + for yi in range(ny): + y0, y1 = int(y_off[yi]), int(y_off[yi + 1]) + for xi in range(nx): + x0, x1 = int(x_off[xi]), int(x_off[xi + 1]) + mask = ((locs[:, 0] >= x0) & (locs[:, 0] < x1) & + (locs[:, 1] >= y0) & (locs[:, 1] < y1)) + if np.any(mask): + ll = locs[mask].copy() + ll[:, 0] -= x0 + ll[:, 1] -= y0 + parts[(yi, xi)] = (ll, heights[mask].copy()) + else: + parts[(yi, xi)] = None + return parts + + n = 400 + data = da.zeros((n, n), dtype=np.float64).rechunk( + {0: (50, 120, 80, 150), 1: (70, 90, 110, 130)} + ) + rng = np.random.default_rng(3612) + count = 2000 + locs = np.empty((count, 2), dtype=np.int32) + locs[:, 0] = rng.integers(0, n, count) + locs[:, 1] = rng.integers(0, n, count) + heights = rng.random(count) + + got = _partition_bumps(data, locs, heights, 1) + ref = _reference(data, locs, heights) + + assert set(got) == set(ref) + for key in ref: + g, r = got[key], ref[key] + assert (g is None) == (r is None), key + if r is None: + continue + np.testing.assert_array_equal(g[0], r[0]) # chunk-local locs + order + np.testing.assert_array_equal(g[1], r[1]) # heights + order + + # --- Issue #1231 regression tests --- def test_bump_raster_memory_guard_rejects_pathological_size(): From eb32e89693be432ab6a4ba223356f17bb2adfd9d Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 2 Jul 2026 11:51:33 -0400 Subject: [PATCH 2/2] Document _partition_bumps in-range precondition (review nit, #3612) --- xrspatial/bump.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xrspatial/bump.py b/xrspatial/bump.py index aca902c7b..ab7188eb3 100644 --- a/xrspatial/bump.py +++ b/xrspatial/bump.py @@ -91,6 +91,9 @@ def _partition_bumps(data, locs, heights, spread): # vectorised pass. Rescanning all of ``locs`` once per chunk would be # O(n_chunks * n_bumps), which runs eagerly during dask graph # construction and dominates the runtime for finely chunked rasters. + # Callers pass ``locs`` already clamped to ``[0, width) x [0, height)`` + # (bump() draws them from ``range(w)`` / ``range(h)``), so searchsorted + # always lands inside the chunk grid. xi = np.searchsorted(x_offsets, locs[:, 0], side='right') - 1 yi = np.searchsorted(y_offsets, locs[:, 1], side='right') - 1