Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/sweep-performance-state.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module,last_inspected,oom_verdict,bottleneck,high_count,issue,notes
aspect,2026-05-29,SAFE,compute-bound,1,2688,"dask+cupy geodesic densified full lat/lon on one GPU at graph build (OOM at scale); fixed via per-block map_blocks cupy conversion. planar/numpy/dask SAFE; geodesic GPU kernel ~184 regs, mitigated by 16x16 blocks."
balanced_allocation,2026-04-16T12:00:00Z,WILL OOM,memory-bound,8,1114,"Re-audit 2026-04-16 after PR 1203 float32 fix. 8 HIGH found (friction.compute L339, argmin.compute in iter loop L182, double all_nan recompute L206, stacked cost_surfaces allocation). Covered by existing documented limitation on #1114. Not refiled."
bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation).
bump,2026-07-02,SAFE,graph-bound,0,3612,"Perf re-audit 2026-07-02: MEDIUM (#3612/PR) _partition_bumps rescanned all locs per chunk -> O(n_chunks*count) dask graph build (7.8s@25.6k chunks, count=100k); fixed with searchsorted bucketing, byte-identical partitions, 85x@25.6k chunks. Memory SAFE (per-chunk lazy build + guard). LOW: cupy path runs CPU kernel then single host->device transfer (by design, no GPU kernel); np.random.choice(range) slower than randint; unused rows,cols in _finish_bump. cuda-available, cupy+dask+cupy parity verified."
classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU."
contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
Expand Down
30 changes: 30 additions & 0 deletions benchmarks/benchmarks/bump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import numpy as np
import xarray as xr

from xrspatial import bump


class Bump:
# Two axes: raster width and backend. The "dask" case times graph
# construction only (no compute), which is where _partition_bumps runs;
# it is finely chunked so the per-chunk bump partitioning is exercised.
params = ([256, 1024, 2048], ["numpy", "dask"])
param_names = ("nx", "type")

def setup(self, nx, type):
ny = nx
if type == "numpy":
self.agg = xr.DataArray(np.zeros((ny, nx)), dims=["y", "x"])
elif type == "dask":
import dask.array as da
self.agg = xr.DataArray(
da.zeros((ny, nx), chunks=(64, 64), dtype=np.float64),
dims=["y", "x"],
)
else:
raise NotImplementedError()
np.random.seed(71942)

def time_bump(self, nx, type):
# numpy: full eager render. dask: lazy graph build (no compute).
bump(agg=self.agg, count=50000, spread=1)
40 changes: 26 additions & 14 deletions xrspatial/bump.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,20 +87,32 @@ def _partition_bumps(data, locs, heights, spread):
ny = len(data.chunks[0])
nx = len(data.chunks[1])

partitions = {}
for yi in range(ny):
y0, y1 = int(y_offsets[yi]), int(y_offsets[yi + 1])
for xi in range(nx):
x0, x1 = int(x_offsets[xi]), int(x_offsets[xi + 1])
mask = ((locs[:, 0] >= x0) & (locs[:, 0] < x1) &
(locs[:, 1] >= y0) & (locs[:, 1] < y1))
if np.any(mask):
local_locs = locs[mask].copy()
local_locs[:, 0] -= x0
local_locs[:, 1] -= y0
partitions[(yi, xi)] = (local_locs, heights[mask].copy())
else:
partitions[(yi, xi)] = None
# Assign every bump to the chunk that holds its centre pixel in a single
# vectorised pass. Rescanning all of ``locs`` once per chunk would be
# O(n_chunks * n_bumps), which runs eagerly during dask graph
# construction and dominates the runtime for finely chunked rasters.
# Callers pass ``locs`` already clamped to ``[0, width) x [0, height)``
# (bump() draws them from ``range(w)`` / ``range(h)``), so searchsorted
# always lands inside the chunk grid.
xi = np.searchsorted(x_offsets, locs[:, 0], side='right') - 1
yi = np.searchsorted(y_offsets, locs[:, 1], side='right') - 1

partitions = {(a, b): None for a in range(ny) for b in range(nx)}
if len(locs):
flat = yi.astype(np.intp) * nx + xi
order = np.argsort(flat, kind='stable')
flat_sorted = flat[order]
bounds = np.nonzero(np.diff(flat_sorted))[0] + 1
starts = np.concatenate([[0], bounds])
ends = np.concatenate([bounds, [len(flat_sorted)]])
for s, e in zip(starts, ends):
idx = order[s:e]
a, b = divmod(int(flat_sorted[s]), nx)
x0, y0 = int(x_offsets[b]), int(y_offsets[a])
local_locs = locs[idx].copy()
local_locs[:, 0] -= x0
local_locs[:, 1] -= y0
partitions[(a, b)] = (local_locs, heights[idx].copy())
return partitions


Expand Down
57 changes: 57 additions & 0 deletions xrspatial/tests/test_bump.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,63 @@ def test_bump_dask_partitioned_matches_numpy():
np.testing.assert_array_equal(result_np.values, result_dask.values)


# --- Issue #3612 regression test ---

@dask_array_available
def test_partition_bumps_matches_bruteforce():
"""Vectorised _partition_bumps must assign each bump to the chunk that
holds its centre pixel, with chunk-local coordinates and original order
preserved (#3612).

The reference is the original per-chunk boolean-mask partition, checked
across an irregular chunk layout and empty chunks.
"""
from xrspatial.bump import _partition_bumps

def _reference(data, locs, heights):
y_off = np.concatenate([[0], np.cumsum(data.chunks[0])])
x_off = np.concatenate([[0], np.cumsum(data.chunks[1])])
ny, nx = len(data.chunks[0]), len(data.chunks[1])
parts = {}
for yi in range(ny):
y0, y1 = int(y_off[yi]), int(y_off[yi + 1])
for xi in range(nx):
x0, x1 = int(x_off[xi]), int(x_off[xi + 1])
mask = ((locs[:, 0] >= x0) & (locs[:, 0] < x1) &
(locs[:, 1] >= y0) & (locs[:, 1] < y1))
if np.any(mask):
ll = locs[mask].copy()
ll[:, 0] -= x0
ll[:, 1] -= y0
parts[(yi, xi)] = (ll, heights[mask].copy())
else:
parts[(yi, xi)] = None
return parts

n = 400
data = da.zeros((n, n), dtype=np.float64).rechunk(
{0: (50, 120, 80, 150), 1: (70, 90, 110, 130)}
)
rng = np.random.default_rng(3612)
count = 2000
locs = np.empty((count, 2), dtype=np.int32)
locs[:, 0] = rng.integers(0, n, count)
locs[:, 1] = rng.integers(0, n, count)
heights = rng.random(count)

got = _partition_bumps(data, locs, heights, 1)
ref = _reference(data, locs, heights)

assert set(got) == set(ref)
for key in ref:
g, r = got[key], ref[key]
assert (g is None) == (r is None), key
if r is None:
continue
np.testing.assert_array_equal(g[0], r[0]) # chunk-local locs + order
np.testing.assert_array_equal(g[1], r[1]) # heights + order


# --- Issue #1231 regression tests ---

def test_bump_raster_memory_guard_rejects_pathological_size():
Expand Down
Loading