diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index ef411e9ea..9e0969f52 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -5,7 +5,7 @@ bilateral,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, bump,2026-04-16T12:00:00Z,SAFE,compute-bound,0,1206,Re-audit 2026-04-16: fix verified SAFE. No HIGH findings. MEDIUM: CuPy backend runs CPU kernel then transfers to GPU (documented limitation). classify,2026-06-20,RISKY,graph-bound,1,3412,"Re-audit 2026-06-20 (CUDA host). 1 HIGH: _generate_sample_indices >10M branch used RandomState.choice(replace=False) which builds a full arange(num_data) permutation -> O(num_data) host alloc (160MB for 20M pop, OOM at 30TB) despite docstring claiming O(num_sample). Backed dask/dask+cupy natural_breaks/maximum_breaks/quantile/percentiles/box_plot. Fixed via np.random.default_rng().choice (Floyd, O(num_sample), still deterministic); peak 160MB->0.4MB. Other paths SAFE: head_tail_breaks already persists+fuses; box_plot samples; cupy kernels low-register; no .values/np.asarray-on-dask/.compute-in-loop. 93 classify tests pass incl GPU." contour,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, -convolution,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, +convolution,2026-07-02,SAFE,compute-bound,1,3615,_convolve_2d_numpy used prange w/o parallel=True -> ran serial (~7-10x slow); fixed via parallel=True + threading.Lock (macOS SIGABRT hazard #3141); cuda kernel 40 regs OK; dask ~20 tasks/chunk corridor,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, cost_distance,2026-06-15,RISKY,memory-bound,1,3342,"Perf sweep 2026-06-15. HIGH: bounded map_overlap branch in _cost_distance_dask gated on full dims (pad>=height/width) not chunk size; pad>chunk collapses to single chunk (#880-class OOM, verified npartitions=1 at chunks=10/pad=96). Fixed: compare pad vs max chunk dim, route to iterative when pad>=chunk (matches GPU path L484). dask+cupy path already correct. Register count 37 (no pressure). nanmin().compute() L478/L1149 intentional scalar. iterative tile_cache full-dataset materialization is documented MemoryError-guarded design (#1118). All 56 tests pass incl GPU." curvature,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, diff --git a/benchmarks/benchmarks/convolution.py b/benchmarks/benchmarks/convolution.py new file mode 100644 index 000000000..e357478c2 --- /dev/null +++ b/benchmarks/benchmarks/convolution.py @@ -0,0 +1,28 @@ +import numpy as np + +from xrspatial.convolution import circle_kernel, convolve_2d + +from .common import get_xr_dataarray + + +class Convolve2d: + params = ([300, 1000, 3000], [(5, 5), (25, 25)], ["numpy", "cupy", "dask"]) + param_names = ("nx", "kernelsize", "type") + + def setup(self, nx, kernelsize, type): + ny = nx // 2 + self.agg = get_xr_dataarray((ny, nx), type) + kernel_h, kernel_w = kernelsize + self.kernel = np.ones((kernel_h, kernel_w), dtype=np.float64) + + def time_convolve_2d(self, nx, kernelsize, type): + # convolve_2d takes the backing array, not the DataArray wrapper. + convolve_2d(self.agg.data, self.kernel) + + +class CircleKernel: + params = ([3, 25, 100],) + param_names = ("radius",) + + def time_circle_kernel(self, radius): + circle_kernel(1, 1, radius) diff --git a/xrspatial/convolution.py b/xrspatial/convolution.py index b1b08bfb7..0bd028a4e 100644 --- a/xrspatial/convolution.py +++ b/xrspatial/convolution.py @@ -1,4 +1,5 @@ import re +import threading from functools import partial import numpy as np @@ -344,7 +345,16 @@ def custom_kernel(kernel): return kernel -@jit(nopython=True, nogil=True) +# Numba parallel=True kernels must not be launched concurrently from multiple +# Python threads: the default 'workqueue' threading layer is not threadsafe and +# aborts the process (SIGABRT on macOS) when two host threads enter a parallel +# region at once. _convolve_2d_dask_numpy calls the kernel per chunk under +# dask's threaded scheduler, so the kernel launch is serialized behind this +# lock. Same hazard and fix as the terrain and reproject kernels (#3141). +_PARALLEL_KERNEL_LOCK = threading.Lock() + + +@jit(nopython=True, nogil=True, parallel=True) def _convolve_2d_numpy(data, kernel): # apply kernel to data image. # Caller must ensure data is a float type (float32 or float64). @@ -374,14 +384,23 @@ def _convolve_2d_numpy(data, kernel): return out +def _convolve_2d_numpy_locked(data, kernel): + # Serialize the parallel=True kernel launch across host threads; see the + # comment on _PARALLEL_KERNEL_LOCK. A single numpy call takes the lock + # uncontended and still runs across all cores; concurrent dask chunk calls + # run one at a time, each internally parallel. + with _PARALLEL_KERNEL_LOCK: + return _convolve_2d_numpy(data, kernel) + + def _convolve_2d_numpy_boundary(data, kernel, boundary='nan'): data = data.astype(_promote_float(data.dtype)) if boundary == 'nan': - return _convolve_2d_numpy(data, kernel) + return _convolve_2d_numpy_locked(data, kernel) pad_h = kernel.shape[0] // 2 pad_w = kernel.shape[1] // 2 padded = _pad_array(data, (pad_h, pad_w), boundary) - result = _convolve_2d_numpy(padded, kernel) + result = _convolve_2d_numpy_locked(padded, kernel) r0 = pad_h if pad_h else None r1 = -pad_h if pad_h else None c0 = pad_w if pad_w else None @@ -393,7 +412,7 @@ def _convolve_2d_dask_numpy(data, kernel, boundary='nan'): data = data.astype(_promote_float(data.dtype)) pad_h = kernel.shape[0] // 2 pad_w = kernel.shape[1] // 2 - _func = partial(_convolve_2d_numpy, kernel=kernel) + _func = partial(_convolve_2d_numpy_locked, kernel=kernel) out = data.map_overlap(_func, depth=(pad_h, pad_w), boundary=_boundary_to_dask(boundary),