xarray-contrib · brendancol · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv
@@ -32,7 +32,7 @@ morphology,2026-06-20,SAFE,compute-bound,1,3401,memory guard fired on full lazy-
 multispectral,2026-05-02,SAFE,compute-bound,0,,"Re-audit 2026-05-02 after PRs 1292 (true_color memory guard) and 1301 (validate_arrays in true_color). Verified SAFE. No HIGH. MEDIUM: da.stack in _true_color_dask/_true_color_dask_cupy at L1702/L1731 creates (1,1,1,1) chunks along band axis (4 bands so impact is minor, scheduling overhead not OOM). LOW: np.zeros((h,w,4)) at L1681 then full overwrite -- np.empty would suffice. All 17 indices use plain map_blocks with no halo; 8192x8192 ndvi graph is 80 tasks, evi/arvi/ebbi 112 tasks."
 normalize,2026-03-31T18:00:00Z,SAFE,compute-bound,0,1124,Boolean indexing replaced with lazy nanmin/nanmax/nanmean/nanstd.
 pathfinding,2026-04-15T12:00:00Z,SAFE,compute-bound,0,false-positive,Downgraded. CuPy .get() is required -- A* has no GPU kernel. Per-pixel .compute() is only 2 calls for start/goal validation. seg.values in multi_stop_search collects already-computed results for stitching.
-perlin,2026-03-31T18:00:00Z,WILL OOM,memory-bound,0,,
+perlin,2026-06-23,SAFE,memory-bound,1,3469,"Re-audit 2026-06-23 (CUDA host). 1 HIGH: _perlin_dask_numpy (L128) and _perlin_dask_cupy (L279) called dask.persist on the full noise array before min/ptp(max) reductions -> every chunk resident at once, WILL OOM at 30TB. Persist redundant: both reductions share the named noise subgraph in one dask.compute (verified min/ptp graphs ref same 100 noise keys). Fixed by removing persist -> peak mem chunk-bounded, SAFE. map_blocks graph embarrassingly parallel (3.3 tasks/chunk). cupy kernel _perlin_gpu 36 regs/thread (no pressure); cupy path stays on GPU, no host round trip. numpy/cupy single-GPU unaffected. 13 perlin tests pass incl GPU+dask+GPU. PR #3469-fix."
 polygon_clip,2026-06-10,SAFE,graph-bound,0,3191,"crop=True picked tiny leading edge chunk as rasterize mask size -> 13169-task graph; fixed to max(rc),max(cc) -> 1045 tasks. crop=False/numpy/cupy clean. Cat1-5 clean. GPU+dask+cupy run-validated."
 polygonize,2026-06-12,RISKY,compute-bound,0,3303,"Pass 3 (2026-06-12): re-audit after #2817/#2913/#3041. 0 HIGH. 1 MEDIUM fixed (#3303): _compute_region_value_ranges ran a pure-Python per-pixel loop (95% of float chunk time; 0.283s of 0.299s on 1024x1024, float chunks ~30x int) and re-ran _calculate_regions on an already-labelled block; moved to jitted _region_ranges_scan + _polygonize_numpy_regions label reuse (0.299s -> 0.015s/chunk). Side fix: w_match/s_match flags were always-truthy (_is_close numba overload generator called from pure Python returns impl function); output-neutral by chunk geometry, now computed correctly in jit. Cat1/2 clean (dask.compute batching is the documented #2673 design). Cat3 validated on GPU: cupy int/float + dask+cupy run end-to-end, single documented transfer, no round-trip. Cat4/5 LOW unchanged: _calculate_regions_cupy per-unique-value labeling (low impact); per-polygon Python classify loop in _polygonize_chunk dominates only on pathological many-polygon chunks (788K polys -> 7.8s). Cat6 RISKY unchanged: driver accumulates O(total polygons); 32-chunk batches bound transient peak. 527 polygonize tests + 40 new pass."
 proximity,2026-06-09,RISKY,graph-bound,0,3103,"Pass 2 (2026-06-09): re-audit after 16 fix commits since 2026-03-31. 0 HIGH, 2 MEDIUM found and fixed: (1) #3103/PR #3126 line-sweep @ngjit closure inside _process recompiled per call (~0.42s constant overhead; 10x10 warm call 0.44s->1ms after module-level hoist with explicit args, 1000x1000 0.49s->35ms); (2) #3132/PR #3137 dask xs/ys coordinate grids built via da.tile/da.repeat+rechunk cost ~185 tasks/chunk with the ys term scaling O(raster height) (~4.3 tasks/row, 44K tasks at 10240 rows); chunk-aligned da.broadcast_to gives identical values, bounded graph 18535->5554 tasks (3.3x) on 2560^2/256 chunks; regression test bounds tasks/chunk<80 (old 100.4, new 58.7) + ragged-chunk parity. LOW not fixed: zeros+fill(-1) row buffers in line-sweep; numpy backend materializes full float64 xs/ys grids (guarded since #1111); unbounded KDTree streaming count pass computes chunks on driver by design (gh-879). GPU validated on CUDA host: cupy 1024^2 proximity 6ms device-resident with exact numpy parity, dask+cupy bounded parity exact, _proximity_cuda_kernel 56 regs/thread (no register pressure). _halo_depth python loop measured 58ms at 100K coords - not a finding. Verdict RISKY (was WILL OOM): unbounded paths either guarded (MemoryError at 80% mem) or stream via kdtree; bounded map_overlap peak scales with chunk size."

diff --git a/xrspatial/perlin.py b/xrspatial/perlin.py
@@ -124,8 +124,13 @@ def _perlin_dask_numpy(data: da.Array,
     data = da.map_blocks(_func, x, y, meta=np.array((), dtype=np.float32),
                          **_dask_task_name_kwargs('xrspatial.perlin'))
 
-    # persist so min/ptp don't recompute the noise from scratch
-    (data,) = dask.persist(data)
+    # min and ptp go out in one dask.compute call, which shares the noise
+    # subgraph between them, so each chunk is computed once and freed after
+    # both reductions read it. Persisting the whole array first would instead
+    # hold every chunk resident at once and OOM at scale. The returned lazy
+    # array recomputes the noise when the caller materializes it; that extra
+    # pass is intentional -- the noise is point-wise and deterministic, so
+    # recompute is exact and cheap relative to keeping the array resident.
     min_val, ptp_val = dask.compute(da.min(data), da.ptp(data))
     data = (data - min_val) / ptp_val
     return data
@@ -275,8 +280,13 @@ def _chunk_perlin(block, block_info=None):
                          meta=cupy.array((), dtype=cupy.float32),
                          **_dask_task_name_kwargs('xrspatial.perlin'))
 
-    # persist so min/max don't recompute the noise from scratch
-    (data,) = dask.persist(data)
+    # min and max go out in one dask.compute call, which shares the noise
+    # subgraph between them, so each chunk is computed once and freed after
+    # both reductions read it. Persisting the whole array first would instead
+    # hold every chunk resident at once and OOM at scale. The returned lazy
+    # array recomputes the noise when the caller materializes it; that extra
+    # pass is intentional -- the noise is point-wise and deterministic, so
+    # recompute is exact and cheap relative to keeping the array resident.
     min_val, max_val = dask.compute(da.min(data), da.max(data))
     data = (data - min_val) / (max_val - min_val)
     return data

diff --git a/xrspatial/tests/test_perlin.py b/xrspatial/tests/test_perlin.py
@@ -52,6 +52,36 @@ def test_perlin_dask_cpu():
     )
 
 
+@dask_array_available
+def test_perlin_dask_does_not_persist_whole_array(monkeypatch):
+    # Regression for issue #3469: the dask backends used dask.persist() to
+    # cache the noise before reducing it, which forces every chunk resident
+    # at once and OOMs at scale. The min/ptp reductions share the noise
+    # subgraph within a single dask.compute call, so the persist is
+    # unnecessary. Fail loudly if it ever comes back.
+    import dask
+
+    def _no_persist(*args, **kwargs):
+        raise AssertionError(
+            "perlin dask backend called dask.persist(); this materializes "
+            "the whole noise array and reintroduces the OOM from #3469"
+        )
+
+    monkeypatch.setattr(dask, "persist", _no_persist)
+
+    data_numpy = create_test_arr()
+    perlin_numpy = perlin(data_numpy)
+
+    data_dask = create_test_arr(backend='dask')
+    perlin_dask = perlin(data_dask)
+    general_output_checks(data_dask, perlin_dask)
+
+    np.testing.assert_allclose(
+        perlin_numpy.data, perlin_dask.data.compute(),
+        rtol=1e-05, atol=1e-07, equal_nan=True
+    )
+
+
 @cuda_and_cupy_available
 def test_perlin_gpu():
     # vanilla numpy version
@@ -70,7 +100,20 @@ def test_perlin_gpu():
 
 @cuda_and_cupy_available
 @dask_array_available
-def test_perlin_dask_gpu():
+def test_perlin_dask_gpu(monkeypatch):
+    # The dask+cupy path must not call dask.persist either (issue #3469);
+    # guard it here so the GPU backend is covered too.
+    import dask
+
+    def _no_persist(*args, **kwargs):
+        raise AssertionError(
+            "perlin dask+cupy backend called dask.persist(); this "
+            "materializes the whole noise array and reintroduces the "
+            "OOM from #3469"
+        )
+
+    monkeypatch.setattr(dask, "persist", _no_persist)
+
     # numpy baseline
     data_numpy = create_test_arr()
     perlin_numpy = perlin(data_numpy)