From 050039700d62da7928d5920d3b8839d6a1e8e0a7 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 17 Jun 2026 14:36:57 -0700 Subject: [PATCH] Vectorize _sort_and_stride 3D reindex, drop deepcopy (#3381) The 3D branch of _sort_and_stride deep-copied the whole values array then reindexed each layer by sorted_indices in a Python loop. Replace both with a single vectorized fancy-index that returns a fresh array, so the input is never mutated and no explicit copy is needed. Removes the now-unused `import copy`. Runs in crosstab() on numpy and once per block on dask+numpy; cupy rejects 3D input so it is unaffected. Adds test_sort_and_stride_3d_no_mutation_and_parity pinning input non-mutation and layer-wise sort parity. --- .claude/sweep-performance-state.csv | 2 +- xrspatial/tests/test_zonal.py | 28 ++++++++++++++++++++++++++++ xrspatial/zonal.py | 12 +++++++----- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index a8b0809d7..266f9a8d0 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -48,4 +48,4 @@ terrain_metrics,2026-03-31T18:00:00Z,SAFE,memory-bound,0,, viewshed,2026-04-05T12:00:00Z,SAFE,memory-bound,0,fixed-in-tree,Tier B memory estimate tightened from 280 to 368 bytes/pixel (accounts for lexsort double-alloc + computed raster). astype copy=False avoids needless float64 copy. visibility,2026-06-10,RISKY,compute-bound,0,3185,"cumulative_viewshed recomputed dask source per observer (fixed #3185: materialise once when no max_distance); graph grows ~64 tasks/observer with N; line_of_sight single-transect cheap; MEDIUM count temp .astype per observer (LOW, not fixed)" worley,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, -zonal,2026-05-27,SAFE,compute-bound,0,2526,"Pass 2 (2026-05-27): re-audit identified 3 MEDIUM findings. (1) zonal_apply 3D dask path: da.stack(layers, axis=2) left output chunks at size 1 along axis 2 -- filed #2526 and fixed by rechunking back to values_data.chunks[2] in _apply_dask_numpy (zonal.py:1691) and _apply_dask_cupy (zonal.py:1731). Confirmed via graph probe: 256x256 raster chunks=(64,64) 3 bands previously yielded chunks[2]=(1,1,1); now (3,). 1 new test (test_apply_dask_3d_axis2_rechunked_2526). 126 existing zonal tests pass. (2) _stats_cupy (zonal.py:588-608): per-zone x per-stat Python loop with cupy.float_(result) forces O(n_zones * n_stats) GPU<->CPU sync points; not fixed in this pass (CUDA-native rewrite needed, larger refactor). (3) _parallel_variance @delayed reduce iterates over all blocks in driver memory; for very large block counts the single-task merge becomes scheduler-bound but is not OOM since per-block arrays are O(n_zones). Not fixed (algorithmic refactor needed). Dask graph probe: stats(7 stats) on 2560x2560 chunks=256 -> 4449 tasks (44/chunk); stats(mean only) -> 823 tasks (8/chunk); crosstab -> 304 (3/chunk); hypsometric_integral -> 300 (3/chunk). All under 50K cap. SAFE/compute-bound verdict holds. | Fixed-in-tree 2026-04-16: rewrote hypsometric_integral dask path. Eliminated double-compute (_unique_finite_zones removed, each block discovers own zones). Replaced np.stack (O(n_blocks * n_zones) scheduler memory) with streaming dict-merge (O(n_zones)). 29 existing tests pass." +zonal,2026-06-17,SAFE,compute-bound,0,3381,"Pass 3 (2026-06-17, deep-sweep): 0 HIGH, 1 new MEDIUM found and fixed (#3381/PR pending). _sort_and_stride 3D branch (zonal.py:317) did copy.deepcopy(values).reshape(...) then a per-row Python loop reindexing by sorted_indices; replaced with one vectorized fancy-index values.reshape(n,-1)[:, sorted_indices] (fancy index returns fresh array, input unmutated -- verified), dropping a full-array copy and the loop; orphaned 'import copy' removed. Runs in crosstab() numpy + per-dask-block; cupy rejects 3D so unaffected. New test test_sort_and_stride_3d_no_mutation_and_parity; 194 zonal + 43 backend-coverage tests pass. Carry-over MEDIUMs unchanged (not refixed this pass): _stats_cupy per-zone x per-stat loop with cupy.float_() forces O(n_zones*n_stats) device->host syncs (needs CUDA-native rewrite); _parallel_variance @delayed reduce iterates all blocks in driver (scheduler-bound at huge block counts, not OOM since per-block O(n_zones)). #2526 3D-apply rechunk still fixed. Dask graph counts from Pass 2 hold (stats 7-stat 4449 tasks/44 per chunk, stats-mean 823/8, crosstab 304/3, hypsometric_integral 300/3 on 2560x2560 chunks=256; code paths unchanged, _sort_and_stride fix changes per-task work not task count) -- all under 50K cap. This pass's graph re-probe was killed by host memory pressure from parallel sibling sweeps; counts taken from prior recorded run. SAFE/compute-bound holds. CUDA available on host (cupy 13.6) but cupy zonal stats path is 2D-only and the fix is numpy/dask+numpy." diff --git a/xrspatial/tests/test_zonal.py b/xrspatial/tests/test_zonal.py index 337765ca4..5157816b2 100644 --- a/xrspatial/tests/test_zonal.py +++ b/xrspatial/tests/test_zonal.py @@ -1387,6 +1387,34 @@ def test_crosstab_3d_count(backend, data_zones, data_values_3d, result_crosstab_ assert_input_data_unmodified(data_values_3d, copied_data_values_3d) +def test_sort_and_stride_3d_no_mutation_and_parity(): + # Regression for #3381: the 3D branch of _sort_and_stride used to + # copy.deepcopy the values array and reindex it row by row in a Python + # loop. It now reindexes in a single vectorized fancy-index. Pin both + # properties: the input must be left untouched, and the reordered output + # must match the layer-wise sort the loop produced. + from xrspatial.zonal import _sort_and_stride + + rng = np.random.default_rng(3381) + n_layers, h, w = 4, 5, 6 + values = rng.integers(0, 100, (n_layers, h, w)).astype(np.float64) + zones = rng.integers(0, 3, (h, w)).astype(np.float64) + unique_zones = np.unique(zones) + + values_before = values.copy() + sorted_indices, values_by_zones, _ = _sort_and_stride( + zones, values, unique_zones + ) + + # input array is not mutated by the reindex + np.testing.assert_array_equal(values, values_before) + + # each layer is reordered by the same flat zone-sort permutation + expected = values.reshape(n_layers, h * w)[:, sorted_indices] + assert values_by_zones.shape == (n_layers, h * w) + np.testing.assert_array_equal(values_by_zones, expected) + + @pytest.mark.skipif(not dask_array_available(), reason="Requires Dask") def test_crosstab_3d_mixed_backend_raises(): # 3D crosstab() must reject mismatched backends up front with a clear diff --git a/xrspatial/zonal.py b/xrspatial/zonal.py index e1c5e44ae..7167de67e 100644 --- a/xrspatial/zonal.py +++ b/xrspatial/zonal.py @@ -1,7 +1,6 @@ from __future__ import annotations # standard library -import copy from math import sqrt from typing import Callable, Dict, List, Optional, Union @@ -315,10 +314,13 @@ def _sort_and_stride(zones, values, unique_zones): values_shape = values.shape if len(values_shape) == 3: - values_by_zones = copy.deepcopy(values).reshape( - values_shape[0], values_shape[1] * values_shape[2]) - for i in range(values_shape[0]): - values_by_zones[i] = values_by_zones[i][sorted_indices] + # Reindex every layer's flattened cells by the zone sort order in a + # single vectorized fancy-index. Fancy indexing returns a fresh + # array, so the input is never mutated and no explicit copy is + # needed (the old path deep-copied the whole array first, then + # reindexed row by row in a Python loop). + values_by_zones = values.reshape( + values_shape[0], values_shape[1] * values_shape[2])[:, sorted_indices] else: values_by_zones = values.ravel()[sorted_indices]