diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index 2e2af4c76..ef411e9ea 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -18,7 +18,7 @@ fire,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, flood,2026-06-25,RISKY,compute-bound,1,3503,"Cat1 HIGH: _validate_mannings_n_dataarray used .values, eagerly materialized dask/cupy roughness raster (OOM); fixed lazy-safe. Core elementwise ops SAFE. LOW: dask+cupy host round-trip (map_blocks b.get) shared with cost_distance/surface_distance, documented not fixed." focal,2026-05-29,SAFE,compute-bound,1,2734,"HIGH: _hotspots_dask_cupy chunk fn round-tripped each chunk host<->GPU (cupy.asnumpy classify cupy.asarray); fixed PR 2739 to reuse _run_gpu_hotspots on device. LOW (not fixed): _apply_numpy/_hotspots_cupy use zeros_like where empty would suffice. CUDA kernels regs<=62, no register-pressure issue." geodesic,2026-03-31T18:00:00Z,N/A,compute-bound,0,, -geotiff,2026-06-11,SAFE,IO-bound,0,3235,"Pass 15 (2026-06-11): 1 MEDIUM found and fixed. _pack (_attrs.py:~1795) guarded the no-sentinel integer restore with an eager bool(out.isnull().any()), which executed the whole upstream dask graph at to_geotiff(pack=True) call time; the streaming writer then executed it again, so every source chunk computed twice (measured 32 decode-task executions for 16 chunks on a 512x512 int16 SCALE/OFFSET no-GDAL_NODATA source; 71->33 total task starts post-fix). Filed #3235, fixed by mapping a per-chunk NaN guard (_pack_guard_no_nan) into the graph for dask-backed data (raises from the write's single compute; numpy keeps the eager call-time check; meta= preserves cupy backing). 9 new tests in test_pack_lazy_nan_guard_3235.py incl. fusion-proof execution counter and cupy-chunk guard unit test (dask+cupy e2e still blocked upstream by #3112). Scrutinised all 16 commits since 2026-06-08 (pack/unpack series #3065/#3075/#3079/#3129/#3174/#3175, VRT placement #3135, compression_level gate #3176, streaming banding #3136, dask+cupy writer order fix #3171): no other regressions; #3171's get-then-asarray order is intentional D2H for gpu=False. GPU validated on-device this pass: eager GPU unpack returns cupy with exact parity (387ms incl warmup, only 0-d scalar .get()s -- no bulk host round trip), dask+GPU unpack lazy (112 tasks/16 chunks, cupy meta, compute returns cupy, parity 0.0), GDS fast path intact without unpack (4 tasks/chunk); unpack disqualifying GDS is documented intentional. Dask CPU probe 4 tasks/chunk, 50k-task cap intact. Note: #1714 (_write_vrt_tiled synchronous scheduler) is now FIXED+CLOSED (scheduler='threads' at _writers/eager.py:1517) -- drop from the open-issue list. LOW noted (no fix): _pack does identity (data-0.0)/1.0 arithmetic allocating two full-array temporaries when scale==1/offset==0 (masked_nodata-only pack); prior deferred LOWs unchanged. SAFE/IO-bound holds. | Pass 14 (2026-06-09): MEDIUM found and fixed -- _write_streaming ran one dask .compute() per 256-row tile-row/strip, so a source chunk taller than the band re-executed once per band it overlapped (measured 2x at chunks=512, 4x at chunks=1024, whole upstream graph re-runs for computed pipelines). Filed #3117, fixed via _stream_row_bands: consecutive tile-rows/strips group into row bands sized by the source chunk-row span (one-chunk halo, #3007 accounting) under streaming_buffer_bytes; each band computes once and tiles/strips are carved from the materialised band. Wide rasters needing column segmentation keep the per-tile-row path. Post-fix per-chunk executions == 1 on the default read->write round trip. 5 new tests (TestRowBandRecompute3117 + _stream_row_bands unit); write/integration/parity suites pass (2195). LOW deferred (no fix): _read_geotiff_gpu_chunked parses header+all IFDs twice at graph build (_backends/gpu.py ~1367-1419, cap check then GDS probe; build-time only). GPU paths validated on-device this pass: eager gpu read returns cupy with parity, dask+GPU chunked read lazy (17 tasks/4 chunks) with parity; GPU writer full materialisation is documented intentional (streaming_buffer_bytes no-op). Read path keeps 50k-task graph cap; dask read probe 4 tasks/chunk. SAFE/IO-bound holds. | Pass 13 (2026-05-20): 1 MEDIUM found and fixed. _nvjpeg_batch_encode (_gpu_decode.py:~L1560) and _nvjpeg2k_batch_encode (~L2958) called cupy.cuda.Device().synchronize() inside the per-tile encode loops, a whole-device fence that blocked every CUDA stream and serialised concurrent work (e.g. predictor encodes on other streams). The decode-side counterpart _try_nvjpeg_batch_decode already used cupy.cuda.Stream.null.synchronize() at L1442; the encoder side was inconsistent. Filed #2212 and fixed both encoders to use Stream.null.synchronize(), scoping the per-tile sync to the default stream the encode/retrieve calls were issued on. nvJPEG / nvJPEG2000 encoders maintain a single shared state per encoder so encodes within a batch are inherently serial; the fix removes the device-wide blocker without changing the API ordering contract. 5 new tests in test_nvjpeg_encode_stream_sync_2212.py (AST checks that neither encoder contains Device().synchronize() inside a for-loop, that both call Stream.null.synchronize() in the loop, and that the decoder reference pattern stays pinned). All 5 new tests + 19 existing related encode/decode tests pass. nvjpeg/nvjpeg2k shared libs not present on this host so end-to-end encode verification is gated; add cuda-unavailable-libs note to re-validate on a host with the RAPIDS conda env. SAFE/IO-bound verdict holds; no change in dask graph cost. Dask probe: 2560x2560 deflate-tiled file via read_geotiff_dask(chunks=256) yields 400 tasks for 100 chunks (4 tasks/chunk), well under the 50K cap. LOW deferred (no fix in this PR): _build_ifd called twice per IFD level in _assemble_standard_layout (_writer.py:1531+1543), _assemble_cog_layout (1582+1625), and the COG overview path (2519+2546+2740) -- the first call's bytes are discarded; only the overflow byte length is used to compute pixel_data_offset. Cost is bounded by IFD count (typically 1-5 overview levels) so absolute impact is minor. Pre-existing pattern. | Pass 12 (2026-05-18): 1 MEDIUM found and fixed. _try_nvjpeg2k_batch_decode at _gpu_decode.py:~L2725-2778 allocated per-tile per-component cupy.empty buffers (N*S round-trips through the cupy memory pool) and called cupy.cuda.Device().synchronize() once per tile, forcing default-stream serialisation that defeats nvJPEG2000's internal pipelining. Filed #2107 and fixed: pre-allocate a single d_comp_pool sized n_tiles*samples*tile_height*pitch under a _check_gpu_memory guard, derive per-tile/per-component views as slab offsets, and replace the per-tile sync with a single batch-end sync. Same pattern as #1659 (_try_nvcomp_from_device_bufs), #1688 (_try_kvikio_read_tiles), #1712 (_nvcomp_batch_compress). 7 new tests in test_nvjpeg2k_single_alloc_2107.py: AST-level structural assertions confirm no cupy.empty inside the for-loop and no Device().synchronize() inside the loop, plus pool/per_tile_comp_bytes presence and _check_gpu_memory guard checks; lib-absent short-circuit; unsupported-dtype cleanup contract; cupy-only pool slab-non-overlap test (gpu-marked). libnvjpeg2k.so not present on this host so the end-to-end nvJPEG2000 decode is gated -- note added to re-validate on a host with the RAPIDS conda env. All 30 jpeg2000/compression tests + 7 new tests pass. SAFE/IO-bound verdict holds (no change in dask graph cost). Dask probe: 4096x4096 deflate-tiled file via read_geotiff_dask(chunks=512) yields 256 tasks for 64 chunks (4 tasks/chunk), well under the 50K cap. | Pass 11 (2026-05-18): 1 MEDIUM found and fixed. _read_strips (_reader.py:~L1972) and _fetch_decode_cog_http_strips (_reader.py:~L2670) decoded strips sequentially in a Python for-loop while the tile counterparts (_read_tiles L2146, _fetch_decode_cog_http_tiles L2898) gated parallel decode on _PARALLEL_DECODE_PIXEL_THRESHOLD via ThreadPoolExecutor. Filed #2100 and fixed: both strip paths now collect jobs, parallel-decode when n_strips > 1 and strip_pixels >= 64K, then place sequentially. Measured (uint16, 4-core): 4096x4096 deflate 130ms->34ms (3.82x), 8192x8192 deflate 531ms->146ms (3.63x), 8192x8192 zstd 211ms->85ms (2.48x), uncompressed 25ms->22ms (1.14x). 5 new tests in test_parallel_strip_decode_2100.py (parallel/serial parity, pool-engaged on multi-strip, serial-path for single-strip, windowed cross-strip read, HTTP COG strip parity). 3998 tests pass; 8 pre-existing failures predating this change (predictor2 BE + size_param_validation_gpu_vrt reference now-private read_to_array attr). SAFE/IO-bound verdict holds. | Pass 10 (2026-05-15): 1 new MEDIUM found and fixed; 2 LOW noted. MEDIUM (_reader.py:2737): _fetch_decode_cog_http_tiles decoded tiles sequentially in a Python for-loop after the concurrent fetch landed (issue #1480). Local _read_tiles parallelises decode whenever tile_pixels >= 64K via ThreadPoolExecutor (_reader.py:2017); the HTTP path was structurally similar but never picked up the same gate, so wide windowed reads of multi-tile COGs left deflate/zstd decode single-threaded. Mirrored the local-path threshold + pool. 5 new tests in test_cog_http_parallel_decode_2026_05_15.py (parallel + serial round-trip correctness, pool-instantiation branch selection above the threshold, single-tile path skips the pool, structural _decode_strip_or_tile call count == n_tiles). All 262 COG/HTTP tests pass; 3162 of 3164 selected geotiff tests pass overall (2 pre-existing failures predating Pass 9 per prior notes -- test_predictor2_big_endian_gpu_1517 references the now-private read_to_array attr, and the test_size_param_validation_gpu_vrt_1776 tile_size=4 validator failure). LOW deferred (no fix in this PR): (1) _block_reduce_2d_gpu (_gpu_decode.py:3142/3163/3189) does bool(mask.any().item()) per overview level when nodata is set, paying one device sync per level; the alternative (unconditional cupy.putmask) always pays the work cost and the short-circuit is correct under the current API. (2) _nvcomp_batch_compress adler32 staging (_gpu_decode.py:2543-2546) issues n_tiles slice-assign kernels into a fresh contig buffer despite all callers passing slices of a single underlying d_tile_buf; an API refactor to accept the source buffer directly would skip the rebuild. SAFE/IO-bound verdict holds. Dask probe: 2560x2560 chunks=256 yields 400 tasks (4 per chunk), well under the 50000 cap. GPU probe: 1024x1024 float32 zstd read returns CuPy-backed in 236 ms with no host round-trip. | Rockout 2026-05-15: LOW filed #1934 -- _apply_nodata_mask_gpu used cupy.where (allocating); switched to cupy.putmask on the already-owned buffer (float path) and on the post-astype float64 buffer (int path). Saves one chunk-sized device allocation per call. 7 new tests in test_apply_nodata_mask_gpu_inplace_1934.py; 52 related nodata tests pass. | Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures." +geotiff,2026-07-01,SAFE,IO-bound,0,3597,"Pass 16 (2026-07-01): 1 MEDIUM found and fixed, 0 HIGH. to_geotiff(dask, color_ramp=...) executed the source graph twice: streaming write computed every chunk, then _write_sidecars -> _finite_stats ran dask.compute over the same source for the PAM/QML statistics (measured 32 chunk executions for a 16-chunk source; color_ramp_range escape hatch stays at 16). Filed #3597, fixed by threading a chunk_observer through _write_streaming's three materialisation sites (row band, wide-raster segment, strip band) into a new StreamingStats accumulator (_symbology.py; Chan mean/M2 combine, float64 accumulators, ddof=0, nodata/finite exclusion matches _finite_stats; all-valid buffers skip the boolean-index copy and reductions use dtype=float64 on the original buffer to avoid an astype copy). Post-fix executions 16/16; round-trip bench 8192x8192 f32 deflate write+stats 1.53s -> 1.44s and total source reads halve (the real win is expensive-to-recompute sources: HTTP COGs, cold storage, long pipelines). GPU writer and VRT paths keep the documented second pass (GPU writer fully materialises anyway); docstrings updated. 13 new tests in test_color_ramp_single_pass_3597.py incl. execution counters (row-band, strip, segmented wide path with full-width chunks), nodata/int/all-NaN/multiband gating, accumulator-vs-_finite_stats parity, and a dask+cupy gpu=False streaming leg (run on-device). Audited all 26 geotiff commits since 2026-06-11: pack range guards #3272/#3277 correctly defer per-chunk scans into the write's single compute (no #3235 regression); #3374 fixed Pass 14's deferred LOW (chunked GPU read now parses header/IFDs once); xarray engine #3375/#3377/#3380 is a thin wrapper (no eager compute); PAM sidecar read on open_geotiff is a local os.path.exists probe (no HTTP cost); _CloudSource cat_file change is neutral-to-better. Probes on-device this pass: dask read 400 tasks/100 chunks (4/chunk, cap intact), eager GPU read cupy parity 0.0 (222ms), dask+GPU lazy with cupy meta, symbology _finite_stats GPU/CPU parity 0.0. LOWs unchanged from prior passes. SAFE/IO-bound holds. | Pass 15 (2026-06-11): 1 MEDIUM found and fixed. _pack (_attrs.py:~1795) guarded the no-sentinel integer restore with an eager bool(out.isnull().any()), which executed the whole upstream dask graph at to_geotiff(pack=True) call time; the streaming writer then executed it again, so every source chunk computed twice (measured 32 decode-task executions for 16 chunks on a 512x512 int16 SCALE/OFFSET no-GDAL_NODATA source; 71->33 total task starts post-fix). Filed #3235, fixed by mapping a per-chunk NaN guard (_pack_guard_no_nan) into the graph for dask-backed data (raises from the write's single compute; numpy keeps the eager call-time check; meta= preserves cupy backing). 9 new tests in test_pack_lazy_nan_guard_3235.py incl. fusion-proof execution counter and cupy-chunk guard unit test (dask+cupy e2e still blocked upstream by #3112). Scrutinised all 16 commits since 2026-06-08 (pack/unpack series #3065/#3075/#3079/#3129/#3174/#3175, VRT placement #3135, compression_level gate #3176, streaming banding #3136, dask+cupy writer order fix #3171): no other regressions; #3171's get-then-asarray order is intentional D2H for gpu=False. GPU validated on-device this pass: eager GPU unpack returns cupy with exact parity (387ms incl warmup, only 0-d scalar .get()s -- no bulk host round trip), dask+GPU unpack lazy (112 tasks/16 chunks, cupy meta, compute returns cupy, parity 0.0), GDS fast path intact without unpack (4 tasks/chunk); unpack disqualifying GDS is documented intentional. Dask CPU probe 4 tasks/chunk, 50k-task cap intact. Note: #1714 (_write_vrt_tiled synchronous scheduler) is now FIXED+CLOSED (scheduler='threads' at _writers/eager.py:1517) -- drop from the open-issue list. LOW noted (no fix): _pack does identity (data-0.0)/1.0 arithmetic allocating two full-array temporaries when scale==1/offset==0 (masked_nodata-only pack); prior deferred LOWs unchanged. SAFE/IO-bound holds. | Pass 14 (2026-06-09): MEDIUM found and fixed -- _write_streaming ran one dask .compute() per 256-row tile-row/strip, so a source chunk taller than the band re-executed once per band it overlapped (measured 2x at chunks=512, 4x at chunks=1024, whole upstream graph re-runs for computed pipelines). Filed #3117, fixed via _stream_row_bands: consecutive tile-rows/strips group into row bands sized by the source chunk-row span (one-chunk halo, #3007 accounting) under streaming_buffer_bytes; each band computes once and tiles/strips are carved from the materialised band. Wide rasters needing column segmentation keep the per-tile-row path. Post-fix per-chunk executions == 1 on the default read->write round trip. 5 new tests (TestRowBandRecompute3117 + _stream_row_bands unit); write/integration/parity suites pass (2195). LOW deferred (no fix): _read_geotiff_gpu_chunked parses header+all IFDs twice at graph build (_backends/gpu.py ~1367-1419, cap check then GDS probe; build-time only). GPU paths validated on-device this pass: eager gpu read returns cupy with parity, dask+GPU chunked read lazy (17 tasks/4 chunks) with parity; GPU writer full materialisation is documented intentional (streaming_buffer_bytes no-op). Read path keeps 50k-task graph cap; dask read probe 4 tasks/chunk. SAFE/IO-bound holds. | Pass 13 (2026-05-20): 1 MEDIUM found and fixed. _nvjpeg_batch_encode (_gpu_decode.py:~L1560) and _nvjpeg2k_batch_encode (~L2958) called cupy.cuda.Device().synchronize() inside the per-tile encode loops, a whole-device fence that blocked every CUDA stream and serialised concurrent work (e.g. predictor encodes on other streams). The decode-side counterpart _try_nvjpeg_batch_decode already used cupy.cuda.Stream.null.synchronize() at L1442; the encoder side was inconsistent. Filed #2212 and fixed both encoders to use Stream.null.synchronize(), scoping the per-tile sync to the default stream the encode/retrieve calls were issued on. nvJPEG / nvJPEG2000 encoders maintain a single shared state per encoder so encodes within a batch are inherently serial; the fix removes the device-wide blocker without changing the API ordering contract. 5 new tests in test_nvjpeg_encode_stream_sync_2212.py (AST checks that neither encoder contains Device().synchronize() inside a for-loop, that both call Stream.null.synchronize() in the loop, and that the decoder reference pattern stays pinned). All 5 new tests + 19 existing related encode/decode tests pass. nvjpeg/nvjpeg2k shared libs not present on this host so end-to-end encode verification is gated; add cuda-unavailable-libs note to re-validate on a host with the RAPIDS conda env. SAFE/IO-bound verdict holds; no change in dask graph cost. Dask probe: 2560x2560 deflate-tiled file via read_geotiff_dask(chunks=256) yields 400 tasks for 100 chunks (4 tasks/chunk), well under the 50K cap. LOW deferred (no fix in this PR): _build_ifd called twice per IFD level in _assemble_standard_layout (_writer.py:1531+1543), _assemble_cog_layout (1582+1625), and the COG overview path (2519+2546+2740) -- the first call's bytes are discarded; only the overflow byte length is used to compute pixel_data_offset. Cost is bounded by IFD count (typically 1-5 overview levels) so absolute impact is minor. Pre-existing pattern. | Pass 12 (2026-05-18): 1 MEDIUM found and fixed. _try_nvjpeg2k_batch_decode at _gpu_decode.py:~L2725-2778 allocated per-tile per-component cupy.empty buffers (N*S round-trips through the cupy memory pool) and called cupy.cuda.Device().synchronize() once per tile, forcing default-stream serialisation that defeats nvJPEG2000's internal pipelining. Filed #2107 and fixed: pre-allocate a single d_comp_pool sized n_tiles*samples*tile_height*pitch under a _check_gpu_memory guard, derive per-tile/per-component views as slab offsets, and replace the per-tile sync with a single batch-end sync. Same pattern as #1659 (_try_nvcomp_from_device_bufs), #1688 (_try_kvikio_read_tiles), #1712 (_nvcomp_batch_compress). 7 new tests in test_nvjpeg2k_single_alloc_2107.py: AST-level structural assertions confirm no cupy.empty inside the for-loop and no Device().synchronize() inside the loop, plus pool/per_tile_comp_bytes presence and _check_gpu_memory guard checks; lib-absent short-circuit; unsupported-dtype cleanup contract; cupy-only pool slab-non-overlap test (gpu-marked). libnvjpeg2k.so not present on this host so the end-to-end nvJPEG2000 decode is gated -- note added to re-validate on a host with the RAPIDS conda env. All 30 jpeg2000/compression tests + 7 new tests pass. SAFE/IO-bound verdict holds (no change in dask graph cost). Dask probe: 4096x4096 deflate-tiled file via read_geotiff_dask(chunks=512) yields 256 tasks for 64 chunks (4 tasks/chunk), well under the 50K cap. | Pass 11 (2026-05-18): 1 MEDIUM found and fixed. _read_strips (_reader.py:~L1972) and _fetch_decode_cog_http_strips (_reader.py:~L2670) decoded strips sequentially in a Python for-loop while the tile counterparts (_read_tiles L2146, _fetch_decode_cog_http_tiles L2898) gated parallel decode on _PARALLEL_DECODE_PIXEL_THRESHOLD via ThreadPoolExecutor. Filed #2100 and fixed: both strip paths now collect jobs, parallel-decode when n_strips > 1 and strip_pixels >= 64K, then place sequentially. Measured (uint16, 4-core): 4096x4096 deflate 130ms->34ms (3.82x), 8192x8192 deflate 531ms->146ms (3.63x), 8192x8192 zstd 211ms->85ms (2.48x), uncompressed 25ms->22ms (1.14x). 5 new tests in test_parallel_strip_decode_2100.py (parallel/serial parity, pool-engaged on multi-strip, serial-path for single-strip, windowed cross-strip read, HTTP COG strip parity). 3998 tests pass; 8 pre-existing failures predating this change (predictor2 BE + size_param_validation_gpu_vrt reference now-private read_to_array attr). SAFE/IO-bound verdict holds. | Pass 10 (2026-05-15): 1 new MEDIUM found and fixed; 2 LOW noted. MEDIUM (_reader.py:2737): _fetch_decode_cog_http_tiles decoded tiles sequentially in a Python for-loop after the concurrent fetch landed (issue #1480). Local _read_tiles parallelises decode whenever tile_pixels >= 64K via ThreadPoolExecutor (_reader.py:2017); the HTTP path was structurally similar but never picked up the same gate, so wide windowed reads of multi-tile COGs left deflate/zstd decode single-threaded. Mirrored the local-path threshold + pool. 5 new tests in test_cog_http_parallel_decode_2026_05_15.py (parallel + serial round-trip correctness, pool-instantiation branch selection above the threshold, single-tile path skips the pool, structural _decode_strip_or_tile call count == n_tiles). All 262 COG/HTTP tests pass; 3162 of 3164 selected geotiff tests pass overall (2 pre-existing failures predating Pass 9 per prior notes -- test_predictor2_big_endian_gpu_1517 references the now-private read_to_array attr, and the test_size_param_validation_gpu_vrt_1776 tile_size=4 validator failure). LOW deferred (no fix in this PR): (1) _block_reduce_2d_gpu (_gpu_decode.py:3142/3163/3189) does bool(mask.any().item()) per overview level when nodata is set, paying one device sync per level; the alternative (unconditional cupy.putmask) always pays the work cost and the short-circuit is correct under the current API. (2) _nvcomp_batch_compress adler32 staging (_gpu_decode.py:2543-2546) issues n_tiles slice-assign kernels into a fresh contig buffer despite all callers passing slices of a single underlying d_tile_buf; an API refactor to accept the source buffer directly would skip the rebuild. SAFE/IO-bound verdict holds. Dask probe: 2560x2560 chunks=256 yields 400 tasks (4 per chunk), well under the 50000 cap. GPU probe: 1024x1024 float32 zstd read returns CuPy-backed in 236 ms with no host round-trip. | Rockout 2026-05-15: LOW filed #1934 -- _apply_nodata_mask_gpu used cupy.where (allocating); switched to cupy.putmask on the already-owned buffer (float path) and on the post-astype float64 buffer (int path). Saves one chunk-sized device allocation per call. 7 new tests in test_apply_nodata_mask_gpu_inplace_1934.py; 52 related nodata tests pass. | Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures." glcm,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,"Downgraded to MEDIUM. da.stack without rechunk is scheduling overhead, not OOM risk." hillshade,2026-04-16T12:00:00Z,SAFE,compute-bound,0,,"Re-audit after Horn's method rewrite (PR 1175): clean stencil, map_overlap depth=(1,1), no materialization. Zero findings." hydro,2026-05-01,RISKY,memory-bound,0,1416,"Fixed-in-tree 2026-05-01: hand_mfd._hand_mfd_dask now assembles via da.map_blocks instead of eager da.block of pre-computed tiles (matches hand_dinf pattern). Remaining MEDIUM: sink_d8 CCL fully materializes labels (inherently global), flow_accumulation_mfd frac_bdry held in driver dict instead of memmap-backed BoundaryStore. D8 iterative paths (flow_accum/fill/watershed/basin/stream_*) use serial-tile sweep with memmap-backed boundary store -- per-tile RAM bounded but driver iterates O(diameter) times. flow_direction_*, flow_path/snap_pour_point/twi/hand_d8/hand_dinf are SAFE." diff --git a/xrspatial/geotiff/_symbology.py b/xrspatial/geotiff/_symbology.py index 9fe11dcaa..4b10c3b02 100644 --- a/xrspatial/geotiff/_symbology.py +++ b/xrspatial/geotiff/_symbology.py @@ -230,7 +230,76 @@ def _dask_finite_stats(arr, nodata): utils._to_float_scalar(mean), utils._to_float_scalar(std)) -def write_symbology_sidecars(path, data, *, stops, nodata=None, ramp_range=None): +class StreamingStats: + """One-pass ``(min, max, mean, stddev)`` accumulator over streamed buffers. + + The streaming dask writer already materialises every pixel exactly once + (issue #3597); feeding those buffers through :meth:`update` yields the + same statistics as :func:`_finite_stats` without a second execution of + the source graph. Matches ``_finite_stats`` semantics: non-finite values + and the *nodata* sentinel (unless it is NaN) are excluded, and the + stddev is the population stddev (``ddof=0``). + + Mean and variance combine across buffers with Chan's parallel formula, + accumulated in float64, so the result is stable regardless of how the + writer bands / segments the raster. + """ + + def __init__(self, nodata=None): + if isinstance(nodata, float) and math.isnan(nodata): + nodata = None + self._nodata = nodata + self._count = 0 + self._mean = 0.0 + self._m2 = 0.0 + self._min = math.inf + self._max = -math.inf + + def update(self, buf): + """Fold one materialised numpy buffer into the running statistics.""" + if buf.dtype.kind == 'f': + mask = np.isfinite(buf) + if self._nodata is not None: + mask &= (buf != self._nodata) + # ``buf[mask]`` copies; skip it for the common all-valid buffer. + vals = buf.ravel() if bool(mask.all()) else buf[mask] + elif self._nodata is not None: + mask = buf != self._nodata + vals = buf.ravel() if bool(mask.all()) else buf[mask] + else: + vals = buf.ravel() + # Plain ``int`` so the running count (and the moment arithmetic + # built on it) stays in unbounded Python ints, not numpy scalars. + n_b = int(vals.size) + if n_b == 0: + return + # Reductions with float64 accumulators on the original buffer: + # an ``astype(float64)`` copy here would double the memory + # traffic of every update for no precision gain. + mean_b = float(vals.mean(dtype=np.float64)) + m2_b = float(vals.var(dtype=np.float64)) * n_b + self._min = min(self._min, float(vals.min())) + self._max = max(self._max, float(vals.max())) + if self._count == 0: + self._count, self._mean, self._m2 = n_b, mean_b, m2_b + else: + total = self._count + n_b + delta = mean_b - self._mean + self._m2 += m2_b + delta * delta * self._count * n_b / total + self._mean += delta * n_b / total + self._count = total + + def result(self): + """Return ``(min, max, mean, stddev)``, or ``None`` if nothing valid + was accumulated -- the same contract as :func:`_finite_stats`.""" + if self._count == 0: + return None + std = math.sqrt(max(self._m2 / self._count, 0.0)) + return (self._min, self._max, self._mean, std) + + +def write_symbology_sidecars(path, data, *, stops, nodata=None, + ramp_range=None, stats=None): """Write continuous-raster symbology sidecars next to *path*. Writes band statistics into the PAM ``.aux.xml`` and, when the data range @@ -241,6 +310,11 @@ def write_symbology_sidecars(path, data, *, stops, nodata=None, ramp_range=None) directly and skips the statistics reduction -- the escape hatch for large dask graphs -- so only ``STATISTICS_MINIMUM`` / ``STATISTICS_MAXIMUM`` are written (mean/stddev would need the pass it avoids). + + *stats* is an optional :class:`StreamingStats` that already accumulated + the statistics during the write (the streaming dask path, issue #3597); + when given, its result replaces the :func:`_finite_stats` reduction so + the source graph is not executed a second time. """ from . import _pam @@ -249,20 +323,21 @@ def write_symbology_sidecars(path, data, *, stops, nodata=None, ramp_range=None) if ramp_range is not None: vmin, vmax = float(ramp_range[0]), float(ramp_range[1]) - stats = {'STATISTICS_MINIMUM': vmin, 'STATISTICS_MAXIMUM': vmax} + stats_dict = {'STATISTICS_MINIMUM': vmin, 'STATISTICS_MAXIMUM': vmax} else: - result = _finite_stats(data, nodata) + result = (stats.result() if stats is not None + else _finite_stats(data, nodata)) if result is None: return vmin, vmax, vmean, vstd = result - stats = { + stats_dict = { 'STATISTICS_MINIMUM': vmin, 'STATISTICS_MAXIMUM': vmax, 'STATISTICS_MEAN': vmean, 'STATISTICS_STDDEV': vstd, } - _pam.write_stats_pam_sidecar(path, stats) + _pam.write_stats_pam_sidecar(path, stats_dict) # A constant raster (vmin == vmax) has no range to ramp across, so the # stats stretch is the useful part and the QML would be a degenerate diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index aae2b1d94..63c15281f 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -779,7 +779,8 @@ def _write_streaming(dask_data, path: str, *, photometric='auto', restore_sentinel: bool = True, allow_internal_only_jpeg: bool = False, - allow_unparseable_crs: bool = False) -> None: + allow_unparseable_crs: bool = False, + chunk_observer=None) -> None: """Write a dask array as a GeoTIFF by streaming pixel data. For tiled output, each tile-row is computed in horizontal segments @@ -818,6 +819,15 @@ def _write_streaming(dask_data, path: str, *, allow_unparseable_crs : bool Opt in to writing an unparseable ``crs_wkt`` string into ``GTCitationGeoKey``. Default ``False``. + chunk_observer : callable, optional + Called once with every buffer the streaming write materialises + (a numpy array of logical values: after the dask compute and any + CuPy transfer, before the output-dtype cast and the + NaN-to-sentinel restore). The buffers partition the raster, so + an observer sees each pixel exactly once -- ``to_geotiff`` uses + this to accumulate the ``color_ramp`` symbology statistics + during the write instead of executing the source graph a second + time (issue #3597). Notes ----- @@ -1215,6 +1225,8 @@ def _write_streaming(dask_data, path: str, *, if hasattr(band_np, 'get'): band_np = band_np.get() # CuPy -> numpy band_np = np.asarray(band_np) + if chunk_observer is not None: + chunk_observer(band_np) if band_np.dtype != out_dtype: band_np = band_np.astype(out_dtype) @@ -1263,6 +1275,8 @@ def _write_streaming(dask_data, path: str, *, if hasattr(seg_np, 'get'): seg_np = seg_np.get() # CuPy -> numpy seg_np = np.asarray(seg_np) + if chunk_observer is not None: + chunk_observer(seg_np) if seg_np.dtype != out_dtype: seg_np = seg_np.astype(out_dtype) @@ -1381,6 +1395,8 @@ def _write_streaming(dask_data, path: str, *, if hasattr(band_np, 'get'): band_np = band_np.get() # CuPy -> numpy band_np = np.asarray(band_np) + if chunk_observer is not None: + chunk_observer(band_np) if band_np.dtype != out_dtype: band_np = band_np.astype(out_dtype) diff --git a/xrspatial/geotiff/_writers/eager.py b/xrspatial/geotiff/_writers/eager.py index dfa148cbe..2dcf4a845 100644 --- a/xrspatial/geotiff/_writers/eager.py +++ b/xrspatial/geotiff/_writers/eager.py @@ -414,17 +414,20 @@ def to_geotiff(data: xr.DataArray | np.ndarray, ``.aux.xml``. No-op for a categorical raster (one with ``attrs['category_names']`` -- those get the RAT sidecar instead), a multiband array, a file-like destination, or data with no finite - values. Computing the statistics is a separate reduction pass over the - data; for a dask source that means reading the graph once more (see - ``color_ramp_range`` to skip it). Ignored when ``pack=True``, whose - on-disk packed values would not match a ramp built from the logical - values. + values. Computing the statistics is an extra reduction pass over the + data. The streaming dask write accumulates them from the buffers it + materialises anyway, so the source graph still runs once; the GPU + (``gpu=True``) and VRT (``.vrt``) write paths execute a dask source + a second time for the statistics (see ``color_ramp_range`` to skip + that). Ignored when ``pack=True``, whose on-disk packed values would + not match a ramp built from the logical values. color_ramp_range : tuple of (float, float) or None, default None [advanced] Explicit ``(min, max)`` for the ``color_ramp`` stretch. - Skips the statistics reduction -- useful for large dask graphs -- so - only ``STATISTICS_MINIMUM`` / ``STATISTICS_MAXIMUM`` are written - (mean/stddev need the pass it avoids). Ignored when ``color_ramp`` is - not set. + Skips the statistics reduction -- useful for a dask source on the + GPU or VRT write paths, which would otherwise read the graph once + more -- so only ``STATISTICS_MINIMUM`` / ``STATISTICS_MAXIMUM`` are + written (mean/stddev need the pass it avoids). Ignored when + ``color_ramp`` is not set. Returns ------- @@ -491,6 +494,7 @@ def to_geotiff(data: xr.DataArray | np.ndarray, _sym_data = None _sym_stops = None _sym_nodata = None + _sym_stream_stats = None if isinstance(path, str) and isinstance(data, xr.DataArray): _cat_names = data.attrs.get('category_names') _cat_colors = data.attrs.get('category_colors') @@ -515,7 +519,8 @@ def _write_sidecars(): from .._symbology import write_symbology_sidecars write_symbology_sidecars( path, _sym_data, stops=_sym_stops, - nodata=_sym_nodata, ramp_range=color_ramp_range) + nodata=_sym_nodata, ramp_range=color_ramp_range, + stats=_sym_stream_stats) # Reject bool / np.bool_ nodata up front. ``bool`` is a subclass of # ``int`` in Python, so a typo like ``nodata=True`` slips past every @@ -1097,6 +1102,18 @@ def _write_sidecars(): # branch needs the guard. if epsg is None: _validate_crs_fallback(wkt_fallback, allow_unparseable_crs) + # ``color_ramp`` statistics: rather than re-executing the + # source graph after the write (a second full read of the + # data, issue #3597), accumulate them from the buffers the + # streaming write materialises anyway. ``color_ramp_range`` + # already skips statistics, and multiband data never gets + # symbology, so neither pays the accumulation cost. + _stream_chunk_observer = None + if _sym_stops is not None and color_ramp_range is None: + from .._symbology import StreamingStats, _is_single_band + if _is_single_band(data): + _sym_stream_stats = StreamingStats(nodata=_sym_nodata) + _stream_chunk_observer = _sym_stream_stats.update write_streaming( dask_arr, path, geo_transform=geo_transform, @@ -1125,6 +1142,7 @@ def _write_sidecars(): # rather than rejecting input the wrapper accepted. allow_internal_only_jpeg=allow_internal_only_jpeg, allow_unparseable_crs=allow_unparseable_crs, + chunk_observer=_stream_chunk_observer, ) _write_sidecars() return path diff --git a/xrspatial/geotiff/tests/write/test_color_ramp_single_pass_3597.py b/xrspatial/geotiff/tests/write/test_color_ramp_single_pass_3597.py new file mode 100644 index 000000000..d1f708ab7 --- /dev/null +++ b/xrspatial/geotiff/tests/write/test_color_ramp_single_pass_3597.py @@ -0,0 +1,258 @@ +"""``color_ramp`` statistics ride along with the streaming dask write (#3597). + +``to_geotiff(dask_data, path, color_ramp=...)`` used to execute the source +graph twice: once in ``_write_streaming`` for the pixels and once more in +``_finite_stats`` for the sidecar statistics. The fix threads a +``chunk_observer`` through the streaming writer so a ``StreamingStats`` +accumulator folds in every buffer the write materialises anyway. These tests +pin the single-execution behaviour with a counting ``map_blocks`` layer and +check the accumulated statistics against ``_finite_stats`` / plain numpy on +the row-band, segmented wide-raster, and strip paths. +""" +import os +import threading +import xml.etree.ElementTree as ET + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import to_geotiff +from xrspatial.geotiff._symbology import StreamingStats, _finite_stats + +from .._helpers.markers import requires_gpu + +pytest.importorskip("tifffile") + + +def _counting_da(base, chunks): + """Dask-backed DataArray whose chunks count their own executions.""" + import dask.array as dsa + + counter = {"n": 0} + lock = threading.Lock() + + def _count(block): + if block.size: # skip dask's zero-size meta-inference call + with lock: + counter["n"] += 1 + return block + + arr = dsa.from_array(base, chunks=chunks).map_blocks( + _count, dtype=base.dtype) + n = base.shape[0] + da = xr.DataArray( + arr, dims=("y", "x"), + coords={"y": np.arange(n, dtype="float64"), + "x": np.arange(base.shape[1], dtype="float64")}, + attrs={"crs": 4326}, + ) + return da, counter + + +def _aux_stats(path): + """Parse ``.aux.xml`` into a ``{STATISTICS_*: float}`` dict.""" + band = ET.parse(path + ".aux.xml").getroot().find(".//PAMRasterBand") + return {mdi.get("key"): float(mdi.text) + for mdi in band.findall("./Metadata/MDI")} + + +def _ref_stats(arr, nodata=None): + """(min, max, mean, population std) over finite non-nodata values.""" + a = np.asarray(arr, dtype="float64") + mask = np.isfinite(a) + if nodata is not None: + mask &= (a != nodata) + v = a[mask] + return float(v.min()), float(v.max()), float(v.mean()), float(v.std()) + + +def _assert_aux_matches(path, ref): + stats = _aux_stats(path) + assert stats["STATISTICS_MINIMUM"] == pytest.approx(ref[0], rel=1e-9) + assert stats["STATISTICS_MAXIMUM"] == pytest.approx(ref[1], rel=1e-9) + assert stats["STATISTICS_MEAN"] == pytest.approx(ref[2], rel=1e-9) + assert stats["STATISTICS_STDDEV"] == pytest.approx(ref[3], rel=1e-9) + + +_RNG = np.random.default_rng(3597) +_BASE = _RNG.uniform(-50.0, 150.0, (64, 64)) +_BASE[3, 7] = np.nan +_BASE[40, 2] = np.nan + + +# -------------------------------------------------------------------------- +# single execution of the source graph +# -------------------------------------------------------------------------- + +def test_streaming_color_ramp_executes_source_once(tmp_path): + da, counter = _counting_da(_BASE, chunks=(16, 16)) + path = str(tmp_path / "once_3597.tif") + to_geotiff(da, path, color_ramp="viridis") + assert counter["n"] == 16 # was 32 before the fix + _assert_aux_matches(path, _ref_stats(_BASE)) + assert os.path.exists(str(tmp_path / "once_3597.qml")) + + +def test_streaming_strip_path_executes_source_once(tmp_path): + da, counter = _counting_da(_BASE, chunks=(16, 16)) + path = str(tmp_path / "strip_3597.tif") + to_geotiff(da, path, tiled=False, color_ramp="viridis") + assert counter["n"] == 16 + _assert_aux_matches(path, _ref_stats(_BASE)) + + +def test_color_ramp_range_still_skips_stats(tmp_path): + da, counter = _counting_da(_BASE, chunks=(16, 16)) + path = str(tmp_path / "rng_3597.tif") + to_geotiff(da, path, color_ramp="viridis", color_ramp_range=(0.0, 10.0)) + assert counter["n"] == 16 + aux = open(path + ".aux.xml").read() + assert "STATISTICS_MINIMUM" in aux and "STATISTICS_MEAN" not in aux + + +# -------------------------------------------------------------------------- +# the segmented wide-raster path partitions pixels even when chunks recompute +# -------------------------------------------------------------------------- + +def test_segmented_wide_path_stats_exact(tmp_path): + # Full-width source chunks + a buffer budget of two 16x16 float64 tile + # columns force the column-segmented path, where a source chunk is + # computed once per segment it spans. The observer is fed the segment + # buffers (which partition the raster), so the statistics must stay + # exact even though chunk executions exceed the chunk count. + da, counter = _counting_da(_BASE, chunks=(16, 64)) + path = str(tmp_path / "wide_3597.tif") + to_geotiff(da, path, tiled=True, tile_size=16, + streaming_buffer_bytes=4096, color_ramp="viridis") + assert counter["n"] > 4 # proves the segmented path actually engaged + _assert_aux_matches(path, _ref_stats(_BASE)) + + +# -------------------------------------------------------------------------- +# semantics parity with _finite_stats +# -------------------------------------------------------------------------- + +def test_streaming_nodata_sentinel_excluded(tmp_path): + import dask.array as dsa + + vals = _BASE.copy() + vals[10:20, 10:20] = -9999.0 + da = xr.DataArray( + dsa.from_array(vals, chunks=(16, 16)), dims=("y", "x"), + coords={"y": np.arange(64.0), "x": np.arange(64.0)}, + attrs={"crs": 4326, "nodata": -9999.0}, + ) + path = str(tmp_path / "nd_3597.tif") + to_geotiff(da, path, color_ramp="viridis") + _assert_aux_matches(path, _ref_stats(vals, nodata=-9999.0)) + + +def test_streaming_int_dtype_stats(tmp_path): + import dask.array as dsa + + vals = _RNG.integers(-100, 500, (64, 64)).astype("int32") + vals[0, :] = -32768 + da = xr.DataArray( + dsa.from_array(vals, chunks=(16, 16)), dims=("y", "x"), + coords={"y": np.arange(64.0), "x": np.arange(64.0)}, + attrs={"crs": 4326, "nodata": -32768}, + ) + path = str(tmp_path / "int_3597.tif") + to_geotiff(da, path, color_ramp="viridis") + _assert_aux_matches(path, _ref_stats(vals, nodata=-32768)) + + +def test_all_nan_dask_writes_no_sidecars(tmp_path): + import dask.array as dsa + + da = xr.DataArray( + dsa.full((32, 32), np.nan, chunks=(16, 16)), dims=("y", "x"), + coords={"y": np.arange(32.0), "x": np.arange(32.0)}, + attrs={"crs": 4326}, + ) + path = str(tmp_path / "nan_3597.tif") + to_geotiff(da, path, color_ramp="viridis") + assert os.path.exists(path) + assert not os.path.exists(path + ".aux.xml") + assert not os.path.exists(str(tmp_path / "nan_3597.qml")) + + +def test_multiband_dask_color_ramp_noop(tmp_path): + import dask.array as dsa + + rgb = xr.DataArray( + dsa.zeros((3, 32, 32), chunks=(3, 16, 16), dtype="float32"), + dims=("band", "y", "x"), + coords={"band": [1, 2, 3], "y": np.arange(32.0), + "x": np.arange(32.0)}, + attrs={"crs": 4326}, + ) + path = str(tmp_path / "rgb_3597.tif") + to_geotiff(rgb, path, color_ramp="viridis") + assert not os.path.exists(path + ".aux.xml") + assert not os.path.exists(str(tmp_path / "rgb_3597.qml")) + + +# -------------------------------------------------------------------------- +# StreamingStats unit behaviour +# -------------------------------------------------------------------------- + +def test_streaming_stats_matches_finite_stats_uneven_slabs(): + acc = StreamingStats() + for r0, r1 in [(0, 5), (5, 6), (6, 40), (40, 64)]: + acc.update(_BASE[r0:r1]) + da = xr.DataArray(_BASE, dims=("y", "x")) + assert acc.result() == pytest.approx(_finite_stats(da, None), rel=1e-9) + + +def test_streaming_stats_nan_nodata_treated_as_unset(): + acc = StreamingStats(nodata=float("nan")) + acc.update(_BASE) + da = xr.DataArray(_BASE, dims=("y", "x")) + assert acc.result() == pytest.approx(_finite_stats(da, None), rel=1e-9) + + +def test_streaming_stats_empty_returns_none(): + acc = StreamingStats() + assert acc.result() is None + acc.update(np.full((4, 4), np.nan)) + assert acc.result() is None + + +def test_streaming_stats_constant_buffers(): + acc = StreamingStats() + acc.update(np.full((4, 4), 7.0)) + acc.update(np.full((2, 4), 7.0)) + assert acc.result() == pytest.approx((7.0, 7.0, 7.0, 0.0), abs=1e-12) + + +# -------------------------------------------------------------------------- +# dask+cupy through the CPU streaming writer (gpu=False) +# -------------------------------------------------------------------------- + +@requires_gpu +def test_dask_cupy_streaming_executes_source_once(tmp_path): + import cupy + import dask.array as dsa + + counter = {"n": 0} + lock = threading.Lock() + + def _count(block): + if block.size: # skip dask's zero-size meta-inference call + with lock: + counter["n"] += 1 + return block + + arr = dsa.from_array(cupy.asarray(_BASE), chunks=(16, 16)).map_blocks( + _count, dtype=_BASE.dtype) + da = xr.DataArray( + arr, dims=("y", "x"), + coords={"y": np.arange(64.0), "x": np.arange(64.0)}, + attrs={"crs": 4326}, + ) + path = str(tmp_path / "dgpu_3597.tif") + to_geotiff(da, path, gpu=False, color_ramp="viridis") + assert counter["n"] == 16 + _assert_aux_matches(path, _ref_stats(_BASE))