diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv
index 2e2af4c76..ef411e9ea 100644
--- a/.claude/sweep-performance-state.csv
+++ b/.claude/sweep-performance-state.csv
@@ -18,7 +18,7 @@ fire,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 flood,2026-06-25,RISKY,compute-bound,1,3503,"Cat1 HIGH: _validate_mannings_n_dataarray used .values, eagerly materialized dask/cupy roughness raster (OOM); fixed lazy-safe. Core elementwise ops SAFE. LOW: dask+cupy host round-trip (map_blocks b.get) shared with cost_distance/surface_distance, documented not fixed."
 focal,2026-05-29,SAFE,compute-bound,1,2734,"HIGH: _hotspots_dask_cupy chunk fn round-tripped each chunk host<->GPU (cupy.asnumpy classify cupy.asarray); fixed PR 2739 to reuse _run_gpu_hotspots on device. LOW (not fixed): _apply_numpy/_hotspots_cupy use zeros_like where empty would suffice. CUDA kernels regs<=62, no register-pressure issue."
 geodesic,2026-03-31T18:00:00Z,N/A,compute-bound,0,,
-geotiff,2026-06-11,SAFE,IO-bound,0,3235,"Pass 15 (2026-06-11): 1 MEDIUM found and fixed. _pack (_attrs.py:~1795) guarded the no-sentinel integer restore with an eager bool(out.isnull().any()), which executed the whole upstream dask graph at to_geotiff(pack=True) call time; the streaming writer then executed it again, so every source chunk computed twice (measured 32 decode-task executions for 16 chunks on a 512x512 int16 SCALE/OFFSET no-GDAL_NODATA source; 71->33 total task starts post-fix). Filed #3235, fixed by mapping a per-chunk NaN guard (_pack_guard_no_nan) into the graph for dask-backed data (raises from the write's single compute; numpy keeps the eager call-time check; meta= preserves cupy backing). 9 new tests in test_pack_lazy_nan_guard_3235.py incl. fusion-proof execution counter and cupy-chunk guard unit test (dask+cupy e2e still blocked upstream by #3112). Scrutinised all 16 commits since 2026-06-08 (pack/unpack series #3065/#3075/#3079/#3129/#3174/#3175, VRT placement #3135, compression_level gate #3176, streaming banding #3136, dask+cupy writer order fix #3171): no other regressions; #3171's get-then-asarray order is intentional D2H for gpu=False. GPU validated on-device this pass: eager GPU unpack returns cupy with exact parity (387ms incl warmup, only 0-d scalar .get()s -- no bulk host round trip), dask+GPU unpack lazy (112 tasks/16 chunks, cupy meta, compute returns cupy, parity 0.0), GDS fast path intact without unpack (4 tasks/chunk); unpack disqualifying GDS is documented intentional. Dask CPU probe 4 tasks/chunk, 50k-task cap intact. Note: #1714 (_write_vrt_tiled synchronous scheduler) is now FIXED+CLOSED (scheduler='threads' at _writers/eager.py:1517) -- drop from the open-issue list. LOW noted (no fix): _pack does identity (data-0.0)/1.0 arithmetic allocating two full-array temporaries when scale==1/offset==0 (masked_nodata-only pack); prior deferred LOWs unchanged. SAFE/IO-bound holds. | Pass 14 (2026-06-09): MEDIUM found and fixed -- _write_streaming ran one dask .compute() per 256-row tile-row/strip, so a source chunk taller than the band re-executed once per band it overlapped (measured 2x at chunks=512, 4x at chunks=1024, whole upstream graph re-runs for computed pipelines). Filed #3117, fixed via _stream_row_bands: consecutive tile-rows/strips group into row bands sized by the source chunk-row span (one-chunk halo, #3007 accounting) under streaming_buffer_bytes; each band computes once and tiles/strips are carved from the materialised band. Wide rasters needing column segmentation keep the per-tile-row path. Post-fix per-chunk executions == 1 on the default read->write round trip. 5 new tests (TestRowBandRecompute3117 + _stream_row_bands unit); write/integration/parity suites pass (2195). LOW deferred (no fix): _read_geotiff_gpu_chunked parses header+all IFDs twice at graph build (_backends/gpu.py ~1367-1419, cap check then GDS probe; build-time only). GPU paths validated on-device this pass: eager gpu read returns cupy with parity, dask+GPU chunked read lazy (17 tasks/4 chunks) with parity; GPU writer full materialisation is documented intentional (streaming_buffer_bytes no-op). Read path keeps 50k-task graph cap; dask read probe 4 tasks/chunk. SAFE/IO-bound holds. | Pass 13 (2026-05-20): 1 MEDIUM found and fixed. _nvjpeg_batch_encode (_gpu_decode.py:~L1560) and _nvjpeg2k_batch_encode (~L2958) called cupy.cuda.Device().synchronize() inside the per-tile encode loops, a whole-device fence that blocked every CUDA stream and serialised concurrent work (e.g. predictor encodes on other streams). The decode-side counterpart _try_nvjpeg_batch_decode already used cupy.cuda.Stream.null.synchronize() at L1442; the encoder side was inconsistent. Filed #2212 and fixed both encoders to use Stream.null.synchronize(), scoping the per-tile sync to the default stream the encode/retrieve calls were issued on. nvJPEG / nvJPEG2000 encoders maintain a single shared state per encoder so encodes within a batch are inherently serial; the fix removes the device-wide blocker without changing the API ordering contract. 5 new tests in test_nvjpeg_encode_stream_sync_2212.py (AST checks that neither encoder contains Device().synchronize() inside a for-loop, that both call Stream.null.synchronize() in the loop, and that the decoder reference pattern stays pinned). All 5 new tests + 19 existing related encode/decode tests pass. nvjpeg/nvjpeg2k shared libs not present on this host so end-to-end encode verification is gated; add cuda-unavailable-libs note to re-validate on a host with the RAPIDS conda env. SAFE/IO-bound verdict holds; no change in dask graph cost. Dask probe: 2560x2560 deflate-tiled file via read_geotiff_dask(chunks=256) yields 400 tasks for 100 chunks (4 tasks/chunk), well under the 50K cap. LOW deferred (no fix in this PR): _build_ifd called twice per IFD level in _assemble_standard_layout (_writer.py:1531+1543), _assemble_cog_layout (1582+1625), and the COG overview path (2519+2546+2740) -- the first call's bytes are discarded; only the overflow byte length is used to compute pixel_data_offset. Cost is bounded by IFD count (typically 1-5 overview levels) so absolute impact is minor. Pre-existing pattern. | Pass 12 (2026-05-18): 1 MEDIUM found and fixed. _try_nvjpeg2k_batch_decode at _gpu_decode.py:~L2725-2778 allocated per-tile per-component cupy.empty buffers (N*S round-trips through the cupy memory pool) and called cupy.cuda.Device().synchronize() once per tile, forcing default-stream serialisation that defeats nvJPEG2000's internal pipelining. Filed #2107 and fixed: pre-allocate a single d_comp_pool sized n_tiles*samples*tile_height*pitch under a _check_gpu_memory guard, derive per-tile/per-component views as slab offsets, and replace the per-tile sync with a single batch-end sync. Same pattern as #1659 (_try_nvcomp_from_device_bufs), #1688 (_try_kvikio_read_tiles), #1712 (_nvcomp_batch_compress). 7 new tests in test_nvjpeg2k_single_alloc_2107.py: AST-level structural assertions confirm no cupy.empty inside the for-loop and no Device().synchronize() inside the loop, plus pool/per_tile_comp_bytes presence and _check_gpu_memory guard checks; lib-absent short-circuit; unsupported-dtype cleanup contract; cupy-only pool slab-non-overlap test (gpu-marked). libnvjpeg2k.so not present on this host so the end-to-end nvJPEG2000 decode is gated -- note added to re-validate on a host with the RAPIDS conda env. All 30 jpeg2000/compression tests + 7 new tests pass. SAFE/IO-bound verdict holds (no change in dask graph cost). Dask probe: 4096x4096 deflate-tiled file via read_geotiff_dask(chunks=512) yields 256 tasks for 64 chunks (4 tasks/chunk), well under the 50K cap. | Pass 11 (2026-05-18): 1 MEDIUM found and fixed. _read_strips (_reader.py:~L1972) and _fetch_decode_cog_http_strips (_reader.py:~L2670) decoded strips sequentially in a Python for-loop while the tile counterparts (_read_tiles L2146, _fetch_decode_cog_http_tiles L2898) gated parallel decode on _PARALLEL_DECODE_PIXEL_THRESHOLD via ThreadPoolExecutor. Filed #2100 and fixed: both strip paths now collect jobs, parallel-decode when n_strips > 1 and strip_pixels >= 64K, then place sequentially. Measured (uint16, 4-core): 4096x4096 deflate 130ms->34ms (3.82x), 8192x8192 deflate 531ms->146ms (3.63x), 8192x8192 zstd 211ms->85ms (2.48x), uncompressed 25ms->22ms (1.14x). 5 new tests in test_parallel_strip_decode_2100.py (parallel/serial parity, pool-engaged on multi-strip, serial-path for single-strip, windowed cross-strip read, HTTP COG strip parity). 3998 tests pass; 8 pre-existing failures predating this change (predictor2 BE + size_param_validation_gpu_vrt reference now-private read_to_array attr). SAFE/IO-bound verdict holds. | Pass 10 (2026-05-15): 1 new MEDIUM found and fixed; 2 LOW noted. MEDIUM (_reader.py:2737): _fetch_decode_cog_http_tiles decoded tiles sequentially in a Python for-loop after the concurrent fetch landed (issue #1480). Local _read_tiles parallelises decode whenever tile_pixels >= 64K via ThreadPoolExecutor (_reader.py:2017); the HTTP path was structurally similar but never picked up the same gate, so wide windowed reads of multi-tile COGs left deflate/zstd decode single-threaded. Mirrored the local-path threshold + pool. 5 new tests in test_cog_http_parallel_decode_2026_05_15.py (parallel + serial round-trip correctness, pool-instantiation branch selection above the threshold, single-tile path skips the pool, structural _decode_strip_or_tile call count == n_tiles). All 262 COG/HTTP tests pass; 3162 of 3164 selected geotiff tests pass overall (2 pre-existing failures predating Pass 9 per prior notes -- test_predictor2_big_endian_gpu_1517 references the now-private read_to_array attr, and the test_size_param_validation_gpu_vrt_1776 tile_size=4 validator failure). LOW deferred (no fix in this PR): (1) _block_reduce_2d_gpu (_gpu_decode.py:3142/3163/3189) does bool(mask.any().item()) per overview level when nodata is set, paying one device sync per level; the alternative (unconditional cupy.putmask) always pays the work cost and the short-circuit is correct under the current API. (2) _nvcomp_batch_compress adler32 staging (_gpu_decode.py:2543-2546) issues n_tiles slice-assign kernels into a fresh contig buffer despite all callers passing slices of a single underlying d_tile_buf; an API refactor to accept the source buffer directly would skip the rebuild. SAFE/IO-bound verdict holds. Dask probe: 2560x2560 chunks=256 yields 400 tasks (4 per chunk), well under the 50000 cap. GPU probe: 1024x1024 float32 zstd read returns CuPy-backed in 236 ms with no host round-trip. | Rockout 2026-05-15: LOW filed #1934 -- _apply_nodata_mask_gpu used cupy.where (allocating); switched to cupy.putmask on the already-owned buffer (float path) and on the post-astype float64 buffer (int path). Saves one chunk-sized device allocation per call. 7 new tests in test_apply_nodata_mask_gpu_inplace_1934.py; 52 related nodata tests pass. | Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures."
+geotiff,2026-07-01,SAFE,IO-bound,0,3597,"Pass 16 (2026-07-01): 1 MEDIUM found and fixed, 0 HIGH. to_geotiff(dask, color_ramp=...) executed the source graph twice: streaming write computed every chunk, then _write_sidecars -> _finite_stats ran dask.compute over the same source for the PAM/QML statistics (measured 32 chunk executions for a 16-chunk source; color_ramp_range escape hatch stays at 16). Filed #3597, fixed by threading a chunk_observer through _write_streaming's three materialisation sites (row band, wide-raster segment, strip band) into a new StreamingStats accumulator (_symbology.py; Chan mean/M2 combine, float64 accumulators, ddof=0, nodata/finite exclusion matches _finite_stats; all-valid buffers skip the boolean-index copy and reductions use dtype=float64 on the original buffer to avoid an astype copy). Post-fix executions 16/16; round-trip bench 8192x8192 f32 deflate write+stats 1.53s -> 1.44s and total source reads halve (the real win is expensive-to-recompute sources: HTTP COGs, cold storage, long pipelines). GPU writer and VRT paths keep the documented second pass (GPU writer fully materialises anyway); docstrings updated. 13 new tests in test_color_ramp_single_pass_3597.py incl. execution counters (row-band, strip, segmented wide path with full-width chunks), nodata/int/all-NaN/multiband gating, accumulator-vs-_finite_stats parity, and a dask+cupy gpu=False streaming leg (run on-device). Audited all 26 geotiff commits since 2026-06-11: pack range guards #3272/#3277 correctly defer per-chunk scans into the write's single compute (no #3235 regression); #3374 fixed Pass 14's deferred LOW (chunked GPU read now parses header/IFDs once); xarray engine #3375/#3377/#3380 is a thin wrapper (no eager compute); PAM sidecar read on open_geotiff is a local os.path.exists probe (no HTTP cost); _CloudSource cat_file change is neutral-to-better. Probes on-device this pass: dask read 400 tasks/100 chunks (4/chunk, cap intact), eager GPU read cupy parity 0.0 (222ms), dask+GPU lazy with cupy meta, symbology _finite_stats GPU/CPU parity 0.0. LOWs unchanged from prior passes. SAFE/IO-bound holds. | Pass 15 (2026-06-11): 1 MEDIUM found and fixed. _pack (_attrs.py:~1795) guarded the no-sentinel integer restore with an eager bool(out.isnull().any()), which executed the whole upstream dask graph at to_geotiff(pack=True) call time; the streaming writer then executed it again, so every source chunk computed twice (measured 32 decode-task executions for 16 chunks on a 512x512 int16 SCALE/OFFSET no-GDAL_NODATA source; 71->33 total task starts post-fix). Filed #3235, fixed by mapping a per-chunk NaN guard (_pack_guard_no_nan) into the graph for dask-backed data (raises from the write's single compute; numpy keeps the eager call-time check; meta= preserves cupy backing). 9 new tests in test_pack_lazy_nan_guard_3235.py incl. fusion-proof execution counter and cupy-chunk guard unit test (dask+cupy e2e still blocked upstream by #3112). Scrutinised all 16 commits since 2026-06-08 (pack/unpack series #3065/#3075/#3079/#3129/#3174/#3175, VRT placement #3135, compression_level gate #3176, streaming banding #3136, dask+cupy writer order fix #3171): no other regressions; #3171's get-then-asarray order is intentional D2H for gpu=False. GPU validated on-device this pass: eager GPU unpack returns cupy with exact parity (387ms incl warmup, only 0-d scalar .get()s -- no bulk host round trip), dask+GPU unpack lazy (112 tasks/16 chunks, cupy meta, compute returns cupy, parity 0.0), GDS fast path intact without unpack (4 tasks/chunk); unpack disqualifying GDS is documented intentional. Dask CPU probe 4 tasks/chunk, 50k-task cap intact. Note: #1714 (_write_vrt_tiled synchronous scheduler) is now FIXED+CLOSED (scheduler='threads' at _writers/eager.py:1517) -- drop from the open-issue list. LOW noted (no fix): _pack does identity (data-0.0)/1.0 arithmetic allocating two full-array temporaries when scale==1/offset==0 (masked_nodata-only pack); prior deferred LOWs unchanged. SAFE/IO-bound holds. | Pass 14 (2026-06-09): MEDIUM found and fixed -- _write_streaming ran one dask .compute() per 256-row tile-row/strip, so a source chunk taller than the band re-executed once per band it overlapped (measured 2x at chunks=512, 4x at chunks=1024, whole upstream graph re-runs for computed pipelines). Filed #3117, fixed via _stream_row_bands: consecutive tile-rows/strips group into row bands sized by the source chunk-row span (one-chunk halo, #3007 accounting) under streaming_buffer_bytes; each band computes once and tiles/strips are carved from the materialised band. Wide rasters needing column segmentation keep the per-tile-row path. Post-fix per-chunk executions == 1 on the default read->write round trip. 5 new tests (TestRowBandRecompute3117 + _stream_row_bands unit); write/integration/parity suites pass (2195). LOW deferred (no fix): _read_geotiff_gpu_chunked parses header+all IFDs twice at graph build (_backends/gpu.py ~1367-1419, cap check then GDS probe; build-time only). GPU paths validated on-device this pass: eager gpu read returns cupy with parity, dask+GPU chunked read lazy (17 tasks/4 chunks) with parity; GPU writer full materialisation is documented intentional (streaming_buffer_bytes no-op). Read path keeps 50k-task graph cap; dask read probe 4 tasks/chunk. SAFE/IO-bound holds. | Pass 13 (2026-05-20): 1 MEDIUM found and fixed. _nvjpeg_batch_encode (_gpu_decode.py:~L1560) and _nvjpeg2k_batch_encode (~L2958) called cupy.cuda.Device().synchronize() inside the per-tile encode loops, a whole-device fence that blocked every CUDA stream and serialised concurrent work (e.g. predictor encodes on other streams). The decode-side counterpart _try_nvjpeg_batch_decode already used cupy.cuda.Stream.null.synchronize() at L1442; the encoder side was inconsistent. Filed #2212 and fixed both encoders to use Stream.null.synchronize(), scoping the per-tile sync to the default stream the encode/retrieve calls were issued on. nvJPEG / nvJPEG2000 encoders maintain a single shared state per encoder so encodes within a batch are inherently serial; the fix removes the device-wide blocker without changing the API ordering contract. 5 new tests in test_nvjpeg_encode_stream_sync_2212.py (AST checks that neither encoder contains Device().synchronize() inside a for-loop, that both call Stream.null.synchronize() in the loop, and that the decoder reference pattern stays pinned). All 5 new tests + 19 existing related encode/decode tests pass. nvjpeg/nvjpeg2k shared libs not present on this host so end-to-end encode verification is gated; add cuda-unavailable-libs note to re-validate on a host with the RAPIDS conda env. SAFE/IO-bound verdict holds; no change in dask graph cost. Dask probe: 2560x2560 deflate-tiled file via read_geotiff_dask(chunks=256) yields 400 tasks for 100 chunks (4 tasks/chunk), well under the 50K cap. LOW deferred (no fix in this PR): _build_ifd called twice per IFD level in _assemble_standard_layout (_writer.py:1531+1543), _assemble_cog_layout (1582+1625), and the COG overview path (2519+2546+2740) -- the first call's bytes are discarded; only the overflow byte length is used to compute pixel_data_offset. Cost is bounded by IFD count (typically 1-5 overview levels) so absolute impact is minor. Pre-existing pattern. | Pass 12 (2026-05-18): 1 MEDIUM found and fixed. _try_nvjpeg2k_batch_decode at _gpu_decode.py:~L2725-2778 allocated per-tile per-component cupy.empty buffers (N*S round-trips through the cupy memory pool) and called cupy.cuda.Device().synchronize() once per tile, forcing default-stream serialisation that defeats nvJPEG2000's internal pipelining. Filed #2107 and fixed: pre-allocate a single d_comp_pool sized n_tiles*samples*tile_height*pitch under a _check_gpu_memory guard, derive per-tile/per-component views as slab offsets, and replace the per-tile sync with a single batch-end sync. Same pattern as #1659 (_try_nvcomp_from_device_bufs), #1688 (_try_kvikio_read_tiles), #1712 (_nvcomp_batch_compress). 7 new tests in test_nvjpeg2k_single_alloc_2107.py: AST-level structural assertions confirm no cupy.empty inside the for-loop and no Device().synchronize() inside the loop, plus pool/per_tile_comp_bytes presence and _check_gpu_memory guard checks; lib-absent short-circuit; unsupported-dtype cleanup contract; cupy-only pool slab-non-overlap test (gpu-marked). libnvjpeg2k.so not present on this host so the end-to-end nvJPEG2000 decode is gated -- note added to re-validate on a host with the RAPIDS conda env. All 30 jpeg2000/compression tests + 7 new tests pass. SAFE/IO-bound verdict holds (no change in dask graph cost). Dask probe: 4096x4096 deflate-tiled file via read_geotiff_dask(chunks=512) yields 256 tasks for 64 chunks (4 tasks/chunk), well under the 50K cap. | Pass 11 (2026-05-18): 1 MEDIUM found and fixed. _read_strips (_reader.py:~L1972) and _fetch_decode_cog_http_strips (_reader.py:~L2670) decoded strips sequentially in a Python for-loop while the tile counterparts (_read_tiles L2146, _fetch_decode_cog_http_tiles L2898) gated parallel decode on _PARALLEL_DECODE_PIXEL_THRESHOLD via ThreadPoolExecutor. Filed #2100 and fixed: both strip paths now collect jobs, parallel-decode when n_strips > 1 and strip_pixels >= 64K, then place sequentially. Measured (uint16, 4-core): 4096x4096 deflate 130ms->34ms (3.82x), 8192x8192 deflate 531ms->146ms (3.63x), 8192x8192 zstd 211ms->85ms (2.48x), uncompressed 25ms->22ms (1.14x). 5 new tests in test_parallel_strip_decode_2100.py (parallel/serial parity, pool-engaged on multi-strip, serial-path for single-strip, windowed cross-strip read, HTTP COG strip parity). 3998 tests pass; 8 pre-existing failures predating this change (predictor2 BE + size_param_validation_gpu_vrt reference now-private read_to_array attr). SAFE/IO-bound verdict holds. | Pass 10 (2026-05-15): 1 new MEDIUM found and fixed; 2 LOW noted. MEDIUM (_reader.py:2737): _fetch_decode_cog_http_tiles decoded tiles sequentially in a Python for-loop after the concurrent fetch landed (issue #1480). Local _read_tiles parallelises decode whenever tile_pixels >= 64K via ThreadPoolExecutor (_reader.py:2017); the HTTP path was structurally similar but never picked up the same gate, so wide windowed reads of multi-tile COGs left deflate/zstd decode single-threaded. Mirrored the local-path threshold + pool. 5 new tests in test_cog_http_parallel_decode_2026_05_15.py (parallel + serial round-trip correctness, pool-instantiation branch selection above the threshold, single-tile path skips the pool, structural _decode_strip_or_tile call count == n_tiles). All 262 COG/HTTP tests pass; 3162 of 3164 selected geotiff tests pass overall (2 pre-existing failures predating Pass 9 per prior notes -- test_predictor2_big_endian_gpu_1517 references the now-private read_to_array attr, and the test_size_param_validation_gpu_vrt_1776 tile_size=4 validator failure). LOW deferred (no fix in this PR): (1) _block_reduce_2d_gpu (_gpu_decode.py:3142/3163/3189) does bool(mask.any().item()) per overview level when nodata is set, paying one device sync per level; the alternative (unconditional cupy.putmask) always pays the work cost and the short-circuit is correct under the current API. (2) _nvcomp_batch_compress adler32 staging (_gpu_decode.py:2543-2546) issues n_tiles slice-assign kernels into a fresh contig buffer despite all callers passing slices of a single underlying d_tile_buf; an API refactor to accept the source buffer directly would skip the rebuild. SAFE/IO-bound verdict holds. Dask probe: 2560x2560 chunks=256 yields 400 tasks (4 per chunk), well under the 50000 cap. GPU probe: 1024x1024 float32 zstd read returns CuPy-backed in 236 ms with no host round-trip. | Rockout 2026-05-15: LOW filed #1934 -- _apply_nodata_mask_gpu used cupy.where (allocating); switched to cupy.putmask on the already-owned buffer (float path) and on the post-astype float64 buffer (int path). Saves one chunk-sized device allocation per call. 7 new tests in test_apply_nodata_mask_gpu_inplace_1934.py; 52 related nodata tests pass. | Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures."
 glcm,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,"Downgraded to MEDIUM. da.stack without rechunk is scheduling overhead, not OOM risk."
 hillshade,2026-04-16T12:00:00Z,SAFE,compute-bound,0,,"Re-audit after Horn's method rewrite (PR 1175): clean stencil, map_overlap depth=(1,1), no materialization. Zero findings."
 hydro,2026-05-01,RISKY,memory-bound,0,1416,"Fixed-in-tree 2026-05-01: hand_mfd._hand_mfd_dask now assembles via da.map_blocks instead of eager da.block of pre-computed tiles (matches hand_dinf pattern). Remaining MEDIUM: sink_d8 CCL fully materializes labels (inherently global), flow_accumulation_mfd frac_bdry held in driver dict instead of memmap-backed BoundaryStore. D8 iterative paths (flow_accum/fill/watershed/basin/stream_*) use serial-tile sweep with memmap-backed boundary store -- per-tile RAM bounded but driver iterates O(diameter) times. flow_direction_*, flow_path/snap_pour_point/twi/hand_d8/hand_dinf are SAFE."
diff --git a/xrspatial/geotiff/_symbology.py b/xrspatial/geotiff/_symbology.py
index 9fe11dcaa..4b10c3b02 100644
--- a/xrspatial/geotiff/_symbology.py
+++ b/xrspatial/geotiff/_symbology.py
@@ -230,7 +230,76 @@ def _dask_finite_stats(arr, nodata):
             utils._to_float_scalar(mean), utils._to_float_scalar(std))
 
 
-def write_symbology_sidecars(path, data, *, stops, nodata=None, ramp_range=None):
+class StreamingStats:
+    """One-pass ``(min, max, mean, stddev)`` accumulator over streamed buffers.
+
+    The streaming dask writer already materialises every pixel exactly once
+    (issue #3597); feeding those buffers through :meth:`update` yields the
+    same statistics as :func:`_finite_stats` without a second execution of
+    the source graph. Matches ``_finite_stats`` semantics: non-finite values
+    and the *nodata* sentinel (unless it is NaN) are excluded, and the
+    stddev is the population stddev (``ddof=0``).
+
+    Mean and variance combine across buffers with Chan's parallel formula,
+    accumulated in float64, so the result is stable regardless of how the
+    writer bands / segments the raster.
+    """
+
+    def __init__(self, nodata=None):
+        if isinstance(nodata, float) and math.isnan(nodata):
+            nodata = None
+        self._nodata = nodata
+        self._count = 0
+        self._mean = 0.0
+        self._m2 = 0.0
+        self._min = math.inf
+        self._max = -math.inf
+
+    def update(self, buf):
+        """Fold one materialised numpy buffer into the running statistics."""
+        if buf.dtype.kind == 'f':
+            mask = np.isfinite(buf)
+            if self._nodata is not None:
+                mask &= (buf != self._nodata)
+            # ``buf[mask]`` copies; skip it for the common all-valid buffer.
+            vals = buf.ravel() if bool(mask.all()) else buf[mask]
+        elif self._nodata is not None:
+            mask = buf != self._nodata
+            vals = buf.ravel() if bool(mask.all()) else buf[mask]
+        else:
+            vals = buf.ravel()
+        # Plain ``int`` so the running count (and the moment arithmetic
+        # built on it) stays in unbounded Python ints, not numpy scalars.
+        n_b = int(vals.size)
+        if n_b == 0:
+            return
+        # Reductions with float64 accumulators on the original buffer:
+        # an ``astype(float64)`` copy here would double the memory
+        # traffic of every update for no precision gain.
+        mean_b = float(vals.mean(dtype=np.float64))
+        m2_b = float(vals.var(dtype=np.float64)) * n_b
+        self._min = min(self._min, float(vals.min()))
+        self._max = max(self._max, float(vals.max()))
+        if self._count == 0:
+            self._count, self._mean, self._m2 = n_b, mean_b, m2_b
+        else:
+            total = self._count + n_b
+            delta = mean_b - self._mean
+            self._m2 += m2_b + delta * delta * self._count * n_b / total
+            self._mean += delta * n_b / total
+            self._count = total
+
+    def result(self):
+        """Return ``(min, max, mean, stddev)``, or ``None`` if nothing valid
+        was accumulated -- the same contract as :func:`_finite_stats`."""
+        if self._count == 0:
+            return None
+        std = math.sqrt(max(self._m2 / self._count, 0.0))
+        return (self._min, self._max, self._mean, std)
+
+
+def write_symbology_sidecars(path, data, *, stops, nodata=None,
+                             ramp_range=None, stats=None):
     """Write continuous-raster symbology sidecars next to *path*.
 
     Writes band statistics into the PAM ``.aux.xml`` and, when the data range
@@ -241,6 +310,11 @@ def write_symbology_sidecars(path, data, *, stops, nodata=None, ramp_range=None)
     directly and skips the statistics reduction -- the escape hatch for large
     dask graphs -- so only ``STATISTICS_MINIMUM`` / ``STATISTICS_MAXIMUM`` are
     written (mean/stddev would need the pass it avoids).
+
+    *stats* is an optional :class:`StreamingStats` that already accumulated
+    the statistics during the write (the streaming dask path, issue #3597);
+    when given, its result replaces the :func:`_finite_stats` reduction so
+    the source graph is not executed a second time.
     """
     from . import _pam
 
@@ -249,20 +323,21 @@ def write_symbology_sidecars(path, data, *, stops, nodata=None, ramp_range=None)
 
     if ramp_range is not None:
         vmin, vmax = float(ramp_range[0]), float(ramp_range[1])
-        stats = {'STATISTICS_MINIMUM': vmin, 'STATISTICS_MAXIMUM': vmax}
+        stats_dict = {'STATISTICS_MINIMUM': vmin, 'STATISTICS_MAXIMUM': vmax}
     else:
-        result = _finite_stats(data, nodata)
+        result = (stats.result() if stats is not None
+                  else _finite_stats(data, nodata))
         if result is None:
             return
         vmin, vmax, vmean, vstd = result
-        stats = {
+        stats_dict = {
             'STATISTICS_MINIMUM': vmin,
             'STATISTICS_MAXIMUM': vmax,
             'STATISTICS_MEAN': vmean,
             'STATISTICS_STDDEV': vstd,
         }
 
-    _pam.write_stats_pam_sidecar(path, stats)
+    _pam.write_stats_pam_sidecar(path, stats_dict)
 
     # A constant raster (vmin == vmax) has no range to ramp across, so the
     # stats stretch is the useful part and the QML would be a degenerate
diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
index aae2b1d94..63c15281f 100644
--- a/xrspatial/geotiff/_writer.py
+++ b/xrspatial/geotiff/_writer.py
@@ -779,7 +779,8 @@ def _write_streaming(dask_data, path: str, *,
                      photometric='auto',
                      restore_sentinel: bool = True,
                      allow_internal_only_jpeg: bool = False,
-                     allow_unparseable_crs: bool = False) -> None:
+                     allow_unparseable_crs: bool = False,
+                     chunk_observer=None) -> None:
     """Write a dask array as a GeoTIFF by streaming pixel data.
 
     For tiled output, each tile-row is computed in horizontal segments
@@ -818,6 +819,15 @@ def _write_streaming(dask_data, path: str, *,
     allow_unparseable_crs : bool
         Opt in to writing an unparseable ``crs_wkt`` string into
         ``GTCitationGeoKey``. Default ``False``.
+    chunk_observer : callable, optional
+        Called once with every buffer the streaming write materialises
+        (a numpy array of logical values: after the dask compute and any
+        CuPy transfer, before the output-dtype cast and the
+        NaN-to-sentinel restore). The buffers partition the raster, so
+        an observer sees each pixel exactly once -- ``to_geotiff`` uses
+        this to accumulate the ``color_ramp`` symbology statistics
+        during the write instead of executing the source graph a second
+        time (issue #3597).
 
     Notes
     -----
@@ -1215,6 +1225,8 @@ def _write_streaming(dask_data, path: str, *,
                             if hasattr(band_np, 'get'):
                                 band_np = band_np.get()  # CuPy -> numpy
                             band_np = np.asarray(band_np)
+                            if chunk_observer is not None:
+                                chunk_observer(band_np)
 
                             if band_np.dtype != out_dtype:
                                 band_np = band_np.astype(out_dtype)
@@ -1263,6 +1275,8 @@ def _write_streaming(dask_data, path: str, *,
                                     if hasattr(seg_np, 'get'):
                                         seg_np = seg_np.get()  # CuPy -> numpy
                                     seg_np = np.asarray(seg_np)
+                                    if chunk_observer is not None:
+                                        chunk_observer(seg_np)
 
                                     if seg_np.dtype != out_dtype:
                                         seg_np = seg_np.astype(out_dtype)
@@ -1381,6 +1395,8 @@ def _write_streaming(dask_data, path: str, *,
                     if hasattr(band_np, 'get'):
                         band_np = band_np.get()  # CuPy -> numpy
                     band_np = np.asarray(band_np)
+                    if chunk_observer is not None:
+                        chunk_observer(band_np)
 
                     if band_np.dtype != out_dtype:
                         band_np = band_np.astype(out_dtype)
diff --git a/xrspatial/geotiff/_writers/eager.py b/xrspatial/geotiff/_writers/eager.py
index dfa148cbe..2dcf4a845 100644
--- a/xrspatial/geotiff/_writers/eager.py
+++ b/xrspatial/geotiff/_writers/eager.py
@@ -414,17 +414,20 @@ def to_geotiff(data: xr.DataArray | np.ndarray,
         ``<file>.aux.xml``. No-op for a categorical raster (one with
         ``attrs['category_names']`` -- those get the RAT sidecar instead), a
         multiband array, a file-like destination, or data with no finite
-        values. Computing the statistics is a separate reduction pass over the
-        data; for a dask source that means reading the graph once more (see
-        ``color_ramp_range`` to skip it). Ignored when ``pack=True``, whose
-        on-disk packed values would not match a ramp built from the logical
-        values.
+        values. Computing the statistics is an extra reduction pass over the
+        data. The streaming dask write accumulates them from the buffers it
+        materialises anyway, so the source graph still runs once; the GPU
+        (``gpu=True``) and VRT (``.vrt``) write paths execute a dask source
+        a second time for the statistics (see ``color_ramp_range`` to skip
+        that). Ignored when ``pack=True``, whose on-disk packed values would
+        not match a ramp built from the logical values.
     color_ramp_range : tuple of (float, float) or None, default None
         [advanced] Explicit ``(min, max)`` for the ``color_ramp`` stretch.
-        Skips the statistics reduction -- useful for large dask graphs -- so
-        only ``STATISTICS_MINIMUM`` / ``STATISTICS_MAXIMUM`` are written
-        (mean/stddev need the pass it avoids). Ignored when ``color_ramp`` is
-        not set.
+        Skips the statistics reduction -- useful for a dask source on the
+        GPU or VRT write paths, which would otherwise read the graph once
+        more -- so only ``STATISTICS_MINIMUM`` / ``STATISTICS_MAXIMUM`` are
+        written (mean/stddev need the pass it avoids). Ignored when
+        ``color_ramp`` is not set.
 
     Returns
     -------
@@ -491,6 +494,7 @@ def to_geotiff(data: xr.DataArray | np.ndarray,
     _sym_data = None
     _sym_stops = None
     _sym_nodata = None
+    _sym_stream_stats = None
     if isinstance(path, str) and isinstance(data, xr.DataArray):
         _cat_names = data.attrs.get('category_names')
         _cat_colors = data.attrs.get('category_colors')
@@ -515,7 +519,8 @@ def _write_sidecars():
             from .._symbology import write_symbology_sidecars
             write_symbology_sidecars(
                 path, _sym_data, stops=_sym_stops,
-                nodata=_sym_nodata, ramp_range=color_ramp_range)
+                nodata=_sym_nodata, ramp_range=color_ramp_range,
+                stats=_sym_stream_stats)
 
     # Reject bool / np.bool_ nodata up front. ``bool`` is a subclass of
     # ``int`` in Python, so a typo like ``nodata=True`` slips past every
@@ -1097,6 +1102,18 @@ def _write_sidecars():
             # branch needs the guard.
             if epsg is None:
                 _validate_crs_fallback(wkt_fallback, allow_unparseable_crs)
+            # ``color_ramp`` statistics: rather than re-executing the
+            # source graph after the write (a second full read of the
+            # data, issue #3597), accumulate them from the buffers the
+            # streaming write materialises anyway. ``color_ramp_range``
+            # already skips statistics, and multiband data never gets
+            # symbology, so neither pays the accumulation cost.
+            _stream_chunk_observer = None
+            if _sym_stops is not None and color_ramp_range is None:
+                from .._symbology import StreamingStats, _is_single_band
+                if _is_single_band(data):
+                    _sym_stream_stats = StreamingStats(nodata=_sym_nodata)
+                    _stream_chunk_observer = _sym_stream_stats.update
             write_streaming(
                 dask_arr, path,
                 geo_transform=geo_transform,
@@ -1125,6 +1142,7 @@ def _write_sidecars():
                 # rather than rejecting input the wrapper accepted.
                 allow_internal_only_jpeg=allow_internal_only_jpeg,
                 allow_unparseable_crs=allow_unparseable_crs,
+                chunk_observer=_stream_chunk_observer,
             )
             _write_sidecars()
             return path
diff --git a/xrspatial/geotiff/tests/write/test_color_ramp_single_pass_3597.py b/xrspatial/geotiff/tests/write/test_color_ramp_single_pass_3597.py
new file mode 100644
index 000000000..d1f708ab7
--- /dev/null
+++ b/xrspatial/geotiff/tests/write/test_color_ramp_single_pass_3597.py
@@ -0,0 +1,258 @@
+"""``color_ramp`` statistics ride along with the streaming dask write (#3597).
+
+``to_geotiff(dask_data, path, color_ramp=...)`` used to execute the source
+graph twice: once in ``_write_streaming`` for the pixels and once more in
+``_finite_stats`` for the sidecar statistics. The fix threads a
+``chunk_observer`` through the streaming writer so a ``StreamingStats``
+accumulator folds in every buffer the write materialises anyway. These tests
+pin the single-execution behaviour with a counting ``map_blocks`` layer and
+check the accumulated statistics against ``_finite_stats`` / plain numpy on
+the row-band, segmented wide-raster, and strip paths.
+"""
+import os
+import threading
+import xml.etree.ElementTree as ET
+
+import numpy as np
+import pytest
+import xarray as xr
+
+from xrspatial.geotiff import to_geotiff
+from xrspatial.geotiff._symbology import StreamingStats, _finite_stats
+
+from .._helpers.markers import requires_gpu
+
+pytest.importorskip("tifffile")
+
+
+def _counting_da(base, chunks):
+    """Dask-backed DataArray whose chunks count their own executions."""
+    import dask.array as dsa
+
+    counter = {"n": 0}
+    lock = threading.Lock()
+
+    def _count(block):
+        if block.size:  # skip dask's zero-size meta-inference call
+            with lock:
+                counter["n"] += 1
+        return block
+
+    arr = dsa.from_array(base, chunks=chunks).map_blocks(
+        _count, dtype=base.dtype)
+    n = base.shape[0]
+    da = xr.DataArray(
+        arr, dims=("y", "x"),
+        coords={"y": np.arange(n, dtype="float64"),
+                "x": np.arange(base.shape[1], dtype="float64")},
+        attrs={"crs": 4326},
+    )
+    return da, counter
+
+
+def _aux_stats(path):
+    """Parse ``<path>.aux.xml`` into a ``{STATISTICS_*: float}`` dict."""
+    band = ET.parse(path + ".aux.xml").getroot().find(".//PAMRasterBand")
+    return {mdi.get("key"): float(mdi.text)
+            for mdi in band.findall("./Metadata/MDI")}
+
+
+def _ref_stats(arr, nodata=None):
+    """(min, max, mean, population std) over finite non-nodata values."""
+    a = np.asarray(arr, dtype="float64")
+    mask = np.isfinite(a)
+    if nodata is not None:
+        mask &= (a != nodata)
+    v = a[mask]
+    return float(v.min()), float(v.max()), float(v.mean()), float(v.std())
+
+
+def _assert_aux_matches(path, ref):
+    stats = _aux_stats(path)
+    assert stats["STATISTICS_MINIMUM"] == pytest.approx(ref[0], rel=1e-9)
+    assert stats["STATISTICS_MAXIMUM"] == pytest.approx(ref[1], rel=1e-9)
+    assert stats["STATISTICS_MEAN"] == pytest.approx(ref[2], rel=1e-9)
+    assert stats["STATISTICS_STDDEV"] == pytest.approx(ref[3], rel=1e-9)
+
+
+_RNG = np.random.default_rng(3597)
+_BASE = _RNG.uniform(-50.0, 150.0, (64, 64))
+_BASE[3, 7] = np.nan
+_BASE[40, 2] = np.nan
+
+
+# --------------------------------------------------------------------------
+# single execution of the source graph
+# --------------------------------------------------------------------------
+
+def test_streaming_color_ramp_executes_source_once(tmp_path):
+    da, counter = _counting_da(_BASE, chunks=(16, 16))
+    path = str(tmp_path / "once_3597.tif")
+    to_geotiff(da, path, color_ramp="viridis")
+    assert counter["n"] == 16  # was 32 before the fix
+    _assert_aux_matches(path, _ref_stats(_BASE))
+    assert os.path.exists(str(tmp_path / "once_3597.qml"))
+
+
+def test_streaming_strip_path_executes_source_once(tmp_path):
+    da, counter = _counting_da(_BASE, chunks=(16, 16))
+    path = str(tmp_path / "strip_3597.tif")
+    to_geotiff(da, path, tiled=False, color_ramp="viridis")
+    assert counter["n"] == 16
+    _assert_aux_matches(path, _ref_stats(_BASE))
+
+
+def test_color_ramp_range_still_skips_stats(tmp_path):
+    da, counter = _counting_da(_BASE, chunks=(16, 16))
+    path = str(tmp_path / "rng_3597.tif")
+    to_geotiff(da, path, color_ramp="viridis", color_ramp_range=(0.0, 10.0))
+    assert counter["n"] == 16
+    aux = open(path + ".aux.xml").read()
+    assert "STATISTICS_MINIMUM" in aux and "STATISTICS_MEAN" not in aux
+
+
+# --------------------------------------------------------------------------
+# the segmented wide-raster path partitions pixels even when chunks recompute
+# --------------------------------------------------------------------------
+
+def test_segmented_wide_path_stats_exact(tmp_path):
+    # Full-width source chunks + a buffer budget of two 16x16 float64 tile
+    # columns force the column-segmented path, where a source chunk is
+    # computed once per segment it spans. The observer is fed the segment
+    # buffers (which partition the raster), so the statistics must stay
+    # exact even though chunk executions exceed the chunk count.
+    da, counter = _counting_da(_BASE, chunks=(16, 64))
+    path = str(tmp_path / "wide_3597.tif")
+    to_geotiff(da, path, tiled=True, tile_size=16,
+               streaming_buffer_bytes=4096, color_ramp="viridis")
+    assert counter["n"] > 4  # proves the segmented path actually engaged
+    _assert_aux_matches(path, _ref_stats(_BASE))
+
+
+# --------------------------------------------------------------------------
+# semantics parity with _finite_stats
+# --------------------------------------------------------------------------
+
+def test_streaming_nodata_sentinel_excluded(tmp_path):
+    import dask.array as dsa
+
+    vals = _BASE.copy()
+    vals[10:20, 10:20] = -9999.0
+    da = xr.DataArray(
+        dsa.from_array(vals, chunks=(16, 16)), dims=("y", "x"),
+        coords={"y": np.arange(64.0), "x": np.arange(64.0)},
+        attrs={"crs": 4326, "nodata": -9999.0},
+    )
+    path = str(tmp_path / "nd_3597.tif")
+    to_geotiff(da, path, color_ramp="viridis")
+    _assert_aux_matches(path, _ref_stats(vals, nodata=-9999.0))
+
+
+def test_streaming_int_dtype_stats(tmp_path):
+    import dask.array as dsa
+
+    vals = _RNG.integers(-100, 500, (64, 64)).astype("int32")
+    vals[0, :] = -32768
+    da = xr.DataArray(
+        dsa.from_array(vals, chunks=(16, 16)), dims=("y", "x"),
+        coords={"y": np.arange(64.0), "x": np.arange(64.0)},
+        attrs={"crs": 4326, "nodata": -32768},
+    )
+    path = str(tmp_path / "int_3597.tif")
+    to_geotiff(da, path, color_ramp="viridis")
+    _assert_aux_matches(path, _ref_stats(vals, nodata=-32768))
+
+
+def test_all_nan_dask_writes_no_sidecars(tmp_path):
+    import dask.array as dsa
+
+    da = xr.DataArray(
+        dsa.full((32, 32), np.nan, chunks=(16, 16)), dims=("y", "x"),
+        coords={"y": np.arange(32.0), "x": np.arange(32.0)},
+        attrs={"crs": 4326},
+    )
+    path = str(tmp_path / "nan_3597.tif")
+    to_geotiff(da, path, color_ramp="viridis")
+    assert os.path.exists(path)
+    assert not os.path.exists(path + ".aux.xml")
+    assert not os.path.exists(str(tmp_path / "nan_3597.qml"))
+
+
+def test_multiband_dask_color_ramp_noop(tmp_path):
+    import dask.array as dsa
+
+    rgb = xr.DataArray(
+        dsa.zeros((3, 32, 32), chunks=(3, 16, 16), dtype="float32"),
+        dims=("band", "y", "x"),
+        coords={"band": [1, 2, 3], "y": np.arange(32.0),
+                "x": np.arange(32.0)},
+        attrs={"crs": 4326},
+    )
+    path = str(tmp_path / "rgb_3597.tif")
+    to_geotiff(rgb, path, color_ramp="viridis")
+    assert not os.path.exists(path + ".aux.xml")
+    assert not os.path.exists(str(tmp_path / "rgb_3597.qml"))
+
+
+# --------------------------------------------------------------------------
+# StreamingStats unit behaviour
+# --------------------------------------------------------------------------
+
+def test_streaming_stats_matches_finite_stats_uneven_slabs():
+    acc = StreamingStats()
+    for r0, r1 in [(0, 5), (5, 6), (6, 40), (40, 64)]:
+        acc.update(_BASE[r0:r1])
+    da = xr.DataArray(_BASE, dims=("y", "x"))
+    assert acc.result() == pytest.approx(_finite_stats(da, None), rel=1e-9)
+
+
+def test_streaming_stats_nan_nodata_treated_as_unset():
+    acc = StreamingStats(nodata=float("nan"))
+    acc.update(_BASE)
+    da = xr.DataArray(_BASE, dims=("y", "x"))
+    assert acc.result() == pytest.approx(_finite_stats(da, None), rel=1e-9)
+
+
+def test_streaming_stats_empty_returns_none():
+    acc = StreamingStats()
+    assert acc.result() is None
+    acc.update(np.full((4, 4), np.nan))
+    assert acc.result() is None
+
+
+def test_streaming_stats_constant_buffers():
+    acc = StreamingStats()
+    acc.update(np.full((4, 4), 7.0))
+    acc.update(np.full((2, 4), 7.0))
+    assert acc.result() == pytest.approx((7.0, 7.0, 7.0, 0.0), abs=1e-12)
+
+
+# --------------------------------------------------------------------------
+# dask+cupy through the CPU streaming writer (gpu=False)
+# --------------------------------------------------------------------------
+
+@requires_gpu
+def test_dask_cupy_streaming_executes_source_once(tmp_path):
+    import cupy
+    import dask.array as dsa
+
+    counter = {"n": 0}
+    lock = threading.Lock()
+
+    def _count(block):
+        if block.size:  # skip dask's zero-size meta-inference call
+            with lock:
+                counter["n"] += 1
+        return block
+
+    arr = dsa.from_array(cupy.asarray(_BASE), chunks=(16, 16)).map_blocks(
+        _count, dtype=_BASE.dtype)
+    da = xr.DataArray(
+        arr, dims=("y", "x"),
+        coords={"y": np.arange(64.0), "x": np.arange(64.0)},
+        attrs={"crs": 4326},
+    )
+    path = str(tmp_path / "dgpu_3597.tif")
+    to_geotiff(da, path, gpu=False, color_ramp="viridis")
+    assert counter["n"] == 16
+    _assert_aux_matches(path, _ref_stats(_BASE))