From 46e61560050894c070f8640f2d0f8a9e0e8cf18b Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sat, 27 Jun 2026 09:54:52 -0700 Subject: [PATCH 1/2] from_template: guard dask grids against task-graph explosion (#3557) Supplying a fixed chunk size with a typo-level resolution builds one task per chunk, so the block count grows with the cell count and the graph can bog down the client during construction even though no data is computed. Guard on the estimated block count (via dask's own normalize_chunks, so 'auto'/int/tuple/dict all resolve to the real grid) rather than cell count, and raise when it exceeds 1,000,000 chunks. A legitimate large grid with sensible chunks (new_england@10m ~26k) or 'auto' sizing still passes; the runaway fixed-chunk case raises before da.full builds the graph. Claude-Session: https://claude.ai/code/session_01YUNyNJkagqnTT3qEc7bt81 --- xrspatial/templates.py | 46 ++++++++++++++++++++++++++++--- xrspatial/tests/test_templates.py | 43 +++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 4 deletions(-) diff --git a/xrspatial/templates.py b/xrspatial/templates.py index a380351c2..16c4c6eb2 100644 --- a/xrspatial/templates.py +++ b/xrspatial/templates.py @@ -28,6 +28,14 @@ # there -- a large chunked study area is a legitimate workflow. _MAX_CELLS = 500_000_000 +# A lazy dask grid never materializes, but its task graph does: with a fixed +# chunk size the block count grows with the cell count, and a typo-level +# resolution can build a graph large enough to bog down the client during +# construction. Guard on the estimated block count (not cells) so a legitimate +# large grid with sensible chunks still passes -- the new_england@10m repro is +# ~12k blocks, so 1e6 leaves wide headroom while catching the runaway case. +_MAX_CHUNKS = 1_000_000 + def _resolve(name): """Resolve a name to a spec dict. @@ -180,6 +188,23 @@ def _make_data(shape, fill, backend, chunks): ) +def _estimate_n_chunks(shape, chunks): + """Number of blocks ``da.full`` would build for ``shape`` and ``chunks``. + + Uses dask's own ``normalize_chunks`` so every ``chunks`` form -- ``'auto'`` + (where dask picks the block shape from the chunk-size config), an int, a + tuple, or a dict -- resolves to the same block grid the real array would + have. dtype is ``float32`` to match the grid ``_make_data`` builds. The call + only sizes the block tuples, so it stays cheap even when the count is huge. + """ + from dask.array.core import normalize_chunks + normalized = normalize_chunks(chunks, shape=shape, dtype="float32") + n = 1 + for dim in normalized: + n *= len(dim) + return n + + def _cf_crs_attrs(crs): """CF Conventions grid-mapping attributes for an EPSG code. @@ -316,8 +341,9 @@ def from_template(name: str, longer applies. When omitted, the dask backends use ``'auto'``. The data stays lazy, but a very fine resolution still builds one task per chunk, so an extreme shape with small chunks can make a task graph - large enough to bog down the client; coarsen the resolution or use - larger chunks if that happens. + large enough to bog down the client. To prevent that, a grid that would + split into more than 1,000,000 chunks raises ``ValueError``; coarsen the + resolution or use larger chunks. Returns ------- @@ -410,6 +436,19 @@ def from_template(name: str, f"Use a coarser resolution, or pass chunks=... for a lazy dask grid." ) + effective_chunks = "auto" if chunks is None else chunks + if is_dask: + n_chunks = _estimate_n_chunks((height, width), effective_chunks) + if n_chunks > _MAX_CHUNKS: + raise ValueError( + f"resolution {(res_x, res_y)} produces a {height} x {width} " + f"grid that splits into {n_chunks:,} chunks with " + f"chunks={effective_chunks!r}, exceeding the " + f"{_MAX_CHUNKS:,}-chunk limit. A task graph this large can bog " + f"down the client even though no data is computed. Use a " + f"coarser resolution or larger chunks." + ) + # Honor the requested resolution exactly: anchor the lower-left corner and # nudge the far edges to an exact multiple of the cell size, so res comes # back as (res_x, res_y) instead of drifting when the bbox extent isn't a @@ -419,8 +458,7 @@ def from_template(name: str, top = bottom + height * res_y ys, xs = _make_output_coords((left, bottom, right, top), (height, width)) - data = _make_data((height, width), fill, backend, - "auto" if chunks is None else chunks) + data = _make_data((height, width), fill, backend, effective_chunks) attrs = {"res": (res_x, res_y), "crs": crs} attrs.update(_cf_crs_attrs(crs)) diff --git a/xrspatial/tests/test_templates.py b/xrspatial/tests/test_templates.py index 1c72be3d4..36bfb36c3 100644 --- a/xrspatial/tests/test_templates.py +++ b/xrspatial/tests/test_templates.py @@ -413,6 +413,49 @@ def test_chunks_promotes_cupy_to_dask_cupy(): assert isinstance(block, cupy.ndarray) +@dask_array_available +def test_over_fine_dask_chunk_count_raises(): + # The dask path skips the cell cap, but a typo-level resolution with a fixed + # chunk size builds a runaway task graph. This is the issue #3557 repro: + # conus at 1 m / chunks=512 is ~7e13 cells / 512^2 ~= 7e7 chunks. The guard + # must raise from the estimate, BEFORE da.full builds the graph. + with pytest.raises(ValueError, match="chunk"): + from_template("conus", resolution=1, chunks=512) + + +@dask_array_available +def test_auto_chunks_exempt_from_chunk_cap(): + import dask.array as da + # 'auto' sizes blocks to the dask chunk-size config (~128 MB), so even a very + # fine resolution stays well under the chunk cap and builds fine. The guard + # keys on the real block count, so the auto path is not falsely tripped. + agg = from_template("conus", resolution=1, chunks="auto") + assert isinstance(agg.data, da.Array) + from xrspatial.templates import _MAX_CHUNKS + assert agg.data.npartitions <= _MAX_CHUNKS + + +@dask_array_available +def test_chunk_count_estimate_matches_dask(): + import dask.array as da + from xrspatial.templates import _estimate_n_chunks + # The estimate must agree with the block count dask actually builds, across + # chunk forms, so the guard fires on the real graph size. + for chunks in (256, 512, "auto", (300, 400)): + built = da.full((4000, 5000), np.nan, dtype="float32", chunks=chunks) + assert _estimate_n_chunks((4000, 5000), chunks) == built.npartitions + + +@dask_array_available +def test_legit_large_dask_grid_passes(): + import dask.array as da + # The headroom case: new_england @ 10 m / chunks=512 is past the eager cell + # cap but only ~26k chunks, far below the 1e6 chunk cap, so it must build. + agg = from_template("new_england", resolution=10, chunks=512) + assert isinstance(agg.data, da.Array) + assert agg.data.npartitions < 1_000_000 + + def test_single_pixel_grid(): # a resolution coarser than the whole study-area box clamps width and height # to the max(1, ...) floor, giving a 1x1 grid that still obeys the contract. From 980e76b8663c3c3c514b707277af22c0283e1bd9 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sat, 27 Jun 2026 09:57:35 -0700 Subject: [PATCH 2/2] Address review nits: fix block-count comment, tighten test matches (#3557) - Correct the _MAX_CHUNKS comment: new_england@10m is ~26k blocks, not ~12k (the test comment and PR body already said ~26k). - Tighten the runaway-guard tests to match "chunk limit" so they cannot pass on the eager cell-cap message, which also mentions chunks. - Add an explicit dask+numpy (non-promotion) raise test. Claude-Session: https://claude.ai/code/session_01YUNyNJkagqnTT3qEc7bt81 --- xrspatial/templates.py | 2 +- xrspatial/tests/test_templates.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/xrspatial/templates.py b/xrspatial/templates.py index 16c4c6eb2..bcd9f9b03 100644 --- a/xrspatial/templates.py +++ b/xrspatial/templates.py @@ -33,7 +33,7 @@ # resolution can build a graph large enough to bog down the client during # construction. Guard on the estimated block count (not cells) so a legitimate # large grid with sensible chunks still passes -- the new_england@10m repro is -# ~12k blocks, so 1e6 leaves wide headroom while catching the runaway case. +# ~26k blocks, so 1e6 leaves wide headroom while catching the runaway case. _MAX_CHUNKS = 1_000_000 diff --git a/xrspatial/tests/test_templates.py b/xrspatial/tests/test_templates.py index 36bfb36c3..110a87f14 100644 --- a/xrspatial/tests/test_templates.py +++ b/xrspatial/tests/test_templates.py @@ -418,11 +418,21 @@ def test_over_fine_dask_chunk_count_raises(): # The dask path skips the cell cap, but a typo-level resolution with a fixed # chunk size builds a runaway task graph. This is the issue #3557 repro: # conus at 1 m / chunks=512 is ~7e13 cells / 512^2 ~= 7e7 chunks. The guard - # must raise from the estimate, BEFORE da.full builds the graph. - with pytest.raises(ValueError, match="chunk"): + # must raise from the estimate, BEFORE da.full builds the graph. Match the + # chunk-count cap text specifically so this can't pass on the eager + # cell-cap message (which also mentions "chunks"). + with pytest.raises(ValueError, match="chunk limit"): from_template("conus", resolution=1, chunks=512) +@dask_array_available +def test_explicit_dask_backend_chunk_count_raises(): + # Same guard via the non-promotion path: an explicit dask backend with a + # fixed small chunk size on a typo-fine resolution must raise too. + with pytest.raises(ValueError, match="chunk limit"): + from_template("conus", resolution=1, backend="dask+numpy", chunks=512) + + @dask_array_available def test_auto_chunks_exempt_from_chunk_cap(): import dask.array as da