Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 33 additions & 9 deletions xrspatial/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
_PRESERVE_OPTIONS = ("area", "shape")

# Guard against a stray fine resolution allocating an enormous array. Applied to
# every backend, including dask: a lazy grid this large is almost always a typo,
# and the uniform cap keeps the error identical across backends.
# the eager backends (numpy, cupy) only: those materialize the whole grid up
# front, so a huge shape is an immediate out-of-memory risk. A dask grid is
# lazy (da.full builds a chunk graph, never the array), so the cap is skipped
# there -- a large chunked study area is a legitimate workflow.
_MAX_CELLS = 500_000_000


Expand Down Expand Up @@ -254,7 +256,7 @@ def from_template(name: str,
resolution: Optional[Union[float, Tuple[float, float]]] = None,
*, preserve: Optional[str] = None, backend: str = "numpy",
fill: float = np.nan,
chunks: Union[int, str, Tuple] = "auto") -> xr.DataArray:
chunks: Optional[Union[int, str, Tuple]] = None) -> xr.DataArray:
"""Create an empty DataArray for a common study area.

The returned raster is NaN-filled and obeys the xarray-spatial array
Expand Down Expand Up @@ -301,11 +303,21 @@ def from_template(name: str,
the chosen EPSG int. Requires pyproj.
backend : str, default='numpy'
Array backend: ``'numpy'``, ``'dask+numpy'`` (alias ``'dask'``),
``'cupy'``, or ``'dask+cupy'``.
``'cupy'``, or ``'dask+cupy'``. The eager backends (``'numpy'``,
``'cupy'``) materialize the whole grid, so they are subject to a
500-million-cell cap; the dask backends build a lazy chunk graph and
are not.
fill : float, default=numpy.nan
Value the grid is filled with. The dtype is always ``float32``.
chunks : int, str, or tuple, default='auto'
Dask chunk specification; only used by the dask backends.
chunks : int, str, or tuple, optional
Dask chunk specification. Supplying it returns a lazy, chunked grid:
an eager backend is promoted to its dask variant (``'numpy'`` to
``'dask+numpy'``, ``'cupy'`` to ``'dask+cupy'``), and the cell cap no
longer applies. When omitted, the dask backends use ``'auto'``. The
data stays lazy, but a very fine resolution still builds one task per
chunk, so an extreme shape with small chunks can make a task graph
large enough to bog down the client; coarsen the resolution or use
larger chunks if that happens.

Returns
-------
Expand Down Expand Up @@ -351,6 +363,10 @@ def from_template(name: str,
32630
>>> from_template("FRA", preserve="area").attrs["crs"] # Equal Earth
8857
>>> # passing chunks returns a lazy dask grid, exempt from the cell cap
>>> agg = from_template("new_england", resolution=10, chunks=512)
>>> type(agg.data).__name__
'Array'
"""
spec = _resolve(name)
key = spec["key"]
Expand Down Expand Up @@ -379,12 +395,19 @@ def from_template(name: str,
width = max(1, int(round((right - left) / res_x)))
height = max(1, int(round((top - bottom) / res_y)))

# Supplying `chunks` signals intent for a lazy, chunked grid: promote an
# eager backend to its dask variant so the result never materializes.
backend = backend.lower()
if chunks is not None and backend in ("numpy", "cupy"):
backend = "dask+numpy" if backend == "numpy" else "dask+cupy"
is_dask = backend in ("dask", "dask+numpy", "dask+cupy")

n_cells = width * height
if n_cells > _MAX_CELLS:
if not is_dask and n_cells > _MAX_CELLS:
raise ValueError(
f"resolution {(res_x, res_y)} produces a {height} x {width} grid "
f"({n_cells:,} cells), exceeding the {_MAX_CELLS:,}-cell limit. "
f"Use a coarser resolution."
f"Use a coarser resolution, or pass chunks=... for a lazy dask grid."
)

# Honor the requested resolution exactly: anchor the lower-left corner and
Expand All @@ -396,7 +419,8 @@ def from_template(name: str,
top = bottom + height * res_y
ys, xs = _make_output_coords((left, bottom, right, top), (height, width))

data = _make_data((height, width), fill, backend, chunks)
data = _make_data((height, width), fill, backend,
"auto" if chunks is None else chunks)

attrs = {"res": (res_x, res_y), "crs": crs}
attrs.update(_cf_crs_attrs(crs))
Expand Down
46 changes: 46 additions & 0 deletions xrspatial/tests/test_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,52 @@ def test_over_fine_resolution_raises():
from_template("conus", resolution=1)


# new_england at 10 m is ~6.9e9 cells, well past the eager cap, but only ~26k
# chunks at 512 -- a lazy grid that builds and indexes instantly. This is the
# repro from the issue.
_OVER_CAP = dict(name="new_england", resolution=10)


@dask_array_available
def test_chunks_promotes_eager_to_lazy_dask():
import dask.array as da
from xrspatial.templates import _MAX_CELLS
# Supplying chunks promotes the default numpy backend to dask and skips the
# cap, returning a lazy array that never materializes the full shape.
agg = from_template(_OVER_CAP["name"], resolution=_OVER_CAP["resolution"],
chunks=512)
assert isinstance(agg.data, da.Array)
assert agg.size > _MAX_CELLS
assert agg.data.chunksize == (512, 512)
assert agg.attrs["res"] == (10.0, 10.0)
# computing a single cell stays cheap and yields the NaN fill
assert np.isnan(float(agg.data[0, 0].compute()))


@dask_array_available
def test_dask_backend_skips_cell_cap_without_chunks():
import dask.array as da
from xrspatial.templates import _MAX_CELLS
# An explicit dask backend is lazy too, so the cap is skipped even when no
# chunks are passed (the dask path falls back to 'auto').
agg = from_template(_OVER_CAP["name"], resolution=_OVER_CAP["resolution"],
backend="dask+numpy")
assert isinstance(agg.data, da.Array)
assert agg.size > _MAX_CELLS


@cuda_and_cupy_available
@dask_array_available
def test_chunks_promotes_cupy_to_dask_cupy():
import cupy
import dask.array as da
agg = from_template(_OVER_CAP["name"], resolution=_OVER_CAP["resolution"],
backend="cupy", chunks=512)
assert isinstance(agg.data, da.Array)
block = agg.data.blocks[0, 0].compute()
assert isinstance(block, cupy.ndarray)


def test_single_pixel_grid():
# a resolution coarser than the whole study-area box clamps width and height
# to the max(1, ...) floor, giving a 1x1 grid that still obeys the contract.
Expand Down
Loading