Skip to content

Commit 63c3f2e

Browse files
authored
Refuse pack=True values the packed integer dtype cannot hold (#3272)
* Refuse pack=True values the packed integer dtype cannot hold (#3260) The integer restore in _pack cast unguarded: finite values whose packed form fell outside the dtype range wrapped in the astype (40000 -> -25536 in int16) and +/-Inf cast to a platform-defined integer, all silently. A range/finiteness guard now runs after the round and before the cast, raising ValueError. numpy and cupy buffers raise at call time; dask backings map the guard into the graph so it raises from the write's single compute, same shape as the #3235 NaN guard. The exclusive upper bound is iinfo.max + 1 (a power of two, exact in float64) so 64-bit dtypes reject exactly-2**63 instead of letting it wrap. Also routes the eager NaN-no-sentinel scan through _pack_guard_no_nan: the old bool(out.isnull().any()) materialised via np.asarray and raised TypeError on cupy-backed eager input before the scan could answer. * Record accuracy-sweep pass 27 for geotiff (#3260) * Address review: assert on the packed output file, drop redundant attrs copy (#3260)
1 parent 3b74a15 commit 63c3f2e

4 files changed

Lines changed: 273 additions & 10 deletions

File tree

.claude/sweep-accuracy-state.csv

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

xrspatial/geotiff/_attrs.py

Lines changed: 72 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1730,6 +1730,44 @@ def _pack_fill_nan(chunk, fill):
17301730
return out
17311731

17321732

1733+
def _pack_guard_int_range(chunk, tgt_name, lo, hi_excl):
1734+
"""Per-chunk range / finiteness guard for ``_pack``'s integer restore.
1735+
1736+
Runs after the round and before the ``astype``. Raises ``ValueError``
1737+
when the chunk holds non-finite values or values outside the target
1738+
integer dtype's range -- the cast would silently wrap them into
1739+
valid-looking integers (issue #3260). Returns the chunk unchanged
1740+
when clean.
1741+
1742+
``lo`` is ``iinfo.min`` and ``hi_excl`` is ``iinfo.max + 1``, both as
1743+
floats. The exclusive upper bound matters for the 64-bit dtypes:
1744+
``float(iinfo.max)`` rounds up to ``2**63`` / ``2**64``, so a
1745+
``chunk > float(iinfo.max)`` test would pass a value that still wraps
1746+
in the cast. ``iinfo.max + 1`` is a power of two for every integer
1747+
dtype and therefore exact in float64. ``iinfo.min`` is likewise a
1748+
power of two (or zero) and exact.
1749+
1750+
Mapped over dask blocks so the scan fuses with the write's single
1751+
compute (same shape as :func:`_pack_guard_no_nan`); numpy and cupy
1752+
buffers are checked eagerly at ``_pack`` call time. ``np.isfinite``
1753+
and the comparisons dispatch on the array type, so the one function
1754+
covers all four backends.
1755+
"""
1756+
if chunk.dtype.kind == 'f':
1757+
bad = ~np.isfinite(chunk)
1758+
bad |= chunk < lo
1759+
bad |= chunk >= hi_excl
1760+
if bool(bad.any()):
1761+
raise ValueError(
1762+
f"pack=True: data contains values that the packed "
1763+
f"integer dtype {tgt_name} cannot represent (valid "
1764+
f"range {int(lo)}..{int(hi_excl) - 1} after the "
1765+
f"scale/offset reversal) or that are not finite; the "
1766+
f"cast would silently wrap them. Clip the data to the "
1767+
f"packed range or rescale before writing.")
1768+
return chunk
1769+
1770+
17331771
def _pack_guard_no_nan(chunk, tgt_name):
17341772
"""Per-chunk NaN guard for ``_pack``'s no-sentinel integer restore.
17351773
@@ -1785,12 +1823,16 @@ def _pack(data, nodata=None):
17851823
17861824
Raises ``ValueError`` when ``data`` carries no unpack state to
17871825
reverse, when an integer dtype must be restored but NaN pixels are
1788-
present with no declared sentinel to fill them, or when the ``nodata``
1826+
present with no declared sentinel to fill them, when the ``nodata``
17891827
kwarg cannot be represented in the packed integer dtype (out of range
17901828
or fractional) -- the cast would wrap or round the fill value away
1791-
from what the GDAL_NODATA tag declares.
1829+
from what the GDAL_NODATA tag declares -- or when any pixel's packed
1830+
value is non-finite or falls outside the packed integer dtype's
1831+
range, where the cast would silently wrap it into a valid-looking
1832+
integer (issue #3260).
17921833
1793-
Error timing for the NaN-with-no-sentinel case depends on the
1834+
Error timing for the NaN-with-no-sentinel and packed-range cases
1835+
depends on the
17941836
backing: numpy-backed data raises here, at call time. dask-backed
17951837
data defers the scan into the graph (one per-chunk check fused with
17961838
the write's single compute) so the upstream graph is not executed a
@@ -1887,14 +1929,36 @@ def _pack(data, nodata=None):
18871929
out = out.copy(data=out.data.map_blocks(
18881930
_pack_guard_no_nan, tgt.name,
18891931
dtype=out.dtype, meta=out.data._meta))
1890-
elif bool(out.isnull().any()):
1891-
raise ValueError(
1892-
f"pack=True: cannot restore integer dtype {tgt.name}: "
1893-
"NaN pixels are present but no nodata sentinel is "
1894-
"declared to fill them.")
1932+
else:
1933+
# Buffer-level scan, not ``bool(out.isnull().any())``: the
1934+
# xarray reduction materialises through ``np.asarray`` and
1935+
# raised TypeError on cupy-backed eager input, crashing
1936+
# every no-sentinel integer pack on the gpu backend before
1937+
# the scan could answer (#3260). ``_pack_guard_no_nan``
1938+
# dispatches ``np.isnan`` on the array type, so numpy and
1939+
# cupy take the same path here.
1940+
_pack_guard_no_nan(out.data, tgt.name)
18951941

18961942
if tgt.kind in ('i', 'u'):
18971943
out = out.round()
1944+
# Out-of-range and non-finite packed values would silently wrap
1945+
# in the astype below, exactly the corruption the NaN guard
1946+
# above exists to prevent: a finite value whose packed form
1947+
# exceeds the dtype range wraps (40000 -> -25536 in int16), and
1948+
# +/-Inf casts to a platform-defined integer (issue #3260).
1949+
# Runs after the round so a value like 32767.4 that rounds back
1950+
# into range is not rejected. Same eager/lazy split as the NaN
1951+
# guard: numpy / cupy raise here, dask raises from the write's
1952+
# single compute.
1953+
info = np.iinfo(tgt)
1954+
lo = float(info.min)
1955+
hi_excl = float(int(info.max) + 1)
1956+
if hasattr(out.data, 'dask'):
1957+
out = out.copy(data=out.data.map_blocks(
1958+
_pack_guard_int_range, tgt.name, lo, hi_excl,
1959+
dtype=out.dtype, meta=out.data._meta))
1960+
else:
1961+
_pack_guard_int_range(out.data, tgt.name, lo, hi_excl)
18981962

18991963
out = out.astype(tgt)
19001964

xrspatial/geotiff/_writers/eager.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,11 @@ def to_geotiff(data: xr.DataArray | np.ndarray,
390390
integer dtype must be restored, the ``ValueError`` is raised at
391391
call time for numpy-backed data; for dask-backed data the check
392392
runs per chunk inside the write's compute (issue #3235), so the
393-
error surfaces during the write. The write itself stays atomic
393+
error surfaces during the write. The same timing applies to
394+
pixels whose packed value is non-finite or falls outside the
395+
packed integer dtype's range: the cast would silently wrap
396+
them, so the write is refused with ``ValueError`` instead
397+
(issue #3260). The write itself stays atomic
394398
(temp file plus rename), so no partial output is left at the
395399
destination path.
396400
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
"""``to_geotiff(pack=True)`` refuses packed values the dtype cannot hold (#3260).
2+
3+
``_pack`` reverses the unpack transform and casts to the integer dtype
4+
recorded on ``attrs['mask_and_scale_dtype']``. Before #3260 the cast ran
5+
unguarded: a finite value whose packed form fell outside the dtype range
6+
wrapped (40000 -> -25536 in int16) and +/-Inf cast to a platform-defined
7+
integer, both silently. The guard raises ``ValueError`` instead -- at
8+
call time for numpy / cupy buffers, from the write's single compute for
9+
dask backings (same timing as the #3235 NaN guard).
10+
"""
11+
import numpy as np
12+
import pytest
13+
import xarray as xr
14+
15+
from xrspatial.geotiff import open_geotiff, to_geotiff
16+
from xrspatial.geotiff._attrs import _pack_guard_int_range
17+
18+
from .._helpers.markers import requires_gpu
19+
20+
21+
def _write_scaled_int16(path, *, nodata=None):
22+
"""int16 source with SCALE=0.1 so unpacked values are data * 0.1."""
23+
data = np.array([[100, 200], [300, 32000]], dtype=np.int16)
24+
attrs = {
25+
"crs": 4326,
26+
"gdal_metadata": {"SCALE": "0.1", "OFFSET": "0.0"},
27+
}
28+
if nodata is not None:
29+
attrs["nodata"] = nodata
30+
da = xr.DataArray(
31+
data,
32+
dims=("y", "x"),
33+
coords={"y": [1.5, 0.5], "x": [0.5, 1.5]},
34+
attrs=attrs,
35+
)
36+
to_geotiff(da, str(path), nodata=nodata)
37+
return str(path)
38+
39+
40+
def _unpacked(path, *, gpu=False):
41+
kwargs = {"unpack": True}
42+
if gpu:
43+
kwargs["gpu"] = True
44+
return open_geotiff(path, **kwargs).copy()
45+
46+
47+
# ---------------------------------------------------------------------------
48+
# Finite overflow wraps in the cast: refused on every backend
49+
# ---------------------------------------------------------------------------
50+
51+
52+
@pytest.mark.parametrize("chunks,gpu", [
53+
pytest.param(None, False, id="numpy"),
54+
pytest.param(1, False, id="dask"),
55+
pytest.param(None, True, marks=requires_gpu, id="gpu"),
56+
pytest.param(1, True, marks=requires_gpu, id="dask-gpu"),
57+
])
58+
def test_pack_rejects_finite_overflow(tmp_path, chunks, gpu):
59+
src = _write_scaled_int16(tmp_path / "src_overflow_3260.tif")
60+
mod = _unpacked(src, gpu=gpu)
61+
# Packs to 40000 > int16 max 32767; pre-#3260 this wrapped to -25536.
62+
mod.data[0, 0] = 4000.0
63+
if chunks is not None:
64+
mod = mod.chunk({"y": chunks})
65+
out = str(tmp_path / "out_overflow_3260.tif")
66+
with pytest.raises(ValueError, match="cannot represent"):
67+
to_geotiff(mod, out, pack=True)
68+
69+
70+
@pytest.mark.parametrize("chunks", [None, 1], ids=["numpy", "dask"])
71+
def test_pack_rejects_underflow_unsigned(tmp_path, chunks):
72+
"""A negative packed value must not wrap into the top of a uint range."""
73+
data = np.array([[10, 20], [30, 40]], dtype=np.uint16)
74+
da = xr.DataArray(
75+
data,
76+
dims=("y", "x"),
77+
coords={"y": [1.5, 0.5], "x": [0.5, 1.5]},
78+
attrs={"crs": 4326,
79+
"gdal_metadata": {"SCALE": "0.5", "OFFSET": "0.0"}},
80+
)
81+
src = str(tmp_path / "src_u16_underflow_3260.tif")
82+
to_geotiff(da, src)
83+
84+
mod = _unpacked(src)
85+
mod.data[0, 0] = -1.0 # packs to -2, below uint16 min
86+
if chunks is not None:
87+
mod = mod.chunk({"y": chunks})
88+
out = str(tmp_path / "out_u16_underflow_3260.tif")
89+
with pytest.raises(ValueError, match="cannot represent"):
90+
to_geotiff(mod, out, pack=True)
91+
92+
93+
# ---------------------------------------------------------------------------
94+
# +/-Inf passes the NaN fill / NaN guard but must not reach the cast
95+
# ---------------------------------------------------------------------------
96+
97+
98+
@pytest.mark.parametrize("chunks", [None, 1], ids=["numpy", "dask"])
99+
@pytest.mark.parametrize("value", [np.inf, -np.inf], ids=["inf", "neg-inf"])
100+
def test_pack_rejects_inf(tmp_path, chunks, value):
101+
# A declared sentinel routes Inf past the NaN fill (isnan only), so
102+
# this leg pins the path where Inf used to reach the cast directly.
103+
src = _write_scaled_int16(tmp_path / "src_inf_3260.tif", nodata=-32768)
104+
mod = _unpacked(src)
105+
mod.data[0, 0] = value
106+
if chunks is not None:
107+
mod = mod.chunk({"y": chunks})
108+
out = str(tmp_path / "out_inf_3260.tif")
109+
with pytest.raises(ValueError, match="not finite|cannot represent"):
110+
to_geotiff(mod, out, pack=True)
111+
112+
113+
# ---------------------------------------------------------------------------
114+
# No false positives: boundary and round-back-into-range values pass
115+
# ---------------------------------------------------------------------------
116+
117+
118+
@pytest.mark.parametrize("chunks", [None, 1], ids=["numpy", "dask"])
119+
def test_pack_accepts_full_dtype_range(tmp_path, chunks):
120+
"""Exact iinfo.min / iinfo.max packed values are representable."""
121+
data = np.array([[-32768, 0], [100, 32767]], dtype=np.int16)
122+
da = xr.DataArray(
123+
data,
124+
dims=("y", "x"),
125+
coords={"y": [1.5, 0.5], "x": [0.5, 1.5]},
126+
attrs={"crs": 4326,
127+
"gdal_metadata": {"SCALE": "0.1", "OFFSET": "0.0"}},
128+
)
129+
src = str(tmp_path / "src_bounds_3260.tif")
130+
to_geotiff(da, src)
131+
132+
mod = _unpacked(src)
133+
if chunks is not None:
134+
mod = mod.chunk({"y": chunks})
135+
out = str(tmp_path / "out_bounds_3260.tif")
136+
to_geotiff(mod, out, pack=True)
137+
138+
back = open_geotiff(out)
139+
assert str(back.dtype) == "int16"
140+
np.testing.assert_array_equal(np.asarray(back.data), data)
141+
142+
143+
def test_pack_accepts_value_that_rounds_back_into_range(tmp_path):
144+
"""The guard runs after the round: 3276.74 packs to 32767.4 which
145+
rounds to 32767 and fits."""
146+
src = _write_scaled_int16(tmp_path / "src_round_3260.tif")
147+
mod = _unpacked(src)
148+
mod.data[0, 0] = 3276.74
149+
out = str(tmp_path / "out_round_3260.tif")
150+
to_geotiff(mod, out, pack=True)
151+
back = open_geotiff(out)
152+
assert np.asarray(back.data)[0, 0] == 32767
153+
154+
155+
def test_pack_float_target_not_range_guarded(tmp_path):
156+
"""Float packed dtypes have no wrap problem; large values pass."""
157+
data = np.array([[1.0, 2.0], [3.0, -9999.0]], dtype=np.float32)
158+
da = xr.DataArray(
159+
data,
160+
dims=("y", "x"),
161+
coords={"y": [1.5, 0.5], "x": [0.5, 1.5]},
162+
attrs={"crs": 4326, "nodata": -9999.0,
163+
"gdal_metadata": {"SCALE": "2.0", "OFFSET": "0.0"}},
164+
)
165+
src = str(tmp_path / "src_float_3260.tif")
166+
to_geotiff(da, src, nodata=-9999.0)
167+
168+
mod = _unpacked(src)
169+
mod.data[0, 0] = 1e30
170+
out = str(tmp_path / "out_float_3260.tif")
171+
to_geotiff(mod, out, pack=True)
172+
back = open_geotiff(out)
173+
assert str(back.dtype) == "float32"
174+
# Packed value is 1e30 / SCALE = 5e29, well inside float32 range.
175+
assert np.asarray(back.data)[0, 0] == np.float32(5e29)
176+
177+
178+
# ---------------------------------------------------------------------------
179+
# Unit: the 64-bit exclusive upper bound is exact
180+
# ---------------------------------------------------------------------------
181+
182+
183+
def test_guard_rejects_two_pow_63_for_int64():
184+
"""float64 cannot hold INT64_MAX; float(iinfo.max) rounds up to 2**63.
185+
The guard's exclusive bound must reject exactly-2**63, which an
186+
inclusive ``> float(iinfo.max)`` test would let through to wrap."""
187+
info = np.iinfo(np.int64)
188+
chunk = np.array([float(2 ** 63)])
189+
with pytest.raises(ValueError, match="cannot represent"):
190+
_pack_guard_int_range(
191+
chunk, "int64", float(info.min), float(int(info.max) + 1))
192+
# One ULP below 2**63 is representable in int64 and passes.
193+
ok = np.array([np.nextafter(float(2 ** 63), 0.0)])
194+
_pack_guard_int_range(
195+
ok, "int64", float(info.min), float(int(info.max) + 1))

0 commit comments

Comments
 (0)