From ec9ae5865960d1303ae553678b76e58d37da907e Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 16 Jun 2026 18:10:49 -0700 Subject: [PATCH] Reject predictor with jpeg2000/lerc/jpeg compression (#3371) The TIFF PREDICTOR tag only applies to byte-oriented entropy codecs (deflate/lzw/zstd/lz4/packbits). jpeg/jpeg2000/lerc do their own pixel modelling and GDAL ignores the tag, so applying horizontal/floating-point differencing before these codecs wrote a file whose pixels were corrupted to every spec-compliant reader (GDAL read lerc+predictor=2 with a maxdiff of 64537; jpeg2000+predictor=2 would not open at all). normalize_predictor already received the compression tag but never used it. Gate there so every writer leg (eager strip + tiled, GPU) rejects the combination up front with a clear ValueError. Byte-oriented codecs keep honouring the predictor. --- xrspatial/geotiff/_encode.py | 49 ++++++++-- .../geotiff/tests/unit/test_predictor.py | 89 +++++++++++++++++++ 2 files changed, 129 insertions(+), 9 deletions(-) diff --git a/xrspatial/geotiff/_encode.py b/xrspatial/geotiff/_encode.py index c6c6719a6..23008f276 100644 --- a/xrspatial/geotiff/_encode.py +++ b/xrspatial/geotiff/_encode.py @@ -249,27 +249,58 @@ def _reject_disagreeing_photometric_override( # Predictor helpers # --------------------------------------------------------------------------- +# Compression codecs that do NOT honor the TIFF PREDICTOR tag. The +# horizontal / floating-point predictor rearranges pixel bytes assuming +# the downstream codec is a byte-oriented entropy coder (deflate, lzw, +# zstd, lz4, packbits) that a spec-compliant reader will run the inverse +# predictor over. JPEG, JPEG2000 and LERC do their own pixel modelling, +# so GDAL never applies PREDICTOR for them. Differencing the bytes anyway +# writes a file whose pixels are corrupted to every reader except one that +# happens to invert the predictor symmetrically (see issue #3371). +_PREDICTOR_INCOMPATIBLE_COMPRESSION = { + COMPRESSION_JPEG: 'jpeg', + COMPRESSION_JPEG2000: 'jpeg2000', + COMPRESSION_LERC: 'lerc', +} + + def normalize_predictor(predictor, dtype, compression: int) -> int: """Normalize a user-supplied predictor value to a TIFF predictor int. Accepts ``False``/``True`` (legacy) and integers ``1``/``2``/``3``. Returns ``1`` (no predictor), ``2`` (horizontal differencing), or ``3`` (floating-point predictor). + + A predictor other than 1 combined with a codec that does not honor the + PREDICTOR tag (jpeg / jpeg2000 / lerc) is rejected with ``ValueError``. """ if predictor is False or predictor == 0: - return 1 - if predictor is True or predictor == 2: - return 2 - if predictor == 1: - return 1 - if predictor == 3: + resolved = 1 + elif predictor is True or predictor == 2: + resolved = 2 + elif predictor == 1: + resolved = 1 + elif predictor == 3: if np.dtype(dtype).kind != 'f': raise ValueError( "predictor=3 (floating-point) requires float data, " f"got dtype={np.dtype(dtype)}") - return 3 - raise ValueError( - f"predictor must be False/True or 1/2/3, got {predictor!r}") + resolved = 3 + else: + raise ValueError( + f"predictor must be False/True or 1/2/3, got {predictor!r}") + + if resolved != 1 and compression in _PREDICTOR_INCOMPATIBLE_COMPRESSION: + codec = _PREDICTOR_INCOMPATIBLE_COMPRESSION[compression] + raise ValueError( + f"predictor={resolved} is not supported with " + f"compression={codec!r}. The TIFF PREDICTOR tag only applies to " + f"byte-oriented codecs (deflate, lzw, zstd, lz4, packbits); " + f"{codec!r} does its own pixel modelling and GDAL ignores the " + f"tag, so differencing the bytes corrupts the pixels for " + f"spec-compliant readers. Drop predictor= (or set predictor=1) " + f"for this codec.") + return resolved def _apply_predictor_encode(buf: np.ndarray, predictor: int, diff --git a/xrspatial/geotiff/tests/unit/test_predictor.py b/xrspatial/geotiff/tests/unit/test_predictor.py index bc1e9a037..9bc9cd11f 100644 --- a/xrspatial/geotiff/tests/unit/test_predictor.py +++ b/xrspatial/geotiff/tests/unit/test_predictor.py @@ -1234,3 +1234,92 @@ def test_normalize_predictor_table(): with pytest.raises(ValueError, match="predictor must be"): normalize_predictor(99, f32, deflate) + + +# --------------------------------------------------------------------------- +# Predictor vs. codec compatibility (issue #3371). The TIFF PREDICTOR tag +# only applies to byte-oriented entropy codecs. jpeg / jpeg2000 / lerc do +# their own pixel modelling and ignore the tag, so differencing the bytes +# first writes a file that no spec-compliant reader can recover. +# --------------------------------------------------------------------------- + + +def test_normalize_predictor_rejects_predictor_codec_mismatch(): + """predictor 2/3 + jpeg/jpeg2000/lerc raises; byte codecs are fine.""" + from xrspatial.geotiff._compression import (COMPRESSION_DEFLATE, COMPRESSION_JPEG, + COMPRESSION_JPEG2000, COMPRESSION_LERC, + COMPRESSION_LZ4, COMPRESSION_LZW, + COMPRESSION_PACKBITS, COMPRESSION_ZSTD) + f32 = np.dtype("float32") + u16 = np.dtype("uint16") + + incompatible = { + "jpeg": COMPRESSION_JPEG, + "jpeg2000": COMPRESSION_JPEG2000, + "lerc": COMPRESSION_LERC, + } + for name, tag in incompatible.items(): + with pytest.raises(ValueError, match=f"compression={name!r}"): + normalize_predictor(2, u16, tag) + with pytest.raises(ValueError, match=f"compression={name!r}"): + normalize_predictor(3, f32, tag) + + # predictor=1 (the default for these codecs) is always allowed. + for tag in incompatible.values(): + assert normalize_predictor(1, u16, tag) == 1 + assert normalize_predictor(False, u16, tag) == 1 + + # Byte-oriented codecs keep honouring the predictor. + for tag in (COMPRESSION_DEFLATE, COMPRESSION_LZW, COMPRESSION_ZSTD, + COMPRESSION_LZ4, COMPRESSION_PACKBITS): + assert normalize_predictor(2, u16, tag) == 2 + + +@pytest.mark.parametrize("codec", ["jpeg2000", "j2k", "lerc"]) +def test_to_geotiff_rejects_predictor2_with_codec(tmp_path, codec): + """Eager writer refuses predictor=2 with jpeg2000/lerc (issue #3371).""" + arr = (np.arange(64 * 64, dtype=np.uint16).reshape(64, 64) % 1000) + da = _da_xy(arr) + path = tmp_path / f"pred2_{codec}_3371.tif" + with pytest.raises(ValueError, match="PREDICTOR"): + to_geotiff(da, str(path), compression=codec, predictor=2, + allow_experimental_codecs=True) + + +@pytest.mark.parametrize("codec", ["jpeg2000", "lerc"]) +def test_to_geotiff_rejects_predictor3_with_codec(tmp_path, codec): + """Eager writer refuses predictor=3 (float) with jpeg2000/lerc.""" + arr = _smooth_float((64, 64), np.float32) + da = _da_xy(arr) + path = tmp_path / f"pred3_{codec}_3371.tif" + with pytest.raises(ValueError, match="PREDICTOR"): + to_geotiff(da, str(path), compression=codec, predictor=3, + allow_experimental_codecs=True) + + +@pytest.mark.parametrize("codec", ["deflate", "lzw", "zstd"]) +def test_to_geotiff_predictor2_byte_codec_round_trips(tmp_path, codec): + """The gate is codec-scoped: byte-oriented codecs still round-trip.""" + arr = (np.arange(64 * 64, dtype=np.uint16).reshape(64, 64) % 1000) + da = _da_xy(arr) + path = tmp_path / f"pred2_{codec}_3371.tif" + to_geotiff(da, str(path), compression=codec, predictor=2) + rt, _ = read_to_array(str(path)) + np.testing.assert_array_equal(rt, arr) + + +@_gpu_only +@pytest.mark.parametrize("codec", ["jpeg2000", "lerc"]) +def test_gpu_writer_rejects_predictor2_with_codec(tmp_path, codec): + """GPU writer shares normalize_predictor, so the gate fires there too.""" + import cupy + + from xrspatial.geotiff import _write_geotiff_gpu + + arr = (np.arange(64 * 64, dtype=np.uint16).reshape(64, 64) % 1000) + da = _da_xy(arr.copy()) + da = da.copy(data=cupy.asarray(da.data)) + path = tmp_path / f"gpu_pred2_{codec}_3371.tif" + with pytest.raises(ValueError, match="PREDICTOR"): + _write_geotiff_gpu(da, str(path), compression=codec, predictor=2, + allow_experimental_codecs=True)