From 22f160c870a00178aae6f6c0a7ffc69cd042ce8d Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 16 Jun 2026 06:39:46 -0700 Subject: [PATCH 1/5] Add free-threaded Python 3.14 (3.14t) to CI matrix (#3359) Add 3.14t to both the PR fast lane and the full push/nightly lane so the no-GIL build gets exercised. Mark the free-threaded job continue-on-error (keyed on the 't' suffix) so it reports signal while numba and the rest of the stack finish their free-threaded support, without blocking merges. --- .github/workflows/test.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 89030d22e..6975dd142 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,10 +23,15 @@ on: jobs: run: runs-on: ${{ matrix.os }} + # The free-threaded build (3.14t) is allowed to fail while numba and the + # rest of the stack finish their no-GIL support, so a broken install or a + # thread-safety failure reports signal without turning the PR check red or + # blocking a merge. Promote it to a required job once it stays green. + continue-on-error: ${{ endsWith(matrix.python, 't') }} strategy: matrix: os: ['ubuntu-latest', 'macos-latest', 'windows-latest'] - python: ${{ github.event_name == 'pull_request' && fromJson('["3.14"]') || fromJson('["3.12", "3.13", "3.14"]') }} + python: ${{ github.event_name == 'pull_request' && fromJson('["3.14", "3.14t"]') || fromJson('["3.12", "3.13", "3.14", "3.14t"]') }} env: OS: ${{ matrix.os }} PYTHON: ${{ matrix.python }} From 93571b980ed2553546e9a3bc9b4f2f5c038fc037 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 16 Jun 2026 06:43:16 -0700 Subject: [PATCH 2/5] Cap free-threaded job runtime with timeout-minutes (#3359) Address review suggestion: a no-GIL deadlock on the 3.14t job would otherwise run to the 360-minute default. Cap the free-threaded entry at 30 minutes (keyed on the 't' suffix); other versions keep the default. --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6975dd142..ae27da33c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -28,6 +28,9 @@ jobs: # thread-safety failure reports signal without turning the PR check red or # blocking a merge. Promote it to a required job once it stays green. continue-on-error: ${{ endsWith(matrix.python, 't') }} + # Cap the free-threaded job so a no-GIL deadlock can't sit on a runner for + # the 360-minute default; other versions keep that default. + timeout-minutes: ${{ endsWith(matrix.python, 't') && 30 || 360 }} strategy: matrix: os: ['ubuntu-latest', 'macos-latest', 'windows-latest'] From 5c66215760c24cf53996ebdb8ef9c0df4a299252 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 16 Jun 2026 07:11:47 -0700 Subject: [PATCH 3/5] Fix free-threaded data race in _CloudSource ranged reads (#3361) fsspec's MemoryFileSystem (and any backend whose open() returns a shared file object) handed every _CloudSource.read_range/read_all call the same process-global handle. Concurrent windowed reads -- dask chunk tasks plus read_ranges' own worker pool -- raced on that single cursor, corrupting tile bytes (zlib 'incorrect header check') or reading a just-closed handle ('I/O operation on closed file'). The GIL serialized the reads enough to mask it; the free-threaded 3.14t lane exposed it. Route both reads through the stateless cat_file ranged API, which returns a fresh byte slice per call with no shared cursor. cat_file matches seek+read semantics including the EOF clamp the COG header prefetch needs. Add a deterministic regression test that forces the shared-cursor interleaving with a seek barrier, so it fails on the old implementation even under the GIL, plus a live memory:// round-trip check. Surfaced by the 3.14t CI lane added in this PR (#3359). --- xrspatial/geotiff/_sources.py | 26 +++- .../test_cloud_source_concurrency_3361.py | 142 ++++++++++++++++++ 2 files changed, 163 insertions(+), 5 deletions(-) create mode 100644 xrspatial/geotiff/tests/read/test_cloud_source_concurrency_3361.py diff --git a/xrspatial/geotiff/_sources.py b/xrspatial/geotiff/_sources.py index b7c801dc3..f6101f5fe 100644 --- a/xrspatial/geotiff/_sources.py +++ b/xrspatial/geotiff/_sources.py @@ -1680,9 +1680,22 @@ def __init__(self, url: str, **storage_options): self._size = self._fs.size(self._path) def read_range(self, start: int, length: int) -> bytes: - with self._fs.open(self._path, 'rb') as f: - f.seek(start) - return f.read(length) + # Use the stateless ``cat_file`` ranged-read API rather than + # ``open()`` + ``seek()`` + ``read()``. For some backends + # ``fs.open(path, 'rb')`` hands back a shared, process-global file + # object -- notably fsspec's ``MemoryFileSystem``, where every + # open of a path returns the same stored buffer. Concurrent + # windowed reads (e.g. dask chunk tasks under the free-threaded + # interpreter, or ``read_ranges``' own worker pool) then race on + # that single handle's cursor: one thread's ``seek`` lands under + # another's ``read``, corrupting the returned bytes (surfacing as + # ``zlib.error: incorrect header check`` when the bytes feed a + # tile decoder) or reading from a handle the context manager just + # closed. ``cat_file`` returns a fresh byte slice per call with no + # shared cursor, so it is safe under true parallelism. See issue + # #3361. ``end`` is exclusive and clamps at EOF, matching + # ``read(length)``. + return self._fs.cat_file(self._path, start=start, end=start + length) def read_ranges( self, @@ -1740,8 +1753,11 @@ def read_ranges_coalesced( return split_coalesced_bytes(merged_bytes, mapping) def read_all(self) -> bytes: - with self._fs.open(self._path, 'rb') as f: - return f.read() + # ``cat_file`` with no range reads the whole object and, like + # ``read_range`` above, avoids the shared-handle seek/close race + # that ``open()`` exposes on backends such as fsspec's + # ``MemoryFileSystem`` under concurrent reads (issue #3361). + return self._fs.cat_file(self._path) @property def size(self) -> int: diff --git a/xrspatial/geotiff/tests/read/test_cloud_source_concurrency_3361.py b/xrspatial/geotiff/tests/read/test_cloud_source_concurrency_3361.py new file mode 100644 index 000000000..d3d3af3ec --- /dev/null +++ b/xrspatial/geotiff/tests/read/test_cloud_source_concurrency_3361.py @@ -0,0 +1,142 @@ +"""Regression tests for the _CloudSource shared-handle data race (#3361). + +``_CloudSource.read_range`` used to do ``fs.open(path, 'rb')`` followed by +``seek`` + ``read``. For backends whose ``open`` returns a shared, +process-global file object -- notably fsspec's ``MemoryFileSystem``, where +every open of a path hands back the same stored buffer -- concurrent +windowed reads raced on that single cursor. Under the free-threaded +interpreter (3.14t) this corrupted tile bytes (``zlib.error: incorrect +header check``) or read from a handle another thread had just closed +(``ValueError: I/O operation on closed file``). The GIL masked the race, +so the integration repro (``chunks=`` open of a ``memory://`` COG) only +fails on the free-threaded lane. + +The fix routes reads through the stateless ``cat_file`` ranged API. These +tests pin that contract deterministically (without depending on the +free-threaded interpreter) by driving ``read_range`` against a filesystem +whose ``open`` returns a shared handle and whose ``seek`` is barrier- +synchronised, so a shared-cursor implementation provably returns the wrong +bytes while the ``cat_file`` implementation does not. +""" +import threading + +import pytest + +fsspec = pytest.importorskip("fsspec") + +from xrspatial.geotiff._sources import _CloudSource # noqa: E402 + + +class _SharedHandle: + """A single buffer with one shared cursor, like an fsspec MemoryFile.""" + + def __init__(self, buf, seek_barrier=None): + self._buf = buf + self.pos = 0 + self._seek_barrier = seek_barrier + + def seek(self, pos, whence=0): + assert whence == 0 + self.pos = pos + # Force every concurrent reader to finish seeking before any of + # them reads. On a shared cursor this guarantees at least one read + # observes another thread's offset -- the exact race the fix + # removes -- making the test deterministic under the GIL. + if self._seek_barrier is not None: + self._seek_barrier.wait() + + def read(self, n=-1): + return self._buf[self.pos:self.pos + n] if n >= 0 else self._buf[self.pos:] + + def close(self): + pass + + def __enter__(self): + return self + + def __exit__(self, *exc): + return False + + +class _SharedHandleFS: + """Filesystem stub: ``open`` returns one shared handle; ``cat_file`` + is the stateless ranged read that the fix relies on.""" + + def __init__(self, buf, seek_barrier=None): + self._data = buf + self._handle = _SharedHandle(buf, seek_barrier=seek_barrier) + + def open(self, path, mode='rb'): + return self._handle + + def cat_file(self, path, start=None, end=None): + return self._data[start:end] + + def size(self, path): + return len(self._data) + + +def _cloud_source_over(fs): + src = _CloudSource.__new__(_CloudSource) + src._url = "memory://stub/x.bin" + src._fs = fs + src._path = "stub/x.bin" + src._size = len(fs._data) + return src + + +def test_cloud_source_concurrent_read_range_no_shared_cursor_3361(): + """Concurrent ``read_range`` calls must each return their own bytes + even when the backend's ``open`` exposes a shared cursor.""" + data = bytes((i % 251) for i in range(1000)) + barrier = threading.Barrier(2) + src = _cloud_source_over(_SharedHandleFS(data, seek_barrier=barrier)) + + results = {} + + def worker(start): + results[start] = src.read_range(start, 100) + + threads = [threading.Thread(target=worker, args=(s,)) for s in (0, 500)] + for t in threads: + t.start() + for t in threads: + t.join() + + # A shared-cursor implementation makes both reads land on whichever + # seek ran last, so at least one of these is wrong. ``cat_file`` keeps + # them independent. + assert results[0] == data[0:100] + assert results[500] == data[500:600] + + +def test_cloud_source_read_range_matches_read_semantics_3361(): + """``cat_file``-based ``read_range`` keeps ``seek``+``read`` semantics, + including the EOF clamp the COG header prefetch relies on.""" + data = b"ABCDE" + src = _cloud_source_over(_SharedHandleFS(data)) + assert src.read_range(0, 5) == b"ABCDE" + assert src.read_range(0, 100) == b"ABCDE" # length past EOF clamps + assert src.read_range(3, 100) == b"DE" + assert src.read_range(2, 0) == b"" + assert src.read_range(5, 10) == b"" # start at EOF + + +def test_cloud_source_read_range_real_memory_fs_3361(): + """End-to-end against a real fsspec ``memory://`` source: every range + reads correctly through the live ``MemoryFileSystem`` (whose ``open`` + really does return a shared handle).""" + import os + import uuid + + fs = fsspec.filesystem("memory") + data = bytes((i % 256) for i in range(4096)) + key = f"test3361-{os.getpid()}-{uuid.uuid4().hex[:8]}/x.bin" + fs.pipe_file(key, data) + try: + src = _CloudSource("memory://" + key) + for start, length in [(0, 16), (100, 200), (4000, 500), (4096, 8)]: + assert src.read_range(start, length) == data[start:start + length] + assert src.read_all() == data + finally: + fs.rm_file(key) From 8de98a15b9942baaef90573ffd5aabb7bb1891a6 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 16 Jun 2026 07:24:20 -0700 Subject: [PATCH 4/5] Force PYTHON_GIL=0 on the free-threaded CI job (#3359) Set PYTHON_GIL=0 only on the 3.14t matrix entry so the tests exercise the real no-GIL path instead of letting CPython silently re-enable the GIL for a C extension that hasn't declared free-threaded support (which would hide thread-safety bugs behind a green run). Keyed on the 't' suffix with an empty fallback for the GIL builds, where PYTHON_GIL=0 is a fatal error ('Disabling the GIL is not supported by this build'). --- .github/workflows/test.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ae27da33c..a9e63cdc6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -38,6 +38,14 @@ jobs: env: OS: ${{ matrix.os }} PYTHON: ${{ matrix.python }} + # Force the GIL off on the free-threaded (3.14t) job so the tests + # exercise the real no-GIL path instead of letting CPython silently + # re-enable the GIL for a C extension that hasn't declared + # free-threaded support (which would hide thread-safety bugs behind + # a green run). Empty on the GIL builds, where it is ignored -- + # PYTHON_GIL=0 there is a fatal error ("Disabling the GIL is not + # supported by this build"). + PYTHON_GIL: ${{ endsWith(matrix.python, 't') && '0' || '' }} steps: - uses: actions/checkout@v4 - name: Setup Python From daceb5f3e6b372a34b68e232a101e689b228727f Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 16 Jun 2026 07:33:27 -0700 Subject: [PATCH 5/5] Scope PYTHON_GIL=0 to the pytest steps, not the whole job (#3359) Setting PYTHON_GIL=0 at the job level crashed the Setup Python step on macOS: actions/setup-python invokes a standard (GIL) bootstrap interpreter while installing 3.14t, and PYTHON_GIL=0 is a fatal error there ('config_read_gil: Disabling the GIL is not supported by this build'). Move PYTHON_GIL=0 onto the two pytest run steps so it only applies to the already-installed free-threaded interpreter under test, never to setup-python or pip. Still keyed on the 't' suffix with an empty fallback for the GIL builds. --- .github/workflows/test.yml | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a9e63cdc6..e98b8cd5b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -38,14 +38,6 @@ jobs: env: OS: ${{ matrix.os }} PYTHON: ${{ matrix.python }} - # Force the GIL off on the free-threaded (3.14t) job so the tests - # exercise the real no-GIL path instead of letting CPython silently - # re-enable the GIL for a C extension that hasn't declared - # free-threaded support (which would hide thread-safety bugs behind - # a green run). Empty on the GIL builds, where it is ignored -- - # PYTHON_GIL=0 there is a fatal error ("Disabling the GIL is not - # supported by this build"). - PYTHON_GIL: ${{ endsWith(matrix.python, 't') && '0' || '' }} steps: - uses: actions/checkout@v4 - name: Setup Python @@ -62,7 +54,18 @@ jobs: # (today: the six compression fixtures). push-to-main and the # nightly schedule run the full set with no filter. if: github.event_name == 'pull_request' + # Force the GIL off only for the free-threaded (3.14t) test run so the + # suite exercises the real no-GIL path instead of letting CPython + # silently re-enable the GIL for a C extension that hasn't declared + # free-threaded support. Scoped to the pytest step (not the job) so it + # never reaches setup-python's bootstrap interpreter, where + # PYTHON_GIL=0 is a fatal error on a GIL build. Empty (ignored) on the + # GIL versions. + env: + PYTHON_GIL: ${{ endsWith(matrix.python, 't') && '0' || '' }} run: pytest -m "not slow" - name: Run pytest (full) if: github.event_name != 'pull_request' + env: + PYTHON_GIL: ${{ endsWith(matrix.python, 't') && '0' || '' }} run: pytest