From 166324aeab5cbb82a49bfae9b892d0eecde07965 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 12 Feb 2026 13:20:08 -0500 Subject: [PATCH 1/2] bf(test): add timeout to test_nwb2asset_remote_asset to prevent CI hang When dandi-cli tests are run from dandi-archive CI via `pytest --pyargs dandi`, the rootdir is dandi-archive, so dandi-cli's tox.ini [pytest] config (which has --timeout=300) is NOT read. This means the test has no timeout protection. With dandischema 0.12.0, the test no longer fails quickly (the model validation path changed), so it now reaches the fsspec HTTP read which can hang indefinitely. The xfail marker alone doesn't help -- xfail only catches exceptions, not hangs. Adding @pytest.mark.timeout(120) ensures the test is killed after 2 minutes regardless of which project runs it. The xfail marker then catches the timeout as an expected failure. Fixes: https://github.com/dandi/dandi-cli/issues/1762 Co-Authored-By: Claude Opus 4.6 --- dandi/tests/test_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dandi/tests/test_metadata.py b/dandi/tests/test_metadata.py index 1170e67af..1d0b8697e 100644 --- a/dandi/tests/test_metadata.py +++ b/dandi/tests/test_metadata.py @@ -1145,6 +1145,7 @@ def test_nwb2asset(simple2_nwb: Path) -> None: ) +@pytest.mark.timeout(120) @pytest.mark.xfail(reason="https://github.com/dandi/dandi-cli/issues/1450") def test_nwb2asset_remote_asset(nwb_dandiset: SampleDandiset) -> None: pytest.importorskip("fsspec") From a6109befcc353d4f47f115c91c06e8cdf93b088c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 12 Feb 2026 13:46:10 -0500 Subject: [PATCH 2/2] bf: pass aiohttp timeouts to fsspec in RemoteReadableAsset.open() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dandi-archive CLI integration tests started hanging for 6 hours on November 24, 2025 (dandi/dandi-archive#1762). Investigation of tinuous CI logs showed: - Nov 21 (last success): test_nwb2asset_remote_asset XFAIL'd in 0.4s - Nov 24 (first hang): same test hung for 6 hours until GitHub killed it - ALL test runs on Nov 24-25 hung, across unrelated PRs The code flow that hangs is: nwb2asset() → get_metadata() → _get_pynwb_metadata() → open_readable() → RemoteReadableAsset.open() → fsspec.open(url).open() → aiohttp HTTP read from minio in Docker → fsspec sync() blocks in threading.Event.wait() The key environmental change between Nov 21 and Nov 24 was dandi-cli PR #1744 updating dandischema from <0.12.0 to ~=0.12.0. With dandischema 0.11.x, the test hit a quick model validation mismatch (completing as XFAIL in 0.4s before reaching the fsspec read). With dandischema 0.12.0 (vendor-configurable models, schema 0.7.0), that mismatch no longer occurs, so the test now proceeds to the actual fsspec HTTP read — which hangs. The hang itself is a known interaction between h5py, fsspec, and GC: - h5py holds a global lock while reading from Python file objects - fsspec's sync() runs async aiohttp coroutines on a background thread and blocks the calling thread in threading.Event.wait() - Without socket-level timeouts, aiohttp blocks forever on stalled connections (aio-libs/aiohttp#11740) - GC running during this window can deadlock with h5py's lock (h5py/h5py#2019) The fix: pass explicit ClientTimeout to aiohttp via fsspec's client_kwargs so that stalled connections raise TimeoutError instead of blocking indefinitely. Additionally, the dandi-archive CI never had a pytest --timeout because dandi-cli's tox.ini [pytest] addopts (--timeout=300) are not read when pytest runs from the dandi-archive rootdir via `pytest --pyargs dandi`. References: - https://github.com/fsspec/filesystem_spec/issues/1666 - https://github.com/h5py/h5py/issues/2019 - https://github.com/aio-libs/aiohttp/issues/11740 - https://github.com/dandi/dandi-cli/issues/1762 - https://github.com/dandi/dandi-cli/issues/1450 Co-Authored-By: Claude Opus 4.6 --- dandi/misctypes.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/dandi/misctypes.py b/dandi/misctypes.py index 9ed2dcf07..b50cffc42 100644 --- a/dandi/misctypes.py +++ b/dandi/misctypes.py @@ -345,10 +345,27 @@ def open(self) -> IO[bytes]: # Optional dependency: import fsspec + from aiohttp import ClientTimeout + # We need to call open() on the return value of fsspec.open() because # otherwise the filehandle will only be opened when used to enter a # context manager. - return cast(IO[bytes], fsspec.open(self.url, mode="rb").open()) + # + # Pass explicit timeouts to aiohttp to prevent indefinite hangs in + # fsspec's sync() wrapper. Without these, a stalled connection to S3 + # (or minio in tests) causes fsspec's background IO thread to block + # forever, which in turn blocks the calling thread in + # threading.Event.wait() — see https://github.com/fsspec/filesystem_spec/issues/1666 + return cast( + IO[bytes], + fsspec.open( + self.url, + mode="rb", + client_kwargs={ + "timeout": ClientTimeout(total=120, sock_read=60, sock_connect=30) + }, + ).open(), + ) def get_size(self) -> int: return self.size