From 38d99924c063905e0f352256b5f1ad7f3800ad21 Mon Sep 17 00:00:00 2001 From: Anas Date: Sun, 21 Jun 2026 15:42:09 -0400 Subject: [PATCH] Single-pass MemoryFileSystem.find() to avoid quadratic listing MemoryFileSystem inherited the generic AbstractFileSystem.find(), which walks the tree calling ls() once per directory. Each ls() scans the whole global store, so listing a tree is O(n_dirs * n_entries). Override find() with a single pass over the flat store, producing output identical to the generic implementation across roots, maxdepth, withdirs and detail. On a 10k-file / 200-directory tree this is ~20x faster and the gain grows with the tree size; find(), du(), expand_path() and glob() all benefit since they delegate to find(). Add a differential test against AbstractFileSystem.find() and a regression test asserting find() no longer calls ls() per directory. --- docs/source/changelog.rst | 7 +++ fsspec/implementations/memory.py | 70 +++++++++++++++++++++ fsspec/implementations/tests/test_memory.py | 51 +++++++++++++++ 3 files changed, 128 insertions(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index ebdf40a09..c86dd7c7b 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,13 @@ Changelog ========= +Dev +--- + +Enhancements + +- Single-pass ``MemoryFileSystem.find()`` to avoid O(n_dirs * n_entries) listing; ``ls()`` previously scanned the whole store once per directory, now one pass over the flat store suffices (#2055) + 2026.6.0 -------- diff --git a/fsspec/implementations/memory.py b/fsspec/implementations/memory.py index f6b67bbc8..ad0b05fec 100644 --- a/fsspec/implementations/memory.py +++ b/fsspec/implementations/memory.py @@ -40,6 +40,76 @@ def _strip_protocol(cls, path): path = path.lstrip("/").rstrip("/") return "/" + path if path else "" + def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): + # The base implementation calls ls() once per directory, and each ls() + # scans the whole (global) store, giving O(n_dirs * n_entries) behaviour + # for a tree. Since the store is a flat mapping of every path, the same + # result can be produced with a single pass over it. + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + path = self._strip_protocol(path) + if path in self.store: + # path is itself a file + if not detail: + return [path] + filelike = self.store[path] + return { + path: { + "name": path, + "size": filelike.size, + "type": "file", + "created": filelike.created.timestamp(), + } + } + + # Uniform prefix so that the search root "" (the filesystem root) and a + # nested path are handled the same way; rel depth is rel.count("/") + 1. + prefix = path + "/" if path else "/" + out = {} + dirs = {} + + def add_ancestor_dirs(name): + # Register every directory implied between ``path`` and ``name`` that + # is within maxdepth, mirroring how walk() surfaces implied dirs. + idx = name.rfind("/") + while idx > len(path): + parent = name[:idx] + if parent in dirs: + break + rel = parent[len(prefix) :] + if maxdepth is None or rel.count("/") + 1 <= maxdepth: + dirs[parent] = {"name": parent, "size": 0, "type": "directory"} + idx = parent.rfind("/") + + for name, filelike in self.store.items(): + if not name.startswith(prefix): + continue + rel = name[len(prefix) :] + if withdirs: + add_ancestor_dirs(name) + if maxdepth is not None and rel.count("/") + 1 > maxdepth: + continue + out[name] = { + "name": name, + "size": filelike.size, + "type": "file", + "created": filelike.created.timestamp(), + } + + if withdirs: + # Explicitly-created (possibly empty) directories live in pseudo_dirs. + for pdir in self.pseudo_dirs: + if pdir and pdir.startswith(prefix): + add_ancestor_dirs(pdir + "/") + out.update(dirs) + # Mirror the base find(): include the search root itself when it is + # a directory (needed for posix glob compliance). + if path != "" and self.isdir(path): + out[path] = self.info(path) + + names = sorted(out) + return {name: out[name] for name in names} if detail else names + def ls(self, path, detail=True, **kwargs): path = self._strip_protocol(path) if path in self.store: diff --git a/fsspec/implementations/tests/test_memory.py b/fsspec/implementations/tests/test_memory.py index 2fb02c774..e6e3b6f15 100644 --- a/fsspec/implementations/tests/test_memory.py +++ b/fsspec/implementations/tests/test_memory.py @@ -4,6 +4,7 @@ import pytest from fsspec.implementations.local import LocalFileSystem, make_path_posix +from fsspec.implementations.memory import MemoryFileSystem def test_1(m): @@ -394,3 +395,53 @@ def test_open_path_windows(m): f.write(b"some\nlines\nof\ntext") assert m.read_text(path) == "some\nlines\nof\ntext" + + +def test_find_matches_generic(m): + # MemoryFileSystem overrides find() with a single-pass implementation; make + # sure it agrees with the generic ls()-based AbstractFileSystem.find() across + # roots, maxdepth, withdirs and detail. + from fsspec.spec import AbstractFileSystem + + for path in [ + "/data/a/f1.txt", + "/data/a/f2.txt", + "/data/a/b/deep.txt", + "/data/a/b/c/deepest.txt", + "/data/x.txt", + "/data/y/z.txt", + "/other/o.txt", + ]: + m.pipe_file(path, b"hello") + m.mkdir("/data/emptydir") # empty (pseudo) directory + m.mkdir("/data/a/b/emptysub") + + for root in ["", "/data", "/data/a", "/data/a/b", "/data/x.txt", "/nope"]: + for maxdepth in [None, 1, 2, 3]: + for withdirs in [False, True]: + for detail in [False, True]: + got = m.find( + root, maxdepth=maxdepth, withdirs=withdirs, detail=detail + ) + expected = AbstractFileSystem.find( + m, root, maxdepth=maxdepth, withdirs=withdirs, detail=detail + ) + assert got == expected, (root, maxdepth, withdirs, detail) + + +def test_find_does_not_scan_per_directory(m): + # Regression guard: the old find() called ls() once per directory and each + # ls() re-scanned the whole (global) store, giving O(n_dirs * n_files) work. + # The single-pass implementation must not call ls() at all, so total work + # stays O(n_files) regardless of how many directories there are. + from unittest import mock + + for d in range(20): + for f in range(5): + m.pipe_file(f"/data/dir{d}/file{f}.txt", b"x") + + with mock.patch.object(MemoryFileSystem, "ls", wraps=m.ls) as spy: + out = m.find("/data") + + assert len(out) == 100 + assert spy.call_count == 0