Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Changelog
=========

Dev
---

Enhancements

- Single-pass ``MemoryFileSystem.find()`` to avoid O(n_dirs * n_entries) listing; ``ls()`` previously scanned the whole store once per directory, now one pass over the flat store suffices (#2055)

2026.6.0
--------

Expand Down
70 changes: 70 additions & 0 deletions fsspec/implementations/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,76 @@ def _strip_protocol(cls, path):
path = path.lstrip("/").rstrip("/")
return "/" + path if path else ""

def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
# The base implementation calls ls() once per directory, and each ls()
# scans the whole (global) store, giving O(n_dirs * n_entries) behaviour
# for a tree. Since the store is a flat mapping of every path, the same
# result can be produced with a single pass over it.
if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")
path = self._strip_protocol(path)
if path in self.store:
# path is itself a file
if not detail:
return [path]
filelike = self.store[path]
return {
path: {
"name": path,
"size": filelike.size,
"type": "file",
"created": filelike.created.timestamp(),
}
}

# Uniform prefix so that the search root "" (the filesystem root) and a
# nested path are handled the same way; rel depth is rel.count("/") + 1.
prefix = path + "/" if path else "/"
out = {}
dirs = {}

def add_ancestor_dirs(name):
# Register every directory implied between ``path`` and ``name`` that
# is within maxdepth, mirroring how walk() surfaces implied dirs.
idx = name.rfind("/")
while idx > len(path):
parent = name[:idx]
if parent in dirs:
break
rel = parent[len(prefix) :]
if maxdepth is None or rel.count("/") + 1 <= maxdepth:
dirs[parent] = {"name": parent, "size": 0, "type": "directory"}
idx = parent.rfind("/")

for name, filelike in self.store.items():
if not name.startswith(prefix):
continue
rel = name[len(prefix) :]
if withdirs:
add_ancestor_dirs(name)
if maxdepth is not None and rel.count("/") + 1 > maxdepth:
continue
out[name] = {
"name": name,
"size": filelike.size,
"type": "file",
"created": filelike.created.timestamp(),
}

if withdirs:
# Explicitly-created (possibly empty) directories live in pseudo_dirs.
for pdir in self.pseudo_dirs:
if pdir and pdir.startswith(prefix):
add_ancestor_dirs(pdir + "/")
out.update(dirs)
# Mirror the base find(): include the search root itself when it is
# a directory (needed for posix glob compliance).
if path != "" and self.isdir(path):
out[path] = self.info(path)

names = sorted(out)
return {name: out[name] for name in names} if detail else names

def ls(self, path, detail=True, **kwargs):
path = self._strip_protocol(path)
if path in self.store:
Expand Down
51 changes: 51 additions & 0 deletions fsspec/implementations/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from fsspec.implementations.local import LocalFileSystem, make_path_posix
from fsspec.implementations.memory import MemoryFileSystem


def test_1(m):
Expand Down Expand Up @@ -394,3 +395,53 @@ def test_open_path_windows(m):
f.write(b"some\nlines\nof\ntext")

assert m.read_text(path) == "some\nlines\nof\ntext"


def test_find_matches_generic(m):
# MemoryFileSystem overrides find() with a single-pass implementation; make
# sure it agrees with the generic ls()-based AbstractFileSystem.find() across
# roots, maxdepth, withdirs and detail.
from fsspec.spec import AbstractFileSystem

for path in [
"/data/a/f1.txt",
"/data/a/f2.txt",
"/data/a/b/deep.txt",
"/data/a/b/c/deepest.txt",
"/data/x.txt",
"/data/y/z.txt",
"/other/o.txt",
]:
m.pipe_file(path, b"hello")
m.mkdir("/data/emptydir") # empty (pseudo) directory
m.mkdir("/data/a/b/emptysub")

for root in ["", "/data", "/data/a", "/data/a/b", "/data/x.txt", "/nope"]:
for maxdepth in [None, 1, 2, 3]:
for withdirs in [False, True]:
for detail in [False, True]:
got = m.find(
root, maxdepth=maxdepth, withdirs=withdirs, detail=detail
)
expected = AbstractFileSystem.find(
m, root, maxdepth=maxdepth, withdirs=withdirs, detail=detail
)
assert got == expected, (root, maxdepth, withdirs, detail)


def test_find_does_not_scan_per_directory(m):
# Regression guard: the old find() called ls() once per directory and each
# ls() re-scanned the whole (global) store, giving O(n_dirs * n_files) work.
# The single-pass implementation must not call ls() at all, so total work
# stays O(n_files) regardless of how many directories there are.
from unittest import mock

for d in range(20):
for f in range(5):
m.pipe_file(f"/data/dir{d}/file{f}.txt", b"x")

with mock.patch.object(MemoryFileSystem, "ls", wraps=m.ls) as spy:
out = m.find("/data")

assert len(out) == 100
assert spy.call_count == 0