Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 40 additions & 12 deletions s3fs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,25 +946,53 @@ async def _find(
when used by glob, but users usually only want files.
prefix: str
Only return files that match ``^{path}/{prefix}`` (if there is an
exact match ``filename == {path}/{prefix}``, it also will be included)
exact match ``filename == {path}/{prefix}``, it also will be included).
Can be combined with ``withdirs`` and ``maxdepth``.
"""
path = self._strip_protocol(path)
bucket, key, _ = self.split_path(path)
if not bucket:
raise ValueError("Cannot traverse all of S3")
if (withdirs or maxdepth) and prefix:
# TODO: perhaps propagate these to a glob(f"path/{prefix}*") call
raise ValueError(
"Can not specify 'prefix' option alongside 'withdirs'/'maxdepth' options."
)
if maxdepth:
return await super()._find(
bucket + "/" + key,
maxdepth=maxdepth,
withdirs=withdirs,
detail=detail,
**kwargs,
if not prefix:
return await super()._find(
bucket + "/" + key,
maxdepth=maxdepth,
withdirs=withdirs,
detail=detail,
**kwargs,
)
# maxdepth + prefix: one delimiter-based listing for the first level
# (server-side prefix filter), then recurse into matching subdirs
# normally — avoids fetching all nested objects up front.
first_level = await self._lsdir(
path, delimiter="/", prefix=prefix, **kwargs
)
files = [o for o in first_level if o["type"] != "directory"]
dirs = [o for o in first_level if o["type"] == "directory"]
out = list(files)
out_dirs = list(dirs)
if maxdepth > 1:
for d in dirs:
sub = await self._find(
d["name"],
maxdepth=maxdepth - 1,
withdirs=withdirs,
detail=True,
**kwargs,
)
for name, info in sub.items():
if name == d["name"]:
continue # root dir already in out_dirs
if info["type"] == "directory":
out_dirs.append(info)
else:
out.append(info)
if withdirs:
out = sorted(out + out_dirs, key=lambda x: x["name"])
if detail:
return {o["name"]: o for o in out}
return [o["name"] for o in out]
# TODO: implement find from dircache, if all listings are present
# if refresh is False:
# out = incomplete_tree_dirs(self.dircache, path)
Expand Down
40 changes: 40 additions & 0 deletions s3fs/tests/test_s3fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2625,6 +2625,46 @@ def test_find_with_prefix(s3):
)


def test_find_with_prefix_and_withdirs(s3):
# Issue #1013: prefix must be combinable with withdirs (used by _glob internally)
for cursor in range(10):
s3.touch(test_bucket_name + f"/wdpfx/sub/file_{cursor}")

# withdirs=True + prefix should work and include synthesised directory entries
result = s3.find(test_bucket_name, prefix="wdpfx", withdirs=True)
assert test_bucket_name + "/wdpfx/sub" in result
assert all(
r.startswith(test_bucket_name + "/wdpfx") for r in result
), "prefix filter must be respected"
assert len([r for r in result if "file_" in r]) == 10

# prefix alone (withdirs=False default) must still work
files_only = s3.find(test_bucket_name + "/wdpfx/sub/", prefix="file_")
assert len(files_only) == 10


def test_find_with_prefix_and_maxdepth(s3):
# Issue #1013: prefix must be combinable with maxdepth
for cursor in range(5):
s3.touch(test_bucket_name + f"/mxpfx/sub/file_{cursor}")
s3.touch(test_bucket_name + "/mxpfx_top")

# maxdepth=1 from test_bucket_name: only direct children (depth 1) are returned
# test_bucket_name/mxpfx_top is at depth 1, test_bucket_name/mxpfx/sub/file_* are at depth 3
result = s3.find(test_bucket_name, prefix="mxpfx", maxdepth=1)
assert test_bucket_name + "/mxpfx_top" in result
assert not any("file_" in r for r in result), "depth-2+ files must be excluded"

# maxdepth=2: picks up test_bucket_name/mxpfx/sub (depth 2 dir) but not files inside
result2 = s3.find(test_bucket_name, prefix="mxpfx", maxdepth=2, withdirs=True)
assert test_bucket_name + "/mxpfx/sub" in result2
assert not any("file_" in r for r in result2), "depth-3 files must be excluded"

# maxdepth=3: all files now reachable
result3 = s3.find(test_bucket_name, prefix="mxpfx", maxdepth=3)
assert len([r for r in result3 if "file_" in r]) == 5


def test_list_after_find(s3):
before = s3.ls("s3://test")
s3.invalidate_cache("s3://test/2014-01-01.csv")
Expand Down
Loading