diff --git a/s3fs/core.py b/s3fs/core.py index 190f9604..30a005b8 100644 --- a/s3fs/core.py +++ b/s3fs/core.py @@ -946,25 +946,53 @@ async def _find( when used by glob, but users usually only want files. prefix: str Only return files that match ``^{path}/{prefix}`` (if there is an - exact match ``filename == {path}/{prefix}``, it also will be included) + exact match ``filename == {path}/{prefix}``, it also will be included). + Can be combined with ``withdirs`` and ``maxdepth``. """ path = self._strip_protocol(path) bucket, key, _ = self.split_path(path) if not bucket: raise ValueError("Cannot traverse all of S3") - if (withdirs or maxdepth) and prefix: - # TODO: perhaps propagate these to a glob(f"path/{prefix}*") call - raise ValueError( - "Can not specify 'prefix' option alongside 'withdirs'/'maxdepth' options." - ) if maxdepth: - return await super()._find( - bucket + "/" + key, - maxdepth=maxdepth, - withdirs=withdirs, - detail=detail, - **kwargs, + if not prefix: + return await super()._find( + bucket + "/" + key, + maxdepth=maxdepth, + withdirs=withdirs, + detail=detail, + **kwargs, + ) + # maxdepth + prefix: one delimiter-based listing for the first level + # (server-side prefix filter), then recurse into matching subdirs + # normally — avoids fetching all nested objects up front. + first_level = await self._lsdir( + path, delimiter="/", prefix=prefix, **kwargs ) + files = [o for o in first_level if o["type"] != "directory"] + dirs = [o for o in first_level if o["type"] == "directory"] + out = list(files) + out_dirs = list(dirs) + if maxdepth > 1: + for d in dirs: + sub = await self._find( + d["name"], + maxdepth=maxdepth - 1, + withdirs=withdirs, + detail=True, + **kwargs, + ) + for name, info in sub.items(): + if name == d["name"]: + continue # root dir already in out_dirs + if info["type"] == "directory": + out_dirs.append(info) + else: + out.append(info) + if withdirs: + out = sorted(out + out_dirs, key=lambda x: x["name"]) + if detail: + return {o["name"]: o for o in out} + return [o["name"] for o in out] # TODO: implement find from dircache, if all listings are present # if refresh is False: # out = incomplete_tree_dirs(self.dircache, path) diff --git a/s3fs/tests/test_s3fs.py b/s3fs/tests/test_s3fs.py index 4195531c..b2822511 100644 --- a/s3fs/tests/test_s3fs.py +++ b/s3fs/tests/test_s3fs.py @@ -2625,6 +2625,46 @@ def test_find_with_prefix(s3): ) +def test_find_with_prefix_and_withdirs(s3): + # Issue #1013: prefix must be combinable with withdirs (used by _glob internally) + for cursor in range(10): + s3.touch(test_bucket_name + f"/wdpfx/sub/file_{cursor}") + + # withdirs=True + prefix should work and include synthesised directory entries + result = s3.find(test_bucket_name, prefix="wdpfx", withdirs=True) + assert test_bucket_name + "/wdpfx/sub" in result + assert all( + r.startswith(test_bucket_name + "/wdpfx") for r in result + ), "prefix filter must be respected" + assert len([r for r in result if "file_" in r]) == 10 + + # prefix alone (withdirs=False default) must still work + files_only = s3.find(test_bucket_name + "/wdpfx/sub/", prefix="file_") + assert len(files_only) == 10 + + +def test_find_with_prefix_and_maxdepth(s3): + # Issue #1013: prefix must be combinable with maxdepth + for cursor in range(5): + s3.touch(test_bucket_name + f"/mxpfx/sub/file_{cursor}") + s3.touch(test_bucket_name + "/mxpfx_top") + + # maxdepth=1 from test_bucket_name: only direct children (depth 1) are returned + # test_bucket_name/mxpfx_top is at depth 1, test_bucket_name/mxpfx/sub/file_* are at depth 3 + result = s3.find(test_bucket_name, prefix="mxpfx", maxdepth=1) + assert test_bucket_name + "/mxpfx_top" in result + assert not any("file_" in r for r in result), "depth-2+ files must be excluded" + + # maxdepth=2: picks up test_bucket_name/mxpfx/sub (depth 2 dir) but not files inside + result2 = s3.find(test_bucket_name, prefix="mxpfx", maxdepth=2, withdirs=True) + assert test_bucket_name + "/mxpfx/sub" in result2 + assert not any("file_" in r for r in result2), "depth-3 files must be excluded" + + # maxdepth=3: all files now reachable + result3 = s3.find(test_bucket_name, prefix="mxpfx", maxdepth=3) + assert len([r for r in result3 if "file_" in r]) == 5 + + def test_list_after_find(s3): before = s3.ls("s3://test") s3.invalidate_cache("s3://test/2014-01-01.csv")