diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8b9c0200d..52d7a483b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,10 +35,12 @@ jobs: - name: install run: | pip install -e . - - name: Run Standard Tests + - name: Run Standard Bucket Tests with extended feature support turned OFF run: | export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/gcsfs/tests/fake-secret.json + export GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT="false" pytest -vv -s \ + --cov=gcsfs --cov-report=xml \ --log-format="%(asctime)s %(levelname)s %(message)s" \ --log-date-format="%H:%M:%S" \ gcsfs/ @@ -46,15 +48,28 @@ jobs: run: | pip install -r gcsfs/tests/perf/microbenchmarks/requirements.txt pytest gcsfs/tests/perf/microbenchmarks --run-benchmarks-infra - - name: Run Tests with experimental support + - name: Run Standard Bucket Tests with default ON extended feature support run: | export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/gcsfs/tests/fake-secret.json - export GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT="true" pytest -vv -s \ - --cov=gcsfs --cov-report=xml \ + --cov=gcsfs --cov-append --cov-report=xml \ --log-format="%(asctime)s %(levelname)s %(message)s" \ --log-date-format="%H:%M:%S" \ gcsfs/ + - name: Run Extended tests (Zonal & HNS Enabled) + run: | + export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/gcsfs/tests/fake-secret.json + # TODO: Use dedicated test variables to decouple specialised test execution from the GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT feature flag. + export GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT="true" + pytest -vv -s \ + --cov=gcsfs --cov-append --cov-report=xml \ + --log-format="%(asctime)s %(levelname)s %(message)s" \ + --log-date-format="%H:%M:%S" \ + gcsfs/tests/test_extended_gcsfs.py \ + gcsfs/tests/test_extended_hns_gcsfs.py \ + gcsfs/tests/test_extended_gcsfs_unit.py \ + gcsfs/tests/test_zb_hns_utils.py \ + gcsfs/tests/test_zonal_file.py - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: diff --git a/docs/source/hns_buckets.rst b/docs/source/hns_buckets.rst new file mode 100644 index 000000000..b4b97f61d --- /dev/null +++ b/docs/source/hns_buckets.rst @@ -0,0 +1,89 @@ +Hierarchical Namespace (HNS) +============================================== + +To train, checkpoint, and serve AI models at peak efficiency, Google Cloud Storage (GCS) offers **Hierarchical Namespace (HNS)**. + +``gcsfs`` provides full support for all data and metadata operations on HNS buckets. + +What is a Hierarchical Namespace (HNS)? +--------------------------------------- + +Historically, GCS buckets have utilized a **flat namespace**. In a flat +namespace, directories do not exist as distinct physical entities; they are +simulated by 0-byte objects ending in a slash (``/``) or by filtering object +prefixes during list operations. + +A `Hierarchical Namespace (HNS) `_ introduces true, logical directories as first-class resources to GCS. + +Under the Hood: The ``ExtendedFileSystem`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``gcsfs`` utilizes the ``ExtendedFileSystem`` class under the hood (implemented in `gcsfs/extended_gcsfs.py `_). + +Importantly, ``ExtendedFileSystem`` is designed to be fully backward-compatible. Before executing directory operations, it automatically identifies the underlying bucket type. If it detects a standard flat-namespace bucket, it routes the request back to standard object-level operations, ensuring your existing buckets continue to work without issue. + +The fundamental architectural shift is that ``ExtendedFileSystem`` actively routes directory-level operations to the **GCS Folders grpc API** instead of relying solely on the Objects API. + +.. list-table:: **Operation Semantics: Flat Namespace vs. HNS** + :widths: 15 40 45 + :header-rows: 1 + + * - Operation + - Flat Namespace (Standard ``gcsfs``) + - HNS Namespace (``ExtendedFileSystem``) + * - **``mkdir``** + - Only used for creating buckets, since GCS Flat namespace doesn't have real directories. + - Calls the native GCS Folders API, creating physical GCS Folder resource instead of simulating with 0 byte object or object prefix. + * - **``rmdir``** + - Primarily used to delete buckets, as directories do not exist as distinct physical entities. + - Used to delete empty folders natively via the GCS Folders API, in addition to deleting buckets. + * - **``rm``** + - Paginates through and individually issues delete requests for every object matching the prefix. + - Deletes the folder resource and its contents via different delete requests corresponding to folder or file. + * - **``rename`` / ``mv``** + - Issues a ``Copy`` request for each object under the prefix, followed by ``Delete``. Non-atomic, ``O(N)``. + - Triggers a single native metadata-only rename on the folder. **Atomic** and more performant, ``O(1)``, helpful in Checkpointing. + * - **``info``** + - Infers directory existence by checking for child objects, returning mocked 0-byte metadata. + - Uses ``get_folder_metadata`` to explicitly query the Folders API, returning accurate metadata (creation time, resource IDs). + +Important Differences to Keep in Mind +------------------------------------- + +While ``gcsfs`` aims to abstract the differences via the ``fsspec`` API, you should be aware of standard HNS limitations imposed by the Google Cloud Storage API: + +1. **Implicit directories:** In standard GCS, you can create an object ``a/b/c.txt`` without the directories ``a/`` or ``a/b/`` physically existing. In HNS, the parent folder resources must exist (or be created) before the object can be written. ``gcsfs`` handles parent folder creation natively under the hood. +2. **``mkdir`` behavior:** Previously, in a flat namespace, calling ``mkdir`` on a path could only ensure the underlying bucket exists. With HNS enabled, calling ``mkdir`` will create an actual folder resource in GCS. Furthermore, if you want to create nested folders (eg: bucket/a/b/c/d) pass ``create_parents=True``, it will physically create all intermediate folder resources along the specified path. +3. **No mixing or toggling:** You cannot toggle HNS on an existing flat-namespace bucket. You must create a new HNS bucket and migrate your data. +4. **Object naming:** Object names in HNS cannot end with a slash (``/``) unless without the creation of physical folder resources. +5. **Rename Operation Benchmarks** + +The following benchmarks show the time taken (in seconds) to rename a directory containing a large number of files (spread across 256 folders and 8 levels) in a standard Regional bucket versus an HNS bucket (can be replicated using `gcsfs/tests/perf/microbenchmarks/rename`): + +.. list-table:: + :header-rows: 1 + + * - File Count + - Standard Regional (seconds) + - HNS (seconds) + * - 65K Files + - 75.69 + - 15.4 + * - 100K Files + - 170.6 + - 23.2 + +For more details on managing these buckets, refer to the official documentation for `Hierarchical Namespace `_. + +Disabling HNS Support +------------------------------ + +You can disable these features by explicitly setting an environment variable of the same name. + +**Code Example** + +.. code-block:: bash + + export GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT=false + +**Note:** *The choice of which filesystem class to use is made at import time based on the GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT environment variable, and cannot be controlled via constructor arguments passed to GCSFileSystem (but you can still import each class explicitly, if needed).* diff --git a/docs/source/index.rst b/docs/source/index.rst index ca788676f..a018bfd33 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -14,6 +14,15 @@ objects of the type used by zarr. .. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ +.. warning:: + **Default Filesystem Implementation Change:** + ``gcsfs`` now uses ``ExtendedFileSystem`` as the default + entry point for all bucket types to support specialised storage buckets like HNS out-of-box. + While all operations on standard buckets will route to the ``core.GCSFileSystem`` (pre-existing implementation) under the hood, + this represents a change in the default flow. If you experience any unexpected behavior due to this change, + you can revert to the previous implementation by setting the environment variable + ``GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT=false`` before importing ``gcsfs``. + Installation ------------ @@ -228,6 +237,7 @@ Contents .. toctree:: api developer + hns_buckets fuse changelog code-of-conduct diff --git a/gcsfs/__init__.py b/gcsfs/__init__.py index f22d6ba13..d9a728890 100644 --- a/gcsfs/__init__.py +++ b/gcsfs/__init__.py @@ -9,7 +9,7 @@ from .core import GCSFileSystem from .mapping import GCSMap -if os.getenv("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "false").lower() in ("true", "1"): +if os.getenv("GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT", "true").lower() in ("true", "1"): try: from .extended_gcsfs import ExtendedGcsFileSystem as GCSFileSystem diff --git a/gcsfs/tests/test_init.py b/gcsfs/tests/test_init.py index 4619de4f6..25d8ade33 100644 --- a/gcsfs/tests/test_init.py +++ b/gcsfs/tests/test_init.py @@ -36,10 +36,10 @@ def teardown_method(self, method): # affecting other tests sys.modules.update(self.original_modules) - def test_experimental_env_unset(self): + def test_experimental_env_is_set_by_default(self): """ - Tests gcsfs.GCSFileSystem is core.GCSFileSystem when - GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT is NOT set. + Tests gcsfs.GCSFileSystem is extended_gcsfs.ExtendedGcsFileSystem when + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT is NOT set and uses default value. """ if "GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT" in os.environ: del os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] @@ -47,16 +47,13 @@ def test_experimental_env_unset(self): import gcsfs assert ( - gcsfs.GCSFileSystem is gcsfs.core.GCSFileSystem - ), "Should be core.GCSFileSystem" - assert not hasattr( - gcsfs, "ExtendedGcsFileSystem" - ), "ExtendedGcsFileSystem should not be imported directly on gcsfs" + gcsfs.GCSFileSystem is gcsfs.extended_gcsfs.ExtendedGcsFileSystem + ), "Should be ExtendedGcsFileSystem" - def test_experimental_env_set(self): + def test_experimental_env_set_to_true(self): """ Tests gcsfs.GCSFileSystem is extended_gcsfs.ExtendedGcsFileSystem when - GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT IS set. + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT IS set to true. """ os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "true" @@ -65,3 +62,19 @@ def test_experimental_env_set(self): assert ( gcsfs.GCSFileSystem is gcsfs.extended_gcsfs.ExtendedGcsFileSystem ), "Should be ExtendedGcsFileSystem" + + def test_experimental_env_set_to_false(self): + """ + Tests gcsfs.GCSFileSystem is core.GCSFileSystem when + GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT IS set to false. + """ + os.environ["GCSFS_EXPERIMENTAL_ZB_HNS_SUPPORT"] = "false" + + import gcsfs + + assert ( + gcsfs.GCSFileSystem is gcsfs.core.GCSFileSystem + ), "Should be core.GCSFileSystem" + assert not hasattr( + gcsfs, "ExtendedGcsFileSystem" + ), "ExtendedGcsFileSystem should not be imported directly on gcsfs"