diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 22e1edc..55d3ad7 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-24.04 tools: - python: "3.11" + python: "3.12" # Build documentation in the "docs/" directory with Sphinx sphinx: diff --git a/contributing.md b/contributing.md index 500c52b..04b17ab 100644 --- a/contributing.md +++ b/contributing.md @@ -29,7 +29,7 @@ Guidelines](https://opensource.google/conduct/). 1. It is recommended to do development in a separate virtual environment ```shell - python3.11 -m venv + python3.12 -m venv ``` 2. Install all the build, dev, test and docs dependencies diff --git a/google/cloud/dataproc_ml/__init__.py b/google/cloud/dataproc_ml/__init__.py index 1eff3f4..88af3fa 100644 --- a/google/cloud/dataproc_ml/__init__.py +++ b/google/cloud/dataproc_ml/__init__.py @@ -12,4 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""A subpackage for ml.""" +"""A python library to ease MLOps for Dataproc customers""" + +try: + import pyspark +except ImportError: + raise ImportError( + "PySpark is not installed. The `dataproc-ml` library requires a Spark " + "environment.\n" + "Please install one of the following packages:\n" + "1. For standard Spark: pip install dataproc-ml[spark]\n" + "2. For Spark Connect: pip install dataproc-ml[spark-connect]" + ) from None diff --git a/pyproject.toml b/pyproject.toml index 2ec9fbd..c3301fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,29 +4,28 @@ build-backend = "setuptools.build_meta" [project] name = "dataproc-ml" -version = "0.1.0" +version = "1.0.0-rc" authors = [{ name = "Google LLC" }] classifiers = [ "Intended Audience :: Developers", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Libraries :: Python Modules", ] description = "A python library to ease MLOps for Dataproc customers" license = "Apache-2.0" license-files = ["LICENSE*"] readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.12" # Aligned with environment of Cloud Dataproc 2.3 for GCE-ML/serverless image dependencies = [ - "google-cloud-aiplatform>=1.88.0, <2.0.0", + "google-cloud-aiplatform>=1.121.0, <2.0.0", "google-cloud-storage>=2.19.0, <3.0.0", - "pandas>=2.1.4, <3.0.0", - "pyarrow>=16.1.0, <17.0.0", - "pyspark>=3.5.3, <4.0.0", + "pandas>=2.2.2, <3.0.0", + "pyarrow>=18.1.0, <20.0.0", "tenacity>=8.5.0, <9.0.0", - "tensorflow>=2.17.0, <2.20.0", + "tensorflow>=2.18.0, <2.20.0", "torch>=2.6.0, <3.0.0" ] @@ -40,6 +39,12 @@ test = [ "pytest", "torchvision>=0.21.0, <1.0.0", "pillow>=11.3.0, <12.0.0", + # For running tests, a full pyspark installation is needed. + "pyspark~=4.0.0", +] +spark = ["pyspark~=4.0.0"] +spark-connect = [ + "pyspark-client~=4.0.0", ] dev = [ "pyink", @@ -54,5 +59,9 @@ docs = [ [tool.pyink] line-length = 80 # Default is 88 +[tool.setuptools.packages.find] +include = ["google*"] +namespaces = true + [tool.pytest.ini_options] addopts = "--ignore=load/"