GoogleCloudDataproc · Deependra-Patel · Nov 18, 2025 · Nov 19, 2025 · gemini-code-assist · Nov 19, 2025
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -8,7 +8,7 @@ version: 2
 build:
   os: ubuntu-24.04
   tools:
-    python: "3.11"
+    python: "3.12"
 
 # Build documentation in the "docs/" directory with Sphinx
 sphinx:

diff --git a/contributing.md b/contributing.md
@@ -29,7 +29,7 @@ Guidelines](https://opensource.google/conduct/).
 1. It is recommended to do development in a separate virtual environment
 
     ```shell
-    python3.11 -m venv <your-env>
+    python3.12 -m venv <your-env>
     ```
 
 2. Install all the build, dev, test and docs dependencies

diff --git a/google/cloud/dataproc_ml/__init__.py b/google/cloud/dataproc_ml/__init__.py
@@ -12,4 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""A subpackage for ml."""
+"""A python library to ease MLOps for Dataproc customers"""
+
+try:
+    import pyspark
+except ImportError:
+    raise ImportError(
+        "PySpark is not installed. The `dataproc-ml` library requires a Spark "
+        "environment.\n"
+        "Please install one of the following packages:\n"
+        "1. For standard Spark: pip install dataproc-ml[spark]\n"
+        "2. For Spark Connect:  pip install dataproc-ml[spark-connect]"
+    ) from None
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,29 +4,28 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dataproc-ml"
-version = "0.1.0"
+version = "1.0.0-rc"
 authors = [{ name = "Google LLC" }]
 classifiers = [
     "Intended Audience :: Developers",
     "Operating System :: OS Independent",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Topic :: Software Development :: Libraries :: Python Modules",
 ]
 description = "A python library to ease MLOps for Dataproc customers"
 license = "Apache-2.0"
 license-files = ["LICENSE*"]
 readme = "README.md"
-requires-python = ">=3.11"
+requires-python = ">=3.12"
 # Aligned with environment of Cloud Dataproc 2.3 for GCE-ML/serverless image
 dependencies = [
-    "google-cloud-aiplatform>=1.88.0, <2.0.0",
+    "google-cloud-aiplatform>=1.121.0, <2.0.0",
     "google-cloud-storage>=2.19.0, <3.0.0",
-    "pandas>=2.1.4, <3.0.0",
-    "pyarrow>=16.1.0, <17.0.0",
-    "pyspark>=3.5.3, <4.0.0",
+    "pandas>=2.2.2, <3.0.0",
+    "pyarrow>=18.1, <20.0.0",
-    "pyarrow>=18.1, <20.0.0",
+    "pyarrow>=18.1.0, <20.0.0",
-    "pyarrow>=18.1, <20.0.0",
+    "pyarrow>=18.1.0, <20.0.0",
     "tenacity>=8.5.0, <9.0.0",
-    "tensorflow>=2.17.0, <2.20.0",
+    "tensorflow>=2.18.0, <2.20.0",
     "torch>=2.6.0, <3.0.0"
 ]
 
@@ -40,6 +39,12 @@ test = [
     "pytest",
     "torchvision>=0.21.0, <1.0.0",
     "pillow>=11.3.0, <12.0.0",
+    # For running tests, a full pyspark installation is needed.
+    "pyspark~=4.0.0",
+]
+spark = ["pyspark~=4.0.0"]
+spark-connect = [
+    "pyspark-client~=4.0.0",
 ]
 dev = [
     "pyink",
@@ -54,5 +59,9 @@ docs = [
 [tool.pyink]
 line-length = 80 # Default is 88
 
+[tool.setuptools.packages.find]
+include = ["google*"]
+namespaces = true
+
 [tool.pytest.ini_options]
 addopts = "--ignore=load/"