Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions src/kaggle/api/kaggle_api_extended.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
from kagglesdk.datasets.types.dataset_api_service import (
ApiListDatasetsRequest,
ApiListDatasetFilesRequest,
ApiGetDatasetRequest,
ApiGetDatasetStatusRequest,
ApiDownloadDatasetRequest,
ApiCreateDatasetRequest,
Expand Down Expand Up @@ -2131,7 +2132,15 @@ def dataset_status(self, dataset: str) -> str:
request.owner_slug = owner_slug
request.dataset_slug = dataset_slug
response = kaggle.datasets.dataset_api_client.get_dataset_status(request)
return response.status.name.lower()
status = response.status.name.lower()

dataset_request = ApiGetDatasetRequest()
dataset_request.owner_slug = owner_slug
dataset_request.dataset_slug = dataset_slug
dataset_response = kaggle.datasets.dataset_api_client.get_dataset(dataset_request)
current_version_number = dataset_response.current_version_number

return status, current_version_number
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rosbo This breaks backward-compatibility, doesn't it? Not for the CLI usage, but for notebooks that import the package. When I checked, there were tens of thousands of such notebooks, but I don't know how often they are used. And I don't know if any of them call dataset_status().

In general, breaking back-compat is not advised.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had marvin try to one-shot these, but lack enough context on the working of this code to know if it was a good solution. Based on my initial review it didn't look good.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with Steve. To avoid breaking script that parses this output, we could add a --format flag like gcloud: https://screenshot.googleplex.com/ganNtm4RhbYHWkS

By default, we would keep the text output with only the status.

But user could specify --format=json and they would get the status AND the current version number in a json output.

Optionally, we could support field selection like gcloud in the format flag:

# If you only want a subset of fields
--format='json(current_version_number)`

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch — reverted. dataset_status() now keeps its original str return type so notebooks/scripts importing the package are unaffected. The version-number behavior moved behind a new --format flag at the CLI layer per @rosbo's suggestion.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Implemented the gcloud-style --format flag:

  • Default: unchanged text output containing only the status string (back-compat preserved for both CLI users and notebooks importing the package).
  • --format=json: emits {"status": ..., "current_version_number": ...}.
  • --format='json(current_version_number)' and --format='json(status,current_version_number)' for gcloud-style field selection.

dataset_status() is back to returning just str. The extra get_dataset() call to fetch the version number only runs when --format requests JSON, so we don't pay for it on the default path either. Tests in tests/test_dataset_status.py cover the default path, JSON, field selection, and error cases.


def dataset_status_cli(self, dataset, dataset_opt=None):
"""A wrapper for client for dataset_status, with additional dataset_opt to
Expand All @@ -2141,7 +2150,10 @@ def dataset_status_cli(self, dataset, dataset_opt=None):
dataset_opt: an alternative to dataset
"""
dataset = dataset or dataset_opt
return self.dataset_status(dataset)
status, current_version_number = self.dataset_status(dataset)
if current_version_number is not None:
return f"{status} (version {current_version_number})"
return status

def dataset_download_file(self, dataset, file_name, path=None, force=False, quiet=True, licenses=[]):
"""Download a single file for a dataset.
Expand Down
100 changes: 100 additions & 0 deletions tests/test_dataset_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import unittest
from unittest.mock import MagicMock, patch


class TestDatasetStatus(unittest.TestCase):
def setUp(self):
self.api = self._create_api()

def _create_api(self):
with patch("kaggle.api.kaggle_api_extended.KaggleApi._read_config_file"):
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi.__new__(KaggleApi)
api.already_printed_version_warning = True
api.config_values = {}
return api

def _mock_kaggle_client(self, status_name, current_version_number):
mock_kaggle = MagicMock()

# Mock get_dataset_status response
mock_status_response = MagicMock()
mock_status_response.status.name = status_name
mock_kaggle.datasets.dataset_api_client.get_dataset_status.return_value = mock_status_response

# Mock get_dataset response
mock_dataset_response = MagicMock()
mock_dataset_response.current_version_number = current_version_number
mock_kaggle.datasets.dataset_api_client.get_dataset.return_value = mock_dataset_response

return mock_kaggle

@patch("kaggle.api.kaggle_api_extended.KaggleApi.build_kaggle_client")
def test_dataset_status_returns_status_and_version(self, mock_build):
mock_kaggle = self._mock_kaggle_client("READY", 3)
mock_build.return_value.__enter__ = MagicMock(return_value=mock_kaggle)
mock_build.return_value.__exit__ = MagicMock(return_value=False)

status, version = self.api.dataset_status("owner/dataset-name")

self.assertEqual(status, "ready")
self.assertEqual(version, 3)

@patch("kaggle.api.kaggle_api_extended.KaggleApi.build_kaggle_client")
def test_dataset_status_cli_formats_with_version(self, mock_build):
mock_kaggle = self._mock_kaggle_client("READY", 5)
mock_build.return_value.__enter__ = MagicMock(return_value=mock_kaggle)
mock_build.return_value.__exit__ = MagicMock(return_value=False)

result = self.api.dataset_status_cli("owner/dataset-name")

self.assertEqual(result, "ready (version 5)")

@patch("kaggle.api.kaggle_api_extended.KaggleApi.build_kaggle_client")
def test_dataset_status_cli_pending(self, mock_build):
mock_kaggle = self._mock_kaggle_client("PENDING", 2)
mock_build.return_value.__enter__ = MagicMock(return_value=mock_kaggle)
mock_build.return_value.__exit__ = MagicMock(return_value=False)

result = self.api.dataset_status_cli("owner/dataset-name")

self.assertEqual(result, "pending (version 2)")

@patch("kaggle.api.kaggle_api_extended.KaggleApi.build_kaggle_client")
def test_dataset_status_cli_version_none(self, mock_build):
mock_kaggle = self._mock_kaggle_client("READY", None)
mock_build.return_value.__enter__ = MagicMock(return_value=mock_kaggle)
mock_build.return_value.__exit__ = MagicMock(return_value=False)

result = self.api.dataset_status_cli("owner/dataset-name")

self.assertEqual(result, "ready")

@patch("kaggle.api.kaggle_api_extended.KaggleApi.build_kaggle_client")
def test_dataset_status_cli_version_1(self, mock_build):
mock_kaggle = self._mock_kaggle_client("READY", 1)
mock_build.return_value.__enter__ = MagicMock(return_value=mock_kaggle)
mock_build.return_value.__exit__ = MagicMock(return_value=False)

result = self.api.dataset_status_cli("owner/dataset-name")

self.assertEqual(result, "ready (version 1)")

@patch("kaggle.api.kaggle_api_extended.KaggleApi.build_kaggle_client")
def test_dataset_status_cli_uses_dataset_opt(self, mock_build):
mock_kaggle = self._mock_kaggle_client("READY", 3)
mock_build.return_value.__enter__ = MagicMock(return_value=mock_kaggle)
mock_build.return_value.__exit__ = MagicMock(return_value=False)

result = self.api.dataset_status_cli(None, dataset_opt="owner/dataset-name")

self.assertEqual(result, "ready (version 3)")

def test_dataset_status_raises_on_none(self):
with self.assertRaises(ValueError):
self.api.dataset_status(None)


if __name__ == "__main__":
unittest.main()
4 changes: 3 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,8 +466,10 @@ def test_dataset_e_status(self):
if self.dataset == "":
self.test_dataset_a_list()
try:
status = api.dataset_status(self.dataset)
status, current_version_number = api.dataset_status(self.dataset)
self.assertIn(status, ["ready", "pending", "error"])
self.assertIsInstance(current_version_number, int)
self.assertGreaterEqual(current_version_number, 1)
except ApiException as e:
self.fail(f"dataset_status failed: {e}")

Expand Down
Loading