diff --git a/CHANGES.md b/CHANGES.md index 3d06e5c04..09fbe5a40 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,8 +3,9 @@ ## New Features - Declare support for python 3.13 `PR #1848` +- Allow moving to PEP691 simple API for package metadata `PR #2075` -## Big Fixes +## Bug Fixes - Support reading HTTP proxy URLs from environment variables, and SOCKS proxy URLs from the 'mirror.proxy' config option `PR #1861` diff --git a/src/bandersnatch/configuration.py b/src/bandersnatch/configuration.py index 246e4c088..fe883ee3e 100644 --- a/src/bandersnatch/configuration.py +++ b/src/bandersnatch/configuration.py @@ -37,6 +37,7 @@ class SetConfigValues(NamedTuple): download_mirror: str download_mirror_no_fallback: bool simple_format: SimpleFormat + api_method: str class Singleton(type): # pragma: no cover @@ -218,6 +219,14 @@ def validate_config_values( # noqa: C901 cleanup = config.getboolean("mirror", "cleanup", fallback=False) + api_method = config.get("mirror", "api-method", fallback="xmlrpc") + if api_method not in ("xmlrpc", "simple"): + raise ValueError( + f"Supplied api-method {api_method} is not supported! Please " + + "update api-method to one of ('xmlrpc', 'simple') in the [mirror] " + + "section." + ) + return SetConfigValues( json_save, root_uri, @@ -231,4 +240,5 @@ def validate_config_values( # noqa: C901 download_mirror, download_mirror_no_fallback, simple_format, + api_method, ) diff --git a/src/bandersnatch/defaults.conf b/src/bandersnatch/defaults.conf index ee041dbf4..99568bb8a 100644 --- a/src/bandersnatch/defaults.conf +++ b/src/bandersnatch/defaults.conf @@ -33,3 +33,7 @@ diff-file = diff-append-epoch = false log-config = + +; API method to use for fetching package lists and changelogs +; Options: xmlrpc, simple +api-method = xmlrpc diff --git a/src/bandersnatch/example.conf b/src/bandersnatch/example.conf index acf3dc5f3..1f5a74463 100644 --- a/src/bandersnatch/example.conf +++ b/src/bandersnatch/example.conf @@ -116,3 +116,11 @@ compare-method = hash ; be appended to the filename (i.e. /path/to/diff-1568129735) ; diff-file = /srv/pypi/mirrored-files ; diff-append-epoch = true + +; Configure the API method to use for fetching package lists and changelogs. +; Options are: +; - xmlrpc: Use the XML-RPC API (default, traditional method) +; - simple: Use the Simple (PEP 691 v1) API (newer method) +; The xmlrpc option is the default and more stable, while simple is newer but may +; be faster in some cases. +; api-method = xmlrpc diff --git a/src/bandersnatch/main.py b/src/bandersnatch/main.py index 5b165699c..0fbb6bd5d 100644 --- a/src/bandersnatch/main.py +++ b/src/bandersnatch/main.py @@ -152,10 +152,12 @@ def _make_parser() -> argparse.ArgumentParser: async def async_main(args: argparse.Namespace, config: ConfigParser) -> int: if args.op.lower() == "delete": + config_values = bandersnatch.configuration.validate_config_values(config) async with bandersnatch.master.Master( config.get("mirror", "master"), config.getfloat("mirror", "timeout"), config.getfloat("mirror", "global-timeout", fallback=None), + api_method=config_values.api_method, ) as master: return await bandersnatch.delete.delete_packages(config, args, master) elif args.op.lower() == "verify": diff --git a/src/bandersnatch/master.py b/src/bandersnatch/master.py index e1d781d23..9c483a779 100644 --- a/src/bandersnatch/master.py +++ b/src/bandersnatch/master.py @@ -40,10 +40,12 @@ def __init__( global_timeout: float | None = FIVE_HOURS_FLOAT, proxy: str | None = None, allow_non_https: bool = False, + api_method: str = "xmlrpc", ) -> None: self.url = url self.timeout = timeout self.global_timeout = global_timeout or FIVE_HOURS_FLOAT + self.api_method = api_method proxy_url = proxy if proxy else proxy_address_from_env() self.proxy_kwargs = get_aiohttp_proxy_kwargs(proxy_url) if proxy_url else {} @@ -144,6 +146,10 @@ async def url_fetch( def xmlrpc_url(self) -> str: return f"{self.url}/pypi" + @property + def simple_url(self) -> str: + return f"{self.url}/simple" + # TODO: Potentially make USER_AGENT more accessible from aiohttp-xmlrpc async def _gen_custom_headers(self) -> dict[str, str]: # Create dummy client so we can copy the USER_AGENT + prepend bandersnatch info @@ -177,13 +183,59 @@ async def rpc(self, method_name: str, serial: int = 0) -> Any: except TimeoutError as te: logger.error(f"Call to {method_name} @ {self.xmlrpc_url} timed out: {te}") + async def fetch_simple_index(self) -> Any: + """Return a mapping of all project data from the PyPI Index API""" + custom_headers = await self._gen_custom_headers() + custom_headers["Accept"] = "application/vnd.pypi.simple.v1+json" + logger.debug( + f"Fetching simple JSON index from {self.simple_url} " + f"w/headers {custom_headers}" + ) + async with self.session.get( + self.simple_url, headers=custom_headers + ) as response: + simple_index = await response.json() + return simple_index + async def all_packages(self) -> Any: + if self.api_method == "simple": + return await self._all_packages_simple() + else: + return await self._all_packages_xmlrpc() + + async def _all_packages_xmlrpc(self) -> Any: all_packages_with_serial = await self.rpc("list_packages_with_serial") if not all_packages_with_serial: raise XmlRpcError("Unable to get full list of packages") return all_packages_with_serial + async def _all_packages_simple(self) -> dict[str, int]: + """ + Fetch all packages using the PEP 691 Simple API JSON endpoint. + Returns a dict mapping package names to their serial numbers. + """ + logger.info("Fetching all packages via Simple (PEP 691 v1) API") + simple_index = await self.fetch_simple_index() + if not simple_index: + return {} + all_packages = {} + for project in simple_index.get("projects", []): + name = project.get("name") + serial = project.get("_last-serial") + if name is not None and serial is not None: + all_packages[name] = serial + else: + logger.warning(f"Skipping malformed project entry in simple index: {project}") + logger.debug(f"Fetched #{len(all_packages)} from simple JSON index") + return all_packages + async def changed_packages(self, last_serial: int) -> dict[str, int]: + if self.api_method == "simple": + return await self._changed_packages_simple(last_serial) + else: + return await self._changed_packages_xmlrpc(last_serial) + + async def _changed_packages_xmlrpc(self, last_serial: int) -> dict[str, int]: changelog = await self.rpc("changelog_since_serial", last_serial) if changelog is None: changelog = [] @@ -194,6 +246,26 @@ async def changed_packages(self, last_serial: int) -> dict[str, int]: packages[package] = serial return packages + async def _changed_packages_simple(self, last_serial: int) -> dict[str, int]: + """ + For the Simple (PEP 691 v1) API, we need to fetch all packages and compare serials. + The Simple API doesn't have a direct "changelog since serial" equivalent, + so we fetch all packages and return those with serial > last_serial. + + Note: This is less efficient than XML-RPC changelog, but works with Simple API. + """ + logger.info( + f"Fetching changed packages since serial {last_serial} via Simple (PEP 691 v1) API" + ) + + # Get all packages with current serial + all_packages = await self._all_packages_simple() + changed_packages = { + pkg: ser for pkg, ser in all_packages.items() if ser > last_serial + } + logger.debug(f"Fetched #{len(changed_packages)} changed packages") + return changed_packages + async def get_package_metadata(self, package_name: str, serial: int = 0) -> Any: try: metadata_generator = self.get(f"/pypi/{package_name}/json", serial) diff --git a/src/bandersnatch/mirror.py b/src/bandersnatch/mirror.py index 519167268..85756899c 100644 --- a/src/bandersnatch/mirror.py +++ b/src/bandersnatch/mirror.py @@ -980,7 +980,12 @@ async def mirror( # Always reference those classes here with the fully qualified name to # allow them being patched by mock libraries! async with Master( - mirror_url, timeout, global_timeout, proxy, allow_non_https + mirror_url, + timeout, + global_timeout, + proxy, + allow_non_https, + config_values.api_method, ) as master: mirror = BandersnatchMirror( homedir, diff --git a/src/bandersnatch/tests/test_configuration.py b/src/bandersnatch/tests/test_configuration.py index b4969b96a..f26ae2e38 100644 --- a/src/bandersnatch/tests/test_configuration.py +++ b/src/bandersnatch/tests/test_configuration.py @@ -60,6 +60,7 @@ def test_single_config__default__mirror__setting_attributes(self) -> None: options, { "allow-non-https", + "api-method", "cleanup", "compare-method", "diff-append-epoch", @@ -101,6 +102,7 @@ def test_single_config__default__mirror__setting__types(self) -> None: ("global-timeout", int), ("workers", int), ("compare-method", str), + ("api-method", str), ]: self.assertIsInstance( option_type(instance["mirror"].get(option)), option_type @@ -146,6 +148,7 @@ def test_validate_config_values(self) -> None: "", False, SimpleFormat.ALL, + "xmlrpc", ) no_options_configparser = BandersnatchConfig(load_defaults=True) self.assertEqual( @@ -166,6 +169,7 @@ def test_validate_config_values_release_files_false_sets_root_uri(self) -> None: "", False, SimpleFormat.ALL, + "xmlrpc", ) release_files_false_configparser = BandersnatchConfig(load_defaults=True) release_files_false_configparser["mirror"].update({"release-files": "false"}) @@ -189,6 +193,7 @@ def test_validate_config_values_download_mirror_false_sets_no_fallback( "", False, SimpleFormat.ALL, + "xmlrpc", ) release_files_false_configparser = BandersnatchConfig(load_defaults=True) release_files_false_configparser["mirror"].update( @@ -200,6 +205,66 @@ def test_validate_config_values_download_mirror_false_sets_no_fallback( default_values, validate_config_values(release_files_false_configparser) ) + def test_validate_config_values_api_method_simple(self) -> None: + """Test that api_method='simple' is accepted and validated.""" + simple_api_values = SetConfigValues( + False, + "", + "", + False, + SimpleDigest.SHA256, + "filesystem", + False, + True, + "hash", + "", + False, + SimpleFormat.ALL, + "simple", + ) + simple_api_config = BandersnatchConfig(load_defaults=True) + simple_api_config["mirror"].update({"api-method": "simple"}) + self.assertEqual(simple_api_values, validate_config_values(simple_api_config)) + + def test_validate_config_values_api_method_xmlrpc(self) -> None: + """Test that api_method='xmlrpc' is accepted and validated.""" + xmlrpc_api_values = SetConfigValues( + False, + "", + "", + False, + SimpleDigest.SHA256, + "filesystem", + False, + True, + "hash", + "", + False, + SimpleFormat.ALL, + "xmlrpc", + ) + xmlrpc_api_config = BandersnatchConfig(load_defaults=True) + xmlrpc_api_config["mirror"].update({"api-method": "xmlrpc"}) + self.assertEqual(xmlrpc_api_values, validate_config_values(xmlrpc_api_config)) + + def test_validate_config_values_api_method_invalid(self) -> None: + """Test that invalid api_method raises ValueError.""" + invalid_api_config = BandersnatchConfig(load_defaults=True) + invalid_api_config["mirror"].update({"api-method": "invalid"}) + with self.assertRaises(ValueError) as context: + validate_config_values(invalid_api_config) + self.assertIn("api-method invalid is not supported", str(context.exception)) + self.assertIn("('xmlrpc', 'simple')", str(context.exception)) + + def test_validate_config_values_api_method_defaults_to_xmlrpc(self) -> None: + """Test that api_method defaults to 'xmlrpc' when not specified.""" + config = BandersnatchConfig(load_defaults=True) + # Remove the api-method config if it exists + if config.has_option("mirror", "api-method"): + config.remove_option("mirror", "api-method") + result = validate_config_values(config) + self.assertEqual(result.api_method, "xmlrpc") + def test_validate_config_diff_file_reference(self) -> None: diff_file_test_cases = [ ( diff --git a/src/bandersnatch/tests/test_master.py b/src/bandersnatch/tests/test_master.py index debbaa02a..73fb6f05d 100644 --- a/src/bandersnatch/tests/test_master.py +++ b/src/bandersnatch/tests/test_master.py @@ -1,6 +1,7 @@ import concurrent.futures from pathlib import Path from tempfile import gettempdir +from typing import Any from unittest.mock import AsyncMock, patch import pytest @@ -91,3 +92,140 @@ async def test_session_raise_for_status(master: Master) -> None: pass assert len(create_session.call_args_list) == 1 assert create_session.call_args_list[0][1]["raise_for_status"] + + +# Tests for Simple API (PEP 691 v1) + + +@pytest.mark.asyncio +async def test_all_packages_simple_api() -> None: + """Test fetching all packages using the Simple (PEP 691 v1) API.""" + master = Master("https://pypi.example.com", api_method="simple") + + # Mock fetch_simple_index to return Simple API response + async def mock_fetch_simple_index() -> dict[str, Any]: + return { + "meta": {"api-version": "1.0"}, + "projects": [ + {"name": "aiohttp", "_last-serial": 12345}, + {"name": "requests", "_last-serial": 12346}, + {"name": "django", "_last-serial": 12347}, + ], + } + + master.fetch_simple_index = mock_fetch_simple_index # type: ignore + + packages = await master.all_packages() + + # Verify response parsing + assert packages == {"aiohttp": 12345, "requests": 12346, "django": 12347} + + +@pytest.mark.asyncio +async def test_all_packages_simple_api_empty_response() -> None: + """Test Simple API handling of empty package list.""" + master = Master("https://pypi.example.com", api_method="simple") + + # Mock fetch_simple_index to return empty response + async def mock_fetch_simple_index() -> dict[str, Any]: + return {"meta": {"api-version": "1.0"}, "projects": []} + + master.fetch_simple_index = mock_fetch_simple_index # type: ignore + + # Should return empty dict, not raise exception + packages = await master.all_packages() + assert packages == {} + + +@pytest.mark.asyncio +async def test_all_packages_xmlrpc_api() -> None: + """Test fetching all packages using XML-RPC API (default).""" + master = Master("https://pypi.example.com", api_method="xmlrpc") + + expected = {"aiohttp": 69, "requests": 70} + master.rpc = AsyncMock(return_value=expected) # type: ignore + + packages = await master.all_packages() + + master.rpc.assert_called_once_with("list_packages_with_serial") + assert packages == expected + + +@pytest.mark.asyncio +async def test_changed_packages_simple_api() -> None: + """Test fetching changed packages using Simple (PEP 691 v1) API.""" + master = Master("https://pypi.example.com", api_method="simple") + + # Mock fetch_simple_index to return Simple API response with different serials + async def mock_fetch_simple_index() -> dict[str, Any]: + return { + "meta": {"api-version": "1.0"}, + "projects": [ + {"name": "aiohttp", "_last-serial": 12345}, + {"name": "requests", "_last-serial": 12346}, + {"name": "django", "_last-serial": 12347}, + ], + } + + master.fetch_simple_index = mock_fetch_simple_index # type: ignore + + # Request changes since serial 10000 + changes = await master.changed_packages(10000) + + # Should return all packages with serial > 10000 + assert changes == {"aiohttp": 12345, "requests": 12346, "django": 12347} + + +@pytest.mark.asyncio +async def test_changed_packages_simple_api_no_changes() -> None: + """Test Simple API when no changes occurred (current serial <= last serial).""" + master = Master("https://pypi.example.com", api_method="simple") + + # Mock fetch_simple_index to return packages with lower serials + async def mock_fetch_simple_index() -> dict[str, Any]: + return { + "meta": {"api-version": "1.0"}, + "projects": [ + {"name": "aiohttp", "_last-serial": 12340}, + {"name": "requests", "_last-serial": 12345}, + ], + } + + master.fetch_simple_index = mock_fetch_simple_index # type: ignore + + # Request changes since serial 12345 (same as or higher than current) + changes = await master.changed_packages(12345) + + # Should return empty dict when no packages have serial > 12345 + assert changes == {} + + +@pytest.mark.asyncio +async def test_changed_packages_xmlrpc_api() -> None: + """Test fetching changed packages using XML-RPC API (default).""" + master = Master("https://pypi.example.com", api_method="xmlrpc") + + list_of_changes = [ + ("aiohttp", "1.0", 0, "added", 17), + ("requests", "2.0", 1, "updated", 18), + ] + master.rpc = AsyncMock(return_value=list_of_changes) # type: ignore + + changes = await master.changed_packages(10) + + master.rpc.assert_called_once_with("changelog_since_serial", 10) + assert changes == {"aiohttp": 17, "requests": 18} + + +@pytest.mark.asyncio +async def test_master_defaults_to_xmlrpc() -> None: + """Test that Master defaults to xmlrpc when api_method is not specified.""" + master = Master("https://pypi.example.com") + assert master.api_method == "xmlrpc" + + +@pytest.mark.asyncio +async def test_master_accepts_simple_api_method() -> None: + """Test that Master accepts 'simple' as api_method.""" + master = Master("https://pypi.example.com", api_method="simple") + assert master.api_method == "simple" diff --git a/src/bandersnatch/unittest.conf b/src/bandersnatch/unittest.conf index 9e7f63c83..209c10675 100644 --- a/src/bandersnatch/unittest.conf +++ b/src/bandersnatch/unittest.conf @@ -92,6 +92,10 @@ diff-append-epoch = false ; Possible values are: hash (default), stat compare-method = hash +; API method to use for fetching package lists and changelogs +; Options: xmlrpc, simple +api-method = xmlrpc + ; Enable filtering plugins [plugins] ; Enable all or specific plugins - e.g. allowlist_project