diff --git a/.github/workflows/build-wheels-defined.yml b/.github/workflows/build-wheels-defined.yml index 3d55e76..7817a71 100644 --- a/.github/workflows/build-wheels-defined.yml +++ b/.github/workflows/build-wheels-defined.yml @@ -254,6 +254,8 @@ jobs: -w /work \ -e GH_TOKEN="${GH_TOKEN}" \ -e PIP_NO_CACHE_DIR=1 \ + -e PIP_INDEX_URL=https://www.piwheels.org/simple \ + -e PIP_EXTRA_INDEX_URL=https://pypi.org/simple \ python:${{ matrix.python-version }}-bookworm \ bash -c " set -e @@ -339,6 +341,8 @@ jobs: -w /work \ -e GH_TOKEN="${GH_TOKEN}" \ -e PIP_NO_CACHE_DIR=1 \ + -e PIP_INDEX_URL=https://www.piwheels.org/simple \ + -e PIP_EXTRA_INDEX_URL=https://pypi.org/simple \ python:${{ matrix.python-version }}-bullseye \ bash -c " set -e diff --git a/.github/workflows/build-wheels-platforms.yml b/.github/workflows/build-wheels-platforms.yml index 50965b1..45f77b8 100644 --- a/.github/workflows/build-wheels-platforms.yml +++ b/.github/workflows/build-wheels-platforms.yml @@ -127,6 +127,8 @@ jobs: -e MIN_IDF_MINOR_VERSION=${{ needs.get-supported-versions.outputs.min_idf_minor_version }} \ -e GH_TOKEN="${GH_TOKEN}" \ -e PIP_NO_CACHE_DIR=1 \ + -e PIP_INDEX_URL=https://www.piwheels.org/simple \ + -e PIP_EXTRA_INDEX_URL=https://pypi.org/simple \ python:${{ matrix.python-version }}-bookworm \ bash -c " set -e @@ -152,6 +154,8 @@ jobs: -e MIN_IDF_MINOR_VERSION=${{ needs.get-supported-versions.outputs.min_idf_minor_version }} \ -e GH_TOKEN="${GH_TOKEN}" \ -e PIP_NO_CACHE_DIR=1 \ + -e PIP_INDEX_URL=https://www.piwheels.org/simple \ + -e PIP_EXTRA_INDEX_URL=https://pypi.org/simple \ python:${{ matrix.python-version }}-bullseye \ bash -c " set -e diff --git a/.github/workflows/build-wheels-python-dependent.yml b/.github/workflows/build-wheels-python-dependent.yml index 89e0973..3630d7a 100644 --- a/.github/workflows/build-wheels-python-dependent.yml +++ b/.github/workflows/build-wheels-python-dependent.yml @@ -140,6 +140,8 @@ jobs: -e PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1 \ -e GH_TOKEN="${GH_TOKEN}" \ -e PIP_NO_CACHE_DIR=1 \ + -e PIP_INDEX_URL=https://www.piwheels.org/simple \ + -e PIP_EXTRA_INDEX_URL=https://pypi.org/simple \ python:${{ matrix.python-version }}-bookworm \ bash -c " set -e @@ -163,6 +165,8 @@ jobs: -e PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1 \ -e GH_TOKEN="${GH_TOKEN}" \ -e PIP_NO_CACHE_DIR=1 \ + -e PIP_INDEX_URL=https://www.piwheels.org/simple \ + -e PIP_EXTRA_INDEX_URL=https://pypi.org/simple \ python:${{ matrix.python-version }}-bullseye \ bash -c " set -e diff --git a/.github/workflows/test-wheels-install.yml b/.github/workflows/test-wheels-install.yml index 7a854ef..a387ca4 100644 --- a/.github/workflows/test-wheels-install.yml +++ b/.github/workflows/test-wheels-install.yml @@ -113,6 +113,8 @@ jobs: python test_wheels_install.py " + # After test_wheels_install.py, ./downloaded_wheels contains only wheels for this + # matrix Python + platform (see prune step in test_wheels_install.main). - name: Upload tested wheels uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/upload-python-wheels.yml b/.github/workflows/upload-python-wheels.yml index 6b84bb3..126ec0e 100644 --- a/.github/workflows/upload-python-wheels.yml +++ b/.github/workflows/upload-python-wheels.yml @@ -41,11 +41,11 @@ jobs: path: ./downloaded_wheels merge-multiple: true - - name: Upload release asset to S3 bucket - run: | - python upload_wheels.py $AWS_BUCKET - python create_index_pages.py $AWS_BUCKET + #- name: Upload release asset to S3 bucket + # run: | + # python upload_wheels.py $AWS_BUCKET + # python create_index_pages.py $AWS_BUCKET - - name: Drop AWS cache - id: invalidate-index-cache - run: aws cloudfront create-invalidation --distribution-id ${{ secrets.AWS_CACHE_INVALIDATION }} --paths "/pypi/*" + #- name: Drop AWS cache + # id: invalidate-index-cache + # run: aws cloudfront create-invalidation --distribution-id ${{ secrets.AWS_CACHE_INVALIDATION }} --paths "/pypi/*" diff --git a/.github/workflows/wheels-repair.yml b/.github/workflows/wheels-repair.yml index cf936f7..cf8f9d0 100644 --- a/.github/workflows/wheels-repair.yml +++ b/.github/workflows/wheels-repair.yml @@ -161,6 +161,9 @@ jobs: run: | docker run --rm \ --platform ${{ matrix.docker_platform }} \ + -e AUDITWHEEL_PLAT=manylinux_2_36_armv7l \ + -e AUDITWHEEL_ONLY_PLAT=1 \ + -e AUDITWHEEL_ALLOW_LINUX_TAG=1 \ -v $(pwd):/work \ -w /work \ ${{ matrix.docker_image }} \ @@ -177,6 +180,9 @@ jobs: run: | docker run --rm \ --platform ${{ matrix.docker_platform }} \ + -e AUDITWHEEL_PLAT=manylinux_2_31_armv7l \ + -e AUDITWHEEL_ONLY_PLAT=1 \ + -e AUDITWHEEL_ALLOW_LINUX_TAG=1 \ -v $(pwd):/work \ -w /work \ ${{ matrix.docker_image }} \ @@ -201,12 +207,26 @@ jobs: needs: repair-wheels runs-on: ubuntu-latest steps: - - name: Download all repaired wheels + - name: Checkout repository + uses: actions/checkout@v4 + + # Download each wheels-repaired-* artifact into its own subdirectory so + # same-named wheels from ARMv7 vs ARMv7 Legacy are not silently overwritten + # before collision detection or S3 upload (see README: ARMv7 wheel collisions). + - name: Download all repaired wheels (per-artifact subdirectories) uses: actions/download-artifact@v4 with: pattern: wheels-repaired-* - path: ./all_wheels - merge-multiple: true + path: ./all_wheels_staging + merge-multiple: false + + - name: Check for duplicate wheel basenames across lineages + run: python3 check_wheel_collisions.py ./all_wheels_staging + + - name: Flatten merged wheels directory + run: | + mkdir -p ./all_wheels + find ./all_wheels_staging -type f -name '*.whl' -exec cp -f {} ./all_wheels/ \; - name: List merged wheels run: | diff --git a/README.md b/README.md index 8fc3678..385e1da 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,18 @@ The repair tools are used after build to link and bundle all the needed librarie This logic is done by the [repair workflow](./.github/workflows/wheels-repair.yml) and the [`repair_wheels.py` script](./repair_wheels.py) +### ARMv7 vs ARMv7 Legacy: same wheel filename, different binaries + +`Linux ARMv7` and `Linux ARMv7 Legacy` can both produce a wheel whose **filename is identical** (same PEP 425 tags) while the **ELF contents differ** (different glibc/OpenSSL/Rust toolchain lineage). **Note:** `wheels-download-directory-*` CI artifacts are the **pre-repair** build outputs; comparing those can still show identical names until the [repair workflow](./.github/workflows/wheels-repair.yml) runs. Two bad outcomes follow if that is not handled after repair/merge: + +1. **Artifact merge / local flatten** — downloading multiple `wheels-repaired-*` artifacts into one directory with `merge-multiple: true` can make the second file **silently overwrite** the first on disk before any upload runs. +2. **S3 upload** — [`upload_wheels.py`](./upload_wheels.py) publishes to `pypi//`. Uploading a second wheel with the **same key** replaces the object; clients then see whichever build ran last, which can surface as import crashes or segfaults. + +Mitigations in this repo: + +- Repair sets **`AUDITWHEEL_PLAT`** and **`AUDITWHEEL_ONLY_PLAT`** per lineage (`manylinux_2_36_armv7l` vs `manylinux_2_31_armv7l`) so [`repair_wheels.py`](./repair_wheels.py) runs `auditwheel repair --plat ... --only-plat` and emitted wheels get **distinct single-tag filenames** when auditwheel supports it. If **`AUDITWHEEL_PLAT` is set**, ARMv7 “libc detection failed” outcomes are **not** treated as non-fatal skips (that would leave identical filenames across lineages). +- The repair workflow merges repaired artifacts using **per-artifact subdirectories**, then runs [`check_wheel_collisions.py`](./check_wheel_collisions.py) to **fail CI** if the same `*.whl` basename appears with **different contents** across lineages, before flattening for tests/upload. + ## Activity Diagram The main file is `build-wheels-platforms.yml` which is scheduled to run periodically to build Python wheels for any requirement of all [ESP-IDF]-supported versions. @@ -167,4 +179,17 @@ Docker files are in its own repository where there are build and published from. - For older ARMv7 operating systems - For packages requiring glibc 2.31 +[!NOTE] +### ARMv7: prefer piwheels for resolution + +For ARMv7 (and ARMv7 Legacy) environments, you may want to prefer [piwheels](https://www.piwheels.org/) as the primary index and use Espressif's index as a secondary source: + +```bash +python -m pip install --index-url https://www.piwheels.org/simple --extra-index-url https://dl.espressif.com/pypi/ +``` + +This repository's ARMv7 CI workflows also set these as `PIP_INDEX_URL` / `PIP_EXTRA_INDEX_URL` inside the ARMv7 Docker builds. + +**Warning:** piwheels wheels may rely on system-provided shared libraries (i.e. may not bundle `.libs/`). If a target OS is missing those libraries or has an incompatible version, imports may fail at runtime. + [ESP-IDF]: https://github.com/espressif/esp-idf diff --git a/check_wheel_collisions.py b/check_wheel_collisions.py new file mode 100644 index 0000000..7246bce --- /dev/null +++ b/check_wheel_collisions.py @@ -0,0 +1,78 @@ +# +# SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD +# +# SPDX-License-Identifier: Apache-2.0 +# +"""Detect duplicate *.whl basenames with different file contents under a tree. + +Used after downloading per-arch ``wheels-repaired-*`` artifacts into separate +subdirectories (``merge-multiple: false``) so a filesystem flatten step cannot +hide ARMv7 vs ARMv7 Legacy collisions before upload to S3. +""" + +from __future__ import annotations + +import hashlib +import sys + +from collections import defaultdict +from pathlib import Path + + +def _sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str: + h = hashlib.sha256() + with path.open("rb") as f: + while True: + b = f.read(chunk_size) + if not b: + break + h.update(b) + return h.hexdigest() + + +def collect_collision_errors(root: Path) -> list[str]: + """Return human-readable error lines; empty if OK.""" + wheels: list[Path] = [] + for p in sorted(root.rglob("*.whl")): + if p.is_file(): + wheels.append(p) + + by_name: defaultdict[str, list[Path]] = defaultdict(list) + for p in wheels: + by_name[p.name].append(p) + + errors: list[str] = [] + for name, paths in sorted(by_name.items()): + if len(paths) < 2: + continue + by_digest: defaultdict[str, list[Path]] = defaultdict(list) + for p in paths: + by_digest[_sha256_file(p)].append(p) + if len(by_digest) == 1: + # Identical content in multiple artifact trees — unusual but safe. + continue + lines = [f"Duplicate wheel basename with different contents: {name}"] + for p in paths: + lines.append(f" - {p} sha256={_sha256_file(p)}") + errors.append("\n".join(lines)) + return errors + + +def main(argv: list[str]) -> int: + root = Path(argv[1] if len(argv) > 1 else ".").resolve() + if not root.is_dir(): + print(f"Error: not a directory: {root}", file=sys.stderr) + return 2 + + errors = collect_collision_errors(root) + if errors: + print("Wheel basename collision check failed:\n", file=sys.stderr) + for block in errors: + print(block, file=sys.stderr) + print(file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/repair_wheels.py b/repair_wheels.py index 6ab90bd..fdb5863 100644 --- a/repair_wheels.py +++ b/repair_wheels.py @@ -12,6 +12,7 @@ - Linux: auditwheel (bundles SOs) """ +import os import platform import subprocess @@ -94,6 +95,46 @@ def get_wheel_arch(wheel_name: str) -> Union[str, None]: return None +def _only_plat_env_enabled() -> bool: + return os.environ.get("AUDITWHEEL_ONLY_PLAT", "").strip().lower() in ("1", "true", "yes") + + +def _allow_linux_tag_env_enabled() -> bool: + """When true, allow keeping linux-tag wheels on ARMv7 even if --plat is set. + + This is useful when resolution prefers piwheels, which may provide wheels + tagged as ``linux_armv7l`` that are not repairable to the desired manylinux + tag in our repair containers. + """ + return os.environ.get("AUDITWHEEL_ALLOW_LINUX_TAG", "").strip().lower() in ("1", "true", "yes") + + +def _is_linux_tag_wheel(wheel_name: str) -> bool: + wn = wheel_name.lower() + return "-linux_" in wn and "manylinux" not in wn and "musllinux" not in wn + + +def _armv7_forced_plat_filename_ok(wheel_name: str, plat: str) -> bool: + """True if ``wheel_name`` matches ``AUDITWHEEL_PLAT`` for ARMv7 / ARMv7 Legacy splits. + + When ``AUDITWHEEL_ONLY_PLAT`` is set, legacy wheels must not carry a ``manylinux_2_36`` + tag (auditwheel dual-tag would collide with the standard lineage again). + """ + plat_l = plat.lower() + wn = wheel_name.lower() + if _allow_linux_tag_env_enabled() and _is_linux_tag_wheel(wn): + return True + if "manylinux_2_36" in plat_l: + return "manylinux_2_36" in wn + if "manylinux_2_31" in plat_l and "manylinux_2_36" not in plat_l: + if "manylinux_2_31" not in wn: + return False + if _only_plat_env_enabled() and "manylinux_2_36" in wn: + return False + return True + return True + + def repair_wheel_windows(wheel_path: Path, temp_dir: Path) -> subprocess.CompletedProcess[str]: """Repair Windows wheel using delvewheel.""" result = subprocess.run( @@ -157,12 +198,28 @@ def repair_wheel_linux(wheel_path: Path, temp_dir: Path) -> subprocess.Completed Uses --strip option to strip debugging symbols which can help with ELF alignment issues on ARM (fixes "ELF load command address/offset not properly aligned" errors). + + If ``AUDITWHEEL_PLAT`` is set (e.g. in CI for ARMv7 vs ARMv7 Legacy), it is passed as + ``auditwheel repair --plat ...`` so repaired wheels get distinct PEP 425 platform tags + when build lineages would otherwise emit the same filename. """ - result = subprocess.run( - ["auditwheel", "repair", str(wheel_path), "-w", str(temp_dir), "--strip"], - capture_output=True, - text=True, - ) + plat = os.environ.get("AUDITWHEEL_PLAT", "").strip() + only_plat = os.environ.get("AUDITWHEEL_ONLY_PLAT", "").strip().lower() in ("1", "true", "yes") + + cmd = ["auditwheel", "repair", str(wheel_path), "-w", str(temp_dir), "--strip"] + if plat: + cmd.extend(["--plat", plat]) + if only_plat: + cmd.append("--only-plat") + + result = subprocess.run(cmd, capture_output=True, text=True) + + # Older auditwheel versions may not support --only-plat. If requested, retry once without it. + combined_err = (result.stderr or "") + (result.stdout or "") + if only_plat and result.returncode != 0 and "unrecognized arguments: --only-plat" in combined_err: + cmd_no_only = [c for c in cmd if c != "--only-plat"] + result = subprocess.run(cmd_no_only, capture_output=True, text=True) + return result @@ -250,7 +307,8 @@ def main() -> None: print_color(f" {result.stderr.strip()}", Fore.RED) # Check for errors - error_msg = result.stderr.strip() if result.stderr else "" + # auditwheel may log failures on stdout or stderr depending on version / logging. + error_msg = ((result.stderr or "") + "\n" + (result.stdout or "")).strip() # Corrupt zip / bad central directory (delocate opens the wheel as a zip) if _stderr_indicates_bad_zip(error_msg): @@ -289,7 +347,7 @@ def main() -> None: # Update wheel reference and error message for subsequent checks wheel = Path(renamed_wheel) - error_msg = result.stderr.strip() if result.stderr else "" + error_msg = ((result.stderr or "") + "\n" + (result.stdout or "")).strip() # Special handling forLinux ARMv7 broken wheels if ( @@ -302,16 +360,33 @@ def main() -> None: deleted_count += 1 continue + plat_env = os.environ.get("AUDITWHEEL_PLAT", "").strip() + allow_linux_tag = _allow_linux_tag_env_enabled() + is_linux_tag = _is_linux_tag_wheel(wheel.name) + # Check for non-critical errors (keep original wheel) is_noncritical = ( "too-recent versioned symbols" in error_msg # manylinux wheel can't find its libraries # it means it was already properly repaired - or ("manylinux" in wheel.name and "could not be located" in error_msg) + or (("manylinux" in wheel.name and "could not be located" in error_msg) and not plat_env) + # When allowing linux-tag wheels (piwheels), treat missing graft libs as non-fatal + # and keep the original linux-tag wheel rather than failing the whole repair job. + or ( + plat_env + and allow_linux_tag + and is_linux_tag + and ( + "Cannot repair wheel, because required library" in error_msg or "could not be located" in error_msg + ) + ) # ARMv7 CI runs under QEMU; auditwheel may fail libc detection on abi3/native .so + # When AUDITWHEEL_PLAT is set (ARMv7 vs ARMv7 Legacy), skipping repair would keep + # identical wheel filenames across lineages — do not treat libc detection as non-critical. or ( current_platform == "Linux" and current_arch == "armv7l" + and not plat_env and ("InvalidLibc" in error_msg or "couldn't detect libc" in error_msg) ) ) @@ -335,15 +410,37 @@ def main() -> None: print_color(" -> Keeping original wheel (build issue: needs older toolchain)", Fore.YELLOW) elif "manylinux" in wheel.name and "could not be located" in error_msg: print_color(" -> Keeping original wheel (already bundled from PyPI)", Fore.GREEN) + elif plat_env and allow_linux_tag and is_linux_tag: + print_color( + " -> Keeping original wheel (linux-tag wheel; not forcing manylinux under current policy)", + Fore.YELLOW, + ) elif ( current_platform == "Linux" and current_arch == "armv7l" + and not plat_env and ("InvalidLibc" in error_msg or "couldn't detect libc" in error_msg) ): print_color( " -> Keeping original wheel (auditwheel libc detection failed on ARMv7 runner; often QEMU)", Fore.YELLOW, ) + if ( + plat_env + and current_platform == "Linux" + and current_arch == "armv7l" + and not _armv7_forced_plat_filename_ok(wheel.name, plat_env) + and not (allow_linux_tag and is_linux_tag) + ): + msg = ( + f"Wheel filename does not match forced AUDITWHEEL_PLAT={plat_env!r} " + f"after non-fatal repair path: {wheel.name}" + ) + print_color(f" -> ERROR: {msg}", Fore.RED) + errors.append(f"{wheel.name}: {msg}") + wheel.unlink(missing_ok=True) + error_count += 1 + continue skipped_count += 1 elif has_error: # Actual error occurred (even if a wheel was created, it may be broken) @@ -374,10 +471,40 @@ def main() -> None: print_color(" -> Deleting repaired output (not a valid / readable zip archive)", Fore.RED) final_path.unlink(missing_ok=True) deleted_count += 1 + elif ( + plat_env + and current_platform == "Linux" + and current_arch == "armv7l" + and not _armv7_forced_plat_filename_ok(final_path.name, plat_env) + and not (allow_linux_tag and _is_linux_tag_wheel(final_path.name)) + ): + msg = ( + f"Repaired wheel filename does not match forced AUDITWHEEL_PLAT={plat_env!r}: {final_path.name}" + ) + print_color(f" -> ERROR: {msg}", Fore.RED) + errors.append(f"{final_path.name}: {msg}") + final_path.unlink(missing_ok=True) + error_count += 1 else: repaired_count += 1 elif result.returncode == 0: # No repaired wheel created, but command succeeded (already compatible) + if ( + plat_env + and current_platform == "Linux" + and current_arch == "armv7l" + and not _armv7_forced_plat_filename_ok(wheel.name, plat_env) + and not (allow_linux_tag and is_linux_tag) + ): + msg = ( + "auditwheel reported success but left the wheel unchanged with a filename " + f"that does not match AUDITWHEEL_PLAT={plat_env!r}: {wheel.name}" + ) + print_color(f" -> ERROR: {msg}", Fore.RED) + errors.append(f"{wheel.name}: {msg}") + wheel.unlink(missing_ok=True) + error_count += 1 + continue print_color(" -> Keeping original wheel (already compatible)", Fore.GREEN) skipped_count += 1 else: diff --git a/test_build_wheels.py b/test_build_wheels.py index ba6af76..1f6ed4f 100644 --- a/test_build_wheels.py +++ b/test_build_wheels.py @@ -222,6 +222,28 @@ def test_abi3_wheel(self): self.assertTrue(self.is_wheel_compatible(f"cryptography-41.0.0-cp39-abi3-{tag}.whl", "39")) +class TestPruneWheelsForArtifact(unittest.TestCase): + """``prune_wheels_not_for_current_python`` keeps per-matrix wheels for CI artifacts.""" + + def test_prune_removes_other_python_same_platform(self): + import tempfile + + from test_wheels_install import prune_wheels_not_for_current_python + + tag = _current_platform_wheel_tag() + with tempfile.TemporaryDirectory() as tmp: + d = Path(tmp) + (d / f"drop-1.0-cp310-cp310-{tag}.whl").write_bytes(b"a") + (d / f"keep-1.0-cp311-cp311-{tag}.whl").write_bytes(b"b") + (d / "universal-1.0-py3-none-any.whl").write_bytes(b"c") + + removed = prune_wheels_not_for_current_python("311", d) + self.assertEqual(removed, 1) + self.assertFalse((d / f"drop-1.0-cp310-cp310-{tag}.whl").exists()) + self.assertTrue((d / f"keep-1.0-cp311-cp311-{tag}.whl").exists()) + self.assertTrue((d / "universal-1.0-py3-none-any.whl").exists()) + + class TestParseWheelName(unittest.TestCase): """Test the parse_wheel_name function from _helper_functions.py.""" diff --git a/test_check_wheel_collisions.py b/test_check_wheel_collisions.py new file mode 100644 index 0000000..ad08152 --- /dev/null +++ b/test_check_wheel_collisions.py @@ -0,0 +1,60 @@ +# +# SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD +# +# SPDX-License-Identifier: Apache-2.0 +# + +import tempfile +import unittest + +from pathlib import Path + +import check_wheel_collisions as cwc + + +class TestCheckWheelCollisions(unittest.TestCase): + def test_no_collision_unique_basenames(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + (root / "a").mkdir() + (root / "b").mkdir() + (root / "a" / "foo-1.0-py3-none-any.whl").write_bytes(b"a") + (root / "b" / "bar-1.0-py3-none-any.whl").write_bytes(b"b") + self.assertEqual(cwc.collect_collision_errors(root), []) + + def test_collision_same_basename_different_bytes(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + (root / "linux-armv7").mkdir() + (root / "linux-armv7legacy").mkdir() + name = "pkg-1.0-cp39-cp39-linux_armv7l.whl" + (root / "linux-armv7" / name).write_bytes(b"v1") + (root / "linux-armv7legacy" / name).write_bytes(b"v2-different") + errs = cwc.collect_collision_errors(root) + self.assertEqual(len(errs), 1) + self.assertIn(name, errs[0]) + + def test_same_basename_identical_bytes_allowed(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + (root / "x").mkdir() + (root / "y").mkdir() + name = "same-1.0-py3-none-any.whl" + payload = b"identical" + (root / "x" / name).write_bytes(payload) + (root / "y" / name).write_bytes(payload) + self.assertEqual(cwc.collect_collision_errors(root), []) + + def test_main_returns_one_on_collision(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + (root / "a").mkdir() + (root / "b").mkdir() + name = "dup-1.0-py3-none-any.whl" + (root / "a" / name).write_bytes(b"1") + (root / "b" / name).write_bytes(b"2") + self.assertEqual(cwc.main(["_", str(root)]), 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/test_wheels_install.py b/test_wheels_install.py index 5ac182b..bfc0533 100644 --- a/test_wheels_install.py +++ b/test_wheels_install.py @@ -9,6 +9,10 @@ verifying that wheel files are valid and platform-compatible. It also checks wheels against exclude_list.yaml and removes incompatible ones. +After a successful run, wheels that do not match this job's Python version and host +platform are deleted from ``downloaded_wheels`` so CI ``wheels-tested-*`` artifacts +do not carry the full multi-Python merge (only ``wheels-repaired-all`` is merged). + Wheels are ZIP archives (PEP 427). pip opens them with the zipfile module; a BadZipFile / "Bad magic number" error means the bytes on disk are not a valid ZIP (truncated, corrupted, or not a wheel), not that ".whl" was mistaken for ".zip". @@ -109,6 +113,29 @@ def find_compatible_wheels(python_version: str) -> list[Path]: return sorted(wheels) +def prune_wheels_not_for_current_python( + python_version_tag: str, + wheels_dir: Path | None = None, +) -> int: + """Remove ``*.whl`` files that are not compatible with this Python + platform. + + CI downloads the full merged ``wheels-repaired-all`` tree into ``downloaded_wheels``, + then tests only compatible wheels. Without pruning, the subsequent + ``wheels-tested--`` artifact would still contain every cp/py tag from the + merge, which is misleading and huge. ``wheels_dir`` defaults to ``WHEELS_DIR`` for + production; tests may pass a temporary directory. + """ + base = wheels_dir if wheels_dir is not None else WHEELS_DIR + if not base.exists(): + return 0 + removed = 0 + for wheel_path in base.glob("*.whl"): + if not is_wheel_compatible(wheel_path.name, python_version_tag): + wheel_path.unlink(missing_ok=True) + removed += 1 + return removed + + def install_wheel(wheel_path: Path) -> tuple[bool, str]: """ Install a wheel with --no-deps to verify wheel validity. @@ -277,6 +304,14 @@ def main() -> int: print(f" - {wheel_name}") return 1 + pruned = prune_wheels_not_for_current_python(python_version_tag) + if pruned: + print_color( + f"Pruned {pruned} wheel(s) not for this matrix (Python {python_version} / " + f"current platform) before artifact upload", + Fore.YELLOW, + ) + print_color("\nAll compatible wheels processed successfully!", Fore.GREEN) return 0 diff --git a/upload_wheels.py b/upload_wheels.py index 270bf3f..ca94459 100644 --- a/upload_wheels.py +++ b/upload_wheels.py @@ -7,12 +7,16 @@ - argument S3 bucket """ +import hashlib import os import re import sys +from typing import Optional + import boto3 +from botocore.exceptions import ClientError from colorama import Fore from _helper_functions import print_color @@ -32,6 +36,44 @@ def normalize(name): return re.sub(r"[-_.]+", "-", name).lower() +def _file_md5_hex(path: str, chunk_size: int = 1024 * 1024) -> str: + h = hashlib.md5() + with open(path, "rb") as f: + while True: + chunk = f.read(chunk_size) + if not chunk: + break + h.update(chunk) + return h.hexdigest() + + +def _overwrite_would_hide_different_wheel(s3_key: str, local_path: str) -> Optional[str]: + """Return an error message if an existing object differs from local_path; else None.""" + obj = BUCKET.Object(s3_key) + try: + obj.load() + except ClientError as e: + code = e.response.get("Error", {}).get("Code", "") + if code in ("404", "NoSuchKey"): + return None + raise + remote_size = obj.content_length + local_size = os.path.getsize(local_path) + if remote_size != local_size: + return f"Refusing to overwrite {s3_key}: remote size {remote_size} != local size {local_size}" + etag = (obj.e_tag or "").strip('"') + if not etag or "-" in etag: + # Multipart upload ETag is not a raw MD5; size match is the best check here. + return None + local_md5 = _file_md5_hex(local_path) + if etag != local_md5: + return ( + f"Refusing to overwrite {s3_key}: remote ETag {etag!r} != local MD5 {local_md5!r}. " + "Same wheel filename would publish different bytes (e.g. ARMv7 vs ARMv7 Legacy collision)." + ) + return None + + def get_existing_wheels(): """Get set of S3 keys for wheels currently on server.""" existing = set() @@ -76,9 +118,14 @@ def collect_wheel_paths(): wheel_name = match.group(1) wheel_name = normalize(wheel_name) - is_new = f"pypi/{wheel_name}/{wheel}" not in existing_wheels + s3_key = f"pypi/{wheel_name}/{wheel}" + is_new = s3_key not in existing_wheels + + conflict = _overwrite_would_hide_different_wheel(s3_key, full_path) + if conflict: + raise SystemExit(conflict) - BUCKET.upload_file(full_path, f"pypi/{wheel_name}/{wheel}", ExtraArgs={"ACL": "public-read"}) + BUCKET.upload_file(full_path, s3_key, ExtraArgs={"ACL": "public-read"}) if is_new: new_wheels += 1 diff --git a/verify_s3_wheels.py b/verify_s3_wheels.py index 839b32f..acf3ffd 100644 --- a/verify_s3_wheels.py +++ b/verify_s3_wheels.py @@ -15,6 +15,8 @@ import re import sys +from collections import defaultdict + import boto3 from colorama import Fore @@ -36,6 +38,16 @@ ] +def _normalize_pkg_dir(name: str) -> str: + """Normalize S3 package directory naming differences. + + Historically, this repo used both underscore and dash package directories on S3 + (e.g. ``flask_compress`` vs ``flask-compress``). Those can legitimately contain the + same wheel basenames. We treat that as a warning, not a violation. + """ + return re.sub(r"[-_.]+", "-", name).lower() + + def get_supported_python_versions(supported_python_json: str) -> list[str]: """Parse supported_python from get-supported-versions output (jq -c .supported_python).""" try: @@ -103,26 +115,49 @@ def main(): # Get all wheels from S3 print_color("---------- SCANNING S3 WHEELS ----------") - wheels = [] + basename_to_keys: defaultdict[str, list[str]] = defaultdict(list) for obj in bucket.objects.filter(Prefix="pypi/"): if obj.key.endswith(".whl"): wheel_name = obj.key.split("/")[-1] - wheels.append(wheel_name) + basename_to_keys[wheel_name].append(obj.key) + + wheel_names = sorted(basename_to_keys.keys()) + wheels_on_s3_count = sum(len(v) for v in basename_to_keys.values()) - print(f"Found {len(wheels)} wheels on S3\n") + print(f"Found {wheels_on_s3_count} wheel objects ({len(wheel_names)} unique filenames) on S3\n") # Check each wheel print_color("---------- CHECKING WHEELS ----------") violations = [] old_python_wheels = [] - for wheel in wheels: + for wheel in wheel_names: # Check for unsupported Python versions (warning only, not a violation) is_old, reason = is_unsupported_python(wheel, oldest_supported_python) if is_old: old_python_wheels.append((wheel, reason)) continue + keys_for_name = basename_to_keys[wheel] + if len(keys_for_name) > 1: + # Determine whether the duplicate keys are only due to directory normalization + # differences (underscore vs dash). Those are historical and are not treated as + # violations (we cannot infer which object is authoritative without comparing bytes). + pkg_dirs = [] + for k in keys_for_name: + parts = k.split("/") + pkg_dirs.append(parts[1] if len(parts) >= 3 else "") + normalized = {_normalize_pkg_dir(d) for d in pkg_dirs if d} + reason_dup = "Duplicate wheel basename across multiple S3 keys: " + ", ".join(sorted(keys_for_name)) + if len(normalized) <= 1: + print_color(f"-- {wheel}", Fore.YELLOW) + print(f" {reason_dup}") + print(f" Note: directories normalize to {next(iter(normalized), '')!r}; treated as warning") + else: + violations.append((wheel, reason_dup)) + print_color(f"-- {wheel}", Fore.RED) + print(f" {reason_dup}") + # Check against exclude_list (actual violations) should_exclude, reason = should_exclude_wheel_s3( wheel, exclude_requirements, supported_python_versions=supported_python_versions @@ -138,7 +173,7 @@ def main(): # Statistics print_color("---------- STATISTICS ----------") - print(f"Checked: {len(wheels)} wheels") + print(f"Checked: {wheels_on_s3_count} wheel objects ({len(wheel_names)} unique filenames)") if old_python_wheels: print_color(f"Old Python wheels: {len(old_python_wheels)} (warning only)", Fore.YELLOW) if violations: