Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/data-build-fingerprint.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Expose build metadata helpers for US data artifacts, including a stable data-build fingerprint and build provenance metadata.
78 changes: 78 additions & 0 deletions policyengine_us/build_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from __future__ import annotations

from functools import lru_cache
import hashlib
from importlib import metadata
from pathlib import Path
import subprocess

PACKAGE_NAME = "policyengine-us"
PACKAGE_ROOT = Path(__file__).resolve().parent
DATA_BUILD_SURFACE = (
"entities.py",
"parameters",
"programs.yaml",
"system.py",
"variables",
)


def _iter_surface_files() -> list[Path]:
files: list[Path] = []
for relative_path in DATA_BUILD_SURFACE:
path = PACKAGE_ROOT / relative_path
if path.is_file():
files.append(path)
continue
if path.is_dir():
files.extend(
child
for child in sorted(path.rglob("*"))
if child.is_file()
and "__pycache__" not in child.parts
and child.suffix not in {".pyc", ".pyo"}
)
return files


def _get_package_version() -> str | None:
try:
return metadata.version(PACKAGE_NAME)
except metadata.PackageNotFoundError:
return None


def _get_git_sha() -> str | None:
for candidate in (PACKAGE_ROOT, *PACKAGE_ROOT.parents):
if not (candidate / ".git").exists():
continue
try:
return subprocess.check_output(
["git", "-C", str(candidate), "rev-parse", "HEAD"],
stderr=subprocess.DEVNULL,
text=True,
).strip()
except Exception:
return None
return None


@lru_cache(maxsize=1)
def get_data_build_fingerprint() -> str:
digest = hashlib.sha256()
for file_path in _iter_surface_files():
relative_path = file_path.relative_to(PACKAGE_ROOT).as_posix()
digest.update(relative_path.encode("utf-8"))
digest.update(b"\0")
digest.update(file_path.read_bytes())
digest.update(b"\0")
return f"sha256:{digest.hexdigest()}"


def get_data_build_metadata() -> dict[str, str | None]:
return {
"name": PACKAGE_NAME,
"version": _get_package_version(),
"git_sha": _get_git_sha(),
"data_build_fingerprint": get_data_build_fingerprint(),
}
43 changes: 43 additions & 0 deletions policyengine_us/tests/test_build_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from unittest.mock import patch

from policyengine_us.build_metadata import (
get_data_build_fingerprint,
get_data_build_metadata,
)


def test_data_build_fingerprint_is_stable_within_process():
get_data_build_fingerprint.cache_clear()

first = get_data_build_fingerprint()
second = get_data_build_fingerprint()

assert first.startswith("sha256:")
assert first == second


def test_get_data_build_metadata_includes_version_git_sha_and_fingerprint():
get_data_build_fingerprint.cache_clear()

with (
patch(
"policyengine_us.build_metadata._get_package_version",
return_value="1.602.0",
),
patch(
"policyengine_us.build_metadata._get_git_sha",
return_value="deadbeef",
),
patch(
"policyengine_us.build_metadata.get_data_build_fingerprint",
return_value="sha256:fingerprint",
),
):
metadata = get_data_build_metadata()

assert metadata == {
"name": "policyengine-us",
"version": "1.602.0",
"git_sha": "deadbeef",
"data_build_fingerprint": "sha256:fingerprint",
}
Loading