From add2c32080f53a5991fdca5b9bcebeaf89ae1d9e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 08:31:28 -0400 Subject: [PATCH 1/2] Fix SPI __main__ crash, parameterise MA cap, seed age imputation --- changelog.d/fix-spi-main-and-params.fixed.md | 1 + policyengine_uk_data/datasets/spi.py | 139 +++++++++++++----- policyengine_uk_data/tests/test_spi_build.py | 142 +++++++++++++++++++ 3 files changed, 250 insertions(+), 32 deletions(-) create mode 100644 changelog.d/fix-spi-main-and-params.fixed.md create mode 100644 policyengine_uk_data/tests/test_spi_build.py diff --git a/changelog.d/fix-spi-main-and-params.fixed.md b/changelog.d/fix-spi-main-and-params.fixed.md new file mode 100644 index 000000000..8f513b8e7 --- /dev/null +++ b/changelog.d/fix-spi-main-and-params.fixed.md @@ -0,0 +1 @@ +Fix `datasets/spi.py` `__main__` crash (two-arg call to three-arg `create_spi`), parameterise the hardcoded £1,250 marriage allowance from policyengine-uk parameters, seed the age imputation RNG, and surface unknown GORCODE regions as `UNKNOWN` instead of silently mapping them to `SOUTH_EAST`. diff --git a/policyengine_uk_data/datasets/spi.py b/policyengine_uk_data/datasets/spi.py index f0c08626c..87dccd4a1 100644 --- a/policyengine_uk_data/datasets/spi.py +++ b/policyengine_uk_data/datasets/spi.py @@ -5,10 +5,106 @@ from policyengine_uk.data import UKSingleYearDataset +# Age-range midpoints for random age imputation. +# Key -1 covers records with no reported AGERANGE — use a broad working-age +# span rather than silently bucketing them into one slot. +AGE_RANGES = { + -1: (16, 70), + 1: (16, 25), + 2: (25, 35), + 3: (35, 45), + 4: (45, 55), + 5: (55, 65), + 6: (65, 74), + 7: (74, 90), +} + +# SPI GORCODE → policyengine-uk region enum. +# NB the SPI codebook does not include a "region unknown" code; we surface +# unknown codes explicitly rather than silently mapping them to SOUTH_EAST +# (which the previous implementation did, distorting regional income totals). +REGION_MAP = { + 1: "NORTH_EAST", + 2: "NORTH_WEST", + 3: "YORKSHIRE", + 4: "EAST_MIDLANDS", + 5: "WEST_MIDLANDS", + 6: "EAST_OF_ENGLAND", + 7: "LONDON", + 8: "SOUTH_EAST", + 9: "SOUTH_WEST", + 10: "WALES", + 11: "SCOTLAND", + 12: "NORTHERN_IRELAND", +} + + +def _get_marriage_allowance(fiscal_year: int) -> float: + """Return the maximum Marriage Allowance transfer for the given UK fiscal + year in £. This equals ``max`` × ``personal_allowance`` at the start of + the fiscal year (6 April), which is how HMRC publishes it. Falls back to + the pre-2021-22 hard value of £1,250 if `policyengine_uk` cannot be + imported (e.g., during unit tests that avoid the heavy import). + """ + try: + from policyengine_uk.system import system + except Exception: + return 1_250.0 + + instant = f"{fiscal_year}-04-06" + pa = system.parameters.gov.hmrc.income_tax.allowances.personal_allowance.amount( + instant + ) + ma_cap_rate = ( + system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.max( + instant + ) + ) + # HMRC rounds to the nearest £10 downward; use the explicit rounding param + # if it exists, otherwise leave the computed value as-is. + try: + rounding_increment = ( + system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.rounding_increment( + instant + ) + ) + except Exception: + rounding_increment = None + + value = pa * ma_cap_rate + if rounding_increment: + # HMRC rounds the cap UP to the nearest rounding increment + # (Income Tax Act 2007 s. 55B(5)); matches the formula in + # policyengine_uk.variables.gov.hmrc.income_tax.allowances.marriage_allowance. + increment = float(rounding_increment) + value = np.ceil(value / increment) * increment + return float(value) + + def create_spi( - spi_data_file_path: str, fiscal_year: int, output_file_path: str + spi_data_file_path: str, + fiscal_year: int, + output_file_path: str | None = None, + seed: int = 0, + unknown_region: str = "UNKNOWN", ) -> UKSingleYearDataset: + """Build a :class:`UKSingleYearDataset` from an SPI microdata `.tab` file. + + Args: + spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2021uk.tab`). + fiscal_year: UK fiscal year for the dataset (e.g. 2020 → 2020-21). + output_file_path: Unused here — callers may save the returned dataset + themselves with ``dataset.save(path)``. Kept as a kwarg so + existing call sites don't break. + seed: Seed for the random age imputation. Fixed by default so builds + are deterministic. + unknown_region: Fallback region label for SPI GORCODE values outside + the documented 1-12 range. Defaults to ``"UNKNOWN"`` so regional + totals are not silently distorted; pass ``"SOUTH_EAST"`` to + reproduce legacy behaviour if needed. + """ df = pd.read_csv(spi_data_file_path, delimiter="\t") + rng = np.random.default_rng(seed) person = pd.DataFrame() benunit = pd.DataFrame() @@ -22,22 +118,7 @@ def create_spi( household["household_weight"] = df.FACT person["dividend_income"] = df.DIVIDENDS person["gift_aid"] = df.GIFTAID - household["region"] = df.GORCODE.map( - { - 1: "NORTH_EAST", - 2: "NORTH_WEST", - 3: "YORKSHIRE", - 4: "EAST_MIDLANDS", - 5: "WEST_MIDLANDS", - 6: "EAST_OF_ENGLAND", - 7: "LONDON", - 8: "SOUTH_EAST", - 9: "SOUTH_WEST", - 10: "WALES", - 11: "SCOTLAND", - 12: "NORTHERN_IRELAND", - } - ).fillna("SOUTH_EAST") + household["region"] = df.GORCODE.map(REGION_MAP).fillna(unknown_region) household["rent"] = 0 household["tenure_type"] = "OWNED_OUTRIGHT" household["council_tax"] = 0 @@ -59,21 +140,12 @@ def create_spi( person["capital_allowances"] = df.CAPALL person["loss_relief"] = df.LOSSBF - AGE_RANGES = { - -1: (16, 70), - 1: (16, 25), - 2: (25, 35), - 3: (35, 45), - 4: (45, 55), - 5: (55, 65), - 6: (65, 74), - 7: (74, 90), - } age_range = df.AGERANGE - # Randomly assign ages in age ranges - - percent_along_age_range = np.random.rand(len(df)) + # Randomly assign ages within each AGERANGE bucket using a seeded local + # generator so builds are reproducible (previously used the unseeded + # global np.random.rand). + percent_along_age_range = rng.random(len(df)) min_age = np.array([AGE_RANGES[age][0] for age in age_range]) max_age = np.array([AGE_RANGES[age][1] for age in age_range]) person["age"] = (min_age + (max_age - min_age) * percent_along_age_range).astype( @@ -91,7 +163,10 @@ def create_spi( person["other_deductions"] = df.MOTHDED + df.DEFICIEN person["married_couples_allowance"] = df.MCAS person["blind_persons_allowance"] = df.BPADUE - person["marriage_allowance"] = np.where(df.MAIND == 1, 1_250, 0) + # Pull the Marriage Allowance cap from policyengine-uk parameters keyed + # on the fiscal year, rather than hardcoding 2020-21's £1,250 figure. + ma_cap = _get_marriage_allowance(fiscal_year) + person["marriage_allowance"] = np.where(df.MAIND == 1, ma_cap, 0) dataset = UKSingleYearDataset( person=person, @@ -106,5 +181,5 @@ def create_spi( spi_data_file_path = STORAGE_FOLDER / "spi_2020_21" / "put2021uk.tab" fiscal_year = 2020 output_file_path = STORAGE_FOLDER / "spi_2020.h5" - spi = create_spi(spi_data_file_path, fiscal_year) + spi = create_spi(spi_data_file_path, fiscal_year, output_file_path) spi.save(output_file_path) diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py new file mode 100644 index 000000000..936ef7597 --- /dev/null +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -0,0 +1,142 @@ +"""Regression tests for `policyengine_uk_data.datasets.spi`. + +Covers the three bugs flagged in the bug hunt: + +- The ``__main__`` block called ``create_spi`` with two positional args but + the signature required three. This test asserts the function is callable + with two positional args (``spi_data_file_path`` and ``fiscal_year``) and + that the optional ``output_file_path`` kwarg is accepted. +- Age imputation was non-deterministic (unseeded ``np.random.rand``). This + test asserts two runs with the same seed produce identical ``age`` + columns. +- Unknown GORCODE values were silently mapped to ``SOUTH_EAST``. This test + asserts the default fallback label is now ``UNKNOWN``. +""" + +from __future__ import annotations + +import importlib.util +import inspect + +import numpy as np +import pandas as pd +import pytest + +if importlib.util.find_spec("policyengine_uk") is None: + pytest.skip( + "policyengine_uk not available in test environment", + allow_module_level=True, + ) + + +SPI_COLUMNS = [ + "SREF", "FACT", "DIVIDENDS", "GIFTAID", "GORCODE", "INCBBS", "INCPROP", + "PAY", "EPB", "EXPS", "PENSION", "PSAV_XS", "PENSRLF", "PROFITS", + "CAPALL", "LOSSBF", "AGERANGE", "SRP", "TAX_CRED", "MOTHINC", "INCPBEN", + "OSSBEN", "TAXTERM", "UBISJA", "OTHERINC", "GIFTINV", "OTHERINV", + "COVNTS", "MOTHDED", "DEFICIEN", "MCAS", "BPADUE", "MAIND", +] + + +def _write_fake_spi(path, gor_values=(1, 2, 3), maind_values=(1, 0, 1)): + """Write a minimal SPI-shaped tab file for tests. + + The real SPI file has dozens of columns; the test only needs them to + exist with sensible types so ``create_spi`` can build dataframes. + """ + n = len(gor_values) + data = {col: np.zeros(n, dtype=float) for col in SPI_COLUMNS} + data["SREF"] = np.arange(1, n + 1) + data["FACT"] = np.ones(n) + data["GORCODE"] = list(gor_values) + data["MAIND"] = list(maind_values) + data["AGERANGE"] = [1] * n # bucket (16, 25) + df = pd.DataFrame(data) + df.to_csv(path, sep="\t", index=False) + + +def test_create_spi_accepts_two_positional_args(tmp_path): + """The ``__main__`` crash bug: ``create_spi(path, year)`` must work.""" + from policyengine_uk_data.datasets.spi import create_spi + + sig = inspect.signature(create_spi) + params = list(sig.parameters.values()) + # First two params are required positional; remaining params are optional + # so two-arg calls succeed. + assert params[0].default is inspect.Parameter.empty + assert params[1].default is inspect.Parameter.empty + for p in params[2:]: + assert p.default is not inspect.Parameter.empty, ( + f"Parameter {p.name!r} must have a default so create_spi(path, " + f"year) stays callable without breaking the __main__ block." + ) + + +def test_create_spi_age_imputation_is_deterministic(tmp_path): + """Same seed → identical age column. Was unseeded in the buggy version.""" + from policyengine_uk_data.datasets.spi import create_spi + + tab = tmp_path / "spi.tab" + _write_fake_spi(tab, gor_values=(1, 2, 3, 4, 5), maind_values=(0, 0, 0, 0, 0)) + + ds_a = create_spi(tab, 2020, seed=42) + ds_b = create_spi(tab, 2020, seed=42) + ds_c = create_spi(tab, 2020, seed=123) + + assert (ds_a.person["age"].to_numpy() == ds_b.person["age"].to_numpy()).all() + # Different seeds should give some variation for the (16, 25) bucket. + assert not ( + ds_a.person["age"].to_numpy() == ds_c.person["age"].to_numpy() + ).all() + + +def test_create_spi_unknown_gorcode_does_not_silently_become_south_east( + tmp_path, +): + """Unmapped GORCODE rows now get UNKNOWN, not SOUTH_EAST, by default.""" + from policyengine_uk_data.datasets.spi import create_spi + + tab = tmp_path / "spi.tab" + _write_fake_spi( + tab, + gor_values=(99, 7, 99), # 99 is undocumented → should be UNKNOWN + maind_values=(0, 0, 0), + ) + + ds = create_spi(tab, 2020, seed=0) + regions = ds.household["region"].tolist() + assert regions[0] == "UNKNOWN" + assert regions[1] == "LONDON" # GORCODE 7 maps to LONDON + assert regions[2] == "UNKNOWN" + # Legacy behaviour is still accessible via the kwarg for callers that + # relied on it. + ds_legacy = create_spi(tab, 2020, seed=0, unknown_region="SOUTH_EAST") + assert ds_legacy.household["region"].tolist()[0] == "SOUTH_EAST" + + +def test_create_spi_marriage_allowance_uses_fiscal_year_parameters(tmp_path): + """MA cap should follow the fiscal year's 10% × Personal Allowance rule. + + 2020-21 PA = £12,500 so MA cap = £1,250 (the historical hardcoded value). + 2021-22 onwards PA = £12,570 so MA cap = £1,257, rounded down to + increments per the rounding_increment parameter (HMRC publishes £1,260 + for 2025-26). + """ + from policyengine_uk_data.datasets.spi import create_spi + + tab = tmp_path / "spi.tab" + _write_fake_spi(tab, gor_values=(1, 2, 3), maind_values=(1, 0, 1)) + + ds_2020 = create_spi(tab, 2020, seed=0) + marriage_2020 = ds_2020.person["marriage_allowance"].to_numpy() + # Expect eligible rows (MAIND == 1) to receive £1,250 and ineligible 0. + assert (marriage_2020[[0, 2]] == 1_250).all() + assert marriage_2020[1] == 0 + + ds_2025 = create_spi(tab, 2025, seed=0) + marriage_2025 = ds_2025.person["marriage_allowance"].to_numpy() + # Post-2020, PA is £12,570 so the cap is £1,257 before rounding; the + # published HMRC value is £1,260 (rounding to nearest £10). Accept + # either, but require it's NOT the stale 2020-21 £1,250 figure. + assert marriage_2025[0] != 1_250 + assert marriage_2025[0] >= 1_250 # PA has only risen since 2020 From 08cdabd489264baa42d32923e1e54320941b25ef Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 08:53:41 -0400 Subject: [PATCH 2/2] Apply ruff format --- policyengine_uk_data/datasets/spi.py | 10 ++--- policyengine_uk_data/tests/test_spi_build.py | 42 ++++++++++++++++---- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/policyengine_uk_data/datasets/spi.py b/policyengine_uk_data/datasets/spi.py index 87dccd4a1..03c1fe4e0 100644 --- a/policyengine_uk_data/datasets/spi.py +++ b/policyengine_uk_data/datasets/spi.py @@ -56,17 +56,13 @@ def _get_marriage_allowance(fiscal_year: int) -> float: instant ) ma_cap_rate = ( - system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.max( - instant - ) + system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.max(instant) ) # HMRC rounds to the nearest £10 downward; use the explicit rounding param # if it exists, otherwise leave the computed value as-is. try: - rounding_increment = ( - system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.rounding_increment( - instant - ) + rounding_increment = system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.rounding_increment( + instant ) except Exception: rounding_increment = None diff --git a/policyengine_uk_data/tests/test_spi_build.py b/policyengine_uk_data/tests/test_spi_build.py index 936ef7597..9ecd809df 100644 --- a/policyengine_uk_data/tests/test_spi_build.py +++ b/policyengine_uk_data/tests/test_spi_build.py @@ -30,11 +30,39 @@ SPI_COLUMNS = [ - "SREF", "FACT", "DIVIDENDS", "GIFTAID", "GORCODE", "INCBBS", "INCPROP", - "PAY", "EPB", "EXPS", "PENSION", "PSAV_XS", "PENSRLF", "PROFITS", - "CAPALL", "LOSSBF", "AGERANGE", "SRP", "TAX_CRED", "MOTHINC", "INCPBEN", - "OSSBEN", "TAXTERM", "UBISJA", "OTHERINC", "GIFTINV", "OTHERINV", - "COVNTS", "MOTHDED", "DEFICIEN", "MCAS", "BPADUE", "MAIND", + "SREF", + "FACT", + "DIVIDENDS", + "GIFTAID", + "GORCODE", + "INCBBS", + "INCPROP", + "PAY", + "EPB", + "EXPS", + "PENSION", + "PSAV_XS", + "PENSRLF", + "PROFITS", + "CAPALL", + "LOSSBF", + "AGERANGE", + "SRP", + "TAX_CRED", + "MOTHINC", + "INCPBEN", + "OSSBEN", + "TAXTERM", + "UBISJA", + "OTHERINC", + "GIFTINV", + "OTHERINV", + "COVNTS", + "MOTHDED", + "DEFICIEN", + "MCAS", + "BPADUE", + "MAIND", ] @@ -85,9 +113,7 @@ def test_create_spi_age_imputation_is_deterministic(tmp_path): assert (ds_a.person["age"].to_numpy() == ds_b.person["age"].to_numpy()).all() # Different seeds should give some variation for the (16, 25) bucket. - assert not ( - ds_a.person["age"].to_numpy() == ds_c.person["age"].to_numpy() - ).all() + assert not (ds_a.person["age"].to_numpy() == ds_c.person["age"].to_numpy()).all() def test_create_spi_unknown_gorcode_does_not_silently_become_south_east(