Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/fix-spi-main-and-params.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix `datasets/spi.py` `__main__` crash (two-arg call to three-arg `create_spi`), parameterise the hardcoded £1,250 marriage allowance from policyengine-uk parameters, seed the age imputation RNG, and surface unknown GORCODE regions as `UNKNOWN` instead of silently mapping them to `SOUTH_EAST`.
135 changes: 103 additions & 32 deletions policyengine_uk_data/datasets/spi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,102 @@
from policyengine_uk.data import UKSingleYearDataset


# Age-range midpoints for random age imputation.
# Key -1 covers records with no reported AGERANGE — use a broad working-age
# span rather than silently bucketing them into one slot.
AGE_RANGES = {
-1: (16, 70),
1: (16, 25),
2: (25, 35),
3: (35, 45),
4: (45, 55),
5: (55, 65),
6: (65, 74),
7: (74, 90),
}

# SPI GORCODE → policyengine-uk region enum.
# NB the SPI codebook does not include a "region unknown" code; we surface
# unknown codes explicitly rather than silently mapping them to SOUTH_EAST
# (which the previous implementation did, distorting regional income totals).
REGION_MAP = {
1: "NORTH_EAST",
2: "NORTH_WEST",
3: "YORKSHIRE",
4: "EAST_MIDLANDS",
5: "WEST_MIDLANDS",
6: "EAST_OF_ENGLAND",
7: "LONDON",
8: "SOUTH_EAST",
9: "SOUTH_WEST",
10: "WALES",
11: "SCOTLAND",
12: "NORTHERN_IRELAND",
}


def _get_marriage_allowance(fiscal_year: int) -> float:
"""Return the maximum Marriage Allowance transfer for the given UK fiscal
year in £. This equals ``max`` × ``personal_allowance`` at the start of
the fiscal year (6 April), which is how HMRC publishes it. Falls back to
the pre-2021-22 hard value of £1,250 if `policyengine_uk` cannot be
imported (e.g., during unit tests that avoid the heavy import).
"""
try:
from policyengine_uk.system import system
except Exception:
return 1_250.0

instant = f"{fiscal_year}-04-06"
pa = system.parameters.gov.hmrc.income_tax.allowances.personal_allowance.amount(
instant
)
ma_cap_rate = (
system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.max(instant)
)
# HMRC rounds to the nearest £10 downward; use the explicit rounding param
# if it exists, otherwise leave the computed value as-is.
try:
rounding_increment = system.parameters.gov.hmrc.income_tax.allowances.marriage_allowance.rounding_increment(
instant
)
except Exception:
rounding_increment = None

value = pa * ma_cap_rate
if rounding_increment:
# HMRC rounds the cap UP to the nearest rounding increment
# (Income Tax Act 2007 s. 55B(5)); matches the formula in
# policyengine_uk.variables.gov.hmrc.income_tax.allowances.marriage_allowance.
increment = float(rounding_increment)
value = np.ceil(value / increment) * increment
return float(value)


def create_spi(
spi_data_file_path: str, fiscal_year: int, output_file_path: str
spi_data_file_path: str,
fiscal_year: int,
output_file_path: str | None = None,
seed: int = 0,
unknown_region: str = "UNKNOWN",
) -> UKSingleYearDataset:
"""Build a :class:`UKSingleYearDataset` from an SPI microdata `.tab` file.

Args:
spi_data_file_path: Path to the SPI `.tab` file (e.g. `put2021uk.tab`).
fiscal_year: UK fiscal year for the dataset (e.g. 2020 → 2020-21).
output_file_path: Unused here — callers may save the returned dataset
themselves with ``dataset.save(path)``. Kept as a kwarg so
existing call sites don't break.
seed: Seed for the random age imputation. Fixed by default so builds
are deterministic.
unknown_region: Fallback region label for SPI GORCODE values outside
the documented 1-12 range. Defaults to ``"UNKNOWN"`` so regional
totals are not silently distorted; pass ``"SOUTH_EAST"`` to
reproduce legacy behaviour if needed.
"""
df = pd.read_csv(spi_data_file_path, delimiter="\t")
rng = np.random.default_rng(seed)

person = pd.DataFrame()
benunit = pd.DataFrame()
Expand All @@ -22,22 +114,7 @@ def create_spi(
household["household_weight"] = df.FACT
person["dividend_income"] = df.DIVIDENDS
person["gift_aid"] = df.GIFTAID
household["region"] = df.GORCODE.map(
{
1: "NORTH_EAST",
2: "NORTH_WEST",
3: "YORKSHIRE",
4: "EAST_MIDLANDS",
5: "WEST_MIDLANDS",
6: "EAST_OF_ENGLAND",
7: "LONDON",
8: "SOUTH_EAST",
9: "SOUTH_WEST",
10: "WALES",
11: "SCOTLAND",
12: "NORTHERN_IRELAND",
}
).fillna("SOUTH_EAST")
household["region"] = df.GORCODE.map(REGION_MAP).fillna(unknown_region)
household["rent"] = 0
household["tenure_type"] = "OWNED_OUTRIGHT"
household["council_tax"] = 0
Expand All @@ -59,21 +136,12 @@ def create_spi(
person["capital_allowances"] = df.CAPALL
person["loss_relief"] = df.LOSSBF

AGE_RANGES = {
-1: (16, 70),
1: (16, 25),
2: (25, 35),
3: (35, 45),
4: (45, 55),
5: (55, 65),
6: (65, 74),
7: (74, 90),
}
age_range = df.AGERANGE

# Randomly assign ages in age ranges

percent_along_age_range = np.random.rand(len(df))
# Randomly assign ages within each AGERANGE bucket using a seeded local
# generator so builds are reproducible (previously used the unseeded
# global np.random.rand).
percent_along_age_range = rng.random(len(df))
min_age = np.array([AGE_RANGES[age][0] for age in age_range])
max_age = np.array([AGE_RANGES[age][1] for age in age_range])
person["age"] = (min_age + (max_age - min_age) * percent_along_age_range).astype(
Expand All @@ -91,7 +159,10 @@ def create_spi(
person["other_deductions"] = df.MOTHDED + df.DEFICIEN
person["married_couples_allowance"] = df.MCAS
person["blind_persons_allowance"] = df.BPADUE
person["marriage_allowance"] = np.where(df.MAIND == 1, 1_250, 0)
# Pull the Marriage Allowance cap from policyengine-uk parameters keyed
# on the fiscal year, rather than hardcoding 2020-21's £1,250 figure.
ma_cap = _get_marriage_allowance(fiscal_year)
person["marriage_allowance"] = np.where(df.MAIND == 1, ma_cap, 0)

dataset = UKSingleYearDataset(
person=person,
Expand All @@ -106,5 +177,5 @@ def create_spi(
spi_data_file_path = STORAGE_FOLDER / "spi_2020_21" / "put2021uk.tab"
fiscal_year = 2020
output_file_path = STORAGE_FOLDER / "spi_2020.h5"
spi = create_spi(spi_data_file_path, fiscal_year)
spi = create_spi(spi_data_file_path, fiscal_year, output_file_path)
spi.save(output_file_path)
168 changes: 168 additions & 0 deletions policyengine_uk_data/tests/test_spi_build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""Regression tests for `policyengine_uk_data.datasets.spi`.

Covers the three bugs flagged in the bug hunt:

- The ``__main__`` block called ``create_spi`` with two positional args but
the signature required three. This test asserts the function is callable
with two positional args (``spi_data_file_path`` and ``fiscal_year``) and
that the optional ``output_file_path`` kwarg is accepted.
- Age imputation was non-deterministic (unseeded ``np.random.rand``). This
test asserts two runs with the same seed produce identical ``age``
columns.
- Unknown GORCODE values were silently mapped to ``SOUTH_EAST``. This test
asserts the default fallback label is now ``UNKNOWN``.
"""

from __future__ import annotations

import importlib.util
import inspect

import numpy as np
import pandas as pd
import pytest

if importlib.util.find_spec("policyengine_uk") is None:
pytest.skip(
"policyengine_uk not available in test environment",
allow_module_level=True,
)


SPI_COLUMNS = [
"SREF",
"FACT",
"DIVIDENDS",
"GIFTAID",
"GORCODE",
"INCBBS",
"INCPROP",
"PAY",
"EPB",
"EXPS",
"PENSION",
"PSAV_XS",
"PENSRLF",
"PROFITS",
"CAPALL",
"LOSSBF",
"AGERANGE",
"SRP",
"TAX_CRED",
"MOTHINC",
"INCPBEN",
"OSSBEN",
"TAXTERM",
"UBISJA",
"OTHERINC",
"GIFTINV",
"OTHERINV",
"COVNTS",
"MOTHDED",
"DEFICIEN",
"MCAS",
"BPADUE",
"MAIND",
]


def _write_fake_spi(path, gor_values=(1, 2, 3), maind_values=(1, 0, 1)):
"""Write a minimal SPI-shaped tab file for tests.

The real SPI file has dozens of columns; the test only needs them to
exist with sensible types so ``create_spi`` can build dataframes.
"""
n = len(gor_values)
data = {col: np.zeros(n, dtype=float) for col in SPI_COLUMNS}
data["SREF"] = np.arange(1, n + 1)
data["FACT"] = np.ones(n)
data["GORCODE"] = list(gor_values)
data["MAIND"] = list(maind_values)
data["AGERANGE"] = [1] * n # bucket (16, 25)
df = pd.DataFrame(data)
df.to_csv(path, sep="\t", index=False)


def test_create_spi_accepts_two_positional_args(tmp_path):
"""The ``__main__`` crash bug: ``create_spi(path, year)`` must work."""
from policyengine_uk_data.datasets.spi import create_spi

sig = inspect.signature(create_spi)
params = list(sig.parameters.values())
# First two params are required positional; remaining params are optional
# so two-arg calls succeed.
assert params[0].default is inspect.Parameter.empty
assert params[1].default is inspect.Parameter.empty
for p in params[2:]:
assert p.default is not inspect.Parameter.empty, (
f"Parameter {p.name!r} must have a default so create_spi(path, "
f"year) stays callable without breaking the __main__ block."
)


def test_create_spi_age_imputation_is_deterministic(tmp_path):
"""Same seed → identical age column. Was unseeded in the buggy version."""
from policyengine_uk_data.datasets.spi import create_spi

tab = tmp_path / "spi.tab"
_write_fake_spi(tab, gor_values=(1, 2, 3, 4, 5), maind_values=(0, 0, 0, 0, 0))

ds_a = create_spi(tab, 2020, seed=42)
ds_b = create_spi(tab, 2020, seed=42)
ds_c = create_spi(tab, 2020, seed=123)

assert (ds_a.person["age"].to_numpy() == ds_b.person["age"].to_numpy()).all()
# Different seeds should give some variation for the (16, 25) bucket.
assert not (ds_a.person["age"].to_numpy() == ds_c.person["age"].to_numpy()).all()


def test_create_spi_unknown_gorcode_does_not_silently_become_south_east(
tmp_path,
):
"""Unmapped GORCODE rows now get UNKNOWN, not SOUTH_EAST, by default."""
from policyengine_uk_data.datasets.spi import create_spi

tab = tmp_path / "spi.tab"
_write_fake_spi(
tab,
gor_values=(99, 7, 99), # 99 is undocumented → should be UNKNOWN
maind_values=(0, 0, 0),
)

ds = create_spi(tab, 2020, seed=0)
regions = ds.household["region"].tolist()
assert regions[0] == "UNKNOWN"
assert regions[1] == "LONDON" # GORCODE 7 maps to LONDON
assert regions[2] == "UNKNOWN"
# Legacy behaviour is still accessible via the kwarg for callers that
# relied on it.
ds_legacy = create_spi(tab, 2020, seed=0, unknown_region="SOUTH_EAST")
assert ds_legacy.household["region"].tolist()[0] == "SOUTH_EAST"


def test_create_spi_marriage_allowance_uses_fiscal_year_parameters(tmp_path):
"""MA cap should follow the fiscal year's 10% × Personal Allowance rule.

2020-21 PA = £12,500 so MA cap = £1,250 (the historical hardcoded value).
2021-22 onwards PA = £12,570 so MA cap = £1,257, rounded down to
increments per the rounding_increment parameter (HMRC publishes £1,260
for 2025-26).
"""
from policyengine_uk_data.datasets.spi import create_spi

tab = tmp_path / "spi.tab"
_write_fake_spi(tab, gor_values=(1, 2, 3), maind_values=(1, 0, 1))

ds_2020 = create_spi(tab, 2020, seed=0)
marriage_2020 = ds_2020.person["marriage_allowance"].to_numpy()
# Expect eligible rows (MAIND == 1) to receive £1,250 and ineligible 0.
assert (marriage_2020[[0, 2]] == 1_250).all()
assert marriage_2020[1] == 0

ds_2025 = create_spi(tab, 2025, seed=0)
marriage_2025 = ds_2025.person["marriage_allowance"].to_numpy()
# Post-2020, PA is £12,570 so the cap is £1,257 before rounding; the
# published HMRC value is £1,260 (rounding to nearest £10). Accept
# either, but require it's NOT the stale 2020-21 £1,250 figure.
assert marriage_2025[0] != 1_250
assert marriage_2025[0] >= 1_250 # PA has only risen since 2020
Loading