Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/fix-vat-parameterize.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Parameterise the VAT standard rate and reduced-rate share in ETB-based VAT imputation by reading from `policyengine_uk.parameters.gov.hmrc.vat` keyed on the training year, with a `VAT_RATE_BY_YEAR` fallback for offline use. Promote the `etb.year == 2020` filter to a `year` argument with a `DEFAULT_ETB_YEAR` default.
90 changes: 70 additions & 20 deletions policyengine_uk_data/datasets/imputations/vat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@

This module imputes household VAT expenditure rates based on demographic
characteristics using machine learning models trained on ETB survey data.

The ETB VAT columns report the standard-rate VAT actually paid plus a
reduced-rate share of expenditure. To back out the underlying
full-rate-taxable expenditure we divide by the statutory VAT standard
rate and subtract an OBR-published reduced-rate share of consumption.
Both are parameterised per-year so later years (or forthcoming rate
changes) don't need a code edit.
"""

import pandas as pd
Expand All @@ -14,39 +21,82 @@

ETB_TAB_FOLDER = STORAGE_FOLDER / "etb_1977_21"

CONSUMPTION_PCT_REDUCED_RATE = 0.03 # From OBR's VAT page
CURRENT_VAT_RATE = 0.2
# Default ETB vintage used when training the imputation model. Kept at 2020
# for backward compatibility with the checked-in vat.pkl fingerprint, but
# exposed as a module constant rather than an inline magic number so later
# updates require only a one-line change (not scattered `etb.year == 2020`
# checks).
DEFAULT_ETB_YEAR = 2020

# Fallback VAT parameters used when `policyengine_uk` is unavailable (e.g.
# unit-test environments). Values match the 2020-21 UK statutory position.
_FALLBACK_VAT_STANDARD_RATE = 0.2
_FALLBACK_REDUCED_RATE_SHARE = 0.03

# Manual year → (standard rate, reduced rate share) override used when
# `policyengine_uk` parameters are not available. Kept intentionally short:
# extend only if the team agrees that a VAT code change warrants a hardcoded
# value until the parameter file is updated upstream.
VAT_RATE_BY_YEAR: dict[int, tuple[float, float]] = {
2020: (0.2, 0.03),
2021: (0.2, 0.03),
}

PREDICTORS = ["is_adult", "is_child", "is_SP_age", "household_net_income"]
IMPUTATIONS = ["full_rate_vat_expenditure_rate"]


def generate_etb_table(etb: pd.DataFrame):
def _get_vat_parameters(year: int) -> tuple[float, float]:
"""Return ``(standard_rate, reduced_rate_share)`` for the given calendar year.

Prefers live `policyengine_uk` parameters (``gov.hmrc.vat.standard_rate``
and ``gov.hmrc.vat.reduced_rate_share``). Falls back to the module-level
``VAT_RATE_BY_YEAR`` dict, and finally to the 2020-21 statutory values so
callers never silently get wrong numbers.
"""
try:
from policyengine_uk.system import system

standard_rate = float(system.parameters.gov.hmrc.vat.standard_rate(str(year)))
reduced_rate_share = float(
system.parameters.gov.hmrc.vat.reduced_rate_share(str(year))
)
return standard_rate, reduced_rate_share
except Exception:
if year in VAT_RATE_BY_YEAR:
return VAT_RATE_BY_YEAR[year]
return _FALLBACK_VAT_STANDARD_RATE, _FALLBACK_REDUCED_RATE_SHARE


def generate_etb_table(etb: pd.DataFrame, year: int = DEFAULT_ETB_YEAR) -> pd.DataFrame:
"""
Clean and transform ETB data for VAT imputation model training.

Args:
etb: Raw ETB survey data DataFrame.
year: ETB survey year to filter to. Defaults to ``DEFAULT_ETB_YEAR``.

Returns:
Cleaned DataFrame with VAT expenditure rates calculated.
"""
etb_2020 = etb[etb.year == 2020].dropna()
for col in etb_2020:
etb_2020[col] = pd.to_numeric(etb_2020[col], errors="coerce")

etb_2020_df = pd.DataFrame()
etb_2020_df["is_adult"] = etb_2020.adults
etb_2020_df["is_child"] = etb_2020.childs
etb_2020_df["is_SP_age"] = etb_2020.noretd
etb_2020_df["household_net_income"] = etb_2020.disinc * 52
etb_2020_df["full_rate_vat_expenditure_rate"] = (
etb_2020.totvat * (1 - CONSUMPTION_PCT_REDUCED_RATE) / CURRENT_VAT_RATE
) / (etb_2020.expdis - etb_2020.totvat)
return etb_2020_df[~etb_2020_df.full_rate_vat_expenditure_rate.isna()]


def save_imputation_models():
standard_rate, reduced_rate_share = _get_vat_parameters(year)

etb_year = etb[etb.year == year].dropna()
for col in etb_year:
etb_year[col] = pd.to_numeric(etb_year[col], errors="coerce")

etb_year_df = pd.DataFrame()
etb_year_df["is_adult"] = etb_year.adults
etb_year_df["is_child"] = etb_year.childs
etb_year_df["is_SP_age"] = etb_year.noretd
etb_year_df["household_net_income"] = etb_year.disinc * 52
etb_year_df["full_rate_vat_expenditure_rate"] = (
etb_year.totvat * (1 - reduced_rate_share) / standard_rate
) / (etb_year.expdis - etb_year.totvat)
return etb_year_df[~etb_year_df.full_rate_vat_expenditure_rate.isna()]


def save_imputation_models(year: int = DEFAULT_ETB_YEAR):
"""
Train and save VAT imputation model.

Expand All @@ -61,7 +111,7 @@ def save_imputation_models():
delimiter="\t",
low_memory=False,
)
etb = generate_etb_table(etb)
etb = generate_etb_table(etb, year=year)
etb = etb[PREDICTORS + IMPUTATIONS]
vat.fit(etb[PREDICTORS], etb[IMPUTATIONS])
vat.save(STORAGE_FOLDER / "vat.pkl")
Expand Down
106 changes: 106 additions & 0 deletions policyengine_uk_data/tests/test_vat_parameters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Tests for parameterised VAT constants in `datasets/imputations/vat.py`.

Covers bug-hunt finding U7: the original code hardcoded
``CURRENT_VAT_RATE = 0.2``, ``CONSUMPTION_PCT_REDUCED_RATE = 0.03`` and
the ``etb.year == 2020`` filter inline, so any change to VAT rates,
reduced-rate share, or training vintage required a code edit across
multiple scattered lines.
"""

from __future__ import annotations

import pandas as pd
import pytest


def test_get_vat_parameters_reads_from_policyengine_uk():
"""Standard rate should come from `policyengine_uk` parameters."""
try:
from policyengine_uk.system import system
except Exception:
pytest.skip("policyengine_uk not available")

from policyengine_uk_data.datasets.imputations.vat import (
_get_vat_parameters,
)

expected_standard = float(system.parameters.gov.hmrc.vat.standard_rate("2020"))
expected_reduced = float(system.parameters.gov.hmrc.vat.reduced_rate_share("2020"))
standard, reduced = _get_vat_parameters(2020)
assert standard == pytest.approx(expected_standard)
assert reduced == pytest.approx(expected_reduced)


def test_vat_rate_by_year_fallback_matches_2020_statute():
"""Offline fallback must stay aligned with the statutory 2020-21 rates."""
from policyengine_uk_data.datasets.imputations.vat import (
VAT_RATE_BY_YEAR,
)

assert VAT_RATE_BY_YEAR[2020] == (0.2, 0.03)


def test_generate_etb_table_uses_year_param():
"""Changing the `year` arg filters ETB rows by that year.

The original implementation hardcoded ``etb.year == 2020``. After the
fix the year is a parameter with a sensible default.
"""
from policyengine_uk_data.datasets.imputations.vat import (
generate_etb_table,
)

etb = pd.DataFrame(
{
"year": [2020, 2020, 2021, 2021],
"adults": [1, 2, 1, 2],
"childs": [0, 1, 0, 1],
"noretd": [0, 0, 1, 1],
"disinc": [500.0, 800.0, 600.0, 900.0],
"totvat": [50.0, 80.0, 60.0, 90.0],
"expdis": [500.0, 800.0, 600.0, 900.0],
}
)

out_2020 = generate_etb_table(etb, year=2020)
out_2021 = generate_etb_table(etb, year=2021)

# Filtering is by year column — disjoint row counts confirm the filter
# actually moved.
assert len(out_2020) == 2
assert len(out_2021) == 2
# Trained features use household_net_income = disinc * 52.
assert set(out_2020["household_net_income"].to_numpy()) == {500 * 52, 800 * 52}
assert set(out_2021["household_net_income"].to_numpy()) == {600 * 52, 900 * 52}


def test_generate_etb_table_uses_year_specific_vat_rate(monkeypatch):
"""The ``full_rate_vat_expenditure_rate`` column scales with VAT rate."""
from policyengine_uk_data.datasets.imputations import vat as vat_module

etb = pd.DataFrame(
{
"year": [2020, 2030],
"adults": [1, 1],
"childs": [0, 0],
"noretd": [0, 0],
"disinc": [1000.0, 1000.0],
"totvat": [100.0, 100.0],
"expdis": [1000.0, 1000.0],
}
)

def _fake_params(year: int):
return (0.2, 0.0) if year == 2020 else (0.25, 0.0)

monkeypatch.setattr(vat_module, "_get_vat_parameters", _fake_params)

out_2020 = vat_module.generate_etb_table(etb, year=2020)
out_hypothetical = vat_module.generate_etb_table(etb, year=2030)

# Higher standard rate → lower implied full-rate expenditure (divide
# totvat by a bigger denominator), so the computed rate must drop.
assert (
out_hypothetical["full_rate_vat_expenditure_rate"].iloc[0]
< (out_2020["full_rate_vat_expenditure_rate"].iloc[0])
)
Loading