From d99f76b08db01d4bcd733648d1a55d1105bf74c1 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 19 Apr 2026 13:51:16 -0400 Subject: [PATCH 1/2] Tighten population tolerance and add fidelity tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The weighted-UK-population drift that motivated #310 has already dropped from ~6.5% to ~1.6% on current main as a side-effect of the data-pipeline improvements landed yesterday (stage-2 QRF #362, TFC target refresh #363, reported-anchor takeup #359). Tightens `test_population` tolerance from 7 % to 3 % to lock in that gain — any future calibration change that regresses back toward the pre-April-2026 overshoot now trips CI instead of silently drifting. Adds a new `test_population_fidelity.py` with four regression tests extracted from the #310 draft: - weighted-total ONS match (3 % tolerance) - household-count sanity range (25-33 M) - non-inflation guard (< 72 M) - country-populations-sum-to-UK consistency Does not include #310's loss-function change or Scotland target removal; those are independent proposals and should be evaluated on their own merits once the practical overshoot is resolved. Co-authored-by: Vahid Ahmadi Co-Authored-By: Claude Opus 4.7 (1M context) --- changelog.d/tighten-population-tests.added.md | 1 + policyengine_uk_data/tests/test_population.py | 11 ++- .../tests/test_population_fidelity.py | 69 +++++++++++++++++++ 3 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 changelog.d/tighten-population-tests.added.md create mode 100644 policyengine_uk_data/tests/test_population_fidelity.py diff --git a/changelog.d/tighten-population-tests.added.md b/changelog.d/tighten-population-tests.added.md new file mode 100644 index 000000000..d52b6104d --- /dev/null +++ b/changelog.d/tighten-population-tests.added.md @@ -0,0 +1 @@ +Tightened `test_population` tolerance from 7% to 3% now that the stage-2 QRF (#362), TFC target refresh (#363), and reported-anchor takeup (#359) pulled the weighted UK population overshoot from ~6.5% down to ~1.6%. Added four regression tests in `test_population_fidelity.py` (weighted-total match, household-count range, non-inflation guard, country-sum consistency) extracted from the earlier #310 draft so any future calibration drift back toward the pre-April-2026 overshoot trips CI. diff --git a/policyengine_uk_data/tests/test_population.py b/policyengine_uk_data/tests/test_population.py index 43645791e..3014e92a4 100644 --- a/policyengine_uk_data/tests/test_population.py +++ b/policyengine_uk_data/tests/test_population.py @@ -1,7 +1,12 @@ def test_population(baseline): population = baseline.calculate("people", 2025).sum() / 1e6 - POPULATION_TARGET = 69.5 # Expected UK population in millions, per ONS 2022-based estimate here: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based - # Tolerance temporarily relaxed to 7% due to calibration inflation issue #217 - assert abs(population / POPULATION_TARGET - 1) < 0.07, ( + POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based + # Tightened from 7% to 3% after data-pipeline improvements in April 2026 + # (stage-2 QRF imputation #362, TFC target refresh #363, reported-anchor + # takeup #359) pulled the weighted UK population down from ~74M (+6.5%) + # to ~71M (+1.6%). 3% headroom keeps the test passing with room for + # normal build-to-build variance while catching any regression back to + # the pre-April-2026 overshoot. + assert abs(population / POPULATION_TARGET - 1) < 0.03, ( f"Expected UK population of {POPULATION_TARGET:.1f} million, got {population:.1f} million." ) diff --git a/policyengine_uk_data/tests/test_population_fidelity.py b/policyengine_uk_data/tests/test_population_fidelity.py new file mode 100644 index 000000000..2038e9c79 --- /dev/null +++ b/policyengine_uk_data/tests/test_population_fidelity.py @@ -0,0 +1,69 @@ +"""Population fidelity regression tests for the calibrated dataset. + +Guards against the April 2026 calibration drift (issue #217) where the +weighted UK population inflated ~6.5% above the ONS target. The drift +was pulled back to ~1.6% by the data-pipeline improvements that landed +in #362 (stage-2 QRF), #363 (TFC target refresh), and #359 (reported- +anchor takeup). These tests lock in that gain so future calibration +changes can't regress past current fidelity without a test failure. + +Extracted from PolicyEngine/policyengine-uk-data#310 (Vahid Ahmadi). +""" + +from __future__ import annotations + +import warnings + +import numpy as np + +POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions +TOLERANCE = 0.03 # 3% — headroom above the ~1.6% achieved on current main + + +def _raw(micro_series): + """Extract the raw numpy array from a MicroSeries without triggering + the `.values` deprecation warning.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + return np.array(micro_series.values) + + +def test_weighted_population_matches_ons_target(baseline): + """Weighted UK population is within 3 % of the ONS projection.""" + population = baseline.calculate("people", 2025).sum() / 1e6 + assert abs(population / POPULATION_TARGET - 1) < TOLERANCE, ( + f"Weighted population {population:.1f}M is >{TOLERANCE:.0%} " + f"from ONS target {POPULATION_TARGET:.1f}M." + ) + + +def test_household_count_reasonable(baseline): + """Total weighted households fall inside the ONS 25-33 M range.""" + hw = _raw(baseline.calculate("household_weight", 2025)) + total_hh = hw.sum() / 1e6 + assert 25 < total_hh < 33, ( + f"Total weighted households {total_hh:.1f}M outside 25-33M range." + ) + + +def test_population_not_inflated(baseline): + """Population stays below the pre-April-2026 inflated level (72 M).""" + population = baseline.calculate("people", 2025).sum() / 1e6 + assert population < 72, ( + f"Population {population:.1f}M exceeds 72M — calibration has " + "regressed toward the pre-#217 overshoot." + ) + + +def test_country_populations_sum_to_uk(baseline): + """England + Scotland + Wales + NI populations sum to the UK total.""" + people = baseline.calculate("people", 2025) + country = baseline.calculate("country", map_to="person") + + uk_pop = people.sum() + country_sum = sum(people[country == c].sum() for c in country.unique()) + + assert abs(country_sum / uk_pop - 1) < 0.001, ( + f"Country populations sum to {country_sum / 1e6:.1f}M " + f"but UK total is {uk_pop / 1e6:.1f}M." + ) From 0dc7fe9677b94db55d60ed0645475b13bbd89073 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 19 Apr 2026 16:52:16 -0400 Subject: [PATCH 2/2] Loosen population tolerance 3% -> 4% for stochastic calibration variance First CI run on this branch produced 71.8M (3.31% over target) where yesterday's main build produced 70.97M (1.58%). Stochastic dropout in the calibration optimiser (`dropout_weights(weights, 0.05)`) gives ~1-2 percentage point build-to-build variance on the population total. 4% keeps the regression gate well below the pre-April-2026 overshoot (~6.5%) while not flaking on normal stochastic variance. Co-Authored-By: Claude Opus 4.7 (1M context) --- policyengine_uk_data/tests/test_population.py | 10 +++++----- policyengine_uk_data/tests/test_population_fidelity.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/policyengine_uk_data/tests/test_population.py b/policyengine_uk_data/tests/test_population.py index 3014e92a4..1714887ca 100644 --- a/policyengine_uk_data/tests/test_population.py +++ b/policyengine_uk_data/tests/test_population.py @@ -1,12 +1,12 @@ def test_population(baseline): population = baseline.calculate("people", 2025).sum() / 1e6 POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based - # Tightened from 7% to 3% after data-pipeline improvements in April 2026 + # Tightened from 7% to 4% after data-pipeline improvements in April 2026 # (stage-2 QRF imputation #362, TFC target refresh #363, reported-anchor # takeup #359) pulled the weighted UK population down from ~74M (+6.5%) - # to ~71M (+1.6%). 3% headroom keeps the test passing with room for - # normal build-to-build variance while catching any regression back to - # the pre-April-2026 overshoot. - assert abs(population / POPULATION_TARGET - 1) < 0.03, ( + # to ~71M (+1.6% - 3.3% depending on stochastic calibration variance). + # 4% headroom keeps CI stable across runs while still catching any + # regression back toward the pre-April-2026 overshoot. + assert abs(population / POPULATION_TARGET - 1) < 0.04, ( f"Expected UK population of {POPULATION_TARGET:.1f} million, got {population:.1f} million." ) diff --git a/policyengine_uk_data/tests/test_population_fidelity.py b/policyengine_uk_data/tests/test_population_fidelity.py index 2038e9c79..272212516 100644 --- a/policyengine_uk_data/tests/test_population_fidelity.py +++ b/policyengine_uk_data/tests/test_population_fidelity.py @@ -17,7 +17,7 @@ import numpy as np POPULATION_TARGET = 69.5 # ONS 2022-based projection for 2025, millions -TOLERANCE = 0.03 # 3% — headroom above the ~1.6% achieved on current main +TOLERANCE = 0.04 # 4% — covers ~1.6%-3.3% stochastic calibration variance def _raw(micro_series):