From d99f76b08db01d4bcd733648d1a55d1105bf74c1 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 19 Apr 2026 13:51:16 -0400
Subject: [PATCH 1/2] Tighten population tolerance and add fidelity tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The weighted-UK-population drift that motivated #310 has already
dropped from ~6.5% to ~1.6% on current main as a side-effect of the
data-pipeline improvements landed yesterday (stage-2 QRF #362, TFC
target refresh #363, reported-anchor takeup #359).

Tightens `test_population` tolerance from 7 % to 3 % to lock in that
gain — any future calibration change that regresses back toward the
pre-April-2026 overshoot now trips CI instead of silently drifting.
Adds a new `test_population_fidelity.py` with four regression tests
extracted from the #310 draft:

- weighted-total ONS match (3 % tolerance)
- household-count sanity range (25-33 M)
- non-inflation guard (< 72 M)
- country-populations-sum-to-UK consistency

Does not include #310's loss-function change or Scotland target
removal; those are independent proposals and should be evaluated on
their own merits once the practical overshoot is resolved.

Co-authored-by: Vahid Ahmadi <va.vahidahmadi@gmail.com>
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 changelog.d/tighten-population-tests.added.md |  1 +
 policyengine_uk_data/tests/test_population.py | 11 ++-
 .../tests/test_population_fidelity.py         | 69 +++++++++++++++++++
 3 files changed, 78 insertions(+), 3 deletions(-)
 create mode 100644 changelog.d/tighten-population-tests.added.md
 create mode 100644 policyengine_uk_data/tests/test_population_fidelity.py

diff --git a/changelog.d/tighten-population-tests.added.md b/changelog.d/tighten-population-tests.added.md
new file mode 100644
index 000000000..d52b6104d
--- /dev/null
+++ b/changelog.d/tighten-population-tests.added.md
@@ -0,0 +1 @@
+Tightened `test_population` tolerance from 7% to 3% now that the stage-2 QRF (#362), TFC target refresh (#363), and reported-anchor takeup (#359) pulled the weighted UK population overshoot from ~6.5% down to ~1.6%. Added four regression tests in `test_population_fidelity.py` (weighted-total match, household-count range, non-inflation guard, country-sum consistency) extracted from the earlier #310 draft so any future calibration drift back toward the pre-April-2026 overshoot trips CI.
diff --git a/policyengine_uk_data/tests/test_population.py b/policyengine_uk_data/tests/test_population.py
index 43645791e..3014e92a4 100644
--- a/policyengine_uk_data/tests/test_population.py
+++ b/policyengine_uk_data/tests/test_population.py
@@ -1,7 +1,12 @@
 def test_population(baseline):
     population = baseline.calculate("people", 2025).sum() / 1e6
-    POPULATION_TARGET = 69.5  # Expected UK population in millions, per ONS 2022-based estimate here: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based
-    # Tolerance temporarily relaxed to 7% due to calibration inflation issue #217
-    assert abs(population / POPULATION_TARGET - 1) < 0.07, (
+    POPULATION_TARGET = 69.5  # ONS 2022-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based
+    # Tightened from 7% to 3% after data-pipeline improvements in April 2026
+    # (stage-2 QRF imputation #362, TFC target refresh #363, reported-anchor
+    # takeup #359) pulled the weighted UK population down from ~74M (+6.5%)
+    # to ~71M (+1.6%). 3% headroom keeps the test passing with room for
+    # normal build-to-build variance while catching any regression back to
+    # the pre-April-2026 overshoot.
+    assert abs(population / POPULATION_TARGET - 1) < 0.03, (
         f"Expected UK population of {POPULATION_TARGET:.1f} million, got {population:.1f} million."
     )
diff --git a/policyengine_uk_data/tests/test_population_fidelity.py b/policyengine_uk_data/tests/test_population_fidelity.py
new file mode 100644
index 000000000..2038e9c79
--- /dev/null
+++ b/policyengine_uk_data/tests/test_population_fidelity.py
@@ -0,0 +1,69 @@
+"""Population fidelity regression tests for the calibrated dataset.
+
+Guards against the April 2026 calibration drift (issue #217) where the
+weighted UK population inflated ~6.5% above the ONS target. The drift
+was pulled back to ~1.6% by the data-pipeline improvements that landed
+in #362 (stage-2 QRF), #363 (TFC target refresh), and #359 (reported-
+anchor takeup). These tests lock in that gain so future calibration
+changes can't regress past current fidelity without a test failure.
+
+Extracted from PolicyEngine/policyengine-uk-data#310 (Vahid Ahmadi).
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import numpy as np
+
+POPULATION_TARGET = 69.5  # ONS 2022-based projection for 2025, millions
+TOLERANCE = 0.03  # 3% — headroom above the ~1.6% achieved on current main
+
+
+def _raw(micro_series):
+    """Extract the raw numpy array from a MicroSeries without triggering
+    the `.values` deprecation warning."""
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        return np.array(micro_series.values)
+
+
+def test_weighted_population_matches_ons_target(baseline):
+    """Weighted UK population is within 3 % of the ONS projection."""
+    population = baseline.calculate("people", 2025).sum() / 1e6
+    assert abs(population / POPULATION_TARGET - 1) < TOLERANCE, (
+        f"Weighted population {population:.1f}M is >{TOLERANCE:.0%} "
+        f"from ONS target {POPULATION_TARGET:.1f}M."
+    )
+
+
+def test_household_count_reasonable(baseline):
+    """Total weighted households fall inside the ONS 25-33 M range."""
+    hw = _raw(baseline.calculate("household_weight", 2025))
+    total_hh = hw.sum() / 1e6
+    assert 25 < total_hh < 33, (
+        f"Total weighted households {total_hh:.1f}M outside 25-33M range."
+    )
+
+
+def test_population_not_inflated(baseline):
+    """Population stays below the pre-April-2026 inflated level (72 M)."""
+    population = baseline.calculate("people", 2025).sum() / 1e6
+    assert population < 72, (
+        f"Population {population:.1f}M exceeds 72M — calibration has "
+        "regressed toward the pre-#217 overshoot."
+    )
+
+
+def test_country_populations_sum_to_uk(baseline):
+    """England + Scotland + Wales + NI populations sum to the UK total."""
+    people = baseline.calculate("people", 2025)
+    country = baseline.calculate("country", map_to="person")
+
+    uk_pop = people.sum()
+    country_sum = sum(people[country == c].sum() for c in country.unique())
+
+    assert abs(country_sum / uk_pop - 1) < 0.001, (
+        f"Country populations sum to {country_sum / 1e6:.1f}M "
+        f"but UK total is {uk_pop / 1e6:.1f}M."
+    )

From 0dc7fe9677b94db55d60ed0645475b13bbd89073 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 19 Apr 2026 16:52:16 -0400
Subject: [PATCH 2/2] Loosen population tolerance 3% -> 4% for stochastic
 calibration variance

First CI run on this branch produced 71.8M (3.31% over target) where
yesterday's main build produced 70.97M (1.58%). Stochastic dropout
in the calibration optimiser (`dropout_weights(weights, 0.05)`) gives
~1-2 percentage point build-to-build variance on the population total.

4% keeps the regression gate well below the pre-April-2026 overshoot
(~6.5%) while not flaking on normal stochastic variance.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 policyengine_uk_data/tests/test_population.py          | 10 +++++-----
 policyengine_uk_data/tests/test_population_fidelity.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/policyengine_uk_data/tests/test_population.py b/policyengine_uk_data/tests/test_population.py
index 3014e92a4..1714887ca 100644
--- a/policyengine_uk_data/tests/test_population.py
+++ b/policyengine_uk_data/tests/test_population.py
@@ -1,12 +1,12 @@
 def test_population(baseline):
     population = baseline.calculate("people", 2025).sum() / 1e6
     POPULATION_TARGET = 69.5  # ONS 2022-based projection for 2025, millions: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationprojections/bulletins/nationalpopulationprojections/2022based
-    # Tightened from 7% to 3% after data-pipeline improvements in April 2026
+    # Tightened from 7% to 4% after data-pipeline improvements in April 2026
     # (stage-2 QRF imputation #362, TFC target refresh #363, reported-anchor
     # takeup #359) pulled the weighted UK population down from ~74M (+6.5%)
-    # to ~71M (+1.6%). 3% headroom keeps the test passing with room for
-    # normal build-to-build variance while catching any regression back to
-    # the pre-April-2026 overshoot.
-    assert abs(population / POPULATION_TARGET - 1) < 0.03, (
+    # to ~71M (+1.6% - 3.3% depending on stochastic calibration variance).
+    # 4% headroom keeps CI stable across runs while still catching any
+    # regression back toward the pre-April-2026 overshoot.
+    assert abs(population / POPULATION_TARGET - 1) < 0.04, (
         f"Expected UK population of {POPULATION_TARGET:.1f} million, got {population:.1f} million."
     )
diff --git a/policyengine_uk_data/tests/test_population_fidelity.py b/policyengine_uk_data/tests/test_population_fidelity.py
index 2038e9c79..272212516 100644
--- a/policyengine_uk_data/tests/test_population_fidelity.py
+++ b/policyengine_uk_data/tests/test_population_fidelity.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 POPULATION_TARGET = 69.5  # ONS 2022-based projection for 2025, millions
-TOLERANCE = 0.03  # 3% — headroom above the ~1.6% achieved on current main
+TOLERANCE = 0.04  # 4% — covers ~1.6%-3.3% stochastic calibration variance
 
 
 def _raw(micro_series):