Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/fix-consumption-weeks-rng.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Replace bare `* 52` weekly-to-annual conversion in LCFS imputation with the shared `WEEKS_IN_YEAR = 365.25 / 7` constant used by `datasets/frs.py`, and replace two `np.random.seed(42)` calls with local `np.random.default_rng(42)` so consumption imputation stops mutating the process-wide RNG state.
7 changes: 7 additions & 0 deletions policyengine_uk_data/datasets/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
from policyengine_uk_data.parameters import load_take_up_rate, load_parameter


# Canonical weeks-per-year conversion factor for annualising weekly survey
# variables. 365.25 / 7 ≈ 52.1786 accounts for leap years; using the rounded
# integer 52 would under-count by ~0.34%. Exposed at module level so sibling
# loaders (e.g. LCFS/ETB in `datasets/imputations/consumption.py`) can import
# the same value rather than re-defining `* 52` locally and drifting.
WEEKS_IN_YEAR = 365.25 / 7

LEGACY_JOBSEEKER_MIN_AGE = 18
HOURS_WORKED_WEEKS_PER_YEAR = 52
ESA_MIN_AGE = 16
Expand Down
28 changes: 21 additions & 7 deletions policyengine_uk_data/datasets/imputations/consumption.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,16 @@
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk import Microsimulation
from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR

LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22"

# Default seed for the stochastic ICE-vehicle flag drawn from
# `NTS_2024_ICE_VEHICLE_SHARE`. Kept at 42 for backward compatibility with
# existing artefact fingerprints; callers can override via the fixture's
# local RNG rather than the process-wide np.random global.
_HAS_FUEL_SEED = 42

# EV/ICE vehicle mix from NTS 2024
NTS_2024_ICE_VEHICLE_SHARE = 0.90

Expand Down Expand Up @@ -406,9 +413,12 @@ def create_has_fuel_model():

num_vehicles = was["vcarnr7"].fillna(0).clip(lower=0)
has_vehicle = num_vehicles > 0
np.random.seed(42)
# Use a local RNG so we don't mutate the global np.random state (which
# would silently change any unrelated consumer of np.random that runs
# after this function).
rng = np.random.default_rng(_HAS_FUEL_SEED)
has_fuel = (
has_vehicle & (np.random.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE)
has_vehicle & (rng.random(len(was)) < NTS_2024_ICE_VEHICLE_SHARE)
).astype(float)

was_df = pd.DataFrame(
Expand Down Expand Up @@ -481,18 +491,21 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame)

household = household.rename(columns=CONSUMPTION_VARIABLE_RENAMES)

# Annualise weekly LCFS values (× 52)
# Annualise weekly LCFS values. Use the same WEEKS_IN_YEAR constant
# (365.25 / 7 ≈ 52.1786) as `datasets/frs.py` rather than a bare `* 52`,
# which underestimates annual totals by ~0.34% and skews VAT / energy
# imputation targets against FRS income.
annualise = list(CONSUMPTION_VARIABLE_RENAMES.values()) + [
"hbai_household_net_income",
"household_gross_income",
"electricity_consumption",
"gas_consumption",
]
for variable in annualise:
household[variable] = household[variable] * 52
household[variable] = household[variable] * WEEKS_IN_YEAR
for variable in PERSON_LCF_RENAMES.values():
household[variable] = (
person[variable].groupby(person.case).sum()[household.case] * 52
person[variable].groupby(person.case).sum()[household.case] * WEEKS_IN_YEAR
)
household.household_weight *= 1_000

Expand Down Expand Up @@ -577,9 +590,10 @@ def impute_consumption(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
sim = Microsimulation(dataset=dataset)
num_vehicles = sim.calculate("num_vehicles", map_to="household").values

np.random.seed(42)
# Local RNG — see note at module level (_HAS_FUEL_SEED).
rng = np.random.default_rng(_HAS_FUEL_SEED)
has_vehicle = num_vehicles > 0
is_ice = np.random.random(len(num_vehicles)) < NTS_2024_ICE_VEHICLE_SHARE
is_ice = rng.random(len(num_vehicles)) < NTS_2024_ICE_VEHICLE_SHARE
has_fuel_consumption = (has_vehicle & is_ice).astype(float)
dataset.household["has_fuel_consumption"] = has_fuel_consumption

Expand Down
76 changes: 76 additions & 0 deletions policyengine_uk_data/tests/test_consumption_weeks_rng.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Regression tests for `datasets/imputations/consumption.py`.

Covers the two sub-fixes in bug-hunt finding U6:

- Weekly → annual conversion used a bare ``* 52`` while the sibling FRS
loader uses ``WEEKS_IN_YEAR = 365.25 / 7 ≈ 52.1786``. The ~0.34% gap
skews VAT / energy imputation targets against FRS income.
- Two ``np.random.seed(42)`` calls mutated the global RNG state, so any
unrelated numpy random call after consumption imputation changed output.
"""

from __future__ import annotations

import importlib.util

import numpy as np
import pytest

if importlib.util.find_spec("policyengine_uk") is None:
pytest.skip(
"policyengine_uk not available in test environment",
allow_module_level=True,
)


def test_consumption_imports_shared_weeks_in_year():
"""consumption.py should use the same WEEKS_IN_YEAR as frs.py."""
from policyengine_uk_data.datasets.frs import WEEKS_IN_YEAR as frs_value
from policyengine_uk_data.datasets.imputations import (
consumption as consumption_module,
)

assert frs_value == 365.25 / 7
# Imported at module scope, not re-computed locally with a different value.
assert consumption_module.WEEKS_IN_YEAR == frs_value


def test_consumption_module_no_longer_calls_np_random_seed():
"""The fixed module must not reach into the process-wide RNG state.

The buggy version called `np.random.seed(42)` in two places, which
mutated the global RNG and silently changed output for any later
numpy random consumer in the same process.
"""
import inspect

from policyengine_uk_data.datasets.imputations import (
consumption as consumption_module,
)

src = inspect.getsource(consumption_module)
assert "np.random.seed(" not in src, (
"consumption.py still mutates the global RNG state via "
"np.random.seed(...) — use np.random.default_rng(seed) instead."
)


def test_consumption_does_not_perturb_global_rng_on_import():
"""Importing consumption.py must not change global RNG state.

Regression against the original bug where running module-level code
paths (e.g. during test collection) could have reseeded the global RNG.
"""
np.random.seed(1234)
before = np.random.random(3)

# Force reimport to exercise module-level side effects.
import importlib

from policyengine_uk_data.datasets.imputations import consumption

importlib.reload(consumption)

np.random.seed(1234)
after = np.random.random(3)
np.testing.assert_allclose(before, after)
Loading