diff --git a/policyengine_us_data/datasets/puf/uprate_puf.py b/policyengine_us_data/datasets/puf/uprate_puf.py index 961446156..73611ca33 100644 --- a/policyengine_us_data/datasets/puf/uprate_puf.py +++ b/policyengine_us_data/datasets/puf/uprate_puf.py @@ -1,6 +1,9 @@ -import pandas as pd +from functools import lru_cache + import numpy as np -from policyengine_us_data.storage import STORAGE_FOLDER +import pandas as pd + +from policyengine_us_data.storage import CALIBRATION_FOLDER ITMDED_GROW_RATE = 0.02 # annual growth rate in itemized deduction amounts @@ -87,11 +90,19 @@ "E09800", ] -if (STORAGE_FOLDER / "soi.csv").exists(): - soi = pd.read_csv(STORAGE_FOLDER / "soi.csv") + +@lru_cache(maxsize=1) +def load_soi_aggregates() -> pd.DataFrame: + path = CALIBRATION_FOLDER / "soi_targets.csv" + if not path.exists(): + raise FileNotFoundError(f"No SOI aggregate file found at {path}") + soi = pd.read_csv(path) + soi["Value"] = soi["Value"].astype(float) + return soi def get_soi_aggregate(variable, year, is_count): + soi = load_soi_aggregates() if variable == "adjusted_gross_income" and is_count: # AGI isn't treated like the other variables return get_soi_aggregate("count", year, True) @@ -101,7 +112,7 @@ def get_soi_aggregate(variable, year, is_count): agi_lower = soi["AGI lower bound"] == -np.inf agi_upper = soi["AGI upper bound"] == np.inf count_status = soi["Count"] == is_count - non_taxable_only = soi["Taxable only"] == False + non_taxable_only = ~soi["Taxable only"] return ( soi[ diff --git a/policyengine_us_data/storage/download_private_prerequisites.py b/policyengine_us_data/storage/download_private_prerequisites.py index 8a4240cf2..755d1e7cf 100644 --- a/policyengine_us_data/storage/download_private_prerequisites.py +++ b/policyengine_us_data/storage/download_private_prerequisites.py @@ -20,12 +20,6 @@ local_folder=FOLDER, version=None, ) -download( - repo="policyengine/irs-soi-puf", - repo_filename="soi.csv", - local_folder=FOLDER, - version=None, -) download( repo="policyengine/irs-soi-puf", repo_filename="np2023_d5_mid.csv", diff --git a/tests/unit/datasets/test_uprate_puf.py b/tests/unit/datasets/test_uprate_puf.py new file mode 100644 index 000000000..b4cccca3c --- /dev/null +++ b/tests/unit/datasets/test_uprate_puf.py @@ -0,0 +1,110 @@ +import importlib.util +import sys +import types +from contextlib import contextmanager +from pathlib import Path + +import pandas as pd +import pytest + + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent +PACKAGE_ROOT = REPO_ROOT / "policyengine_us_data" + + +@contextmanager +def load_uprate_puf_module(storage_root: Path): + module_names = [ + "policyengine_us_data.datasets.puf.uprate_puf", + "policyengine_us_data.datasets.puf", + "policyengine_us_data.datasets", + "policyengine_us_data.storage", + "policyengine_us_data", + ] + original_modules = {name: sys.modules.get(name) for name in module_names} + for name in module_names: + sys.modules.pop(name, None) + + try: + package = types.ModuleType("policyengine_us_data") + package.__path__ = [str(PACKAGE_ROOT)] + sys.modules["policyengine_us_data"] = package + + datasets_package = types.ModuleType("policyengine_us_data.datasets") + datasets_package.__path__ = [str(PACKAGE_ROOT / "datasets")] + sys.modules["policyengine_us_data.datasets"] = datasets_package + + puf_package = types.ModuleType("policyengine_us_data.datasets.puf") + puf_package.__path__ = [str(PACKAGE_ROOT / "datasets" / "puf")] + sys.modules["policyengine_us_data.datasets.puf"] = puf_package + + storage_spec = importlib.util.spec_from_file_location( + "policyengine_us_data.storage", + PACKAGE_ROOT / "storage" / "__init__.py", + submodule_search_locations=[str(PACKAGE_ROOT / "storage")], + ) + storage_module = importlib.util.module_from_spec(storage_spec) + assert storage_spec.loader is not None + sys.modules["policyengine_us_data.storage"] = storage_module + storage_spec.loader.exec_module(storage_module) + storage_module.STORAGE_FOLDER = storage_root + storage_module.CALIBRATION_FOLDER = storage_root / "calibration_targets" + + uprate_spec = importlib.util.spec_from_file_location( + "policyengine_us_data.datasets.puf.uprate_puf", + PACKAGE_ROOT / "datasets" / "puf" / "uprate_puf.py", + ) + uprate_module = importlib.util.module_from_spec(uprate_spec) + assert uprate_spec.loader is not None + sys.modules["policyengine_us_data.datasets.puf.uprate_puf"] = uprate_module + uprate_spec.loader.exec_module(uprate_module) + yield uprate_module + finally: + for name in module_names: + sys.modules.pop(name, None) + for name, module in original_modules.items(): + if module is not None: + sys.modules[name] = module + + +def write_soi_targets(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + pd.DataFrame( + [ + { + "Year": 2021, + "Variable": "employment_income", + "Filing status": "All", + "AGI lower bound": float("-inf"), + "AGI upper bound": float("inf"), + "Count": False, + "Taxable only": False, + "Full population": True, + "Value": 200.0, + }, + { + "Year": 2021, + "Variable": "count", + "Filing status": "All", + "AGI lower bound": float("-inf"), + "AGI upper bound": float("inf"), + "Count": True, + "Taxable only": False, + "Full population": True, + "Value": 100.0, + }, + ] + ).to_csv(path, index=False) + + +def test_get_soi_aggregate_reads_tracked_soi_targets(tmp_path: Path): + write_soi_targets(tmp_path / "calibration_targets" / "soi_targets.csv") + with load_uprate_puf_module(tmp_path) as module: + assert module.get_soi_aggregate("employment_income", 2021, False) == 200.0 + assert module.get_soi_aggregate("count", 2021, True) == 100.0 + + +def test_get_soi_aggregate_raises_clear_error_when_missing(tmp_path: Path): + with load_uprate_puf_module(tmp_path) as module: + with pytest.raises(FileNotFoundError, match="No SOI aggregate file found at"): + module.load_soi_aggregates()