Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions policyengine_us_data/datasets/puf/uprate_puf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pandas as pd
from functools import lru_cache

import numpy as np
from policyengine_us_data.storage import STORAGE_FOLDER
import pandas as pd

from policyengine_us_data.storage import CALIBRATION_FOLDER

ITMDED_GROW_RATE = 0.02 # annual growth rate in itemized deduction amounts

Expand Down Expand Up @@ -87,11 +90,19 @@
"E09800",
]

if (STORAGE_FOLDER / "soi.csv").exists():
soi = pd.read_csv(STORAGE_FOLDER / "soi.csv")

@lru_cache(maxsize=1)
def load_soi_aggregates() -> pd.DataFrame:
path = CALIBRATION_FOLDER / "soi_targets.csv"
if not path.exists():
raise FileNotFoundError(f"No SOI aggregate file found at {path}")
soi = pd.read_csv(path)
soi["Value"] = soi["Value"].astype(float)
return soi


def get_soi_aggregate(variable, year, is_count):
soi = load_soi_aggregates()
if variable == "adjusted_gross_income" and is_count:
# AGI isn't treated like the other variables
return get_soi_aggregate("count", year, True)
Expand All @@ -101,7 +112,7 @@ def get_soi_aggregate(variable, year, is_count):
agi_lower = soi["AGI lower bound"] == -np.inf
agi_upper = soi["AGI upper bound"] == np.inf
count_status = soi["Count"] == is_count
non_taxable_only = soi["Taxable only"] == False
non_taxable_only = ~soi["Taxable only"]

return (
soi[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,6 @@
local_folder=FOLDER,
version=None,
)
download(
repo="policyengine/irs-soi-puf",
repo_filename="soi.csv",
local_folder=FOLDER,
version=None,
)
download(
repo="policyengine/irs-soi-puf",
repo_filename="np2023_d5_mid.csv",
Expand Down
110 changes: 110 additions & 0 deletions tests/unit/datasets/test_uprate_puf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import importlib.util
import sys
import types
from contextlib import contextmanager
from pathlib import Path

import pandas as pd
import pytest


REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
PACKAGE_ROOT = REPO_ROOT / "policyengine_us_data"


@contextmanager
def load_uprate_puf_module(storage_root: Path):
module_names = [
"policyengine_us_data.datasets.puf.uprate_puf",
"policyengine_us_data.datasets.puf",
"policyengine_us_data.datasets",
"policyengine_us_data.storage",
"policyengine_us_data",
]
original_modules = {name: sys.modules.get(name) for name in module_names}
for name in module_names:
sys.modules.pop(name, None)

try:
package = types.ModuleType("policyengine_us_data")
package.__path__ = [str(PACKAGE_ROOT)]
sys.modules["policyengine_us_data"] = package

datasets_package = types.ModuleType("policyengine_us_data.datasets")
datasets_package.__path__ = [str(PACKAGE_ROOT / "datasets")]
sys.modules["policyengine_us_data.datasets"] = datasets_package

puf_package = types.ModuleType("policyengine_us_data.datasets.puf")
puf_package.__path__ = [str(PACKAGE_ROOT / "datasets" / "puf")]
sys.modules["policyengine_us_data.datasets.puf"] = puf_package

storage_spec = importlib.util.spec_from_file_location(
"policyengine_us_data.storage",
PACKAGE_ROOT / "storage" / "__init__.py",
submodule_search_locations=[str(PACKAGE_ROOT / "storage")],
)
storage_module = importlib.util.module_from_spec(storage_spec)
assert storage_spec.loader is not None
sys.modules["policyengine_us_data.storage"] = storage_module
storage_spec.loader.exec_module(storage_module)
storage_module.STORAGE_FOLDER = storage_root
storage_module.CALIBRATION_FOLDER = storage_root / "calibration_targets"

uprate_spec = importlib.util.spec_from_file_location(
"policyengine_us_data.datasets.puf.uprate_puf",
PACKAGE_ROOT / "datasets" / "puf" / "uprate_puf.py",
)
uprate_module = importlib.util.module_from_spec(uprate_spec)
assert uprate_spec.loader is not None
sys.modules["policyengine_us_data.datasets.puf.uprate_puf"] = uprate_module
uprate_spec.loader.exec_module(uprate_module)
yield uprate_module
finally:
for name in module_names:
sys.modules.pop(name, None)
for name, module in original_modules.items():
if module is not None:
sys.modules[name] = module


def write_soi_targets(path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
pd.DataFrame(
[
{
"Year": 2021,
"Variable": "employment_income",
"Filing status": "All",
"AGI lower bound": float("-inf"),
"AGI upper bound": float("inf"),
"Count": False,
"Taxable only": False,
"Full population": True,
"Value": 200.0,
},
{
"Year": 2021,
"Variable": "count",
"Filing status": "All",
"AGI lower bound": float("-inf"),
"AGI upper bound": float("inf"),
"Count": True,
"Taxable only": False,
"Full population": True,
"Value": 100.0,
},
]
).to_csv(path, index=False)


def test_get_soi_aggregate_reads_tracked_soi_targets(tmp_path: Path):
write_soi_targets(tmp_path / "calibration_targets" / "soi_targets.csv")
with load_uprate_puf_module(tmp_path) as module:
assert module.get_soi_aggregate("employment_income", 2021, False) == 200.0
assert module.get_soi_aggregate("count", 2021, True) == 100.0


def test_get_soi_aggregate_raises_clear_error_when_missing(tmp_path: Path):
with load_uprate_puf_module(tmp_path) as module:
with pytest.raises(FileNotFoundError, match="No SOI aggregate file found at"):
module.load_soi_aggregates()
Loading