From a4b880c89882856f6861a99533ea9535cb90d09f Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 31 Jan 2026 09:58:03 +0100 Subject: [PATCH 01/38] first iteration of scalar implementation --- doubleml/__init__.py | 6 + doubleml/double_ml_base.py | 389 +++++++++++++++++++++++++ doubleml/double_ml_framework.py | 2 +- doubleml/double_ml_linear_score.py | 157 ++++++++++ doubleml/double_ml_scalar.py | 452 +++++++++++++++++++++++++++++ 5 files changed, 1005 insertions(+), 1 deletion(-) create mode 100644 doubleml/double_ml_base.py create mode 100644 doubleml/double_ml_linear_score.py create mode 100644 doubleml/double_ml_scalar.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index d4cbb943..0d046c4f 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -1,7 +1,10 @@ from .data import DoubleMLClusterData, DoubleMLData, DoubleMLDIDData, DoubleMLPanelData, DoubleMLRDDData, DoubleMLSSMData from .did.did import DoubleMLDID from .did.did_cs import DoubleMLDIDCS +from .double_ml_base import DoubleMLBase from .double_ml_framework import DoubleMLCore, DoubleMLFramework, concat +from .double_ml_linear_score import LinearScoreMixin +from .double_ml_scalar import DoubleMLScalar from .irm.apo import DoubleMLAPO from .irm.apos import DoubleMLAPOS from .irm.cvar import DoubleMLCVAR @@ -20,7 +23,10 @@ __all__ = [ "concat", + "DoubleMLBase", "DoubleMLCore", + "DoubleMLScalar", + "LinearScoreMixin", "DoubleMLFramework", "DoubleMLPLR", "DoubleMLPLIV", diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py new file mode 100644 index 00000000..19eac58a --- /dev/null +++ b/doubleml/double_ml_base.py @@ -0,0 +1,389 @@ +""" +Abstract base class for Double Machine Learning estimators. +""" + +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd + +from .data.base_data import DoubleMLBaseData +from .double_ml_framework import DoubleMLFramework + + +class DoubleMLBase(ABC): + """ + Abstract base class for Double Machine Learning. + + Provides basic properties and abstract methods, e.g. the fit() method. Mainly handles + properties and methods which rely on an initialized DoubleMLFramework object. + + This class serves as the foundation for both DoubleMLScalar (single parameter estimation) + and DoubleMLVector (parameter vector estimation). + + Parameters + ---------- + obj_dml_data : DoubleMLBaseData + The data object for the double machine learning model. + + Attributes + ---------- + framework : DoubleMLFramework + The DoubleMLFramework object containing estimation results and providing inference methods. + thetas : np.ndarray + Estimated parameter values (aggregated across repetitions, shape: (n_thetas,)). + all_thetas : np.ndarray + Estimated parameter values for each repetition (shape: (n_thetas, n_rep)). + ses : np.ndarray + Standard errors of parameter estimates (aggregated across repetitions, shape: (n_thetas,)). + all_ses : np.ndarray + Standard errors for each repetition (shape: (n_thetas, n_rep)). + summary : pd.DataFrame + Summary table with estimates, standard errors, confidence intervals, and p-values. + psi : np.ndarray + Influence function values (shape: (n_obs, n_thetas, n_rep)). + smpls : list + Sample splitting indices used for cross-fitting. + n_folds : int + Number of folds used for cross-fitting. + n_rep : int + Number of repetitions for sample splitting. + """ + + def __init__( + self, + obj_dml_data: DoubleMLBaseData, + ): + """ + Initialize DoubleMLBase base class. + + Parameters + ---------- + obj_dml_data : DoubleMLBaseData + The data object for the double machine learning model. + """ + # Validate and store data + if not isinstance(obj_dml_data, DoubleMLBaseData): + raise TypeError(f"obj_dml_data must be a DoubleMLBaseData instance. " f"Got {type(obj_dml_data)}.") + + self._dml_data = obj_dml_data + self._n_obs = obj_dml_data.n_obs + + # Framework is initialized after fit() + self._framework: Optional[DoubleMLFramework] = None + + # Sample splits are initialized via draw_sample_splitting() + self._smpls: Optional[List] = None + + # ==================== Properties (Delegating to Framework) ==================== + + @property + def framework(self) -> DoubleMLFramework: + """ + The DoubleMLFramework object containing estimation results. + + This object is created after calling fit() and provides methods for + statistical inference (confidence intervals, bootstrap, sensitivity analysis). + + Returns + ------- + DoubleMLFramework + The framework object with estimation results. + + Raises + ------ + ValueError + If framework is not yet initialized (fit() has not been called). + """ + if self._framework is None: + raise ValueError("The framework is not yet initialized. " "Call fit() before accessing estimation results.") + return self._framework + + @property + def thetas(self) -> np.ndarray: + """ + Estimated parameter values (aggregated across repetitions). + + Returns + ------- + np.ndarray + Parameter estimates (shape: (n_thetas,)). + """ + return self.framework.thetas + + @property + def coef(self) -> np.ndarray: + """ + Alias for thetas. Estimated parameter values (aggregated across repetitions). + + Returns + ------- + np.ndarray + Parameter estimates (shape: (n_thetas,)). + """ + return self.thetas + + @property + def all_thetas(self) -> np.ndarray: + """ + Estimated parameter values for each repetition. + + Returns + ------- + np.ndarray + Parameter estimates for all repetitions (shape: (n_thetas, n_rep)). + """ + return self.framework.all_thetas + + @property + def all_coef(self) -> np.ndarray: + """ + Alias for all_thetas. Estimated parameter values for each repetition. + + Returns + ------- + np.ndarray + Parameter estimates for all repetitions (shape: (n_thetas, n_rep)). + """ + return self.all_thetas + + @property + def se(self) -> np.ndarray: + """ + Standard errors of parameter estimates (aggregated across repetitions). + + Returns + ------- + np.ndarray + Standard errors (shape: (n_thetas,)). + """ + return self.framework.ses + + @property + def all_ses(self) -> np.ndarray: + """ + Standard errors for each repetition. + + Returns + ------- + np.ndarray + Standard errors for all repetitions (shape: (n_thetas, n_rep)). + """ + return self.framework.all_ses + + @property + def summary(self) -> pd.DataFrame: + """ + Summary table with estimates, standard errors, confidence intervals, and p-values. + + Returns + ------- + pd.DataFrame + Summary statistics for all parameters. + """ + return self.framework.summary + + @property + def psi(self) -> np.ndarray: + """ + Normalized influence function values (scaled score function). + + Returns + ------- + np.ndarray + Influence function values (shape: (n_obs, n_thetas, n_rep)). + """ + return self.framework.scaled_psi + + @property + def smpls(self) -> List: + """ + Sample splitting indices used for cross-fitting. + + Returns + ------- + list + List of sample splitting indices for each repetition. + """ + if self._smpls is None: + raise ValueError("Sample splitting has not been performed. " "Call draw_sample_splitting() first.") + return self._smpls + + @property + def n_obs(self) -> int: + """ + Number of observations. + + Returns + ------- + int + Number of observations in the dataset. + """ + return self._n_obs + + # ==================== Concrete Methods (Delegating to Framework) ==================== + + def confint(self, joint: bool = False, level: float = 0.95) -> pd.DataFrame: + """ + Confidence intervals for DoubleML models. + + Parameters + ---------- + joint : bool, optional + Indicates whether joint confidence intervals are computed. + Default is False. + level : float, optional + The confidence level for the confidence interval. + Default is 0.95. + + Returns + ------- + pd.DataFrame + A DataFrame with confidence intervals. + """ + return self.framework.confint(joint=joint, level=level) + + def bootstrap(self, method: str = "normal", n_rep_boot: int = 500) -> "DoubleMLBase": + """ + Multiplier bootstrap for DoubleML models. + + Parameters + ---------- + method : str, optional + The bootstrap method ('normal', 'Bayes', or 'wild'). + Default is 'normal'. + n_rep_boot : int, optional + The number of bootstrap replications. + Default is 500. + + Returns + ------- + self : DoubleMLBase + The DoubleML estimator with bootstrap results. + """ + self.framework.bootstrap(method=method, n_rep_boot=n_rep_boot) + return self + + def p_adjust(self, method: str = "romano-wolf") -> pd.DataFrame: + """ + Multiple testing adjustment of p-values. + + Parameters + ---------- + method : str, optional + The p-value adjustment method. Default is 'romano-wolf'. + + Returns + ------- + pd.DataFrame + A DataFrame with adjusted p-values. + """ + return self.framework.p_adjust(method=method) + + def sensitivity_analysis( + self, + cf_y: float = 0.03, + cf_d: float = 0.03, + rho: float = 1.0, + level: float = 0.95, + null_hypothesis: float = 0.0, + ) -> Dict: + """ + Sensitivity analysis for DoubleML models. + + Parameters + ---------- + cf_y : float, optional + Percentage of residual variation in outcome explained by unobserved confounders. + Default is 0.03. + cf_d : float, optional + Percentage of residual variation in treatment explained by unobserved confounders. + Default is 0.03. + rho : float, optional + Correlation between unobserved confounders affecting outcome and treatment. + Default is 1.0. + level : float, optional + The confidence level for robustness analysis. + Default is 0.95. + null_hypothesis : float, optional + The null hypothesis value for the parameter. + Default is 0.0. + + Returns + ------- + dict + A dictionary with sensitivity analysis results. + """ + return self.framework.sensitivity_analysis( + cf_y=cf_y, + cf_d=cf_d, + rho=rho, + level=level, + null_hypothesis=null_hypothesis, + ) + + # ==================== Abstract Methods ==================== + + @abstractmethod + def fit(self, **kwargs) -> "DoubleMLBase": + """ + Estimate the DoubleML model. + + This method must be implemented by subclasses (DoubleMLScalar or DoubleMLVector). + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments for fitting. + + Returns + ------- + self : DoubleMLBase + The fitted DoubleML estimator. + """ + pass + + @abstractmethod + def draw_sample_splitting(self) -> "DoubleMLBase": + """ + Draw sample splitting for cross-fitting. + + This method must be implemented by subclasses to generate sample splits + using an appropriate resampling strategy. + + Returns + ------- + self : DoubleMLBase + The DoubleML estimator with initialized sample splits. + """ + pass + + def __str__(self) -> str: + """ + String representation of the DoubleMLBase object. + + Returns + ------- + str + A formatted string summary of the model. + """ + class_name = self.__class__.__name__ + header = f"{'=' * 20} {class_name} Object {'=' * 20}" + + if self._framework is not None: + summary_str = str(self.summary) + return f"{header}\n\n{summary_str}" + else: + return f"{header}\n\nModel not yet fitted. Call fit() first." + + def __repr__(self) -> str: + """ + Representation of the DoubleMLBase object. + + Returns + ------- + str + A string representation of the object. + """ + return self.__str__() diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py index 99941c07..c82ad206 100644 --- a/doubleml/double_ml_framework.py +++ b/doubleml/double_ml_framework.py @@ -32,7 +32,7 @@ class DoubleMLCore: cluster_dict: Optional[Dict] = None sensitivity_elements: Optional[Dict[str, np.ndarray]] = None """ - Core container for DoubleML results . + Internal container for DoubleML raw estimation results. This class stores the main results and diagnostics from a DoubleML estimation, including parameter estimates, standard errors, normalized scores, and (optionally) sensitivity and clustering information. It performs diff --git a/doubleml/double_ml_linear_score.py b/doubleml/double_ml_linear_score.py new file mode 100644 index 00000000..640e031d --- /dev/null +++ b/doubleml/double_ml_linear_score.py @@ -0,0 +1,157 @@ +""" +Mixin for DoubleML models with linear score functions. +""" + +from typing import Dict + +import numpy as np + +from .double_ml_scalar import DoubleMLScalar + + +class LinearScoreMixin(DoubleMLScalar): + """ + Mixin for score functions linear in the target parameter. + + This class extends DoubleMLScalar and implements the _est_causal_pars_and_se() method + for score functions that are linear in the target parameter θ. + + Score form: + ψ(W; θ, η) = θ · ψ_a(W; η) + ψ_b(W; η) + + The solution has a closed form: + θ̂ = -E[ψ_b] / E[ψ_a] + + This applies to many common DoubleML models including: + - Partially Linear Regression (PLR) + - Partially Linear IV Regression (PLIV) + - Interactive Regression Model (IRM) + - Difference-in-Differences (DID) + - and others + + Notes + ----- + Subclasses must implement: + - _nuisance_est(): Estimate nuisance parameters for one fold + - _get_score_elements(): Return dict with 'psi_a' and 'psi_b' arrays of shape (n_obs, n_rep) + """ + + def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None: + """ + Estimate causal parameters and standard errors for linear score. + + This method implements the closed-form solution for linear score functions + and computes standard errors using the influence function. + + All computations use framework convention: (n_obs, n_thetas, n_rep). + + Parameters + ---------- + psi_elements : dict + Dictionary with score elements. Must contain: + - 'psi_a': np.ndarray of shape (n_obs, n_rep) + - 'psi_b': np.ndarray of shape (n_obs, n_rep) + + Notes + ----- + Updates the following attributes (all in framework convention): + - self._all_thetas: Parameter estimates for each repetition (n_thetas=1, n_rep) + - self._all_ses: Standard errors for each repetition (n_thetas=1, n_rep) + - self._psi: Influence function values (n_obs, n_thetas=1, n_rep) + - self._psi_deriv: Score derivative w.r.t. θ (n_obs, n_thetas=1, n_rep) + - self._var_scaling_factors: Variance scaling factors (n_thetas=1,) + """ + # Extract score elements + if "psi_a" not in psi_elements or "psi_b" not in psi_elements: + raise ValueError( + "LinearScoreMixin requires 'psi_a' and 'psi_b' in psi_elements. " f"Got keys: {list(psi_elements.keys())}" + ) + + psi_a = psi_elements["psi_a"] # Shape: (n_obs, n_rep) + psi_b = psi_elements["psi_b"] # Shape: (n_obs, n_rep) + + # Validate shapes + if psi_a.shape != psi_b.shape: + raise ValueError(f"psi_a and psi_b must have the same shape. " f"Got psi_a: {psi_a.shape}, psi_b: {psi_b.shape}") + + n_obs, n_rep = psi_a.shape + + if n_rep != self.n_rep: + raise ValueError(f"Score elements have {n_rep} repetitions, but model expects {self.n_rep}.") + + # Compute parameter estimates using closed-form solution + # θ̂ = -E[ψ_b] / E[ψ_a] + mean_psi_a = np.mean(psi_a, axis=0) # (n_rep,) + mean_psi_b = np.mean(psi_b, axis=0) # (n_rep,) + + # Check for zero denominator + if np.any(np.abs(mean_psi_a) < 1e-12): + raise ValueError( + "Division by near-zero detected in linear score estimation. " + "E[psi_a] is very close to zero. This may indicate issues with " + "the nuisance models or data." + ) + + thetas = -mean_psi_b / mean_psi_a # (n_rep,) + + # Store parameter estimates in framework shape: (n_thetas=1, n_rep) + self._all_thetas = thetas[np.newaxis, :] # (1, n_rep) + + # Compute influence function (score evaluated at θ̂) + # ψ(W; θ̂, η) = θ̂ · ψ_a + ψ_b + # Shape: (n_obs, n_rep) + psi = thetas[np.newaxis, :] * psi_a + psi_b # Broadcasting: (1, n_rep) * (n_obs, n_rep) + + # Store influence function in framework shape: (n_obs, n_thetas=1, n_rep) + self._psi = psi[:, np.newaxis, :] # (n_obs, 1, n_rep) + + # Compute score derivative w.r.t. θ + # ∂ψ/∂θ = ψ_a + # Store in framework shape: (n_obs, n_thetas=1, n_rep) + self._psi_deriv = psi_a[:, np.newaxis, :] # (n_obs, 1, n_rep) + + # Compute standard errors + # SE = std(ψ) / sqrt(n) + se = np.std(psi, axis=0) / np.sqrt(n_obs) # (n_rep,) + self._all_ses = se[np.newaxis, :] # (1, n_rep) + + # Compute variance scaling factors + # This is 1 / E[∂ψ/∂θ]^2 = 1 / E[ψ_a]^2 + var_scaling_factors = 1.0 / (mean_psi_a**2) # (n_rep,) + + # Take mean across repetitions and store in framework shape: (n_thetas=1,) + self._var_scaling_factors = np.array([np.mean(var_scaling_factors)]) # (1,) + + def _compute_score(self, psi_elements: Dict[str, np.ndarray], coef: float) -> np.ndarray: + """ + Compute the score function value for a given coefficient. + + This is primarily used for verification and diagnostic purposes. + + Parameters + ---------- + psi_elements : dict + Dictionary with 'psi_a' and 'psi_b' of shape (n_obs, n_rep). + coef : float + The coefficient value at which to evaluate the score. + + Returns + ------- + np.ndarray + Score function values, shape (n_obs, n_rep). + """ + psi_a = psi_elements["psi_a"] + psi_b = psi_elements["psi_b"] + + return coef * psi_a + psi_b + + def _score_element_names(self) -> list: + """ + Get the names of score elements for this model. + + Returns + ------- + list + List of score element names: ['psi_a', 'psi_b'] + """ + return ["psi_a", "psi_b"] diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py new file mode 100644 index 00000000..5cd4a381 --- /dev/null +++ b/doubleml/double_ml_scalar.py @@ -0,0 +1,452 @@ +""" +Abstract base class for scalar DoubleML models (single parameter estimation). +""" + +from abc import ABC, abstractmethod +from typing import Dict, Optional + +import numpy as np + +from .data.base_data import DoubleMLBaseData +from .double_ml_base import DoubleMLBase +from .double_ml_framework import DoubleMLCore as DoubleMLCoreData +from .double_ml_framework import DoubleMLFramework +from .utils.resampling import DoubleMLResampling + + +class DoubleMLScalar(DoubleMLBase, ABC): + """ + Abstract base class for scalar DoubleML models. + + Defines the fit() method for a single parameter based on abstract private methods + such as nuisance_est(). Solves either linear or non-linear score functions. + Requires a single treatment column in DoubleMLData. + + This class implements the template method pattern: the fit() method orchestrates + the estimation process by calling abstract methods that subclasses must implement. + + Parameters + ---------- + obj_dml_data : DoubleMLBaseData + The data object for the double machine learning model. + Must contain exactly one treatment variable. + n_folds : int, optional + Number of folds for cross-fitting. Default is 5. + n_rep : int, optional + Number of repetitions for sample splitting. Default is 1. + score : str, optional + The score function to use. Default is model-specific. + draw_sample_splitting : bool, optional + Whether to draw sample splits on initialization. Default is True. + + Attributes + ---------- + n_folds : int + Number of folds for cross-fitting. + n_rep : int + Number of repetitions for sample splitting. + score : str + The score function being used. + """ + + def __init__( + self, + obj_dml_data: DoubleMLBaseData, + n_folds: int = 5, + n_rep: int = 1, + score: str = "default", + draw_sample_splitting: bool = True, + ): + """ + Initialize DoubleMLScalar. + + Parameters + ---------- + obj_dml_data : DoubleMLBaseData + The data object. Must have exactly one treatment column. + n_folds : int, optional + Number of folds for cross-fitting. Default is 5. + n_rep : int, optional + Number of repetitions for sample splitting. Default is 1. + score : str, optional + The score function to use. Default is 'default'. + draw_sample_splitting : bool, optional + Whether to draw sample splits on initialization. Default is True. + + Raises + ------ + ValueError + If obj_dml_data contains more than one treatment column. + TypeError + If parameters have incorrect types. + """ + # Validate single treatment column + if len(obj_dml_data.d_cols) != 1: + raise ValueError( + f"DoubleMLScalar requires exactly one treatment column. " + f"Got {len(obj_dml_data.d_cols)}: {obj_dml_data.d_cols}. " + f"For multiple treatments, use DoubleMLVector." + ) + + # Call parent constructor + super().__init__(obj_dml_data) + + # Validate and store resampling parameters + if not isinstance(n_folds, int) or n_folds < 2: + raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.") + if not isinstance(n_rep, int) or n_rep < 1: + raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.") + if not isinstance(draw_sample_splitting, bool): + raise TypeError(f"draw_sample_splitting must be bool. Got {type(draw_sample_splitting)}.") + + self._n_folds = n_folds + self._n_rep = n_rep + self._score = score + + # Initialize storage for predictions and results + self._predictions: Optional[Dict[str, np.ndarray]] = None + self._all_thetas: Optional[np.ndarray] = None + self._all_ses: Optional[np.ndarray] = None + self._psi: Optional[np.ndarray] = None + self._psi_deriv: Optional[np.ndarray] = None + self._var_scaling_factors: Optional[np.ndarray] = None + + # For iteration (used during fit) + self._i_rep: Optional[int] = None + self._i_fold: Optional[int] = None + + # Draw sample splitting if requested + if draw_sample_splitting: + self.draw_sample_splitting() + + # ==================== Properties ==================== + + @property + def n_folds(self) -> int: + """ + Number of folds for cross-fitting. + + Returns + ------- + int + Number of folds. + """ + return self._n_folds + + @property + def n_rep(self) -> int: + """ + Number of repetitions for sample splitting. + + Returns + ------- + int + Number of repetitions. + """ + return self._n_rep + + @property + def score(self) -> str: + """ + The score function being used. + + Returns + ------- + str + Score function name. + """ + return self._score + + @property + def predictions(self) -> Dict[str, np.ndarray]: + """ + Predictions from nuisance models (if stored during fit). + + Returns + ------- + dict + Dictionary with predictions for each nuisance component. + + Raises + ------ + ValueError + If predictions were not stored during fit. + """ + if self._predictions is None: + raise ValueError("Predictions not available. Call fit() with store_predictions=True.") + return self._predictions + + # ==================== Concrete fit() Method (Template) ==================== + + def fit(self, n_jobs_cv: Optional[int] = None, store_predictions: bool = True, **kwargs) -> "DoubleMLScalar": + """ + Estimate the DoubleML model. + + This is the concrete implementation of the fit() method using the template method pattern. + It orchestrates the estimation by: + 1. Ensuring sample splitting is initialized + 2. Initializing storage arrays + 3. Looping over repetitions and folds + 4. Calling abstract _nuisance_est() for each fold (implemented by subclasses) + 5. Computing score elements via _get_score_elements() (implemented by subclasses) + 6. Estimating parameters via _est_causal_pars_and_se() (from score mixin) + 7. Constructing the DoubleMLFramework + + Parameters + ---------- + n_jobs_cv : int, optional + Number of jobs for parallel processing during cross-validation. + Currently not used (reserved for future parallelization). + store_predictions : bool, optional + Whether to store predictions from nuisance models. Default is True. + **kwargs : dict + Additional keyword arguments (for future extensibility). + + Returns + ------- + self : DoubleMLScalar + The fitted estimator. + """ + # Step 1: Ensure sample splitting is initialized + if self._smpls is None: + self.draw_sample_splitting() + + # Step 2: Initialize storage arrays + self._initialize_arrays(store_predictions=store_predictions) + + # Step 3: Cross-fitting loop over repetitions and folds + for i_rep in range(self.n_rep): + self._i_rep = i_rep + + for i_fold in range(self.n_folds): + self._i_fold = i_fold + + # Get train/test indices for this fold + train_idx, test_idx = self._smpls[i_rep][i_fold] + + # Step 4: Call abstract method - subclass implements nuisance estimation + self._nuisance_est( + train_idx=train_idx, + test_idx=test_idx, + i_rep=i_rep, + i_fold=i_fold, + ) + + # Step 5: Get score elements - subclass implements + psi_elements = self._get_score_elements() + + # Step 6: Estimate causal parameters - from score mixin + self._est_causal_pars_and_se(psi_elements) + + # Step 7: Construct framework + self._framework = self._construct_framework() + + return self + + def draw_sample_splitting(self) -> "DoubleMLScalar": + """ + Draw sample splitting for cross-fitting. + + Uses DoubleMLResampling to generate K-fold cross-validation splits + with multiple repetitions. + + Returns + ------- + self : DoubleMLScalar + The estimator with initialized sample splits. + """ + # Create resampler + resampler = DoubleMLResampling( + n_folds=self.n_folds, + n_rep=self.n_rep, + n_obs=self._n_obs, + ) + + # Generate splits + self._smpls = resampler.split_samples() + + return self + + # ==================== Private Helper Methods ==================== + + def _initialize_arrays(self, store_predictions: bool = True) -> None: + """ + Initialize storage arrays for predictions and results. + + Parameters + ---------- + store_predictions : bool + Whether to allocate arrays for storing predictions. + """ + n_obs = self._n_obs + n_rep = self.n_rep + n_thetas = 1 # Scalar model estimates single parameter + + # Initialize predictions storage if requested + if store_predictions: + self._predictions = self._initialize_predictions_dict() + + # Initialize result arrays using framework convention + # These will be filled by _est_causal_pars_and_se() + # Shapes follow framework: (n_thetas, n_rep) for params, (n_obs, n_thetas, n_rep) for scores + self._all_thetas = np.zeros((n_thetas, n_rep)) # (n_thetas=1, n_rep) + self._all_ses = np.zeros((n_thetas, n_rep)) + self._psi = np.zeros((n_obs, n_thetas, n_rep)) # (n_obs, n_thetas=1, n_rep) + self._psi_deriv = np.zeros((n_obs, n_thetas, n_rep)) + + def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]: + """ + Initialize dictionary for storing predictions. + + Subclasses can override this to define their specific prediction storage structure. + + Returns + ------- + dict + Empty dictionary (subclasses should override). + """ + # Default: return empty dict + # Subclasses should override to create arrays for their specific nuisance components + return {} + + def _construct_framework(self) -> DoubleMLFramework: + """ + Construct DoubleMLFramework from estimation results. + + Returns + ------- + DoubleMLFramework + The framework object with estimation results. + """ + # Standardize the score function: psi / E[psi_deriv] + # Both already in framework shape: (n_obs, n_thetas, n_rep) + scaled_psi = np.divide(self._psi, np.mean(self._psi_deriv, axis=0, keepdims=True)) + + # Create data container (no transpose needed - already in framework convention!) + framework_data = DoubleMLCoreData( + all_thetas=self._all_thetas, # (n_thetas, n_rep) + all_ses=self._all_ses, # (n_thetas, n_rep) + var_scaling_factors=self._var_scaling_factors, # (n_thetas,) + scaled_psi=scaled_psi, # (n_obs, n_thetas, n_rep) + is_cluster_data=False, # TODO: Add cluster data support + ) + + # Create and return framework + return DoubleMLFramework( + dml_core=framework_data, + treatment_names=self._dml_data.d_cols, + ) + + # ==================== Abstract Methods (Must be Implemented by Subclasses) ==================== + + @abstractmethod + def _nuisance_est( + self, + train_idx: np.ndarray, + test_idx: np.ndarray, + i_rep: int, + i_fold: int, + ) -> None: + """ + Estimate nuisance parameters for one fold. + + This is the main method subclasses must implement. It should: + 1. Extract training and test data using train_idx and test_idx + 2. Fit nuisance models (e.g., outcome model, treatment model) on training data + 3. Predict on test data + 4. Store predictions in self._predictions + + Parameters + ---------- + train_idx : np.ndarray + Indices of training observations for this fold. + test_idx : np.ndarray + Indices of test observations for this fold. + i_rep : int + Repetition index (0 to n_rep-1). + i_fold : int + Fold index (0 to n_folds-1). + + Notes + ----- + Subclasses should store predictions in self._predictions, for example: + self._predictions['ml_l'][test_idx, i_rep] = l_hat + self._predictions['ml_m'][test_idx, i_rep] = m_hat + """ + pass + + @abstractmethod + def _get_score_elements(self) -> Dict[str, np.ndarray]: + """ + Compute score function elements from nuisance predictions. + + This method should use the predictions stored in self._predictions + to compute the components of the score function. + + Returns + ------- + dict + Dictionary with score elements. + For LinearScoreMixin: {'psi_a': array, 'psi_b': array} + For NonLinearScoreMixin: model-specific elements + + Notes + ----- + The score elements should have shape (n_obs, n_rep) for scalar models. + + Example for PLR (linear score): + psi_a = (D - m_hat) ** 2 # shape: (n_obs, n_rep) + psi_b = (D - m_hat) * (Y - l_hat) # shape: (n_obs, n_rep) + return {'psi_a': psi_a, 'psi_b': psi_b} + """ + pass + + @abstractmethod + def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None: + """ + Estimate causal parameters and standard errors from score elements. + + This method is implemented by score mixins (LinearScoreMixin or NonLinearScoreMixin). + It should: + 1. Compute parameter estimates (self._all_thetas) + 2. Compute standard errors (self._all_ses) + 3. Compute influence function (self._psi) + 4. Compute score derivative (self._psi_deriv) + 5. Compute variance scaling factors (self._var_scaling_factors) + + Parameters + ---------- + psi_elements : dict + Dictionary with score function elements from _get_score_elements(). + + Notes + ----- + After this method, all arrays must follow framework convention: + - self._all_thetas should have shape (n_thetas, n_rep) + - self._all_ses should have shape (n_thetas, n_rep) + - self._psi should have shape (n_obs, n_thetas, n_rep) + - self._psi_deriv should have shape (n_obs, n_thetas, n_rep) + - self._var_scaling_factors should have shape (n_thetas,) + """ + pass + + def __str__(self) -> str: + """ + String representation of the DoubleMLScalar object. + + Returns + ------- + str + A formatted string summary of the model. + """ + class_name = self.__class__.__name__ + header = f"{'=' * 20} {class_name} Object {'=' * 20}" + + info = f"Score function: {self.score}\n" + info += f"Resampling: {self.n_folds}-fold CV, {self.n_rep} repetitions\n" + + if self._framework is not None: + summary_str = str(self.summary) + return f"{header}\n\n{info}\n{summary_str}" + else: + return f"{header}\n\n{info}\nModel not yet fitted. Call fit() first." From 4f4c25574b49a13b7c3e049c5a9800f8523dc866 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sun, 1 Feb 2026 10:26:37 +0100 Subject: [PATCH 02/38] refactor DoubleMLScalar to split fit() into separate parts --- doubleml/double_ml_base.py | 41 +---- doubleml/double_ml_linear_score.py | 16 +- doubleml/double_ml_scalar.py | 270 +++++++++++++++++++---------- 3 files changed, 193 insertions(+), 134 deletions(-) diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py index 19eac58a..645e3ed6 100644 --- a/doubleml/double_ml_base.py +++ b/doubleml/double_ml_base.py @@ -3,7 +3,7 @@ """ from abc import ABC, abstractmethod -from typing import Dict, List, Optional +from typing import Dict, Optional, Self import numpy as np import pandas as pd @@ -43,10 +43,6 @@ class DoubleMLBase(ABC): Summary table with estimates, standard errors, confidence intervals, and p-values. psi : np.ndarray Influence function values (shape: (n_obs, n_thetas, n_rep)). - smpls : list - Sample splitting indices used for cross-fitting. - n_folds : int - Number of folds used for cross-fitting. n_rep : int Number of repetitions for sample splitting. """ @@ -73,9 +69,6 @@ def __init__( # Framework is initialized after fit() self._framework: Optional[DoubleMLFramework] = None - # Sample splits are initialized via draw_sample_splitting() - self._smpls: Optional[List] = None - # ==================== Properties (Delegating to Framework) ==================== @property @@ -197,18 +190,17 @@ def psi(self) -> np.ndarray: return self.framework.scaled_psi @property - def smpls(self) -> List: + @abstractmethod + def n_rep(self) -> int: """ - Sample splitting indices used for cross-fitting. + Number of repetitions for sample splitting. Returns ------- - list - List of sample splitting indices for each repetition. + int + Number of repetitions. """ - if self._smpls is None: - raise ValueError("Sample splitting has not been performed. " "Call draw_sample_splitting() first.") - return self._smpls + pass @property def n_obs(self) -> int: @@ -244,7 +236,7 @@ def confint(self, joint: bool = False, level: float = 0.95) -> pd.DataFrame: """ return self.framework.confint(joint=joint, level=level) - def bootstrap(self, method: str = "normal", n_rep_boot: int = 500) -> "DoubleMLBase": + def bootstrap(self, method: str = "normal", n_rep_boot: int = 500) -> Self: """ Multiplier bootstrap for DoubleML models. @@ -326,7 +318,7 @@ def sensitivity_analysis( # ==================== Abstract Methods ==================== @abstractmethod - def fit(self, **kwargs) -> "DoubleMLBase": + def fit(self, **kwargs) -> Self: """ Estimate the DoubleML model. @@ -344,21 +336,6 @@ def fit(self, **kwargs) -> "DoubleMLBase": """ pass - @abstractmethod - def draw_sample_splitting(self) -> "DoubleMLBase": - """ - Draw sample splitting for cross-fitting. - - This method must be implemented by subclasses to generate sample splits - using an appropriate resampling strategy. - - Returns - ------- - self : DoubleMLBase - The DoubleML estimator with initialized sample splits. - """ - pass - def __str__(self) -> str: """ String representation of the DoubleMLBase object. diff --git a/doubleml/double_ml_linear_score.py b/doubleml/double_ml_linear_score.py index 640e031d..4bada4d8 100644 --- a/doubleml/double_ml_linear_score.py +++ b/doubleml/double_ml_linear_score.py @@ -110,17 +110,15 @@ def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None: # Store in framework shape: (n_obs, n_thetas=1, n_rep) self._psi_deriv = psi_a[:, np.newaxis, :] # (n_obs, 1, n_rep) - # Compute standard errors - # SE = std(ψ) / sqrt(n) - se = np.std(psi, axis=0) / np.sqrt(n_obs) # (n_rep,) + # Compute standard errors using sandwich variance estimator + # Var(θ̂) = E[ψ²] / (n · J²), where J = E[ψ_a] + # SE = sqrt(E[ψ²]) / (|J| · sqrt(n)) + gamma_hat = np.mean(psi**2, axis=0) # (n_rep,) + se = np.sqrt(gamma_hat) / (np.abs(mean_psi_a) * np.sqrt(n_obs)) # (n_rep,) self._all_ses = se[np.newaxis, :] # (1, n_rep) - # Compute variance scaling factors - # This is 1 / E[∂ψ/∂θ]^2 = 1 / E[ψ_a]^2 - var_scaling_factors = 1.0 / (mean_psi_a**2) # (n_rep,) - - # Take mean across repetitions and store in framework shape: (n_thetas=1,) - self._var_scaling_factors = np.array([np.mean(var_scaling_factors)]) # (1,) + # Variance scaling factor: n / J² (used by framework for aggregation) + self._var_scaling_factors = np.array([n_obs]) # (1,) def _compute_score(self, psi_elements: Dict[str, np.ndarray], coef: float) -> np.ndarray: """ diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index 5cd4a381..d4b3cf36 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -3,7 +3,7 @@ """ from abc import ABC, abstractmethod -from typing import Dict, Optional +from typing import Dict, List, Optional, Self import numpy as np @@ -30,21 +30,15 @@ class DoubleMLScalar(DoubleMLBase, ABC): obj_dml_data : DoubleMLBaseData The data object for the double machine learning model. Must contain exactly one treatment variable. - n_folds : int, optional - Number of folds for cross-fitting. Default is 5. - n_rep : int, optional - Number of repetitions for sample splitting. Default is 1. score : str, optional The score function to use. Default is model-specific. - draw_sample_splitting : bool, optional - Whether to draw sample splits on initialization. Default is True. Attributes ---------- n_folds : int - Number of folds for cross-fitting. + Number of folds for cross-fitting (set via draw_sample_splitting). n_rep : int - Number of repetitions for sample splitting. + Number of repetitions for sample splitting (set via draw_sample_splitting). score : str The score function being used. """ @@ -52,10 +46,7 @@ class DoubleMLScalar(DoubleMLBase, ABC): def __init__( self, obj_dml_data: DoubleMLBaseData, - n_folds: int = 5, - n_rep: int = 1, score: str = "default", - draw_sample_splitting: bool = True, ): """ Initialize DoubleMLScalar. @@ -64,21 +55,13 @@ def __init__( ---------- obj_dml_data : DoubleMLBaseData The data object. Must have exactly one treatment column. - n_folds : int, optional - Number of folds for cross-fitting. Default is 5. - n_rep : int, optional - Number of repetitions for sample splitting. Default is 1. score : str, optional The score function to use. Default is 'default'. - draw_sample_splitting : bool, optional - Whether to draw sample splits on initialization. Default is True. Raises ------ ValueError If obj_dml_data contains more than one treatment column. - TypeError - If parameters have incorrect types. """ # Validate single treatment column if len(obj_dml_data.d_cols) != 1: @@ -91,18 +74,13 @@ def __init__( # Call parent constructor super().__init__(obj_dml_data) - # Validate and store resampling parameters - if not isinstance(n_folds, int) or n_folds < 2: - raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.") - if not isinstance(n_rep, int) or n_rep < 1: - raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.") - if not isinstance(draw_sample_splitting, bool): - raise TypeError(f"draw_sample_splitting must be bool. Got {type(draw_sample_splitting)}.") - - self._n_folds = n_folds - self._n_rep = n_rep self._score = score + # Resampling parameters (set via draw_sample_splitting) + self._n_folds: Optional[int] = None + self._n_rep: Optional[int] = None + self._smpls: Optional[List] = None + # Initialize storage for predictions and results self._predictions: Optional[Dict[str, np.ndarray]] = None self._all_thetas: Optional[np.ndarray] = None @@ -115,10 +93,6 @@ def __init__( self._i_rep: Optional[int] = None self._i_fold: Optional[int] = None - # Draw sample splitting if requested - if draw_sample_splitting: - self.draw_sample_splitting() - # ==================== Properties ==================== @property @@ -130,7 +104,14 @@ def n_folds(self) -> int: ------- int Number of folds. + + Raises + ------ + ValueError + If sample splitting has not been performed yet. """ + if self._n_folds is None: + raise ValueError("n_folds not set. Call draw_sample_splitting() first.") return self._n_folds @property @@ -142,7 +123,14 @@ def n_rep(self) -> int: ------- int Number of repetitions. + + Raises + ------ + ValueError + If sample splitting has not been performed yet. """ + if self._n_rep is None: + raise ValueError("n_rep not set. Call draw_sample_splitting() first.") return self._n_rep @property @@ -160,7 +148,7 @@ def score(self) -> str: @property def predictions(self) -> Dict[str, np.ndarray]: """ - Predictions from nuisance models (if stored during fit). + Predictions from nuisance models. Returns ------- @@ -170,51 +158,115 @@ def predictions(self) -> Dict[str, np.ndarray]: Raises ------ ValueError - If predictions were not stored during fit. + If the model has not been fitted yet. """ if self._predictions is None: - raise ValueError("Predictions not available. Call fit() with store_predictions=True.") + raise ValueError("Predictions not available. Call fit() first.") return self._predictions + @property + def smpls(self) -> List: + """ + Sample splitting indices used for cross-fitting. + + Returns + ------- + list + List of sample splitting indices for each repetition. + """ + if self._smpls is None: + raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.") + return self._smpls + # ==================== Concrete fit() Method (Template) ==================== - def fit(self, n_jobs_cv: Optional[int] = None, store_predictions: bool = True, **kwargs) -> "DoubleMLScalar": + def fit( + self, + n_folds: int = 5, + n_rep: int = 1, + n_jobs_cv: Optional[int] = None, + external_predictions: Optional[Dict[str, np.ndarray]] = None, + **kwargs, + ) -> Self: """ Estimate the DoubleML model. - This is the concrete implementation of the fit() method using the template method pattern. - It orchestrates the estimation by: - 1. Ensuring sample splitting is initialized - 2. Initializing storage arrays - 3. Looping over repetitions and folds - 4. Calling abstract _nuisance_est() for each fold (implemented by subclasses) - 5. Computing score elements via _get_score_elements() (implemented by subclasses) - 6. Estimating parameters via _est_causal_pars_and_se() (from score mixin) - 7. Constructing the DoubleMLFramework + Calls :meth:`draw_sample_splitting` (if not yet done), + :meth:`fit_nuisance_models`, and :meth:`estimate_causal_parameters`. Parameters ---------- + n_folds : int, optional + Number of folds for cross-fitting. Default is 5. + Only used if sample splitting has not been drawn yet. + n_rep : int, optional + Number of repetitions for sample splitting. Default is 1. + Only used if sample splitting has not been drawn yet. n_jobs_cv : int, optional Number of jobs for parallel processing during cross-validation. Currently not used (reserved for future parallelization). - store_predictions : bool, optional - Whether to store predictions from nuisance models. Default is True. + external_predictions : dict or None, optional + Dictionary of pre-computed nuisance predictions to use instead of fitting + learners. Keys are learner names (e.g., ``'ml_l'``, ``'ml_m'``), values are + arrays of shape ``(n_obs, n_rep)``. Learners not in the dict are fitted normally. + Default is ``None``. **kwargs : dict Additional keyword arguments (for future extensibility). Returns ------- - self : DoubleMLScalar + self : Self The fitted estimator. """ - # Step 1: Ensure sample splitting is initialized if self._smpls is None: - self.draw_sample_splitting() + self.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + self.fit_nuisance_models(n_jobs_cv=n_jobs_cv, external_predictions=external_predictions) + self.estimate_causal_parameters() + return self + + def fit_nuisance_models( + self, + n_jobs_cv: Optional[int] = None, + external_predictions: Optional[Dict[str, np.ndarray]] = None, + ) -> Self: + """ + Fit nuisance models via cross-fitting. + + Requires sample splitting to be initialized via :meth:`draw_sample_splitting` + before calling this method. + + Parameters + ---------- + n_jobs_cv : int, optional + Number of jobs for parallel processing during cross-validation. + Currently not used (reserved for future parallelization). + external_predictions : dict or None, optional + Dictionary of pre-computed nuisance predictions. Keys are learner names, + values are arrays of shape ``(n_obs, n_rep)``. Default is ``None``. + + Returns + ------- + self : Self + The estimator with fitted nuisance models and stored predictions. + + Raises + ------ + ValueError + If sample splitting has not been initialized. + """ + if self._smpls is None: + raise ValueError("Sample splitting has not been initialized. Call draw_sample_splitting() first.") + + # Initialize prediction arrays + self._predictions = self._initialize_predictions_dict() - # Step 2: Initialize storage arrays - self._initialize_arrays(store_predictions=store_predictions) + # Pre-fill external predictions + if external_predictions is not None: + for key, values in external_predictions.items(): + if key in self._predictions: + self._predictions[key][:] = values - # Step 3: Cross-fitting loop over repetitions and folds + # Cross-fitting loop over repetitions and folds for i_rep in range(self.n_rep): self._i_rep = i_rep @@ -224,41 +276,87 @@ def fit(self, n_jobs_cv: Optional[int] = None, store_predictions: bool = True, * # Get train/test indices for this fold train_idx, test_idx = self._smpls[i_rep][i_fold] - # Step 4: Call abstract method - subclass implements nuisance estimation + # Call abstract method - subclass implements nuisance estimation self._nuisance_est( train_idx=train_idx, test_idx=test_idx, i_rep=i_rep, i_fold=i_fold, + external_predictions=external_predictions, ) - # Step 5: Get score elements - subclass implements + return self + + def estimate_causal_parameters(self) -> Self: + """ + Estimate causal parameters from nuisance predictions. + + Computes score elements, estimates parameters and standard errors, and + constructs the DoubleMLFramework. Must be called after :meth:`fit_nuisance_models`. + + Returns + ------- + self : Self + The estimator with estimated causal parameters. + + Raises + ------ + ValueError + If nuisance models have not been fitted yet. + """ + if self._predictions is None: + raise ValueError("Predictions not available. Call fit_nuisance_models() first.") + + # Initialize result arrays + self._initialize_result_arrays() + + # Get score elements - subclass implements psi_elements = self._get_score_elements() - # Step 6: Estimate causal parameters - from score mixin + # Estimate causal parameters - from score mixin self._est_causal_pars_and_se(psi_elements) - # Step 7: Construct framework + # Construct framework self._framework = self._construct_framework() return self - def draw_sample_splitting(self) -> "DoubleMLScalar": + def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: """ Draw sample splitting for cross-fitting. Uses DoubleMLResampling to generate K-fold cross-validation splits with multiple repetitions. + Parameters + ---------- + n_folds : int, optional + Number of folds for cross-fitting. Default is 5. + n_rep : int, optional + Number of repetitions for sample splitting. Default is 1. + Returns ------- - self : DoubleMLScalar + self : Self The estimator with initialized sample splits. + + Raises + ------ + ValueError + If n_folds or n_rep have invalid values. """ + if not isinstance(n_folds, int) or n_folds < 2: + raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.") + if not isinstance(n_rep, int) or n_rep < 1: + raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.") + + self._n_folds = n_folds + self._n_rep = n_rep + # Create resampler resampler = DoubleMLResampling( - n_folds=self.n_folds, - n_rep=self.n_rep, + n_folds=n_folds, + n_rep=n_rep, n_obs=self._n_obs, ) @@ -269,29 +367,16 @@ def draw_sample_splitting(self) -> "DoubleMLScalar": # ==================== Private Helper Methods ==================== - def _initialize_arrays(self, store_predictions: bool = True) -> None: - """ - Initialize storage arrays for predictions and results. - - Parameters - ---------- - store_predictions : bool - Whether to allocate arrays for storing predictions. - """ + def _initialize_result_arrays(self) -> None: + """Initialize storage arrays for causal parameter estimation results.""" n_obs = self._n_obs n_rep = self.n_rep n_thetas = 1 # Scalar model estimates single parameter - # Initialize predictions storage if requested - if store_predictions: - self._predictions = self._initialize_predictions_dict() - - # Initialize result arrays using framework convention - # These will be filled by _est_causal_pars_and_se() # Shapes follow framework: (n_thetas, n_rep) for params, (n_obs, n_thetas, n_rep) for scores - self._all_thetas = np.zeros((n_thetas, n_rep)) # (n_thetas=1, n_rep) + self._all_thetas = np.zeros((n_thetas, n_rep)) self._all_ses = np.zeros((n_thetas, n_rep)) - self._psi = np.zeros((n_obs, n_thetas, n_rep)) # (n_obs, n_thetas=1, n_rep) + self._psi = np.zeros((n_obs, n_thetas, n_rep)) self._psi_deriv = np.zeros((n_obs, n_thetas, n_rep)) def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]: @@ -305,8 +390,6 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]: dict Empty dictionary (subclasses should override). """ - # Default: return empty dict - # Subclasses should override to create arrays for their specific nuisance components return {} def _construct_framework(self) -> DoubleMLFramework: @@ -346,15 +429,17 @@ def _nuisance_est( test_idx: np.ndarray, i_rep: int, i_fold: int, + external_predictions: Optional[Dict[str, np.ndarray]] = None, ) -> None: """ Estimate nuisance parameters for one fold. This is the main method subclasses must implement. It should: - 1. Extract training and test data using train_idx and test_idx - 2. Fit nuisance models (e.g., outcome model, treatment model) on training data - 3. Predict on test data - 4. Store predictions in self._predictions + 1. Check external_predictions for pre-computed values (skip fitting if present) + 2. Extract training and test data using train_idx and test_idx + 3. Fit nuisance models on training data + 4. Predict on test data + 5. Store predictions in self._predictions Parameters ---------- @@ -366,12 +451,10 @@ def _nuisance_est( Repetition index (0 to n_rep-1). i_fold : int Fold index (0 to n_folds-1). - - Notes - ----- - Subclasses should store predictions in self._predictions, for example: - self._predictions['ml_l'][test_idx, i_rep] = l_hat - self._predictions['ml_m'][test_idx, i_rep] = m_hat + external_predictions : dict or None, optional + If provided, a dictionary of external predictions. Learners whose names + appear as keys should not be fitted; their predictions are already + pre-filled in self._predictions. """ pass @@ -443,7 +526,8 @@ def __str__(self) -> str: header = f"{'=' * 20} {class_name} Object {'=' * 20}" info = f"Score function: {self.score}\n" - info += f"Resampling: {self.n_folds}-fold CV, {self.n_rep} repetitions\n" + if self._n_folds is not None: + info += f"Resampling: {self._n_folds}-fold CV, {self._n_rep} repetitions\n" if self._framework is not None: summary_str = str(self.summary) From ae2e5be2a1a36b35403aa2a08bfba50160fc86bb Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sun, 1 Feb 2026 10:26:48 +0100 Subject: [PATCH 03/38] add plr_scalar implementation --- doubleml/plm/plr_scalar.py | 151 ++++++++++++++++++ doubleml/plm/tests/test_plr_scalar.py | 86 ++++++++++ .../plm/tests/test_plr_scalar_exceptions.py | 88 ++++++++++ .../plm/tests/test_plr_scalar_return_types.py | 123 ++++++++++++++ doubleml/plm/tests/test_plr_scalar_vs_plr.py | 82 ++++++++++ 5 files changed, 530 insertions(+) create mode 100644 doubleml/plm/plr_scalar.py create mode 100644 doubleml/plm/tests/test_plr_scalar.py create mode 100644 doubleml/plm/tests/test_plr_scalar_exceptions.py create mode 100644 doubleml/plm/tests/test_plr_scalar_return_types.py create mode 100644 doubleml/plm/tests/test_plr_scalar_vs_plr.py diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py new file mode 100644 index 00000000..2a5259f6 --- /dev/null +++ b/doubleml/plm/plr_scalar.py @@ -0,0 +1,151 @@ +""" +Partially Linear Regression (PLR) model based on the new DoubleMLScalar hierarchy. +""" + +import warnings + +import numpy as np +from sklearn.base import clone + +from ..data.base_data import DoubleMLData +from ..double_ml_linear_score import LinearScoreMixin + + +class PLR(LinearScoreMixin): + """Double machine learning for partially linear regression models. + + Based on the DoubleMLScalar + LinearScoreMixin hierarchy. + + Parameters + ---------- + obj_dml_data : DoubleMLData + The data object providing the data and specifying the variables for the causal model. + ml_l : estimator + A machine learner implementing ``fit()`` and ``predict()`` for the nuisance + function :math:`\\ell_0(X) = E[Y|X]`. + ml_m : estimator + A machine learner implementing ``fit()`` and ``predict()`` for the nuisance + function :math:`m_0(X) = E[D|X]`. + ml_g : estimator, optional + A machine learner implementing ``fit()`` and ``predict()`` for the nuisance + function :math:`g_0(X) = E[Y - D\\theta_0|X]`. + Only required for ``score='IV-type'``. + score : str, optional + The score function (``'partialling out'`` or ``'IV-type'``). + Default is ``'partialling out'``. + """ + + def __init__( + self, + obj_dml_data, + ml_l, + ml_m, + ml_g=None, + score="partialling out", + ): + # Validate data + self._check_data(obj_dml_data) + + # Validate score + valid_scores = ["partialling out", "IV-type"] + if score not in valid_scores: + raise ValueError(f"Invalid score '{score}'. Valid scores: {valid_scores}.") + + # Store learners + self._learner = {"ml_l": clone(ml_l), "ml_m": clone(ml_m)} + + if ml_g is not None: + if score == "IV-type": + self._learner["ml_g"] = clone(ml_g) + else: + warnings.warn( + "A learner ml_g has been provided for score = 'partialling out' but will be ignored. " + "A learner ml_g is not required for estimation." + ) + elif score == "IV-type": + warnings.warn("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).") + self._learner["ml_g"] = clone(ml_l) + + super().__init__( + obj_dml_data=obj_dml_data, + score=score, + ) + + @staticmethod + def _check_data(obj_dml_data): + if not isinstance(obj_dml_data, DoubleMLData): + raise TypeError( + f"The data must be of DoubleMLData type. " f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) + if obj_dml_data.z_cols is not None: + raise ValueError( + "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). " + "To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR." + ) + + def _initialize_predictions_dict(self): + n_obs = self._n_obs + n_rep = self.n_rep + preds = { + "ml_l": np.full((n_obs, n_rep), np.nan), + "ml_m": np.full((n_obs, n_rep), np.nan), + } + if "ml_g" in self._learner: + preds["ml_g"] = np.full((n_obs, n_rep), np.nan) + return preds + + def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold): + x = self._dml_data.x + y = self._dml_data.y + d = self._dml_data.d + + x_train, x_test = x[train_idx], x[test_idx] + y_train = y[train_idx] + d_train = d[train_idx] + + # Fit and predict ml_l: E[Y|X] + ml_l = clone(self._learner["ml_l"]) + ml_l.fit(x_train, y_train) + self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test) + + # Fit and predict ml_m: E[D|X] + ml_m = clone(self._learner["ml_m"]) + ml_m.fit(x_train, d_train) + self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test) + + # For IV-type: fit ml_g after last fold when all ml_l/ml_m predictions are available + is_last_fold = i_fold == self.n_folds - 1 + if is_last_fold and "ml_g" in self._learner and "ml_g" in self._predictions: + # Compute initial theta from full cross-fitted predictions + l_hat = self._predictions["ml_l"][:, i_rep] + m_hat = self._predictions["ml_m"][:, i_rep] + psi_a = -(d - m_hat) * (d - m_hat) + psi_b = (d - m_hat) * (y - l_hat) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + + # Second pass: fit ml_g with cross-fitting across all folds + for j_fold in range(self.n_folds): + train_j, test_j = self._smpls[i_rep][j_fold] + ml_g = clone(self._learner["ml_g"]) + ml_g.fit(x[train_j], y[train_j] - theta_initial * d[train_j]) + self._predictions["ml_g"][test_j, i_rep] = ml_g.predict(x[test_j]) + + def _get_score_elements(self): + y = self._dml_data.y + d = self._dml_data.d + + m_hat = self._predictions["ml_m"] # (n_obs, n_rep) + v_hat = d[:, np.newaxis] - m_hat # (n_obs, n_rep) + + if self.score == "partialling out": + l_hat = self._predictions["ml_l"] + u_hat = y[:, np.newaxis] - l_hat + psi_a = -v_hat * v_hat + psi_b = v_hat * u_hat + else: + assert self.score == "IV-type" + g_hat = self._predictions["ml_g"] + psi_a = -v_hat * d[:, np.newaxis] + psi_b = v_hat * (y[:, np.newaxis] - g_hat) + + return {"psi_a": psi_a, "psi_b": psi_b} diff --git a/doubleml/plm/tests/test_plr_scalar.py b/doubleml/plm/tests/test_plr_scalar.py new file mode 100644 index 00000000..581121b2 --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar.py @@ -0,0 +1,86 @@ +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.linear_model import Lasso, LinearRegression + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + + +@pytest.fixture(scope="module", params=[LinearRegression(), Lasso(alpha=0.1)]) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["IV-type", "partialling out"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_plr_scalar_fixture(learner, score): + n_folds = 5 + theta = 0.5 + + ml_l = clone(learner) + ml_m = clone(learner) + ml_g = clone(learner) + + np.random.seed(3141) + obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=theta) + + dml_obj = PLR(obj_dml_data, ml_l, ml_m, ml_g, score=score) + dml_obj.draw_sample_splitting(n_folds=n_folds) + dml_obj.fit() + + res_dict = { + "coef": dml_obj.coef[0], + "se": dml_obj.se[0], + "true_coef": theta, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_plr_scalar_coef(dml_plr_scalar_fixture): + coef = dml_plr_scalar_fixture["coef"] + se = dml_plr_scalar_fixture["se"] + true_coef = dml_plr_scalar_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * se + + +@pytest.fixture(scope="module") +def dml_plr_scalar_rep_fixture(): + """Test with multiple repetitions.""" + n_folds = 3 + n_rep = 3 + theta = 0.5 + + np.random.seed(3141) + obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=theta) + + dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression()) + dml_obj.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_obj.fit() + + return { + "dml_obj": dml_obj, + "true_coef": theta, + "n_rep": n_rep, + } + + +@pytest.mark.ci +def test_dml_plr_scalar_rep_coef(dml_plr_scalar_rep_fixture): + dml_obj = dml_plr_scalar_rep_fixture["dml_obj"] + true_coef = dml_plr_scalar_rep_fixture["true_coef"] + assert abs(dml_obj.coef[0] - true_coef) <= 3.0 * dml_obj.se[0] + + +@pytest.mark.ci +def test_dml_plr_scalar_rep_shapes(dml_plr_scalar_rep_fixture): + dml_obj = dml_plr_scalar_rep_fixture["dml_obj"] + n_rep = dml_plr_scalar_rep_fixture["n_rep"] + assert dml_obj.all_thetas.shape == (1, n_rep) + assert dml_obj.all_ses.shape == (1, n_rep) diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py new file mode 100644 index 00000000..5797191a --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py @@ -0,0 +1,88 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import Lasso + +import doubleml as dml +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + +np.random.seed(3141) +obj_dml_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=10, alpha=0.5) + +# Create data with instruments for IV check +df = obj_dml_data.data.copy() +x_cols = [c for c in df.columns if c.startswith("X")] +dml_data_iv = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols[:-1], z_cols=x_cols[-1]) + +ml_l = Lasso(alpha=0.1) +ml_m = Lasso(alpha=0.1) +ml_g = Lasso(alpha=0.1) + + +@pytest.mark.ci +def test_plr_scalar_exception_data(): + msg = r"The data must be of DoubleMLData type\." + with pytest.raises(TypeError, match=msg): + PLR(pd.DataFrame(), ml_l, ml_m) + + +@pytest.mark.ci +def test_plr_scalar_exception_instrument(): + msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\." + with pytest.raises(ValueError, match=msg): + PLR(dml_data_iv, ml_l, ml_m) + + +@pytest.mark.ci +def test_plr_scalar_exception_score(): + msg = r"Invalid score 'invalid'\." + with pytest.raises(ValueError, match=msg): + PLR(obj_dml_data, ml_l, ml_m, score="invalid") + + +@pytest.mark.ci +def test_plr_scalar_exception_n_folds(): + dml_obj = PLR(obj_dml_data, ml_l, ml_m) + msg = r"n_folds must be an integer >= 2\." + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_folds=1) + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_folds=0) + + +@pytest.mark.ci +def test_plr_scalar_exception_n_rep(): + dml_obj = PLR(obj_dml_data, ml_l, ml_m) + msg = r"n_rep must be an integer >= 1\." + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_rep=0) + + +@pytest.mark.ci +def test_plr_scalar_exception_fit_nuisance_without_smpls(): + dml_obj = PLR(obj_dml_data, ml_l, ml_m) + msg = r"Sample splitting has not been initialized\." + with pytest.raises(ValueError, match=msg): + dml_obj.fit_nuisance_models() + + +@pytest.mark.ci +def test_plr_scalar_exception_estimate_causal_without_predictions(): + dml_obj = PLR(obj_dml_data, ml_l, ml_m) + dml_obj.draw_sample_splitting() + msg = r"Predictions not available\." + with pytest.raises(ValueError, match=msg): + dml_obj.estimate_causal_parameters() + + +@pytest.mark.ci +def test_plr_scalar_warning_ml_g_partialling_out(): + with pytest.warns(UserWarning, match="will be ignored"): + PLR(obj_dml_data, ml_l, ml_m, ml_g, score="partialling out") + + +@pytest.mark.ci +def test_plr_scalar_warning_ml_g_iv_type_missing(): + with pytest.warns(UserWarning, match="ml_l and ml_g should be specified"): + PLR(obj_dml_data, ml_l, ml_m, score="IV-type") diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py new file mode 100644 index 00000000..b6f25a71 --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar_return_types.py @@ -0,0 +1,123 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import LinearRegression + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + +N_OBS = 200 +N_FOLDS = 3 +N_REP = 2 +N_REP_BOOT = 314 + +np.random.seed(3141) +obj_dml_data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=10, alpha=0.5) + + +@pytest.fixture(scope="module") +def fitted_dml_obj(): + np.random.seed(3141) + dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression()) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + dml_obj.bootstrap(n_rep_boot=N_REP_BOOT) + return dml_obj + + +@pytest.mark.ci +def test_coef_type_and_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.coef, np.ndarray) + assert fitted_dml_obj.coef.shape == (1,) + + +@pytest.mark.ci +def test_se_type_and_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.se, np.ndarray) + assert fitted_dml_obj.se.shape == (1,) + + +@pytest.mark.ci +def test_all_thetas_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.all_thetas, np.ndarray) + assert fitted_dml_obj.all_thetas.shape == (1, N_REP) + + +@pytest.mark.ci +def test_all_coef_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.all_coef, np.ndarray) + assert fitted_dml_obj.all_coef.shape == (1, N_REP) + + +@pytest.mark.ci +def test_all_ses_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.all_ses, np.ndarray) + assert fitted_dml_obj.all_ses.shape == (1, N_REP) + + +@pytest.mark.ci +def test_summary_type(fitted_dml_obj): + assert isinstance(fitted_dml_obj.summary, pd.DataFrame) + assert fitted_dml_obj.summary.shape[0] == 1 + + +@pytest.mark.ci +def test_confint_type_and_shape(fitted_dml_obj): + ci = fitted_dml_obj.confint() + assert isinstance(ci, pd.DataFrame) + assert ci.shape == (1, 2) + + +@pytest.mark.ci +def test_confint_joint(fitted_dml_obj): + ci_joint = fitted_dml_obj.confint(joint=True) + assert isinstance(ci_joint, pd.DataFrame) + assert ci_joint.shape == (1, 2) + + +@pytest.mark.ci +def test_psi_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.psi, np.ndarray) + assert fitted_dml_obj.psi.shape == (N_OBS, 1, N_REP) + + +@pytest.mark.ci +def test_predictions_type(fitted_dml_obj): + preds = fitted_dml_obj.predictions + assert isinstance(preds, dict) + assert "ml_l" in preds + assert "ml_m" in preds + assert preds["ml_l"].shape == (N_OBS, N_REP) + assert preds["ml_m"].shape == (N_OBS, N_REP) + + +@pytest.mark.ci +def test_smpls_type(fitted_dml_obj): + smpls = fitted_dml_obj.smpls + assert isinstance(smpls, list) + assert len(smpls) == N_REP + assert len(smpls[0]) == N_FOLDS + + +@pytest.mark.ci +def test_n_properties(fitted_dml_obj): + assert fitted_dml_obj.n_obs == N_OBS + assert fitted_dml_obj.n_folds == N_FOLDS + assert fitted_dml_obj.n_rep == N_REP + assert fitted_dml_obj.score == "partialling out" + + +@pytest.mark.ci +def test_str_repr(fitted_dml_obj): + assert isinstance(str(fitted_dml_obj), str) + assert isinstance(repr(fitted_dml_obj), str) + + +@pytest.mark.ci +def test_before_fit_raises(): + np.random.seed(3141) + dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression()) + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.coef + with pytest.raises(ValueError, match="Predictions not available. Call fit"): + _ = dml_obj.predictions diff --git a/doubleml/plm/tests/test_plr_scalar_vs_plr.py b/doubleml/plm/tests/test_plr_scalar_vs_plr.py new file mode 100644 index 00000000..f87a1af5 --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar_vs_plr.py @@ -0,0 +1,82 @@ +"""Compare PLR against the existing DoubleMLPLR implementation.""" + +import numpy as np +import pytest +from sklearn.linear_model import Lasso, LinearRegression + +import doubleml as dml +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + + +@pytest.fixture(scope="module", params=[LinearRegression(), Lasso(alpha=0.1)]) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def comparison_fixture(learner, score, n_rep): + n_folds = 5 + seed = 3141 + + np.random.seed(42) + obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5) + + # Old PLR + np.random.seed(seed) + dml_old = dml.DoubleMLPLR( + obj_dml_data, + learner, + learner, + learner, + n_folds=n_folds, + n_rep=n_rep, + score=score, + ) + dml_old.fit() + + # New PLR + np.random.seed(seed) + dml_new = PLR(obj_dml_data, learner, learner, learner, score=score) + dml_new.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_new.fit() + + return {"old": dml_old, "new": dml_new} + + +@pytest.mark.ci +def test_coef_equal(comparison_fixture): + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.coef, old.coef, rtol=1e-9) + + +@pytest.mark.ci +def test_se_equal(comparison_fixture): + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.se, old.se, rtol=1e-9) + + +@pytest.mark.ci +def test_all_coef_equal(comparison_fixture): + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.all_thetas, old.all_coef, rtol=1e-9) + + +@pytest.mark.ci +def test_all_se_equal(comparison_fixture): + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9) From dad5e4c8a6851c316e6f92818b22ef6a4e8cda90 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sun, 1 Feb 2026 15:42:19 +0100 Subject: [PATCH 04/38] fix external predictions for doublemlscalar --- doubleml/plm/plr_scalar.py | 23 ++-- .../test_plr_scalar_external_predictions.py | 103 ++++++++++++++++++ 2 files changed, 118 insertions(+), 8 deletions(-) create mode 100644 doubleml/plm/tests/test_plr_scalar_external_predictions.py diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 2a5259f6..9a3b181c 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -94,7 +94,7 @@ def _initialize_predictions_dict(self): preds["ml_g"] = np.full((n_obs, n_rep), np.nan) return preds - def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold): + def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions=None): x = self._dml_data.x y = self._dml_data.y d = self._dml_data.d @@ -103,19 +103,26 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold): y_train = y[train_idx] d_train = d[train_idx] + # Check which learners have external predictions + l_external = external_predictions is not None and "ml_l" in external_predictions + m_external = external_predictions is not None and "ml_m" in external_predictions + g_external = external_predictions is not None and "ml_g" in external_predictions + # Fit and predict ml_l: E[Y|X] - ml_l = clone(self._learner["ml_l"]) - ml_l.fit(x_train, y_train) - self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test) + if not l_external: + ml_l = clone(self._learner["ml_l"]) + ml_l.fit(x_train, y_train) + self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test) # Fit and predict ml_m: E[D|X] - ml_m = clone(self._learner["ml_m"]) - ml_m.fit(x_train, d_train) - self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test) + if not m_external: + ml_m = clone(self._learner["ml_m"]) + ml_m.fit(x_train, d_train) + self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test) # For IV-type: fit ml_g after last fold when all ml_l/ml_m predictions are available is_last_fold = i_fold == self.n_folds - 1 - if is_last_fold and "ml_g" in self._learner and "ml_g" in self._predictions: + if is_last_fold and "ml_g" in self._learner and "ml_g" in self._predictions and not g_external: # Compute initial theta from full cross-fitted predictions l_hat = self._predictions["ml_l"][:, i_rep] m_hat = self._predictions["ml_m"][:, i_rep] diff --git a/doubleml/plm/tests/test_plr_scalar_external_predictions.py b/doubleml/plm/tests/test_plr_scalar_external_predictions.py new file mode 100644 index 00000000..da6ac9ce --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar_external_predictions.py @@ -0,0 +1,103 @@ +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression + +from doubleml import DoubleMLData +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + + +@pytest.fixture(scope="module", params=["IV-type", "partialling out"]) +def plr_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_m_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_l_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_g_ext(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_plr_scalar_fixture(plr_score, n_rep, set_ml_m_ext, set_ml_l_ext, set_ml_g_ext): + n_folds = 3 + ext_predictions = {} + + x, y, d = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type="np.array") + + np.random.seed(3141) + dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d) + + kwargs = {"obj_dml_data": dml_data, "score": plr_score} + if plr_score == "IV-type": + kwargs["ml_g"] = LinearRegression() + + # Fit reference model + dml_plr = PLR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs) + np.random.seed(3141) + dml_plr.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_plr.fit() + + # Build external predictions dict + if set_ml_m_ext: + ext_predictions["ml_m"] = dml_plr.predictions["ml_m"] + + if set_ml_l_ext: + ext_predictions["ml_l"] = dml_plr.predictions["ml_l"] + + if plr_score == "IV-type" and set_ml_g_ext: + ext_predictions["ml_g"] = dml_plr.predictions["ml_g"] + kwargs["ml_g"] = LinearRegression() + elif plr_score == "IV-type": + kwargs["ml_g"] = LinearRegression() + + # Fit model with external predictions + dml_plr_ext = PLR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs) + np.random.seed(3141) + dml_plr_ext.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_plr_ext.fit(external_predictions=ext_predictions if ext_predictions else None) + + res_dict = { + "coef_normal": dml_plr.coef[0], + "coef_ext": dml_plr_ext.coef[0], + "se_normal": dml_plr.se[0], + "se_ext": dml_plr_ext.se[0], + } + + return res_dict + + +@pytest.mark.ci +def test_doubleml_plr_scalar_coef(doubleml_plr_scalar_fixture): + assert math.isclose( + doubleml_plr_scalar_fixture["coef_normal"], + doubleml_plr_scalar_fixture["coef_ext"], + rel_tol=1e-9, + abs_tol=1e-4, + ) + + +@pytest.mark.ci +def test_doubleml_plr_scalar_se(doubleml_plr_scalar_fixture): + assert math.isclose( + doubleml_plr_scalar_fixture["se_normal"], + doubleml_plr_scalar_fixture["se_ext"], + rel_tol=1e-9, + abs_tol=1e-4, + ) From 5f0a1378599dc256713e71f88890d145931cf36c Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sun, 1 Feb 2026 19:07:06 +0100 Subject: [PATCH 05/38] Enhance PLR and DoubleMLScalar with learner management and validation - Introduced learner management in DoubleMLScalar with properties for learner names and instances. - Added abstract method `set_learners` to enforce learner setting in subclasses. - Updated PLR to utilize the new learner management system, including validation checks for learner instances. - Refactored tests to align with the new learner management approach, ensuring proper exception handling and validation. --- doubleml/double_ml_scalar.py | 84 ++++++++++++++++- doubleml/plm/plr_scalar.py | 92 +++++++++++-------- doubleml/plm/tests/test_plr_scalar.py | 9 +- .../plm/tests/test_plr_scalar_exceptions.py | 46 +++++++--- .../test_plr_scalar_external_predictions.py | 27 ++++-- .../plm/tests/test_plr_scalar_return_types.py | 12 ++- doubleml/plm/tests/test_plr_scalar_vs_plr.py | 3 +- doubleml/utils/_checks.py | 73 +++++++++++++++ 8 files changed, 278 insertions(+), 68 deletions(-) diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index d4b3cf36..c969ec84 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -76,6 +76,10 @@ def __init__( self._score = score + # Learner names (set by subclass) and learner storage (set via set_learners) + self._learner_names: List[str] = [] + self._learners: Dict[str, object] = {} + # Resampling parameters (set via draw_sample_splitting) self._n_folds: Optional[int] = None self._n_rep: Optional[int] = None @@ -178,6 +182,50 @@ def smpls(self) -> List: raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.") return self._smpls + @property + def learner_names(self) -> List[str]: + """ + Names of the required learners for this model. + + Returns + ------- + list of str + List of required learner names. + """ + return self._learner_names + + @property + def learners(self) -> Dict[str, object]: + """ + The learners used for nuisance estimation. + + Returns + ------- + dict + Dictionary mapping learner names to estimator instances. + """ + return self._learners + + @abstractmethod + def set_learners(self, **kwargs) -> Self: + """ + Set the learners for nuisance estimation. + + Subclasses must implement this method with explicit keyword arguments + for each learner (e.g., ``ml_l``, ``ml_m``, ``ml_g`` for PLR). + + Parameters + ---------- + **kwargs + Learner keyword arguments specific to the subclass. + + Returns + ------- + self : Self + The estimator with learners set. + """ + pass + # ==================== Concrete fit() Method (Template) ==================== def fit( @@ -257,6 +305,9 @@ def fit_nuisance_models( if self._smpls is None: raise ValueError("Sample splitting has not been initialized. Call draw_sample_splitting() first.") + # Validate that all required learners are available + self._check_learners_available(external_predictions) + # Initialize prediction arrays self._predictions = self._initialize_predictions_dict() @@ -383,14 +434,41 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]: """ Initialize dictionary for storing predictions. - Subclasses can override this to define their specific prediction storage structure. + Creates a prediction array of shape ``(n_obs, n_rep)`` for each learner + in :attr:`learner_names`, filled with ``NaN``. Subclasses can override + this for custom prediction storage. Returns ------- dict - Empty dictionary (subclasses should override). + Dictionary mapping learner names to NaN-filled arrays. + """ + n_obs = self._n_obs + n_rep = self.n_rep + return {name: np.full((n_obs, n_rep), np.nan) for name in self._learner_names} + + def _check_learners_available(self, external_predictions=None) -> None: """ - return {} + Validate that all required learners are set or covered by external predictions. + + Parameters + ---------- + external_predictions : dict or None + External predictions that may cover some learners. + + Raises + ------ + ValueError + If a required learner is missing and not covered by external predictions. + """ + ext_keys = set(external_predictions.keys()) if external_predictions is not None else set() + + for name in self._learner_names: + if name not in self._learners and name not in ext_keys: + raise ValueError( + f"Learner '{name}' is required but not set and no external predictions provided for it. " + f"Call set_learners({name}=...) or provide external_predictions." + ) def _construct_framework(self) -> DoubleMLFramework: """ diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 9a3b181c..b915bd76 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -9,6 +9,7 @@ from ..data.base_data import DoubleMLData from ..double_ml_linear_score import LinearScoreMixin +from ..utils._checks import _check_learner class PLR(LinearScoreMixin): @@ -20,16 +21,6 @@ class PLR(LinearScoreMixin): ---------- obj_dml_data : DoubleMLData The data object providing the data and specifying the variables for the causal model. - ml_l : estimator - A machine learner implementing ``fit()`` and ``predict()`` for the nuisance - function :math:`\\ell_0(X) = E[Y|X]`. - ml_m : estimator - A machine learner implementing ``fit()`` and ``predict()`` for the nuisance - function :math:`m_0(X) = E[D|X]`. - ml_g : estimator, optional - A machine learner implementing ``fit()`` and ``predict()`` for the nuisance - function :math:`g_0(X) = E[Y - D\\theta_0|X]`. - Only required for ``score='IV-type'``. score : str, optional The score function (``'partialling out'`` or ``'IV-type'``). Default is ``'partialling out'``. @@ -38,9 +29,6 @@ class PLR(LinearScoreMixin): def __init__( self, obj_dml_data, - ml_l, - ml_m, - ml_g=None, score="partialling out", ): # Validate data @@ -51,25 +39,57 @@ def __init__( if score not in valid_scores: raise ValueError(f"Invalid score '{score}'. Valid scores: {valid_scores}.") - # Store learners - self._learner = {"ml_l": clone(ml_l), "ml_m": clone(ml_m)} + super().__init__( + obj_dml_data=obj_dml_data, + score=score, + ) + + # Set required learner names based on score + self._learner_names = ["ml_l", "ml_m"] + if score == "IV-type": + self._learner_names.append("ml_g") + + def set_learners(self, ml_l=None, ml_m=None, ml_g=None): + """ + Set the learners for nuisance estimation. + + Parameters + ---------- + ml_l : estimator or None, optional + A machine learner implementing ``fit()`` and ``predict()`` for the nuisance + function :math:`\\ell_0(X) = E[Y|X]`. + ml_m : estimator or None, optional + A machine learner implementing ``fit()`` and ``predict()`` for the nuisance + function :math:`m_0(X) = E[D|X]`. + ml_g : estimator or None, optional + A machine learner implementing ``fit()`` and ``predict()`` for the nuisance + function :math:`g_0(X) = E[Y - D\\theta_0|X]`. + Only required for ``score='IV-type'``. + + Returns + ------- + self : PLR + The estimator with learners set. + """ + if ml_l is not None: + _check_learner(ml_l, "ml_l", regressor=True, classifier=True) + self._learners["ml_l"] = clone(ml_l) + + if ml_m is not None: + _check_learner(ml_m, "ml_m", regressor=True, classifier=True) + self._learners["ml_m"] = clone(ml_m) if ml_g is not None: - if score == "IV-type": - self._learner["ml_g"] = clone(ml_g) + if self.score == "IV-type": + _check_learner(ml_g, "ml_g", regressor=True, classifier=False) + self._learners["ml_g"] = clone(ml_g) else: warnings.warn( "A learner ml_g has been provided for score = 'partialling out' but will be ignored. " "A learner ml_g is not required for estimation." ) - elif score == "IV-type": - warnings.warn("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).") - self._learner["ml_g"] = clone(ml_l) - super().__init__( - obj_dml_data=obj_dml_data, - score=score, - ) + return self @staticmethod def _check_data(obj_dml_data): @@ -83,17 +103,6 @@ def _check_data(obj_dml_data): "To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR." ) - def _initialize_predictions_dict(self): - n_obs = self._n_obs - n_rep = self.n_rep - preds = { - "ml_l": np.full((n_obs, n_rep), np.nan), - "ml_m": np.full((n_obs, n_rep), np.nan), - } - if "ml_g" in self._learner: - preds["ml_g"] = np.full((n_obs, n_rep), np.nan) - return preds - def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions=None): x = self._dml_data.x y = self._dml_data.y @@ -110,19 +119,24 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions # Fit and predict ml_l: E[Y|X] if not l_external: - ml_l = clone(self._learner["ml_l"]) + ml_l = clone(self._learners["ml_l"]) ml_l.fit(x_train, y_train) self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test) # Fit and predict ml_m: E[D|X] if not m_external: - ml_m = clone(self._learner["ml_m"]) + ml_m = clone(self._learners["ml_m"]) ml_m.fit(x_train, d_train) self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test) # For IV-type: fit ml_g after last fold when all ml_l/ml_m predictions are available is_last_fold = i_fold == self.n_folds - 1 - if is_last_fold and "ml_g" in self._learner and "ml_g" in self._predictions and not g_external: + if is_last_fold and self.score == "IV-type" and not g_external: + # If ml_g not explicitly set, default to clone of ml_l + if "ml_g" not in self._learners: + warnings.warn("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).") + self._learners["ml_g"] = clone(self._learners["ml_l"]) + # Compute initial theta from full cross-fitted predictions l_hat = self._predictions["ml_l"][:, i_rep] m_hat = self._predictions["ml_m"][:, i_rep] @@ -133,7 +147,7 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions # Second pass: fit ml_g with cross-fitting across all folds for j_fold in range(self.n_folds): train_j, test_j = self._smpls[i_rep][j_fold] - ml_g = clone(self._learner["ml_g"]) + ml_g = clone(self._learners["ml_g"]) ml_g.fit(x[train_j], y[train_j] - theta_initial * d[train_j]) self._predictions["ml_g"][test_j, i_rep] = ml_g.predict(x[test_j]) diff --git a/doubleml/plm/tests/test_plr_scalar.py b/doubleml/plm/tests/test_plr_scalar.py index 581121b2..db9eed6e 100644 --- a/doubleml/plm/tests/test_plr_scalar.py +++ b/doubleml/plm/tests/test_plr_scalar.py @@ -29,7 +29,11 @@ def dml_plr_scalar_fixture(learner, score): np.random.seed(3141) obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=theta) - dml_obj = PLR(obj_dml_data, ml_l, ml_m, ml_g, score=score) + dml_obj = PLR(obj_dml_data, score=score) + if score == "IV-type": + dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) + else: + dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m) dml_obj.draw_sample_splitting(n_folds=n_folds) dml_obj.fit() @@ -60,7 +64,8 @@ def dml_plr_scalar_rep_fixture(): np.random.seed(3141) obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=theta) - dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression()) + dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) dml_obj.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) dml_obj.fit() diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py index 5797191a..7cc74aac 100644 --- a/doubleml/plm/tests/test_plr_scalar_exceptions.py +++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py @@ -24,26 +24,26 @@ def test_plr_scalar_exception_data(): msg = r"The data must be of DoubleMLData type\." with pytest.raises(TypeError, match=msg): - PLR(pd.DataFrame(), ml_l, ml_m) + PLR(pd.DataFrame()) @pytest.mark.ci def test_plr_scalar_exception_instrument(): msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\." with pytest.raises(ValueError, match=msg): - PLR(dml_data_iv, ml_l, ml_m) + PLR(dml_data_iv) @pytest.mark.ci def test_plr_scalar_exception_score(): msg = r"Invalid score 'invalid'\." with pytest.raises(ValueError, match=msg): - PLR(obj_dml_data, ml_l, ml_m, score="invalid") + PLR(obj_dml_data, score="invalid") @pytest.mark.ci def test_plr_scalar_exception_n_folds(): - dml_obj = PLR(obj_dml_data, ml_l, ml_m) + dml_obj = PLR(obj_dml_data) msg = r"n_folds must be an integer >= 2\." with pytest.raises(ValueError, match=msg): dml_obj.draw_sample_splitting(n_folds=1) @@ -53,7 +53,7 @@ def test_plr_scalar_exception_n_folds(): @pytest.mark.ci def test_plr_scalar_exception_n_rep(): - dml_obj = PLR(obj_dml_data, ml_l, ml_m) + dml_obj = PLR(obj_dml_data) msg = r"n_rep must be an integer >= 1\." with pytest.raises(ValueError, match=msg): dml_obj.draw_sample_splitting(n_rep=0) @@ -61,7 +61,8 @@ def test_plr_scalar_exception_n_rep(): @pytest.mark.ci def test_plr_scalar_exception_fit_nuisance_without_smpls(): - dml_obj = PLR(obj_dml_data, ml_l, ml_m) + dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m) msg = r"Sample splitting has not been initialized\." with pytest.raises(ValueError, match=msg): dml_obj.fit_nuisance_models() @@ -69,7 +70,8 @@ def test_plr_scalar_exception_fit_nuisance_without_smpls(): @pytest.mark.ci def test_plr_scalar_exception_estimate_causal_without_predictions(): - dml_obj = PLR(obj_dml_data, ml_l, ml_m) + dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m) dml_obj.draw_sample_splitting() msg = r"Predictions not available\." with pytest.raises(ValueError, match=msg): @@ -78,11 +80,33 @@ def test_plr_scalar_exception_estimate_causal_without_predictions(): @pytest.mark.ci def test_plr_scalar_warning_ml_g_partialling_out(): + dml_obj = PLR(obj_dml_data, score="partialling out") with pytest.warns(UserWarning, match="will be ignored"): - PLR(obj_dml_data, ml_l, ml_m, ml_g, score="partialling out") + dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) @pytest.mark.ci -def test_plr_scalar_warning_ml_g_iv_type_missing(): - with pytest.warns(UserWarning, match="ml_l and ml_g should be specified"): - PLR(obj_dml_data, ml_l, ml_m, score="IV-type") +def test_plr_scalar_exception_missing_learner(): + dml_obj = PLR(obj_dml_data) + dml_obj.draw_sample_splitting() + msg = r"Learner 'ml_l' is required but not set" + with pytest.raises(ValueError, match=msg): + dml_obj.fit() + + +@pytest.mark.ci +def test_plr_scalar_exception_missing_learner_partial(): + dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=ml_l) + dml_obj.draw_sample_splitting() + msg = r"Learner 'ml_m' is required but not set" + with pytest.raises(ValueError, match=msg): + dml_obj.fit() + + +@pytest.mark.ci +def test_plr_scalar_exception_invalid_learner(): + dml_obj = PLR(obj_dml_data) + msg = r"Invalid learner provided for ml_l: provide an instance" + with pytest.raises(TypeError, match=msg): + dml_obj.set_learners(ml_l=Lasso) # class instead of instance diff --git a/doubleml/plm/tests/test_plr_scalar_external_predictions.py b/doubleml/plm/tests/test_plr_scalar_external_predictions.py index da6ac9ce..693d3b73 100644 --- a/doubleml/plm/tests/test_plr_scalar_external_predictions.py +++ b/doubleml/plm/tests/test_plr_scalar_external_predictions.py @@ -44,12 +44,12 @@ def doubleml_plr_scalar_fixture(plr_score, n_rep, set_ml_m_ext, set_ml_l_ext, se np.random.seed(3141) dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d) - kwargs = {"obj_dml_data": dml_data, "score": plr_score} - if plr_score == "IV-type": - kwargs["ml_g"] = LinearRegression() - # Fit reference model - dml_plr = PLR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs) + dml_plr = PLR(dml_data, score=plr_score) + if plr_score == "IV-type": + dml_plr.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression(), ml_g=LinearRegression()) + else: + dml_plr.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) np.random.seed(3141) dml_plr.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) dml_plr.fit() @@ -63,12 +63,19 @@ def doubleml_plr_scalar_fixture(plr_score, n_rep, set_ml_m_ext, set_ml_l_ext, se if plr_score == "IV-type" and set_ml_g_ext: ext_predictions["ml_g"] = dml_plr.predictions["ml_g"] - kwargs["ml_g"] = LinearRegression() - elif plr_score == "IV-type": - kwargs["ml_g"] = LinearRegression() - # Fit model with external predictions - dml_plr_ext = PLR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs) + # Fit model with external predictions — only set learners that are needed + dml_plr_ext = PLR(dml_data, score=plr_score) + learner_kwargs = {} + if not set_ml_l_ext: + learner_kwargs["ml_l"] = LinearRegression() + if not set_ml_m_ext: + learner_kwargs["ml_m"] = LinearRegression() + if plr_score == "IV-type" and not set_ml_g_ext: + learner_kwargs["ml_g"] = LinearRegression() + if learner_kwargs: + dml_plr_ext.set_learners(**learner_kwargs) + np.random.seed(3141) dml_plr_ext.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) dml_plr_ext.fit(external_predictions=ext_predictions if ext_predictions else None) diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py index b6f25a71..63e06cdd 100644 --- a/doubleml/plm/tests/test_plr_scalar_return_types.py +++ b/doubleml/plm/tests/test_plr_scalar_return_types.py @@ -18,7 +18,8 @@ @pytest.fixture(scope="module") def fitted_dml_obj(): np.random.seed(3141) - dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression()) + dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) dml_obj.fit() dml_obj.bootstrap(n_rep_boot=N_REP_BOOT) @@ -107,6 +108,13 @@ def test_n_properties(fitted_dml_obj): assert fitted_dml_obj.score == "partialling out" +@pytest.mark.ci +def test_learner_names(fitted_dml_obj): + assert fitted_dml_obj.learner_names == ["ml_l", "ml_m"] + assert "ml_l" in fitted_dml_obj.learners + assert "ml_m" in fitted_dml_obj.learners + + @pytest.mark.ci def test_str_repr(fitted_dml_obj): assert isinstance(str(fitted_dml_obj), str) @@ -116,7 +124,7 @@ def test_str_repr(fitted_dml_obj): @pytest.mark.ci def test_before_fit_raises(): np.random.seed(3141) - dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression()) + dml_obj = PLR(obj_dml_data) with pytest.raises(ValueError, match="framework is not yet initialized"): _ = dml_obj.coef with pytest.raises(ValueError, match="Predictions not available. Call fit"): diff --git a/doubleml/plm/tests/test_plr_scalar_vs_plr.py b/doubleml/plm/tests/test_plr_scalar_vs_plr.py index f87a1af5..15453c12 100644 --- a/doubleml/plm/tests/test_plr_scalar_vs_plr.py +++ b/doubleml/plm/tests/test_plr_scalar_vs_plr.py @@ -47,7 +47,8 @@ def comparison_fixture(learner, score, n_rep): # New PLR np.random.seed(seed) - dml_new = PLR(obj_dml_data, learner, learner, learner, score=score) + dml_new = PLR(obj_dml_data, score=score) + dml_new.set_learners(ml_l=learner, ml_m=learner, ml_g=learner) dml_new.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) dml_new.fit() diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index edc828fb..7db749dc 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -1,6 +1,7 @@ import warnings import numpy as np +from sklearn.base import is_classifier, is_regressor from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import has_fit_parameter @@ -513,6 +514,78 @@ def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_d return smpls, smpls_cluster, n_rep, n_folds +def _check_learner(learner, learner_name, regressor=True, classifier=True): + """ + Validate that a learner has the required interface for DoubleML estimation. + + Parameters + ---------- + learner : object + The learner to validate. + learner_name : str + Name of the learner (for error messages). + regressor : bool + Whether regressors are accepted. Default is ``True``. + classifier : bool + Whether classifiers are accepted. Default is ``True``. + + Returns + ------- + bool + ``True`` if the learner is a classifier, ``False`` otherwise. + + Raises + ------ + TypeError + If the learner is a class instead of an instance, or lacks + required methods (fit, set_params, get_params, predict/predict_proba). + """ + err_msg_prefix = f"Invalid learner provided for {learner_name}: " + warn_msg_prefix = f"Learner provided for {learner_name} is probably invalid: " + + if isinstance(learner, type): + raise TypeError(err_msg_prefix + "provide an instance of a learner instead of a class.") + + if not hasattr(learner, "fit"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .fit().") + if not hasattr(learner, "set_params"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .set_params().") + if not hasattr(learner, "get_params"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .get_params().") + + if regressor & classifier: + if is_classifier(learner): + learner_is_classifier = True + elif is_regressor(learner): + learner_is_classifier = False + else: + warnings.warn( + warn_msg_prefix + + f"{str(learner)} is (probably) neither a regressor nor a classifier. " + + "Method predict is used for prediction." + ) + learner_is_classifier = False + elif classifier: + if not is_classifier(learner): + warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.") + learner_is_classifier = True + else: + assert regressor # classifier, regressor or both must be True + if not is_regressor(learner): + warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.") + learner_is_classifier = False + + # check existence of the prediction method + if learner_is_classifier: + if not hasattr(learner, "predict_proba"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict_proba().") + else: + if not hasattr(learner, "predict"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict().") + + return learner_is_classifier + + def _check_supports_sample_weights(learner, learner_name): if not has_fit_parameter(learner, "sample_weight"): raise ValueError( From 384beba220bc3a6d4bb7395d79ea98f62ecb529a Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sun, 1 Feb 2026 19:36:51 +0100 Subject: [PATCH 06/38] Add architecture documentation for DoubleMLScalar and class hierarchy --- doc/diagrams/architecture.md | 177 +++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 doc/diagrams/architecture.md diff --git a/doc/diagrams/architecture.md b/doc/diagrams/architecture.md new file mode 100644 index 00000000..4e531e9d --- /dev/null +++ b/doc/diagrams/architecture.md @@ -0,0 +1,177 @@ +# DoubleML Scalar Architecture + +## Class Hierarchy + +``` +DoubleMLBase (ABC) +│ Data storage, framework delegation (coef, se, summary, confint, bootstrap, ...) +│ +└── DoubleMLScalar (ABC) + │ Single-parameter estimation: fit(), draw_sample_splitting(), + │ fit_nuisance_models(), estimate_causal_parameters() + │ Learner management: set_learners(), _check_learners_available() + │ Prediction storage: _initialize_predictions_dict() + │ + ├── LinearScoreMixin + │ │ Implements _est_causal_pars_and_se() for linear scores + │ │ θ̂ = -E[ψ_b] / E[ψ_a] + │ │ + │ ├── PLR (partialling out, IV-type) + │ ├── PLIV (planned) + │ ├── IRM (planned) + │ └── DID (planned) + │ + └── NonLinearScoreMixin (planned) + │ Implements _est_causal_pars_and_se() via numerical root-finding + │ + └── ... +``` + +## UML Class Diagram + +``` +┌─────────────────────────────────────────┐ +│ DoubleMLBase (ABC) │ +├─────────────────────────────────────────┤ +│ - _dml_data: DoubleMLBaseData │ +│ - _n_obs: int │ +│ - _framework: DoubleMLFramework | None │ +├─────────────────────────────────────────┤ +│ + framework: DoubleMLFramework │ +│ + thetas / coef: np.ndarray │ +│ + all_thetas / all_coef: np.ndarray │ +│ + se: np.ndarray │ +│ + all_ses: np.ndarray │ +│ + summary: pd.DataFrame │ +│ + psi: np.ndarray │ +│ + n_obs: int │ +│ + confint() │ +│ + bootstrap() │ +│ + p_adjust() │ +│ + sensitivity_analysis() │ +│ «abstract» + fit() │ +│ «abstract» + n_rep: int │ +└─────────────────┬───────────────────────┘ + │ inherits +┌─────────────────▼───────────────────────┐ +│ DoubleMLScalar (ABC) │ +├─────────────────────────────────────────┤ +│ - _score: str │ +│ - _learner_names: List[str] │ +│ - _learners: Dict[str, object] │ +│ - _n_folds: int | None │ +│ - _n_rep: int | None │ +│ - _smpls: List | None │ +│ - _predictions: Dict | None │ +│ - _all_thetas: np.ndarray | None │ +│ - _all_ses: np.ndarray | None │ +│ - _psi: np.ndarray | None │ +│ - _psi_deriv: np.ndarray | None │ +│ - _var_scaling_factors: np.ndarray|None │ +├─────────────────────────────────────────┤ +│ + score: str │ +│ + n_folds: int │ +│ + n_rep: int │ +│ + predictions: Dict │ +│ + smpls: List │ +│ + learner_names: List[str] │ +│ + learners: Dict[str, object] │ +│ + fit(n_folds, n_rep, external_preds) │ +│ + fit_nuisance_models(external_preds) │ +│ + estimate_causal_parameters() │ +│ + draw_sample_splitting(n_folds, n_rep) │ +│ + _initialize_predictions_dict() │ +│ + _check_learners_available() │ +│ + _initialize_result_arrays() │ +│ + _construct_framework() │ +│ «abstract» + set_learners() │ +│ «abstract» + _nuisance_est() │ +│ «abstract» + _get_score_elements() │ +│ «abstract» + _est_causal_pars_and_se() │ +└──────────┬──────────────────────────────┘ + │ inherits +┌──────────▼──────────────────────────────┐ +│ LinearScoreMixin │ +├─────────────────────────────────────────┤ +│ (no additional state) │ +├─────────────────────────────────────────┤ +│ + _est_causal_pars_and_se(psi_elements) │ +│ → closed-form: θ̂ = -E[ψ_b]/E[ψ_a] │ +│ + _compute_score(psi_elements, coef) │ +│ + _score_element_names() → [psi_a,b] │ +└──────────┬──────────────────────────────┘ + │ inherits +┌──────────▼──────────────────────────────┐ +│ PLR │ +├─────────────────────────────────────────┤ +│ _learner_names = [ml_l, ml_m(, ml_g)] │ +├─────────────────────────────────────────┤ +│ + __init__(obj_dml_data, score) │ +│ + set_learners(ml_l, ml_m, ml_g) │ +│ + _check_data() │ +│ + _nuisance_est(train, test, i_rep, ..) │ +│ + _get_score_elements() → {psi_a,psi_b}│ +└─────────────────────────────────────────┘ +``` + +## Method Resolution & Workflow + +The `fit()` call follows the template method pattern: + +``` +PLR.fit() + │ + ├─ DoubleMLScalar.draw_sample_splitting() ← if not already done + │ └─ DoubleMLResampling.split_samples() + │ + ├─ DoubleMLScalar.fit_nuisance_models() + │ ├─ DoubleMLScalar._check_learners_available() + │ ├─ DoubleMLScalar._initialize_predictions_dict() ← uses _learner_names + │ └─ loop(n_rep × n_folds): + │ └─ PLR._nuisance_est() ← subclass implements + │ + └─ DoubleMLScalar.estimate_causal_parameters() + ├─ DoubleMLScalar._initialize_result_arrays() + ├─ PLR._get_score_elements() ← subclass implements + ├─ LinearScoreMixin._est_causal_pars_and_se() ← mixin implements + └─ DoubleMLScalar._construct_framework() + └─ DoubleMLFramework(...) +``` + +## Typical User Workflow + +```python +# 1. Define model (data + score) +plr = PLR(obj_dml_data, score="partialling out") + +# 2. Set learners +plr.set_learners(ml_l=RandomForestRegressor(), ml_m=RandomForestRegressor()) + +# 3. Draw sample splitting +plr.draw_sample_splitting(n_folds=5, n_rep=1) + +# 4. Fit +plr.fit() + +# 5. Results (delegated to DoubleMLFramework via DoubleMLBase) +print(plr.summary) +plr.confint() +plr.bootstrap() +``` + +## What Each Layer Provides + +| Layer | Responsibilities | +|---|---| +| **DoubleMLBase** | Data storage, framework delegation (coef, se, summary, confint, bootstrap, p_adjust, sensitivity_analysis) | +| **DoubleMLScalar** | Single-parameter fit orchestration, sample splitting, learner management (`_learner_names`, `_learners`, `set_learners`, `_check_learners_available`), prediction storage, result array initialization, framework construction | +| **LinearScoreMixin** | Closed-form parameter estimation for linear scores: `θ̂ = -E[ψ_b]/E[ψ_a]`, SE computation, influence function | +| **PLR** | PLR-specific: data validation, learner names (`ml_l`, `ml_m`, `ml_g`), nuisance estimation logic, score element computation | + +## Key Design Decisions + +- **Learners separated from constructor**: `__init__` takes only data + score; learners are set via `set_learners()` with explicit kwargs per subclass +- **`_learner_names` as single source of truth**: Drives `_initialize_predictions_dict()` and `_check_learners_available()` — subclasses just set the list +- **Resampling separated from constructor**: `draw_sample_splitting()` is a separate step, can be called independently +- **External predictions**: Passed to `fit()` / `fit_nuisance_models()`, validated against `_learner_names`, pre-filled before cross-fitting loop +- **Template method pattern**: `fit()` orchestrates; subclasses implement `_nuisance_est()` and `_get_score_elements()`; mixin implements `_est_causal_pars_and_se()` From 838d0ca742f7122fc326d17db2da648eb2027279 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Tue, 3 Feb 2026 18:47:55 +0100 Subject: [PATCH 07/38] Add code simplifier and technical debt finder documentation --- .claude/CLAUDE.md | 222 ++++++++++++++++++++++++ .claude/skills/code-simplifier/SKILL.md | 178 +++++++++++++++++++ .claude/skills/techdebt/SKILL.md | 125 +++++++++++++ 3 files changed, 525 insertions(+) create mode 100644 .claude/CLAUDE.md create mode 100644 .claude/skills/code-simplifier/SKILL.md create mode 100644 .claude/skills/techdebt/SKILL.md diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md new file mode 100644 index 00000000..6c302c11 --- /dev/null +++ b/.claude/CLAUDE.md @@ -0,0 +1,222 @@ +# DoubleML for Python - Claude Code Memory + +## Project Purpose + +DoubleML is a Python package implementing Double/Debiased Machine Learning (DML) methods for causal inference. The package provides: +- Partially Linear Models (PLR, PLIV, PLPR, LPLR) +- Interactive Regression Models (IRM, IIVM, APO, QTE, CVAR, SSM) +- Difference-in-Differences estimators (DID, DIDCSBinary, DIDMulti) +- Regression Discontinuity Design (RDD) + +**Documentation**: https://docs.doubleml.org + +## Coding Standards + +### Python +- **Version**: Python 3.11+ (supports 3.11, 3.12, 3.13) +- **Formatter**: black with line-length 127 +- **Linter**: ruff (rules: E, F, W, I) +- **Type Checker**: mypy with `disallow_untyped_defs = true` +- **Type hints**: Required for all functions +- **Docstrings**: NumPy-style (see example below) +- **Max line length**: 127 characters + +### NumPy Docstring Style +```python +def example_function(param1: int, param2: str) -> bool: + """ + Short description of the function. + + Parameters + ---------- + param1 : int + Description of param1. + param2 : str + Description of param2. + + Returns + ------- + bool + Description of return value. + + Raises + ------ + ValueError + If param1 is negative. + """ +``` + +### Code Quality Commands +```bash +# Format code +black . + +# Lint code +ruff check . + +# Fix linting issues +ruff check --fix . + +# Type check +mypy doubleml +``` + +### Pre-commit Hooks +Pre-commit is configured with: +- File format checks (yaml, toml) +- Debug statement detection +- Large file checks +- Trailing whitespace and line ending fixes +- black formatting +- ruff linting with auto-fix + +Run pre-commit manually: `pre-commit run --all-files` + +## Architecture Overview + +### Class Hierarchy +``` +DoubleMLBase (ABC) +└─> DoubleMLScalar (ABC) - single-parameter models + ├─> LinearScoreMixin - closed-form solver + │ ├─> DoubleMLPLR + │ ├─> DoubleMLIRM + │ ├─> DoubleMLPLIV + │ ├─> DoubleMLIIVM + │ └─> DoubleML DID variants + └─> NonLinearScoreMixin - numerical solver (planned) + +DoubleML - multi-parameter estimation (extends DoubleMLScalar) +``` + +### Key Design Patterns +- **Template Method**: `fit()` orchestrates; subclasses implement abstract methods +- **Mixin Pattern**: LinearScoreMixin provides closed-form θ = -E[ψ_b]/E[ψ_a] +- **Delegation**: DoubleMLBase delegates inference to DoubleMLFramework + +### Core Files +| File | Purpose | +|------|---------| +| `doubleml/double_ml_base.py` | Abstract base with properties (coef, se, summary) and inference methods | +| `doubleml/double_ml_scalar.py` | Single-parameter estimation orchestrator | +| `doubleml/double_ml.py` | Multi-parameter estimation with sample splitting | +| `doubleml/double_ml_framework.py` | Statistical inference (confint, bootstrap, sensitivity) | +| `doubleml/double_ml_linear_score.py` | Linear score mixin | + +### Package Structure +``` +doubleml/ +├── data/ # Data containers (DoubleMLData, DoubleMLDIDData, etc.) +├── plm/ # Partially Linear Models (PLR, PLIV, PLPR, LPLR) +├── irm/ # Interactive Regression Models (IRM, IIVM, APO, QTE, etc.) +├── did/ # Difference-in-Differences estimators +├── rdd/ # Regression Discontinuity Design +├── utils/ # Helpers (_checks, _estimation, resampling, tuning) +└── tests/ # Main test directory +``` + +## Testing + +### Run Tests +```bash +# Run all tests +pytest + +# Run with coverage +pytest --cov + +# Run specific marker (CI tests) +pytest -m ci + +# Run specific test file +pytest doubleml/tests/test_framework.py + +# Run tests for a specific module +pytest doubleml/plm/tests/ +``` + +### Test Markers +- `ci`: Continuous integration tests for GitHub Actions +- `ci_rdd`: RDD-specific CI tests + +### Test Organization +- Each module (plm, irm, did) has its own `tests/` subdirectory +- Test utilities in `doubleml/tests/_utils*.py` +- Manual computation helpers verify results independently + +## Git Workflow + +### Branches +- `main`: Main development branch +- Feature branches for new work + +### Commit Format +Use Conventional Commits: +- `feat:` new feature +- `fix:` bug fix +- `docs:` documentation +- `refactor:` code refactoring +- `test:` adding tests +- `chore:` maintenance + +## Key Dependencies + +### Core +- numpy>=2.0.0, pandas>=2.0.0, scipy>=1.7.0 +- scikit-learn>=1.6.0, statsmodels>=0.14.0 + +### ML/Tuning +- optuna>=4.6.0 (hyperparameter tuning) +- joblib>=1.2.0 (parallelization) + +### Visualization +- matplotlib>=3.9.0, seaborn>=0.13, plotly>=5.0.0 + +### Development +- pytest>=8.3.0, pytest-cov>=6.0.0 +- black>=25.1.0, ruff>=0.11.1, mypy>=1.18.0 +- xgboost>=2.1.0, lightgbm>=4.6.0 (for testing) + +## Known Pitfalls + +### Type Annotations +- MyPy is strict: `disallow_untyped_defs = true` +- All functions need full type hints including return types +- Use `from __future__ import annotations` for forward references + +### Learner Validation +- Learners must be scikit-learn compatible (fit/predict interface) +- Use `_check_learner()` from `doubleml/utils/_checks.py` for validation +- Classifiers need `predict_proba()` for propensity scores + +### Sample Splitting +- Cross-fitting uses `DoubleMLResampling` from `doubleml/utils/resampling.py` +- Default is 5-fold cross-fitting with 1 repetition +- Cluster-robust resampling available for clustered data + +### Score Functions +- Linear scores use closed-form: θ = -E[ψ_b]/E[ψ_a] +- Custom scores can be passed as callables +- Score elements: `psi_a` (derivative), `psi_b` (moment) + +### External Predictions +- Models support external predictions via `set_external_predictions()` +- Predictions must match sample splitting structure + +## Verification + +Before completing any task: +1. Run `ruff check .` to check for linting issues +2. Run `mypy doubleml` for type checking +3. Run relevant tests: `pytest doubleml/path/to/tests/` +4. Format code: `black .` + +## Useful Links + +- **Documentation**: https://docs.doubleml.org +- **Source**: https://github.com/DoubleML/doubleml-for-py +- **Bug Tracker**: https://github.com/DoubleML/doubleml-for-py/issues +- **Architecture Docs**: [doc/diagrams/architecture.md](doc/diagrams/architecture.md) + +--- +*Update this file when Claude makes mistakes to prevent future issues.* diff --git a/.claude/skills/code-simplifier/SKILL.md b/.claude/skills/code-simplifier/SKILL.md new file mode 100644 index 00000000..583b70ae --- /dev/null +++ b/.claude/skills/code-simplifier/SKILL.md @@ -0,0 +1,178 @@ +--- +name: code-simplifier +description: Simplify and clean up DoubleML code after changes. Reduces complexity, improves readability, ensures NumPy-style docstrings and type hints. +--- + +# Code Simplifier for DoubleML + +Clean up and simplify code after making changes. + +## When to Use + +Run after completing a feature or fix to ensure code is clean, readable, and follows DoubleML patterns. + +## Simplification Goals + +### Reduce Complexity +- Break long functions into smaller, focused ones (target: <50 lines) +- Reduce nesting depth (max 3 levels) +- Simplify complex conditionals +- Extract magic numbers to named constants (e.g., `DEFAULT_N_FOLDS = 5`) + +### Improve Readability +- Use descriptive variable and function names +- Add clarifying comments for non-obvious logic +- Ensure consistent formatting (127 char line limit) +- Remove unnecessary comments + +### Apply Pythonic Patterns +- Use list/dict/set comprehensions where appropriate +- Use `with` statements for resource management +- Use `enumerate()` instead of manual indexing +- Use `zip()` for parallel iteration +- Use f-strings for formatting +- Use `pathlib` for file paths +- Use `is None` / `is not None` instead of `== None` + +### DoubleML-Specific Patterns +- Use `clone()` for sklearn learners instead of direct copy +- Use `_check_learner()` for learner validation +- Use `_check_score()` for score function validation +- Consistent `psi_a`/`psi_b` naming for score elements +- Use `DoubleMLResampling` for sample splitting logic +- Prefer numpy operations over Python loops for arrays + +### Type Hints (Python 3.11+) +- Use built-in generics: `list[int]` not `typing.List[int]` +- Use `X | None` instead of `Optional[X]` +- Use `X | Y` instead of `Union[X, Y]` +- Add `from __future__ import annotations` for forward references +- Ensure all public functions have complete type hints + +### NumPy-Style Docstrings +- Ensure `Parameters` section lists all arguments +- Ensure `Returns` section describes return value +- Add `Raises` section for exceptions +- Use `:class:` references for DoubleML types + +### Clean Up +- Remove unused imports +- Remove unused variables +- Remove commented-out code +- Remove redundant code paths +- Consolidate duplicate logic + +## Workflow + +1. **Identify Changed Files** + ```bash + git diff --name-only HEAD~1 # Recent changes + git status --short # Uncommitted changes + ``` + +2. **Analyze Each File** + - Check for simplification opportunities + - Prioritize high-impact improvements + +3. **Apply Simplifications** + - Make incremental changes + - Preserve original behavior + - Run tests after each change + +4. **Format and Lint** + ```bash + black . + ruff check --fix . + ``` + +5. **Type Check** + ```bash + mypy doubleml + ``` + +6. **Verify** + ```bash + pytest -m ci + ``` + +## Arguments + +Optionally specify files or directories to simplify. + +Usage: +- `/code-simplifier` - Simplify recently changed files +- `/code-simplifier doubleml/plm/plr.py` - Simplify specific file +- `/code-simplifier doubleml/utils/` - Simplify entire directory + +## Example Transformations + +### Loop to Comprehension +```python +# Before +result = [] +for i in range(len(items)): + if items[i].is_valid == True: + result.append(items[i].value) + +# After +result = [item.value for item in items if item.is_valid] +``` + +### Flatten Nesting +```python +# Before +if x != None: + if y != None: + if z != None: + process(x, y, z) + +# After +if all(v is not None for v in (x, y, z)): + process(x, y, z) +``` + +### Modern Type Hints +```python +# Before +from typing import List, Optional, Union, Dict + +def process(items: List[int], config: Optional[Dict[str, Any]] = None) -> Union[int, None]: + ... + +# After +def process(items: list[int], config: dict[str, Any] | None = None) -> int | None: + ... +``` + +### NumPy Operations +```python +# Before +result = [] +for i in range(len(predictions)): + result.append(predictions[i] - true_values[i]) +result = np.array(result) + +# After +result = predictions - true_values +``` + +### DoubleML Learner Pattern +```python +# Before +ml_l_copy = copy.deepcopy(ml_l) + +# After +from sklearn.base import clone +ml_l_copy = clone(ml_l) +``` + +### Score Element Naming +```python +# Before +def _get_score_elements(self, ...): + return {"a": psi_derivative, "b": psi_moment} + +# After +def _get_score_elements(self, ...): + return {"psi_a": psi_derivative, "psi_b": psi_moment} +``` diff --git a/.claude/skills/techdebt/SKILL.md b/.claude/skills/techdebt/SKILL.md new file mode 100644 index 00000000..af0e4b6a --- /dev/null +++ b/.claude/skills/techdebt/SKILL.md @@ -0,0 +1,125 @@ +--- +name: techdebt +description: Find and fix technical debt in DoubleML codebase. Checks for code smells, type issues, style violations, and outdated patterns. +--- + +# Technical Debt Finder for DoubleML + +Identify and fix technical debt aligned with project standards. + +## Project-Specific Checks + +### Type Annotations (MyPy Strict Mode) +- Missing type hints on functions (`disallow_untyped_defs = true`) +- Missing return type annotations +- Use of `Any` where specific types are possible +- Old-style typing (`typing.List` → `list`, `typing.Dict` → `dict`) +- Missing `from __future__ import annotations` for forward references + +### Docstrings (NumPy Style) +- Missing docstrings on public functions/classes +- Incorrect docstring format (must be NumPy-style) +- Missing `Parameters`, `Returns`, or `Raises` sections +- Outdated parameter documentation + +### Code Style (Black + Ruff) +- Lines exceeding 127 characters +- Import ordering issues (ruff rule I) +- Unused imports (ruff rule F401) +- Undefined names (ruff rule F821) +- Old-style string formatting (use f-strings) + +### Scikit-learn Compatibility +- Learners missing `fit()`/`predict()` interface +- Classifiers missing `predict_proba()` for propensity scores +- Missing `clone()` compatibility + +### DoubleML Patterns +- Inconsistent use of `_check_learner()` for validation +- Missing score function validation with `_check_score()` +- Incorrect sample splitting structure +- Missing `psi_a`/`psi_b` score elements + +## Workflow + +1. **Run Automated Checks** + ```bash + # Type checking + mypy doubleml + + # Linting + ruff check . + + # Format check (dry-run) + black --check . + ``` + +2. **Scan for Code Smells** + - Functions longer than 50 lines + - More than 5 parameters + - Deep nesting (> 3 levels) + - Duplicate code blocks + - Magic numbers without constants + +3. **Check for Dead Code** + - Unused imports + - Unused functions/classes + - Commented-out code blocks + - Unreachable code paths + +4. **Report Findings** + Format: `file_path:line_number - [severity] description` + +5. **Fix Issues** + - Auto-fix with `ruff check --fix .` + - Auto-format with `black .` + - Manual fixes for type hints and docstrings + +6. **Verify** + ```bash + ruff check . + mypy doubleml + pytest -m ci # Run CI tests + ``` + +## Severity Levels + +| Severity | Description | Examples | +|----------|-------------|----------| +| **high** | Breaks CI or type safety | Missing type hints, mypy errors | +| **medium** | Style violations | Line length, import order | +| **low** | Code smells | Long functions, magic numbers | + +## Arguments + +Specify scope to focus the scan: + +- `/techdebt` - Scan entire `doubleml/` package +- `/techdebt doubleml/plm/` - Scan PLM module +- `/techdebt doubleml/utils/_checks.py` - Scan specific file + +## Output Format + +```markdown +## Technical Debt Report + +### High Severity +- `doubleml/plm/plr.py:45` - Missing return type annotation +- `doubleml/utils/_checks.py:123` - Type hint uses `typing.List` + +### Medium Severity +- `doubleml/did/did.py:89` - Line exceeds 127 characters +- `doubleml/irm/irm.py:12` - Unused import `warnings` + +### Low Severity +- `doubleml/double_ml.py:234` - Function has 67 lines (>50) +- `doubleml/utils/resampling.py:45` - Magic number `5` should be constant + +### Fixed +- ✓ Auto-fixed 3 import ordering issues +- ✓ Auto-formatted 2 files with black + +### Remaining +- 2 high severity items need manual fixes +- Consider refactoring `_nuisance_est()` in next session +``` From 54e9eb469c452bb7c515ba4784e17d0e0f184d1d Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Feb 2026 11:29:36 +0100 Subject: [PATCH 08/38] Enhance DoubleMLScalar and PLR with learner management, validation, and utility functions --- doubleml/double_ml_scalar.py | 112 ++++++++-- doubleml/plm/plr_scalar.py | 157 ++++++++++---- .../plm/tests/test_plr_scalar_exceptions.py | 2 +- .../plm/tests/test_plr_scalar_return_types.py | 29 ++- doubleml/utils/_learner.py | 201 ++++++++++++++++++ 5 files changed, 448 insertions(+), 53 deletions(-) create mode 100644 doubleml/utils/_learner.py diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index c969ec84..a0010005 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -3,7 +3,7 @@ """ from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Self +from typing import ClassVar, Dict, List, Optional, Self import numpy as np @@ -11,6 +11,7 @@ from .double_ml_base import DoubleMLBase from .double_ml_framework import DoubleMLCore as DoubleMLCoreData from .double_ml_framework import DoubleMLFramework +from .utils._learner import LearnerInfo, LearnerSpec, validate_learner from .utils.resampling import DoubleMLResampling @@ -43,6 +44,9 @@ class DoubleMLScalar(DoubleMLBase, ABC): The score function being used. """ + # Subclasses define all possible learners for the model + _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] + def __init__( self, obj_dml_data: DoubleMLBaseData, @@ -76,9 +80,8 @@ def __init__( self._score = score - # Learner names (set by subclass) and learner storage (set via set_learners) - self._learner_names: List[str] = [] - self._learners: Dict[str, object] = {} + # Learner storage: single dict for all learner state + self._learners: Dict[str, LearnerInfo] = {} # Resampling parameters (set via draw_sample_splitting) self._n_folds: Optional[int] = None @@ -183,31 +186,112 @@ def smpls(self) -> List: return self._smpls @property - def learner_names(self) -> List[str]: + @abstractmethod + def required_learners(self) -> List[str]: """ - Names of the required learners for this model. + Names of the required learners for current configuration. + + Subclasses implement this as a property that returns the learner names + needed based on the current score function or model configuration. Returns ------- list of str List of required learner names. """ - return self._learner_names + pass @property def learners(self) -> Dict[str, object]: """ - The learners used for nuisance estimation. + Access registered learner objects by name. Returns ------- dict Dictionary mapping learner names to estimator instances. """ - return self._learners + return {name: info.learner for name, info in self._learners.items()} + + def get_params(self, learner_name: str) -> Dict: + """ + Get parameters of a registered learner. + + Parameters + ---------- + learner_name : str + Name of the learner. + + Returns + ------- + dict + Dictionary of learner parameters. + + Raises + ------ + ValueError + If the learner is not registered. + """ + if learner_name not in self._learners: + raise ValueError(f"Learner '{learner_name}' not registered.") + return self._learners[learner_name].learner.get_params() + + def set_params(self, learner_name: str, **params: object) -> Self: + """ + Set parameters of a registered learner. + + Parameters + ---------- + learner_name : str + Name of the learner. + **params + Parameters to set on the learner. + + Returns + ------- + self : Self + The estimator with updated learner parameters. + + Raises + ------ + ValueError + If the learner is not registered. + """ + if learner_name not in self._learners: + raise ValueError(f"Learner '{learner_name}' not registered.") + self._learners[learner_name].learner.set_params(**params) + return self + + def _register_learner(self, name: str, learner: object) -> None: + """ + Validate and register a single learner. + + Parameters + ---------- + name : str + Name of the learner (must be in _LEARNER_SPECS). + learner : object + The learner instance to register. + + Raises + ------ + ValueError + If the learner name is not defined in _LEARNER_SPECS. + """ + if name not in self._LEARNER_SPECS: + raise ValueError(f"Learner '{name}' not defined for this model.") + + spec = self._LEARNER_SPECS[name] + info = validate_learner( + learner, + spec, + binary_outcome=self._dml_data.binary_outcome, + binary_treatment=self._dml_data.binary_treats.all(), + ) + self._learners[name] = info @abstractmethod - def set_learners(self, **kwargs) -> Self: + def set_learners(self, **kwargs: object) -> Self: """ Set the learners for nuisance estimation. @@ -435,7 +519,7 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]: Initialize dictionary for storing predictions. Creates a prediction array of shape ``(n_obs, n_rep)`` for each learner - in :attr:`learner_names`, filled with ``NaN``. Subclasses can override + in :attr:`required_learners`, filled with ``NaN``. Subclasses can override this for custom prediction storage. Returns @@ -445,9 +529,9 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]: """ n_obs = self._n_obs n_rep = self.n_rep - return {name: np.full((n_obs, n_rep), np.nan) for name in self._learner_names} + return {name: np.full((n_obs, n_rep), np.nan) for name in self.required_learners} - def _check_learners_available(self, external_predictions=None) -> None: + def _check_learners_available(self, external_predictions: Optional[Dict[str, np.ndarray]] = None) -> None: """ Validate that all required learners are set or covered by external predictions. @@ -463,7 +547,7 @@ def _check_learners_available(self, external_predictions=None) -> None: """ ext_keys = set(external_predictions.keys()) if external_predictions is not None else set() - for name in self._learner_names: + for name in self.required_learners: if name not in self._learners and name not in ext_keys: raise ValueError( f"Learner '{name}' is required but not set and no external predictions provided for it. " diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index b915bd76..ef18fb68 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -2,14 +2,17 @@ Partially Linear Regression (PLR) model based on the new DoubleMLScalar hierarchy. """ +from __future__ import annotations + import warnings +from typing import Dict, List, Optional, Self import numpy as np from sklearn.base import clone from ..data.base_data import DoubleMLData from ..double_ml_linear_score import LinearScoreMixin -from ..utils._checks import _check_learner +from ..utils._learner import LearnerSpec, predict_nuisance class PLR(LinearScoreMixin): @@ -21,16 +24,48 @@ class PLR(LinearScoreMixin): ---------- obj_dml_data : DoubleMLData The data object providing the data and specifying the variables for the causal model. - score : str, optional + score : str The score function (``'partialling out'`` or ``'IV-type'``). Default is ``'partialling out'``. + ml_l : estimator, optional + Learner for E[Y|X]. Can be regressor or classifier. + ml_m : estimator, optional + Learner for E[D|X]. Can be regressor or classifier. + ml_g : estimator, optional + Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor. """ + # Define learner specifications for PLR + _LEARNER_SPECS: Dict[str, LearnerSpec] = { + "ml_l": LearnerSpec("ml_l", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"), + "ml_m": LearnerSpec("ml_m", allow_regressor=True, allow_classifier=True, binary_data_check="treatment"), + "ml_g": LearnerSpec("ml_g", allow_regressor=True, allow_classifier=False), + } + def __init__( self, - obj_dml_data, - score="partialling out", + obj_dml_data: DoubleMLData, + score: str = "partialling out", + ml_l: Optional[object] = None, + ml_m: Optional[object] = None, + ml_g: Optional[object] = None, ): + """ + Initialize PLR model. + + Parameters + ---------- + obj_dml_data : DoubleMLData + The data object. + score : str + Score function ('partialling out' or 'IV-type'). + ml_l : estimator, optional + Learner for E[Y|X]. Can be regressor or classifier. + ml_m : estimator, optional + Learner for E[D|X]. Can be regressor or classifier. + ml_g : estimator, optional + Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor. + """ # Validate data self._check_data(obj_dml_data) @@ -44,12 +79,24 @@ def __init__( score=score, ) - # Set required learner names based on score - self._learner_names = ["ml_l", "ml_m"] - if score == "IV-type": - self._learner_names.append("ml_g") + # Set learners if provided + if any(learner is not None for learner in [ml_l, ml_m, ml_g]): + self.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) + + @property + def required_learners(self) -> List[str]: + """Required learners for current score.""" + names = ["ml_l", "ml_m"] + if self.score == "IV-type": + names.append("ml_g") + return names - def set_learners(self, ml_l=None, ml_m=None, ml_g=None): + def set_learners( + self, + ml_l: Optional[object] = None, + ml_m: Optional[object] = None, + ml_g: Optional[object] = None, + ) -> Self: """ Set the learners for nuisance estimation. @@ -71,26 +118,48 @@ def set_learners(self, ml_l=None, ml_m=None, ml_g=None): self : PLR The estimator with learners set. """ - if ml_l is not None: - _check_learner(ml_l, "ml_l", regressor=True, classifier=True) - self._learners["ml_l"] = clone(ml_l) - - if ml_m is not None: - _check_learner(ml_m, "ml_m", regressor=True, classifier=True) - self._learners["ml_m"] = clone(ml_m) - - if ml_g is not None: - if self.score == "IV-type": - _check_learner(ml_g, "ml_g", regressor=True, classifier=False) - self._learners["ml_g"] = clone(ml_g) - else: - warnings.warn( - "A learner ml_g has been provided for score = 'partialling out' but will be ignored. " - "A learner ml_g is not required for estimation." - ) + for name, learner in [("ml_l", ml_l), ("ml_m", ml_m), ("ml_g", ml_g)]: + if learner is None: + continue + if name not in self.required_learners: + warnings.warn(f"Learner '{name}' not required for score='{self.score}', ignored.") + continue + self._register_learner(name, learner) + # IV-type: clone ml_l to ml_g if only one provided + self._handle_iv_cloning() return self + def _handle_iv_cloning(self) -> None: + """For IV-type score: clone ml_l to ml_g or vice versa if one is missing.""" + if self.score != "IV-type": + return + if "ml_g" not in self.required_learners: + return + + has_l = "ml_l" in self._learners + has_g = "ml_g" in self._learners + + if has_l and not has_g: + warnings.warn("For score='IV-type', ml_g not set. Cloning ml_l to ml_g.") + # Clone the learner and register with same info + from ..utils._learner import LearnerInfo + + ml_l_info = self._learners["ml_l"] + self._learners["ml_g"] = LearnerInfo( + learner=clone(ml_l_info.learner), + is_classifier=ml_l_info.is_classifier, + ) + elif has_g and not has_l: + warnings.warn("For score='IV-type', ml_l not set. Cloning ml_g to ml_l.") + from ..utils._learner import LearnerInfo + + ml_g_info = self._learners["ml_g"] + self._learners["ml_l"] = LearnerInfo( + learner=clone(ml_g_info.learner), + is_classifier=ml_g_info.is_classifier, + ) + @staticmethod def _check_data(obj_dml_data): if not isinstance(obj_dml_data, DoubleMLData): @@ -103,7 +172,14 @@ def _check_data(obj_dml_data): "To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR." ) - def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions=None): + def _nuisance_est( + self, + train_idx: np.ndarray, + test_idx: np.ndarray, + i_rep: int, + i_fold: int, + external_predictions: Optional[Dict[str, np.ndarray]] = None, + ) -> None: x = self._dml_data.x y = self._dml_data.y d = self._dml_data.d @@ -119,23 +195,31 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions # Fit and predict ml_l: E[Y|X] if not l_external: - ml_l = clone(self._learners["ml_l"]) + ml_l_info = self._learners["ml_l"] + ml_l = clone(ml_l_info.learner) ml_l.fit(x_train, y_train) - self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test) + self._predictions["ml_l"][test_idx, i_rep] = predict_nuisance(ml_l, x_test, ml_l_info.is_classifier) # Fit and predict ml_m: E[D|X] if not m_external: - ml_m = clone(self._learners["ml_m"]) + ml_m_info = self._learners["ml_m"] + ml_m = clone(ml_m_info.learner) ml_m.fit(x_train, d_train) - self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test) + self._predictions["ml_m"][test_idx, i_rep] = predict_nuisance(ml_m, x_test, ml_m_info.is_classifier) # For IV-type: fit ml_g after last fold when all ml_l/ml_m predictions are available is_last_fold = i_fold == self.n_folds - 1 if is_last_fold and self.score == "IV-type" and not g_external: - # If ml_g not explicitly set, default to clone of ml_l + # If ml_g not explicitly set, clone ml_l (already handled in _handle_iv_cloning) if "ml_g" not in self._learners: warnings.warn("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).") - self._learners["ml_g"] = clone(self._learners["ml_l"]) + from ..utils._learner import LearnerInfo + + ml_l_info = self._learners["ml_l"] + self._learners["ml_g"] = LearnerInfo( + learner=clone(ml_l_info.learner), + is_classifier=ml_l_info.is_classifier, + ) # Compute initial theta from full cross-fitted predictions l_hat = self._predictions["ml_l"][:, i_rep] @@ -145,13 +229,14 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) # Second pass: fit ml_g with cross-fitting across all folds + ml_g_info = self._learners["ml_g"] for j_fold in range(self.n_folds): train_j, test_j = self._smpls[i_rep][j_fold] - ml_g = clone(self._learners["ml_g"]) + ml_g = clone(ml_g_info.learner) ml_g.fit(x[train_j], y[train_j] - theta_initial * d[train_j]) - self._predictions["ml_g"][test_j, i_rep] = ml_g.predict(x[test_j]) + self._predictions["ml_g"][test_j, i_rep] = predict_nuisance(ml_g, x[test_j], ml_g_info.is_classifier) - def _get_score_elements(self): + def _get_score_elements(self) -> Dict[str, np.ndarray]: y = self._dml_data.y d = self._dml_data.d diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py index 7cc74aac..fb1ba7a9 100644 --- a/doubleml/plm/tests/test_plr_scalar_exceptions.py +++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py @@ -81,7 +81,7 @@ def test_plr_scalar_exception_estimate_causal_without_predictions(): @pytest.mark.ci def test_plr_scalar_warning_ml_g_partialling_out(): dml_obj = PLR(obj_dml_data, score="partialling out") - with pytest.warns(UserWarning, match="will be ignored"): + with pytest.warns(UserWarning, match="not required for score.*ignored"): dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py index 63e06cdd..09832931 100644 --- a/doubleml/plm/tests/test_plr_scalar_return_types.py +++ b/doubleml/plm/tests/test_plr_scalar_return_types.py @@ -109,8 +109,8 @@ def test_n_properties(fitted_dml_obj): @pytest.mark.ci -def test_learner_names(fitted_dml_obj): - assert fitted_dml_obj.learner_names == ["ml_l", "ml_m"] +def test_required_learners(fitted_dml_obj): + assert fitted_dml_obj.required_learners == ["ml_l", "ml_m"] assert "ml_l" in fitted_dml_obj.learners assert "ml_m" in fitted_dml_obj.learners @@ -121,6 +121,31 @@ def test_str_repr(fitted_dml_obj): assert isinstance(repr(fitted_dml_obj), str) +@pytest.mark.ci +def test_get_params(fitted_dml_obj): + params = fitted_dml_obj.get_params("ml_l") + assert isinstance(params, dict) + # LinearRegression has 'fit_intercept' param + assert "fit_intercept" in params + + +@pytest.mark.ci +def test_set_params(fitted_dml_obj): + # Note: This modifies the fitted object, but we're just testing the method works + result = fitted_dml_obj.set_params("ml_l", fit_intercept=False) + assert result is fitted_dml_obj # Returns self + params = fitted_dml_obj.get_params("ml_l") + assert params["fit_intercept"] is False + # Reset for other tests + fitted_dml_obj.set_params("ml_l", fit_intercept=True) + + +@pytest.mark.ci +def test_get_params_invalid_learner(fitted_dml_obj): + with pytest.raises(ValueError, match="not registered"): + fitted_dml_obj.get_params("ml_invalid") + + @pytest.mark.ci def test_before_fit_raises(): np.random.seed(3141) diff --git a/doubleml/utils/_learner.py b/doubleml/utils/_learner.py new file mode 100644 index 00000000..04659c98 --- /dev/null +++ b/doubleml/utils/_learner.py @@ -0,0 +1,201 @@ +""" +Learner specification and validation utilities for DoubleML. +""" + +from __future__ import annotations + +import warnings +from dataclasses import dataclass +from typing import Any, Literal, Optional + +import numpy as np +from sklearn.base import clone, is_classifier, is_regressor + + +@dataclass(frozen=True) +class LearnerSpec: + """ + Immutable specification for a learner requirement. + + Parameters + ---------- + name : str + Name of the learner (e.g., "ml_l", "ml_m"). + allow_regressor : bool + Whether regressors are allowed. Default is ``True``. + allow_classifier : bool + Whether classifiers are allowed. Default is ``True``. + binary_data_check : {"outcome", "treatment"} or None + If specified, warns when using regressor with binary data. + "outcome" checks binary_outcome, "treatment" checks binary_treatment. + Default is ``None``. + """ + + name: str + allow_regressor: bool = True + allow_classifier: bool = True + binary_data_check: Optional[Literal["outcome", "treatment"]] = None + + +@dataclass +class LearnerInfo: + """ + Mutable info about a registered learner. + + Parameters + ---------- + learner : object + The learner object (already cloned). + is_classifier : bool + Whether the learner is a classifier. + """ + + learner: Any + is_classifier: bool + + @property + def predict_method(self) -> str: + """Return the appropriate prediction method name.""" + return "predict_proba" if self.is_classifier else "predict" + + +def validate_learner( + learner: Any, + spec: LearnerSpec, + binary_outcome: bool = False, + binary_treatment: bool = False, +) -> LearnerInfo: + """ + Validate learner against specification and data properties. + + Parameters + ---------- + learner : object + The learner to validate. + spec : LearnerSpec + Specification for this learner. + binary_outcome : bool + Whether the outcome variable is binary. + binary_treatment : bool + Whether the treatment variable is binary. + + Returns + ------- + LearnerInfo + Information about the validated learner. + + Raises + ------ + TypeError + If the learner is a class instead of an instance, or lacks + required methods (fit, set_params, get_params, predict/predict_proba). + ValueError + If the learner type is not allowed by the specification. + If a classifier is used with non-binary data when required. + """ + err_msg_prefix = f"Invalid learner provided for {spec.name}: " + warn_msg_prefix = f"Learner provided for {spec.name} is probably invalid: " + + # Check it's an instance, not a class + if isinstance(learner, type): + raise TypeError(err_msg_prefix + "provide an instance of a learner instead of a class.") + + # Check required methods + if not hasattr(learner, "fit"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .fit().") + if not hasattr(learner, "set_params"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .set_params().") + if not hasattr(learner, "get_params"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .get_params().") + + # Determine learner type + learner_is_classifier: bool + if spec.allow_regressor and spec.allow_classifier: + if is_classifier(learner): + learner_is_classifier = True + elif is_regressor(learner): + learner_is_classifier = False + else: + warnings.warn( + warn_msg_prefix + + f"{str(learner)} is (probably) neither a regressor nor a classifier. " + + "Method predict is used for prediction." + ) + learner_is_classifier = False + elif spec.allow_classifier: + if not is_classifier(learner): + warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.") + learner_is_classifier = True + else: + assert spec.allow_regressor # At least one must be True + if not is_regressor(learner): + warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.") + learner_is_classifier = False + + # Check type is allowed + if learner_is_classifier and not spec.allow_classifier: + raise ValueError(f"Classifier not allowed for {spec.name}. Use a regressor instead.") + if not learner_is_classifier and not spec.allow_regressor: + raise ValueError(f"Regressor not allowed for {spec.name}. Use a classifier instead.") + + # Check prediction method exists + if learner_is_classifier: + if not hasattr(learner, "predict_proba"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict_proba().") + else: + if not hasattr(learner, "predict"): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict().") + + # Check binary data compatibility for classifiers + if learner_is_classifier and spec.binary_data_check: + if spec.binary_data_check == "outcome" and not binary_outcome: + raise ValueError( + f"The {spec.name} learner {str(learner)} was identified as classifier " + "but the outcome variable is not binary with values 0 and 1." + ) + if spec.binary_data_check == "treatment" and not binary_treatment: + raise ValueError( + f"The {spec.name} learner {str(learner)} was identified as classifier " + "but the treatment variable is not binary with values 0 and 1." + ) + + # Warn if regressor used with binary data + if not learner_is_classifier and spec.binary_data_check: + if spec.binary_data_check == "outcome" and binary_outcome: + warnings.warn( + f"Binary outcome detected. Consider using a classifier for {spec.name} " + "with predict_proba() to fit an additive probability model." + ) + elif spec.binary_data_check == "treatment" and binary_treatment: + warnings.warn( + f"Binary treatment detected. Consider using a classifier for {spec.name} " + "with predict_proba() to estimate propensity scores." + ) + + return LearnerInfo( + learner=clone(learner), + is_classifier=learner_is_classifier, + ) + + +def predict_nuisance(learner: Any, X: np.ndarray, is_classifier: bool) -> np.ndarray: + """ + Predict using the appropriate method based on learner type. + + Parameters + ---------- + learner : object + Fitted learner with predict() or predict_proba() method. + X : np.ndarray + Features to predict on. + is_classifier : bool + Whether the learner is a classifier. + + Returns + ------- + np.ndarray + Predictions. For classifiers, returns probability of class 1. + """ + if is_classifier: + return learner.predict_proba(X)[:, 1] + return learner.predict(X) From aa2cffadbe8a461c853e6edb10386c1bb6bd6113 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 6 Feb 2026 18:18:33 +0100 Subject: [PATCH 09/38] Add Interactive Regression Model (IRM) implementation and tests - Implemented the IRM class for double machine learning with interactive regression models in irm_scalar.py. - Added core estimation tests for IRM scalar in test_irm_scalar.py. - Created exception handling tests for IRM scalar in test_irm_scalar_exceptions.py. - Developed tests for handling external predictions in test_irm_scalar_external_predictions.py. - Added return type validation tests for IRM scalar in test_irm_scalar_return_types.py. - Compared the new IRM scalar implementation against the existing DoubleMLIRM in test_irm_scalar_vs_irm.py. --- doc/diagrams/architecture.md | 2 +- doc/diagrams/testing_structure.md | 321 +++++++++++++ doubleml/irm/irm_scalar.py | 421 ++++++++++++++++++ doubleml/irm/tests/test_irm_scalar.py | 66 +++ .../irm/tests/test_irm_scalar_exceptions.py | 133 ++++++ .../test_irm_scalar_external_predictions.py | 105 +++++ .../irm/tests/test_irm_scalar_return_types.py | 170 +++++++ doubleml/irm/tests/test_irm_scalar_vs_irm.py | 82 ++++ 8 files changed, 1299 insertions(+), 1 deletion(-) create mode 100644 doc/diagrams/testing_structure.md create mode 100644 doubleml/irm/irm_scalar.py create mode 100644 doubleml/irm/tests/test_irm_scalar.py create mode 100644 doubleml/irm/tests/test_irm_scalar_exceptions.py create mode 100644 doubleml/irm/tests/test_irm_scalar_external_predictions.py create mode 100644 doubleml/irm/tests/test_irm_scalar_return_types.py create mode 100644 doubleml/irm/tests/test_irm_scalar_vs_irm.py diff --git a/doc/diagrams/architecture.md b/doc/diagrams/architecture.md index 4e531e9d..5081f641 100644 --- a/doc/diagrams/architecture.md +++ b/doc/diagrams/architecture.md @@ -17,8 +17,8 @@ DoubleMLBase (ABC) │ │ θ̂ = -E[ψ_b] / E[ψ_a] │ │ │ ├── PLR (partialling out, IV-type) + │ ├── IRM (ATE, ATTE) │ ├── PLIV (planned) - │ ├── IRM (planned) │ └── DID (planned) │ └── NonLinearScoreMixin (planned) diff --git a/doc/diagrams/testing_structure.md b/doc/diagrams/testing_structure.md new file mode 100644 index 00000000..e6383c6d --- /dev/null +++ b/doc/diagrams/testing_structure.md @@ -0,0 +1,321 @@ +# Testing Structure for DoubleML Scalar Models + +This document defines the testing standard for all new models built on the `DoubleMLScalar` hierarchy. Each model should have a consistent set of test files covering estimation accuracy, return types, input validation, backward compatibility, and external predictions. + +## Test File Convention + +For a model `` in module `/` (e.g., `plr` in `plm/`, `irm` in `irm/`): + +| File | Purpose | +|------|---------| +| `test__scalar.py` | Core estimation accuracy | +| `test__scalar_return_types.py` | Property types and shapes after fitting | +| `test__scalar_exceptions.py` | Input validation and error handling | +| `test__scalar_vs_.py` | Comparison with old `DoubleML` implementation | +| `test__scalar_external_predictions.py` | External predictions workflow | + +All test files live in `doubleml//tests/`. + +All test functions should be marked with `@pytest.mark.ci`. + +--- + +## 1. Core Estimation Tests (`test__scalar.py`) + +Verify that the model produces statistically reasonable estimates. + +### Fixture Pattern + +```python +@pytest.fixture(scope="module", params=[...]) # score variants +def score(request): + return request.param + +@pytest.fixture(scope="module", params=[True, False]) # model-specific options +def option(request): + return request.param + +@pytest.fixture(scope="module") +def fitted_fixture(score, option): + np.random.seed(3141) + data = make__data(theta=true_theta, n_obs=500, ...) + dml_obj = (data, score=score, option=option) + dml_obj.set_learners(...) + dml_obj.draw_sample_splitting(n_folds=5, n_rep=1) + dml_obj.fit() + return {"coef": dml_obj.coef[0], "se": dml_obj.se[0], "true_theta": true_theta, "score": score} +``` + +### Required Tests + +- **`test_coef`**: For scores where the DGP theta equals the target parameter, check the 3-sigma rule: `abs(coef - true_theta) <= 3.0 * se`. For scores where the true parameter differs from the DGP theta (e.g., ATTE), check `np.isfinite(coef)` and `abs(coef) < 10.0`. +- **`test_se`**: `se > 0` + +### Assertion Pattern + +```python +# When true parameter matches DGP theta +assert abs(coef - true_theta) <= 3.0 * se + +# When true parameter is unknown (e.g., ATTE with heterogeneous effects) +assert np.isfinite(coef) +assert abs(coef) < 10.0 +``` + +--- + +## 2. Return Types Tests (`test__scalar_return_types.py`) + +Verify that all properties have the correct types and shapes after fitting. + +### Constants + +```python +N_OBS = 200 # small for speed +N_FOLDS = 3 +N_REP = 2 +``` + +### Fixture Pattern + +```python +@pytest.fixture(scope="module") +def fitted_model(): + np.random.seed(42) + data = make__data(n_obs=N_OBS, ...) + dml_obj = (data, score=) + dml_obj.set_learners(...) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + return dml_obj +``` + +### Required Tests + +| Test | Assertion | +|------|-----------| +| `test_coef_type_and_shape` | `isinstance(coef, np.ndarray)`, `shape == (1,)` | +| `test_se_type_and_shape` | `isinstance(se, np.ndarray)`, `shape == (1,)` | +| `test_all_thetas_shape` | `shape == (1, N_REP)` | +| `test_all_ses_shape` | `shape == (1, N_REP)` | +| `test_summary_type` | `isinstance(summary, pd.DataFrame)`, `len == 1` | +| `test_confint_type_and_shape` | `isinstance(ci, pd.DataFrame)`, `shape == (1, 2)` | +| `test_psi_shape` | `shape == (N_OBS, 1, N_REP)` | +| `test_predictions_type` | `isinstance(predictions, dict)`, correct keys, each `shape == (N_OBS, N_REP)` | +| `test_smpls_type` | `len(smpls) == N_REP`, each has `N_FOLDS` tuples of `(train, test)` arrays | +| `test_n_properties` | `n_obs == N_OBS`, `n_folds == N_FOLDS`, `n_rep == N_REP`, `score == ` | +| `test_required_learners` | Returns expected list of learner names | +| `test_str_repr` | `str(model)` and `repr(model)` return `str` | +| `test_get_params` | Returns dict with expected learner keys | +| `test_set_params` | Modifies and confirms learner parameter change | +| `test_before_fit_raises` | Accessing `coef` / `se` before `fit()` raises appropriate error | + +--- + +## 3. Exception Tests (`test__scalar_exceptions.py`) + +Verify that invalid inputs produce clear error messages. + +### Required Tests (Common to All Models) + +| Test | Input | Expected | +|------|-------|----------| +| `test_exception_data` | Non-DoubleMLData | `TypeError` | +| `test_exception_score` | Invalid score string | `ValueError` | +| `test_exception_n_folds` | `n_folds < 2` | `ValueError` | +| `test_exception_n_rep` | `n_rep < 1` | `ValueError` | +| `test_exception_fit_nuisance_without_smpls` | Call `fit_nuisance_models()` before `draw_sample_splitting()` | `ValueError` | +| `test_exception_estimate_causal_without_predictions` | Call `estimate_causal_parameters()` before `fit_nuisance_models()` | `ValueError` | +| `test_exception_missing_learner` | Call `fit()` without setting required learners | `ValueError` | +| `test_exception_invalid_learner` | Pass a class instead of an instance | `TypeError` | + +### Model-Specific Exception Tests + +Add tests for model-specific constraints: +- **PLR**: Instrumental variables check (`z_cols`), `ml_g` warning for partialling out +- **IRM**: Binary treatment check, instruments check, `normalize_ipw` type check, `ml_m` must be classifier + +### Assertion Pattern + +```python +@pytest.mark.ci +def test_exception_data(): + msg = r"The data must be of DoubleMLData type\." + with pytest.raises(TypeError, match=msg): + (pd.DataFrame()) +``` + +Always use `match=` with regex patterns to verify error messages. + +--- + +## 4. Comparison Tests (`test__scalar_vs_.py`) + +Verify exact numerical equivalence with the old `DoubleML` implementation. + +### Fixture Pattern + +```python +@pytest.fixture(scope="module", params=[...]) +def score(request): + return request.param + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + +@pytest.fixture(scope="module") +def comparison_fixture(score, n_rep): + n_folds = 5 + seed = 3141 + + np.random.seed(42) + data = make__data(...) + + # Old model + np.random.seed(seed) + dml_old = dml.DoubleML(data, learner1, learner2, n_folds=n_folds, n_rep=n_rep, score=score) + dml_old.fit() + + # New model — share sample splits from old model + dml_new = (data, score=score) + dml_new.set_learners(...) + dml_new._n_folds = n_folds + dml_new._n_rep = n_rep + dml_new._smpls = dml_old.smpls + dml_new.fit() + + return {"old": dml_old, "new": dml_new} +``` + +**Key**: Share sample splits from the old model directly (`dml_new._smpls = dml_old.smpls`) because the old and new implementations consume random state differently during `__init__`. + +### Required Tests + +```python +def test_coef_equal(comparison_fixture): + np.testing.assert_allclose(new.coef, old.coef, rtol=1e-9) + +def test_se_equal(comparison_fixture): + np.testing.assert_allclose(new.se, old.se, rtol=1e-9) + +def test_all_coef_equal(comparison_fixture): + np.testing.assert_allclose(new.all_thetas, old.all_coef, rtol=1e-9) + +def test_all_se_equal(comparison_fixture): + np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9) +``` + +Note the property name differences: new uses `all_thetas`/`all_ses`, old uses `all_coef`/`all_se`. + +--- + +## 5. External Predictions Tests (`test__scalar_external_predictions.py`) + +Verify that providing pre-computed predictions produces equivalent results. + +### Fixture Pattern + +```python +@pytest.fixture(scope="module", params=[...]) +def score(request): + return request.param + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_x_ext(request): # one fixture per learner + return request.param + +@pytest.fixture(scope="module") +def ext_pred_fixture(score, n_rep, set_ml_x_ext, ...): + # 1. Fit reference model + dml_ref = (data, score=score) + dml_ref.set_learners(...) + dml_ref.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_ref.fit() + + # 2. Build external_predictions dict from reference model + external_predictions = {} + if set_ml_x_ext: + external_predictions["ml_x"] = dml_ref.predictions["ml_x"] + + # 3. Fit new model with shared splits and external predictions + dml_ext = (data, score=score) + dml_ext.set_learners(...) # set non-external learners + dml_ext._n_folds = n_folds + dml_ext._n_rep = n_rep + dml_ext._smpls = dml_ref.smpls + dml_ext.fit(external_predictions=external_predictions) + + return {"ref": dml_ref, "ext": dml_ext} +``` + +### Required Tests + +```python +import math + +def test_coef(ext_pred_fixture): + assert math.isclose(ref.coef[0], ext.coef[0], rel_tol=1e-9, abs_tol=1e-4) + +def test_se(ext_pred_fixture): + assert math.isclose(ref.se[0], ext.se[0], rel_tol=1e-9, abs_tol=1e-4) +``` + +Use `math.isclose` with `abs_tol=1e-4` instead of `np.testing.assert_allclose` because small numerical differences can accumulate when mixing external and fitted predictions. + +--- + +## Assertion Patterns Summary + +| Context | Assertion | Tolerance | +|---------|-----------|-----------| +| Comparison with old model | `np.testing.assert_allclose(new, old, rtol=1e-9)` | Exact match | +| External predictions | `math.isclose(a, b, rel_tol=1e-9, abs_tol=1e-4)` | Small tolerance | +| Statistical accuracy | `abs(coef - true) <= 3.0 * se` | 3-sigma rule | +| Exception handling | `pytest.raises(Error, match=r"regex pattern")` | Exact message match | + +--- + +## Fixture Scope Guidelines + +| Scope | Use Case | +|-------|----------| +| `module` | Parametrized fixtures that fit models (expensive). Each parameter combination creates one instance shared across tests in the module. | +| `session` | Data generation that should be shared across all test modules (not typically needed for scalar model tests). | +| `function` | Only when test modifies state (rare for read-only assertion tests). | + +--- + +## Checklist for New Scalar Models + +When adding a new scalar model `` to the `DoubleMLScalar` hierarchy: + +- [ ] **Implementation**: `doubleml//_scalar.py` + - [ ] Class inherits from `LinearScoreMixin` (or `NonLinearScoreMixin`) + - [ ] `_LEARNER_SPECS` class variable defined + - [ ] `required_learners` property returns score-dependent list + - [ ] `set_learners()` with model-specific kwargs + - [ ] `_check_data()` static method + - [ ] `draw_sample_splitting()` (override if stratification needed) + - [ ] `_nuisance_est()` per-fold estimation + - [ ] `_get_score_elements()` returns `{psi_a, psi_b}` + +- [ ] **Tests**: `doubleml//tests/` + - [ ] `test__scalar.py` — core estimation + - [ ] `test__scalar_return_types.py` — property shapes/types + - [ ] `test__scalar_exceptions.py` — input validation + - [ ] `test__scalar_vs_.py` — comparison with old implementation + - [ ] `test__scalar_external_predictions.py` — external predictions + +- [ ] **Verification** + - [ ] All new tests pass: `pytest doubleml//tests/test__scalar*.py -v -m ci` + - [ ] Lint: `ruff check doubleml//_scalar.py` + - [ ] Format: `black doubleml//_scalar.py` + - [ ] Type check: `mypy doubleml//_scalar.py` + - [ ] Old tests still pass: `pytest doubleml//tests/ -v` + +- [ ] **Documentation**: Update `doc/diagrams/architecture.md` class hierarchy diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py new file mode 100644 index 00000000..f8c0ee53 --- /dev/null +++ b/doubleml/irm/irm_scalar.py @@ -0,0 +1,421 @@ +""" +Interactive Regression Model (IRM) based on the new DoubleMLScalar hierarchy. +""" + +from __future__ import annotations + +from typing import ClassVar, Dict, List, Optional, Self, Union + +import numpy as np +from sklearn.base import clone +from sklearn.utils.multiclass import type_of_target + +from ..data.base_data import DoubleMLData +from ..double_ml_linear_score import LinearScoreMixin +from ..utils._checks import _check_score, _check_weights +from ..utils._learner import LearnerSpec, predict_nuisance +from ..utils._propensity_score import _propensity_score_adjustment +from ..utils.propensity_score_processing import PSProcessor, PSProcessorConfig +from ..utils.resampling import DoubleMLResampling + + +class IRM(LinearScoreMixin): + """Double machine learning for interactive regression models. + + Based on the DoubleMLScalar + LinearScoreMixin hierarchy. + + Parameters + ---------- + obj_dml_data : DoubleMLData + The data object providing the data and specifying the variables for the causal model. + Must contain exactly one binary treatment variable with values 0 and 1. + score : str + The score function (``'ATE'`` or ``'ATTE'``). + Default is ``'ATE'``. + ml_g : estimator, optional + A machine learner implementing ``fit()`` and ``predict()`` for the nuisance + function :math:`g_0(D, X) = E[Y|X, D]`. Cloned to ``ml_g0`` and ``ml_g1`` + internally. For a binary outcome, a classifier implementing ``fit()`` and + ``predict_proba()`` can also be specified. + ml_m : classifier, optional + A machine learner implementing ``fit()`` and ``predict_proba()`` for the + nuisance function :math:`m_0(X) = E[D|X]`. Must be a classifier. + normalize_ipw : bool + Indicates whether the inverse probability weights are normalized. + Default is ``False``. + weights : array, dict or None + Weights for each individual observation. If ``None``, uniform weights are used + (corresponds to standard ATE). Can only be used with ``score='ATE'``. + An array must have shape ``(n,)``. A dictionary must contain keys ``'weights'`` + and ``'weights_bar'``. + Default is ``None``. + ps_processor_config : PSProcessorConfig, optional + Configuration for propensity score processing (clipping, calibration, etc.). + Default is ``None`` (uses default clipping threshold of 0.01). + + Notes + ----- + **Interactive regression (IRM)** models take the form + + .. math:: + + Y = g_0(D, X) + U, & &\\mathbb{E}(U | X, D) = 0, + + D = m_0(X) + V, & &\\mathbb{E}(V | X) = 0, + + where the treatment variable is binary, :math:`D \\in \\lbrace 0,1 \\rbrace`. + Target parameters of interest are the average treatment effect (ATE), + + .. math:: + + \\theta_0 = \\mathbb{E}[g_0(1, X) - g_0(0, X)] + + and the average treatment effect of the treated (ATTE), + + .. math:: + + \\theta_0 = \\mathbb{E}[g_0(1, X) - g_0(0, X) | D=1]. + """ + + # Define learner specifications for IRM + _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] = { + "ml_g0": LearnerSpec("ml_g0", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"), + "ml_g1": LearnerSpec("ml_g1", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"), + "ml_m": LearnerSpec("ml_m", allow_regressor=False, allow_classifier=True), + } + + def __init__( + self, + obj_dml_data: DoubleMLData, + score: str = "ATE", + ml_g: Optional[object] = None, + ml_m: Optional[object] = None, + normalize_ipw: bool = False, + weights: Optional[Union[np.ndarray, Dict]] = None, + ps_processor_config: Optional[PSProcessorConfig] = None, + ): + """ + Initialize IRM model. + + Parameters + ---------- + obj_dml_data : DoubleMLData + The data object. Must have exactly one binary treatment variable. + score : str + Score function (``'ATE'`` or ``'ATTE'``). + ml_g : estimator, optional + Learner for E[Y|X, D]. Cloned to ml_g0 and ml_g1. + ml_m : classifier, optional + Learner for E[D|X]. Must be a classifier. + normalize_ipw : bool + Whether to normalize inverse probability weights. + weights : array, dict or None, optional + Weights for weighted ATE. + ps_processor_config : PSProcessorConfig, optional + Configuration for propensity score processing. + """ + # Validate data + self._check_data(obj_dml_data) + + # Validate score + valid_scores = ["ATE", "ATTE"] + _check_score(score, valid_scores, allow_callable=False) + + super().__init__( + obj_dml_data=obj_dml_data, + score=score, + ) + + # Normalize IPW + if not isinstance(normalize_ipw, bool): + raise TypeError("Normalization indicator has to be boolean. " f"Object of type {str(type(normalize_ipw))} passed.") + self._normalize_ipw = normalize_ipw + + # Propensity score processing + if ps_processor_config is not None: + self._ps_processor_config = ps_processor_config + self._ps_processor = PSProcessor.from_config(ps_processor_config) + else: + self._ps_processor_config = PSProcessorConfig() + self._ps_processor = PSProcessor.from_config(self._ps_processor_config) + + # Weights + _check_weights(weights, score, obj_dml_data.n_obs, n_rep=1) + self._initialize_weights(weights) + + # Set learners if provided + if any(learner is not None for learner in [ml_g, ml_m]): + self.set_learners(ml_g=ml_g, ml_m=ml_m) + + # ==================== Properties ==================== + + @property + def normalize_ipw(self) -> bool: + """Indicates whether the inverse probability weights are normalized.""" + return self._normalize_ipw + + @property + def ps_processor_config(self) -> PSProcessorConfig: + """Configuration for propensity score processing.""" + return self._ps_processor_config + + @property + def ps_processor(self) -> PSProcessor: + """Propensity score processor.""" + return self._ps_processor + + @property + def weights(self) -> Dict: + """Weights for weighted ATE/ATTE.""" + return self._weights + + @property + def required_learners(self) -> List[str]: + """Required learners for IRM: ml_g0, ml_g1, and ml_m.""" + return ["ml_g0", "ml_g1", "ml_m"] + + # ==================== Learner Management ==================== + + def set_learners( + self, + ml_g: Optional[object] = None, + ml_g0: Optional[object] = None, + ml_g1: Optional[object] = None, + ml_m: Optional[object] = None, + ) -> Self: + """ + Set the learners for nuisance estimation. + + Parameters + ---------- + ml_g : estimator or None, optional + A machine learner for the outcome regression :math:`g_0(D, X) = E[Y|X, D]`. + Cloned to ``ml_g0`` and ``ml_g1`` if they are not explicitly set. + ml_g0 : estimator or None, optional + A machine learner for :math:`E[Y|X, D=0]`. Takes precedence over ``ml_g``. + ml_g1 : estimator or None, optional + A machine learner for :math:`E[Y|X, D=1]`. Takes precedence over ``ml_g``. + ml_m : classifier or None, optional + A machine learner for the propensity score :math:`m_0(X) = E[D|X]`. + Must be a classifier with ``predict_proba()`` method. + + Returns + ------- + self : IRM + The estimator with learners set. + """ + # ml_g convenience: clone to ml_g0/ml_g1 if not explicitly set + if ml_g is not None: + # Validate ml_g is an instance (not a class) before cloning + if isinstance(ml_g, type): + raise TypeError("Invalid learner provided for ml_g: provide an instance of a learner instead of a class.") + if ml_g0 is None: + ml_g0 = clone(ml_g) + if ml_g1 is None: + ml_g1 = clone(ml_g) + + # Register each learner + for name, learner in [("ml_g0", ml_g0), ("ml_g1", ml_g1), ("ml_m", ml_m)]: + if learner is not None: + self._register_learner(name, learner) + + return self + + # ==================== Sample Splitting ==================== + + def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: + """ + Draw stratified sample splitting for cross-fitting. + + Uses stratified K-fold splitting to ensure each fold contains both + treatment groups (D=0 and D=1). + + Parameters + ---------- + n_folds : int, optional + Number of folds for cross-fitting. Default is 5. + n_rep : int, optional + Number of repetitions for sample splitting. Default is 1. + + Returns + ------- + self : IRM + The estimator with initialized sample splits. + """ + if not isinstance(n_folds, int) or n_folds < 2: + raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.") + if not isinstance(n_rep, int) or n_rep < 1: + raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.") + + self._n_folds = n_folds + self._n_rep = n_rep + + # Create stratified resampler + resampler = DoubleMLResampling( + n_folds=n_folds, + n_rep=n_rep, + n_obs=self._n_obs, + stratify=self._dml_data.d, + ) + + self._smpls = resampler.split_samples() + return self + + # ==================== Nuisance Estimation ==================== + + def _nuisance_est( + self, + train_idx: np.ndarray, + test_idx: np.ndarray, + i_rep: int, + i_fold: int, + external_predictions: Optional[Dict[str, np.ndarray]] = None, + ) -> None: + x = self._dml_data.x + y = self._dml_data.y + d = self._dml_data.d + + x_train, x_test = x[train_idx], x[test_idx] + d_train = d[train_idx] + + # Check which learners have external predictions + g0_external = external_predictions is not None and "ml_g0" in external_predictions + g1_external = external_predictions is not None and "ml_g1" in external_predictions + m_external = external_predictions is not None and "ml_m" in external_predictions + + # ml_g0: fit on d==0 subset of training data, predict on ALL test observations + if not g0_external: + train_d0 = train_idx[d[train_idx] == 0] + ml_g0_info = self._learners["ml_g0"] + ml_g0 = clone(ml_g0_info.learner) + ml_g0.fit(x[train_d0], y[train_d0]) + self._predictions["ml_g0"][test_idx, i_rep] = predict_nuisance(ml_g0, x_test, ml_g0_info.is_classifier) + + # ml_g1: fit on d==1 subset of training data, predict on ALL test observations + if not g1_external: + train_d1 = train_idx[d[train_idx] == 1] + ml_g1_info = self._learners["ml_g1"] + ml_g1 = clone(ml_g1_info.learner) + ml_g1.fit(x[train_d1], y[train_d1]) + self._predictions["ml_g1"][test_idx, i_rep] = predict_nuisance(ml_g1, x_test, ml_g1_info.is_classifier) + + # ml_m: fit on ALL training data, predict on test + if not m_external: + ml_m_info = self._learners["ml_m"] + ml_m = clone(ml_m_info.learner) + ml_m.fit(x_train, d_train) + self._predictions["ml_m"][test_idx, i_rep] = predict_nuisance(ml_m, x_test, ml_m_info.is_classifier) + + # ==================== Score Elements ==================== + + def _get_score_elements(self) -> Dict[str, np.ndarray]: + y = self._dml_data.y + d = self._dml_data.d + + g_hat0 = self._predictions["ml_g0"] # (n_obs, n_rep) + g_hat1 = self._predictions["ml_g1"] # (n_obs, n_rep) + m_hat_raw = self._predictions["ml_m"] # (n_obs, n_rep) + + # Apply PS processing per repetition + m_hat = np.zeros_like(m_hat_raw) + for i_rep in range(self.n_rep): + m_hat[:, i_rep] = self._ps_processor.adjust_ps(m_hat_raw[:, i_rep], d, cv=self._smpls[i_rep], learner_name="ml_m") + + # Apply IPW normalization per repetition + m_hat_adj = np.zeros_like(m_hat) + for i_rep in range(self.n_rep): + m_hat_adj[:, i_rep] = _propensity_score_adjustment( + propensity_score=m_hat[:, i_rep], + treatment_indicator=d, + normalize_ipw=self.normalize_ipw, + ) + + # Residuals: (n_obs, n_rep) + u_hat0 = y[:, np.newaxis] - g_hat0 + u_hat1 = y[:, np.newaxis] - g_hat1 + + d_col = d[:, np.newaxis] # (n_obs, 1) for broadcasting + + if self.score == "ATE" or self.score == "ATTE": + weights, weights_bar = self._get_weights(m_hat_adj) + + psi_b = weights * (g_hat1 - g_hat0) + weights_bar * ( + np.divide(d_col * u_hat1, m_hat_adj) - np.divide((1.0 - d_col) * u_hat0, 1.0 - m_hat_adj) + ) + psi_a = -1.0 * np.divide(weights, np.mean(weights, axis=0, keepdims=True)) + + return {"psi_a": psi_a, "psi_b": psi_b} + + # ==================== Private Helpers ==================== + + @staticmethod + def _check_data(obj_dml_data: object) -> None: + """Validate that the data is compatible with IRM.""" + if not isinstance(obj_dml_data, DoubleMLData): + raise TypeError( + f"The data must be of DoubleMLData type. " f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) + if obj_dml_data.z_cols is not None: + raise ValueError( + "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). " + "To fit an interactive IV regression model use DoubleMLIIVM instead of IRM." + ) + one_treat = obj_dml_data.n_treat == 1 + binary_treat = type_of_target(obj_dml_data.d) == "binary" + zero_one_treat = np.all((np.power(obj_dml_data.d, 2) - obj_dml_data.d) == 0) + if not (one_treat & binary_treat & zero_one_treat): + raise ValueError( + "Incompatible data. " + "To fit an IRM model with DML " + "exactly one binary variable with values 0 and 1 " + "needs to be specified as treatment variable." + ) + + def _initialize_weights(self, weights: Optional[Union[np.ndarray, Dict]]) -> None: + """Initialize weights storage.""" + if weights is None: + weights = np.ones(self._dml_data.n_obs) + if isinstance(weights, np.ndarray): + self._weights = {"weights": weights} + else: + assert isinstance(weights, dict) + self._weights = weights + + def _get_weights(self, m_hat: np.ndarray) -> tuple: + """ + Compute weights and weights_bar for score computation. + + Parameters + ---------- + m_hat : np.ndarray + Adjusted propensity scores, shape (n_obs, n_rep). + + Returns + ------- + weights : np.ndarray + Shape (n_obs, n_rep) or broadcastable. + weights_bar : np.ndarray + Shape (n_obs, n_rep) or broadcastable. + """ + d = self._dml_data.d + + if self.score == "ATE": + w = self._weights["weights"] + weights = w[:, np.newaxis] * np.ones((1, self.n_rep)) # (n_obs, n_rep) + if "weights_bar" in self._weights: + # weights_bar has shape (n_obs, n_rep) already + weights_bar = self._weights["weights_bar"] + else: + weights_bar = weights.copy() + else: + # ATTE + assert self.score == "ATTE" + w = self._weights["weights"] + subgroup = w * d + subgroup_probability = np.mean(subgroup) + weights = np.divide(subgroup, subgroup_probability)[:, np.newaxis] * np.ones((1, self.n_rep)) + + # weights_bar depends on m_hat per repetition + weights_bar = np.divide(m_hat * w[:, np.newaxis], subgroup_probability) + + return weights, weights_bar diff --git a/doubleml/irm/tests/test_irm_scalar.py b/doubleml/irm/tests/test_irm_scalar.py new file mode 100644 index 00000000..91a8e3c9 --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar.py @@ -0,0 +1,66 @@ +"""Core estimation tests for IRM scalar.""" + +import numpy as np +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM + + +@pytest.fixture(scope="module", params=["ATE", "ATTE"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def normalize_ipw(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_irm_scalar_fixture(score, normalize_ipw): + n_folds = 5 + true_theta = 0.5 + + np.random.seed(3141) + data = make_irm_data(theta=true_theta, n_obs=500, dim_x=20, return_type="DoubleMLData") + + ml_g = RandomForestRegressor(n_estimators=100, max_features=10, max_depth=5, min_samples_leaf=2, random_state=42) + ml_m = RandomForestClassifier(n_estimators=100, max_features=10, max_depth=5, min_samples_leaf=2, random_state=42) + + np.random.seed(3141) + dml_obj = IRM(data, score=score, normalize_ipw=normalize_ipw) + dml_obj.set_learners(ml_g=ml_g, ml_m=ml_m) + dml_obj.draw_sample_splitting(n_folds=n_folds, n_rep=1) + dml_obj.fit() + + return { + "coef": dml_obj.coef[0], + "se": dml_obj.se[0], + "true_theta": true_theta, + "score": score, + } + + +@pytest.mark.ci +def test_dml_irm_scalar_coef(dml_irm_scalar_fixture): + coef = dml_irm_scalar_fixture["coef"] + se = dml_irm_scalar_fixture["se"] + true_theta = dml_irm_scalar_fixture["true_theta"] + score = dml_irm_scalar_fixture["score"] + + # For ATE, the DGP theta is the true ATE parameter + # For ATTE, the true ATTE differs from theta due to heterogeneous effects in the DGP + if score == "ATE": + assert abs(coef - true_theta) <= 3.0 * se + else: + # ATTE: just check estimate is finite and reasonable + assert np.isfinite(coef) + assert abs(coef) < 10.0 + + +@pytest.mark.ci +def test_dml_irm_scalar_se(dml_irm_scalar_fixture): + se = dml_irm_scalar_fixture["se"] + assert se > 0 diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py new file mode 100644 index 00000000..df0aab60 --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py @@ -0,0 +1,133 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LinearRegression + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM +from doubleml.plm.datasets import make_plr_CCDDHNR2018 + +np.random.seed(3141) +obj_dml_data = make_irm_data(theta=0.5, n_obs=100, dim_x=10, return_type="DoubleMLData") + +ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42) +ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42) + + +@pytest.mark.ci +def test_irm_scalar_exception_data(): + msg = r"The data must be of DoubleMLData type\." + with pytest.raises(TypeError, match=msg): + IRM(pd.DataFrame()) + + +@pytest.mark.ci +def test_irm_scalar_exception_instrument(): + # Create data with instruments + np.random.seed(3141) + plr_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=10, alpha=0.5) + df = plr_data.data.copy() + x_cols = [c for c in df.columns if c.startswith("X")] + + import doubleml as dml + + dml_data_iv = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols[:-1], z_cols=x_cols[-1]) + + msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\." + with pytest.raises(ValueError, match=msg): + IRM(dml_data_iv) + + +@pytest.mark.ci +def test_irm_scalar_exception_non_binary_treatment(): + # Create data with continuous treatment + np.random.seed(3141) + plr_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=10, alpha=0.5) + msg = r"Incompatible data.*exactly one binary variable" + with pytest.raises(ValueError, match=msg): + IRM(plr_data) + + +@pytest.mark.ci +def test_irm_scalar_exception_score(): + msg = r"Invalid score" + with pytest.raises(ValueError, match=msg): + IRM(obj_dml_data, score="invalid") + + +@pytest.mark.ci +def test_irm_scalar_exception_n_folds(): + dml_obj = IRM(obj_dml_data) + msg = r"n_folds must be an integer >= 2\." + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_folds=1) + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_folds=0) + + +@pytest.mark.ci +def test_irm_scalar_exception_n_rep(): + dml_obj = IRM(obj_dml_data) + msg = r"n_rep must be an integer >= 1\." + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_rep=0) + + +@pytest.mark.ci +def test_irm_scalar_exception_fit_nuisance_without_smpls(): + dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m) + msg = r"Sample splitting has not been initialized\." + with pytest.raises(ValueError, match=msg): + dml_obj.fit_nuisance_models() + + +@pytest.mark.ci +def test_irm_scalar_exception_estimate_causal_without_predictions(): + dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m) + dml_obj.draw_sample_splitting() + msg = r"Predictions not available\." + with pytest.raises(ValueError, match=msg): + dml_obj.estimate_causal_parameters() + + +@pytest.mark.ci +def test_irm_scalar_exception_missing_learner(): + dml_obj = IRM(obj_dml_data) + dml_obj.draw_sample_splitting() + msg = r"Learner 'ml_g0' is required but not set" + with pytest.raises(ValueError, match=msg): + dml_obj.fit() + + +@pytest.mark.ci +def test_irm_scalar_exception_missing_learner_partial(): + dml_obj = IRM(obj_dml_data) + dml_obj.set_learners(ml_g=ml_g) + dml_obj.draw_sample_splitting() + msg = r"Learner 'ml_m' is required but not set" + with pytest.raises(ValueError, match=msg): + dml_obj.fit() + + +@pytest.mark.ci +def test_irm_scalar_exception_invalid_learner(): + dml_obj = IRM(obj_dml_data) + msg = r"Invalid learner provided for ml_g: provide an instance" + with pytest.raises(TypeError, match=msg): + dml_obj.set_learners(ml_g=RandomForestRegressor) # class instead of instance + + +@pytest.mark.ci +def test_irm_scalar_exception_ml_m_regressor(): + dml_obj = IRM(obj_dml_data) + # LinearRegression is a regressor, not allowed for ml_m; warns then raises TypeError (no predict_proba) + with pytest.raises(TypeError, match=r"has no method .predict_proba"): + dml_obj.set_learners(ml_m=LinearRegression()) + + +@pytest.mark.ci +def test_irm_scalar_exception_normalize_ipw_type(): + msg = r"Normalization indicator has to be boolean" + with pytest.raises(TypeError, match=msg): + IRM(obj_dml_data, normalize_ipw="True") diff --git a/doubleml/irm/tests/test_irm_scalar_external_predictions.py b/doubleml/irm/tests/test_irm_scalar_external_predictions.py new file mode 100644 index 00000000..a7ea60c5 --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_external_predictions.py @@ -0,0 +1,105 @@ +import math + +import numpy as np +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM + + +@pytest.fixture(scope="module", params=["ATE", "ATTE"]) +def irm_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_g0_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_g1_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_m_ext(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_irm_scalar_fixture(irm_score, n_rep, set_ml_g0_ext, set_ml_g1_ext, set_ml_m_ext): + n_folds = 3 + ext_predictions = {} + + np.random.seed(42) + data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type="DoubleMLData") + + ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42) + ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42) + + # Fit reference model + dml_irm = IRM(data, score=irm_score) + dml_irm.set_learners(ml_g=ml_g, ml_m=ml_m) + np.random.seed(3141) + dml_irm.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_irm.fit() + + # Build external predictions dict + if set_ml_g0_ext: + ext_predictions["ml_g0"] = dml_irm.predictions["ml_g0"] + + if set_ml_g1_ext: + ext_predictions["ml_g1"] = dml_irm.predictions["ml_g1"] + + if set_ml_m_ext: + ext_predictions["ml_m"] = dml_irm.predictions["ml_m"] + + # Fit model with external predictions — only set learners that are needed + dml_irm_ext = IRM(data, score=irm_score) + learner_kwargs = {} + if not (set_ml_g0_ext and set_ml_g1_ext): + learner_kwargs["ml_g"] = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42) + if not set_ml_m_ext: + learner_kwargs["ml_m"] = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42) + if learner_kwargs: + dml_irm_ext.set_learners(**learner_kwargs) + + np.random.seed(3141) + dml_irm_ext.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_irm_ext.fit(external_predictions=ext_predictions if ext_predictions else None) + + res_dict = { + "coef_normal": dml_irm.coef[0], + "coef_ext": dml_irm_ext.coef[0], + "se_normal": dml_irm.se[0], + "se_ext": dml_irm_ext.se[0], + } + + return res_dict + + +@pytest.mark.ci +def test_doubleml_irm_scalar_coef(doubleml_irm_scalar_fixture): + assert math.isclose( + doubleml_irm_scalar_fixture["coef_normal"], + doubleml_irm_scalar_fixture["coef_ext"], + rel_tol=1e-9, + abs_tol=1e-4, + ) + + +@pytest.mark.ci +def test_doubleml_irm_scalar_se(doubleml_irm_scalar_fixture): + assert math.isclose( + doubleml_irm_scalar_fixture["se_normal"], + doubleml_irm_scalar_fixture["se_ext"], + rel_tol=1e-9, + abs_tol=1e-4, + ) diff --git a/doubleml/irm/tests/test_irm_scalar_return_types.py b/doubleml/irm/tests/test_irm_scalar_return_types.py new file mode 100644 index 00000000..15eaae82 --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_return_types.py @@ -0,0 +1,170 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM + +N_OBS = 200 +N_FOLDS = 3 +N_REP = 2 +N_REP_BOOT = 314 + +np.random.seed(3141) +obj_dml_data = make_irm_data(theta=0.5, n_obs=N_OBS, dim_x=10, return_type="DoubleMLData") + + +@pytest.fixture(scope="module") +def fitted_dml_obj(): + np.random.seed(3141) + dml_obj = IRM(obj_dml_data) + dml_obj.set_learners( + ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42), + ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42), + ) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + dml_obj.bootstrap(n_rep_boot=N_REP_BOOT) + return dml_obj + + +@pytest.mark.ci +def test_coef_type_and_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.coef, np.ndarray) + assert fitted_dml_obj.coef.shape == (1,) + + +@pytest.mark.ci +def test_se_type_and_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.se, np.ndarray) + assert fitted_dml_obj.se.shape == (1,) + + +@pytest.mark.ci +def test_all_thetas_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.all_thetas, np.ndarray) + assert fitted_dml_obj.all_thetas.shape == (1, N_REP) + + +@pytest.mark.ci +def test_all_coef_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.all_coef, np.ndarray) + assert fitted_dml_obj.all_coef.shape == (1, N_REP) + + +@pytest.mark.ci +def test_all_ses_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.all_ses, np.ndarray) + assert fitted_dml_obj.all_ses.shape == (1, N_REP) + + +@pytest.mark.ci +def test_summary_type(fitted_dml_obj): + assert isinstance(fitted_dml_obj.summary, pd.DataFrame) + assert fitted_dml_obj.summary.shape[0] == 1 + + +@pytest.mark.ci +def test_confint_type_and_shape(fitted_dml_obj): + ci = fitted_dml_obj.confint() + assert isinstance(ci, pd.DataFrame) + assert ci.shape == (1, 2) + + +@pytest.mark.ci +def test_confint_joint(fitted_dml_obj): + ci_joint = fitted_dml_obj.confint(joint=True) + assert isinstance(ci_joint, pd.DataFrame) + assert ci_joint.shape == (1, 2) + + +@pytest.mark.ci +def test_psi_shape(fitted_dml_obj): + assert isinstance(fitted_dml_obj.psi, np.ndarray) + assert fitted_dml_obj.psi.shape == (N_OBS, 1, N_REP) + + +@pytest.mark.ci +def test_predictions_type(fitted_dml_obj): + preds = fitted_dml_obj.predictions + assert isinstance(preds, dict) + assert "ml_g0" in preds + assert "ml_g1" in preds + assert "ml_m" in preds + assert preds["ml_g0"].shape == (N_OBS, N_REP) + assert preds["ml_g1"].shape == (N_OBS, N_REP) + assert preds["ml_m"].shape == (N_OBS, N_REP) + + +@pytest.mark.ci +def test_smpls_type(fitted_dml_obj): + smpls = fitted_dml_obj.smpls + assert isinstance(smpls, list) + assert len(smpls) == N_REP + assert len(smpls[0]) == N_FOLDS + + +@pytest.mark.ci +def test_n_properties(fitted_dml_obj): + assert fitted_dml_obj.n_obs == N_OBS + assert fitted_dml_obj.n_folds == N_FOLDS + assert fitted_dml_obj.n_rep == N_REP + assert fitted_dml_obj.score == "ATE" + + +@pytest.mark.ci +def test_required_learners(fitted_dml_obj): + assert fitted_dml_obj.required_learners == ["ml_g0", "ml_g1", "ml_m"] + assert "ml_g0" in fitted_dml_obj.learners + assert "ml_g1" in fitted_dml_obj.learners + assert "ml_m" in fitted_dml_obj.learners + + +@pytest.mark.ci +def test_str_repr(fitted_dml_obj): + assert isinstance(str(fitted_dml_obj), str) + assert isinstance(repr(fitted_dml_obj), str) + + +@pytest.mark.ci +def test_get_params(fitted_dml_obj): + params = fitted_dml_obj.get_params("ml_g0") + assert isinstance(params, dict) + assert "n_estimators" in params + + +@pytest.mark.ci +def test_set_params(fitted_dml_obj): + result = fitted_dml_obj.set_params("ml_g0", n_estimators=5) + assert result is fitted_dml_obj + params = fitted_dml_obj.get_params("ml_g0") + assert params["n_estimators"] == 5 + # Reset + fitted_dml_obj.set_params("ml_g0", n_estimators=10) + + +@pytest.mark.ci +def test_get_params_invalid_learner(fitted_dml_obj): + with pytest.raises(ValueError, match="not registered"): + fitted_dml_obj.get_params("ml_invalid") + + +@pytest.mark.ci +def test_before_fit_raises(): + np.random.seed(3141) + dml_obj = IRM(obj_dml_data) + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.coef + with pytest.raises(ValueError, match="Predictions not available. Call fit"): + _ = dml_obj.predictions + + +@pytest.mark.ci +def test_irm_properties(fitted_dml_obj): + assert isinstance(fitted_dml_obj.normalize_ipw, bool) + assert fitted_dml_obj.normalize_ipw is False + assert isinstance(fitted_dml_obj.weights, dict) + assert "weights" in fitted_dml_obj.weights + assert fitted_dml_obj.ps_processor is not None + assert fitted_dml_obj.ps_processor_config is not None diff --git a/doubleml/irm/tests/test_irm_scalar_vs_irm.py b/doubleml/irm/tests/test_irm_scalar_vs_irm.py new file mode 100644 index 00000000..196385e8 --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_vs_irm.py @@ -0,0 +1,82 @@ +"""Compare IRM scalar against the existing DoubleMLIRM implementation.""" + +import numpy as np +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +import doubleml as dml +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM + + +@pytest.fixture(scope="module", params=["ATE", "ATTE"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def comparison_fixture(score, n_rep): + n_folds = 5 + seed = 3141 + + np.random.seed(42) + obj_dml_data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type="DoubleMLData") + + ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42) + ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42) + + # Old IRM + np.random.seed(seed) + dml_old = dml.DoubleMLIRM( + obj_dml_data, + ml_g, + ml_m, + n_folds=n_folds, + n_rep=n_rep, + score=score, + ) + dml_old.fit() + + # New IRM scalar — share sample splits from old model for exact comparison + dml_new = IRM(obj_dml_data, score=score) + dml_new.set_learners(ml_g=ml_g, ml_m=ml_m) + # Copy sample splits directly to ensure identical cross-fitting structure + dml_new._n_folds = n_folds + dml_new._n_rep = n_rep + dml_new._smpls = dml_old.smpls + dml_new.fit() + + return {"old": dml_old, "new": dml_new} + + +@pytest.mark.ci +def test_coef_equal(comparison_fixture): + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.coef, old.coef, rtol=1e-9) + + +@pytest.mark.ci +def test_se_equal(comparison_fixture): + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.se, old.se, rtol=1e-9) + + +@pytest.mark.ci +def test_all_coef_equal(comparison_fixture): + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.all_thetas, old.all_coef, rtol=1e-9) + + +@pytest.mark.ci +def test_all_se_equal(comparison_fixture): + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9) From 0947c9d19e640266360b9d088b5508f833f6c80d Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 7 Feb 2026 10:25:42 +0100 Subject: [PATCH 10/38] Refactor documentation and guidelines for DoubleML, including coding standards, error handling, performance guidelines, and testing conventions. --- .claude/CLAUDE.md | 199 ++++----------------- .claude/agents/py-reviewer.md | 66 +++++++ .claude/rules/dml-scalar-test-structure.md | 135 ++++++++++++++ .claude/rules/error-handling.md | 91 ++++++++++ .claude/rules/performance-guidelines.md | 67 +++++++ .claude/rules/py-code-conventions.md | 196 ++++++++++++++++++++ .claude/rules/testing-conventions.md | 104 +++++++++++ 7 files changed, 689 insertions(+), 169 deletions(-) create mode 100644 .claude/agents/py-reviewer.md create mode 100644 .claude/rules/dml-scalar-test-structure.md create mode 100644 .claude/rules/error-handling.md create mode 100644 .claude/rules/performance-guidelines.md create mode 100644 .claude/rules/py-code-conventions.md create mode 100644 .claude/rules/testing-conventions.md diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 6c302c11..0dc51dca 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -1,84 +1,20 @@ -# DoubleML for Python - Claude Code Memory +# DoubleML for Python -## Project Purpose - -DoubleML is a Python package implementing Double/Debiased Machine Learning (DML) methods for causal inference. The package provides: +DoubleML is a Python package implementing Double/Debiased Machine Learning (DML) methods for causal inference: - Partially Linear Models (PLR, PLIV, PLPR, LPLR) - Interactive Regression Models (IRM, IIVM, APO, QTE, CVAR, SSM) - Difference-in-Differences estimators (DID, DIDCSBinary, DIDMulti) - Regression Discontinuity Design (RDD) -**Documentation**: https://docs.doubleml.org - -## Coding Standards - -### Python -- **Version**: Python 3.11+ (supports 3.11, 3.12, 3.13) -- **Formatter**: black with line-length 127 -- **Linter**: ruff (rules: E, F, W, I) -- **Type Checker**: mypy with `disallow_untyped_defs = true` -- **Type hints**: Required for all functions -- **Docstrings**: NumPy-style (see example below) -- **Max line length**: 127 characters - -### NumPy Docstring Style -```python -def example_function(param1: int, param2: str) -> bool: - """ - Short description of the function. - - Parameters - ---------- - param1 : int - Description of param1. - param2 : str - Description of param2. - - Returns - ------- - bool - Description of return value. - - Raises - ------ - ValueError - If param1 is negative. - """ -``` - -### Code Quality Commands -```bash -# Format code -black . - -# Lint code -ruff check . - -# Fix linting issues -ruff check --fix . - -# Type check -mypy doubleml -``` - -### Pre-commit Hooks -Pre-commit is configured with: -- File format checks (yaml, toml) -- Debug statement detection -- Large file checks -- Trailing whitespace and line ending fixes -- black formatting -- ruff linting with auto-fix - -Run pre-commit manually: `pre-commit run --all-files` +**Docs**: https://docs.doubleml.org | **Source**: https://github.com/DoubleML/doubleml-for-py -## Architecture Overview +## Architecture ### Class Hierarchy ``` DoubleMLBase (ABC) └─> DoubleMLScalar (ABC) - single-parameter models - ├─> LinearScoreMixin - closed-form solver + ├─> LinearScoreMixin - closed-form solver (θ = -E[ψ_b]/E[ψ_a]) │ ├─> DoubleMLPLR │ ├─> DoubleMLIRM │ ├─> DoubleMLPLIV @@ -89,15 +25,15 @@ DoubleMLBase (ABC) DoubleML - multi-parameter estimation (extends DoubleMLScalar) ``` -### Key Design Patterns -- **Template Method**: `fit()` orchestrates; subclasses implement abstract methods -- **Mixin Pattern**: LinearScoreMixin provides closed-form θ = -E[ψ_b]/E[ψ_a] -- **Delegation**: DoubleMLBase delegates inference to DoubleMLFramework +### Design Patterns +- **Template Method**: `fit()` orchestrates; subclasses implement `_nuisance_est()`, `_get_score_elements()` +- **Mixin Pattern**: `LinearScoreMixin` provides closed-form coefficient estimation +- **Delegation**: `DoubleMLBase` delegates inference to `DoubleMLFramework` ### Core Files | File | Purpose | |------|---------| -| `doubleml/double_ml_base.py` | Abstract base with properties (coef, se, summary) and inference methods | +| `doubleml/double_ml_base.py` | Abstract base with properties (coef, se, summary) and inference | | `doubleml/double_ml_scalar.py` | Single-parameter estimation orchestrator | | `doubleml/double_ml.py` | Multi-parameter estimation with sample splitting | | `doubleml/double_ml_framework.py` | Statistical inference (confint, bootstrap, sensitivity) | @@ -115,108 +51,33 @@ doubleml/ └── tests/ # Main test directory ``` -## Testing - -### Run Tests -```bash -# Run all tests -pytest - -# Run with coverage -pytest --cov - -# Run specific marker (CI tests) -pytest -m ci - -# Run specific test file -pytest doubleml/tests/test_framework.py - -# Run tests for a specific module -pytest doubleml/plm/tests/ -``` - -### Test Markers -- `ci`: Continuous integration tests for GitHub Actions -- `ci_rdd`: RDD-specific CI tests - -### Test Organization -- Each module (plm, irm, did) has its own `tests/` subdirectory -- Test utilities in `doubleml/tests/_utils*.py` -- Manual computation helpers verify results independently - -## Git Workflow - -### Branches -- `main`: Main development branch -- Feature branches for new work - -### Commit Format -Use Conventional Commits: -- `feat:` new feature -- `fix:` bug fix -- `docs:` documentation -- `refactor:` code refactoring -- `test:` adding tests -- `chore:` maintenance - ## Key Dependencies -### Core -- numpy>=2.0.0, pandas>=2.0.0, scipy>=1.7.0 -- scikit-learn>=1.6.0, statsmodels>=0.14.0 - -### ML/Tuning -- optuna>=4.6.0 (hyperparameter tuning) -- joblib>=1.2.0 (parallelization) - -### Visualization -- matplotlib>=3.9.0, seaborn>=0.13, plotly>=5.0.0 - -### Development -- pytest>=8.3.0, pytest-cov>=6.0.0 -- black>=25.1.0, ruff>=0.11.1, mypy>=1.18.0 -- xgboost>=2.1.0, lightgbm>=4.6.0 (for testing) +**Core**: numpy>=2.0.0, pandas>=2.0.0, scipy>=1.7.0, scikit-learn>=1.6.0, statsmodels>=0.14.0 +**ML/Tuning**: optuna>=4.6.0, joblib>=1.2.0 +**Visualization**: matplotlib>=3.9.0, seaborn>=0.13, plotly>=5.0.0 +**Dev**: pytest>=8.3.0, black>=25.1.0, ruff>=0.11.1, mypy>=1.18.0, xgboost>=2.1.0, lightgbm>=4.6.0 -## Known Pitfalls - -### Type Annotations -- MyPy is strict: `disallow_untyped_defs = true` -- All functions need full type hints including return types -- Use `from __future__ import annotations` for forward references - -### Learner Validation -- Learners must be scikit-learn compatible (fit/predict interface) -- Use `_check_learner()` from `doubleml/utils/_checks.py` for validation -- Classifiers need `predict_proba()` for propensity scores - -### Sample Splitting -- Cross-fitting uses `DoubleMLResampling` from `doubleml/utils/resampling.py` -- Default is 5-fold cross-fitting with 1 repetition -- Cluster-robust resampling available for clustered data - -### Score Functions -- Linear scores use closed-form: θ = -E[ψ_b]/E[ψ_a] -- Custom scores can be passed as callables -- Score elements: `psi_a` (derivative), `psi_b` (moment) +## Git Workflow -### External Predictions -- Models support external predictions via `set_external_predictions()` -- Predictions must match sample splitting structure +- **Main branch**: `main` +- **Commits**: Conventional Commits — `feat:`, `fix:`, `docs:`, `refactor:`, `test:`, `chore:` ## Verification Before completing any task: -1. Run `ruff check .` to check for linting issues -2. Run `mypy doubleml` for type checking -3. Run relevant tests: `pytest doubleml/path/to/tests/` -4. Format code: `black .` - -## Useful Links +```bash +black . # Format +ruff check --fix . # Lint +mypy doubleml # Type check +pytest -m ci # Tests +``` -- **Documentation**: https://docs.doubleml.org -- **Source**: https://github.com/DoubleML/doubleml-for-py -- **Bug Tracker**: https://github.com/DoubleML/doubleml-for-py/issues -- **Architecture Docs**: [doc/diagrams/architecture.md](doc/diagrams/architecture.md) +## Coding Standards ---- -*Update this file when Claude makes mistakes to prevent future issues.* +Detailed conventions are in `.claude/rules/`: +- **py-code-conventions.md** — Formatting, type hints, docstrings, naming, DML-specific patterns +- **error-handling.md** — Exception types, validation patterns, warnings vs. errors +- **performance-guidelines.md** — Vectorization, pre-allocation, DML computation patterns +- **testing-conventions.md** — Markers, fixtures, assertion patterns +- **dml-scalar-test-structure.md** — Mandatory 5-file test structure for scalar models diff --git a/.claude/agents/py-reviewer.md b/.claude/agents/py-reviewer.md new file mode 100644 index 00000000..dece1193 --- /dev/null +++ b/.claude/agents/py-reviewer.md @@ -0,0 +1,66 @@ +--- +name: py-reviewer +description: Python code reviewer for DoubleML. Checks type safety, learner handling, score contracts, and test coverage. Use after writing or modifying Python files. +tools: Read, Grep, Glob, Bash +model: inherit +--- + +Review Python code changes against DoubleML project conventions. Report issues only — never edit source files. + +## Workflow + +1. Run `git diff --name-only HEAD~1` to identify changed files (use Bash) +2. Read each changed `.py` file +3. Review against the checklist below +4. Output findings in the format specified + +## Review Checklist + +### Critical (must fix — blocks merge) +- **Type hints**: All functions have parameter types and return types. Missing `-> None` counts. +- **`from __future__ import annotations`**: Present when class methods reference their own type (forward refs) +- **Learner validation**: `_check_learner()` called for every user-provided learner +- **Learner cloning**: `clone(learner)` before `.fit()` — learners are mutable +- **Score contract**: `_get_score_elements()` returns `{'psi_a': ..., 'psi_b': ...}` with shape `(n_obs,)` +- **Sample splitting**: Uses `DoubleMLResampling`, never raw `KFold` +- **Test markers**: Every test function has `@pytest.mark.ci` +- **Exception messages**: Include expected vs. actual values (`got {value}`) + +### Warnings (should fix) +- **Module docstring**: File starts with `"""..."""` describing the module +- **NumPy-style docstrings**: Public functions/classes have Parameters + Returns sections +- **Naming**: Classes use `DoubleML` prefix, score elements use `psi_a`/`psi_b`, stats use `theta`/`se`/`n_obs` +- **Magic numbers**: Unexplained numeric literals (should be named constants) +- **Vectorization**: Python loops over `n_obs`-sized arrays (should be NumPy ops) +- **Error handling**: `_check_*` helpers from `doubleml/utils/_checks.py` used where applicable + +### Suggestions (nice to have) +- **Property vs. method**: Cheap computed attributes should be `@property`, side effects should be methods +- **Decorator usage**: `@staticmethod` for `_check_data()`, `@abstractmethod` for template hooks +- **Class vs. instance variables**: `_LEARNER_SPECS`/`_VALID_SCORES` should be class-level + +### Intentionally Acceptable (do NOT flag) +- `Any` type for scikit-learn estimators and learner objects +- `E721` type comparisons (`type(x) == Y`) — intentionally allowed by ruff config +- Test files without type annotations — excluded from mypy +- `# type: ignore` when suppressing third-party library issues (not own code) + +## Output Format + +```markdown +## Code Review: `` + +### Critical +- **line N**: [issue description]. Fix: `` + +### Warnings +- **line N**: [issue description]. Consider: `` + +### Suggestions +- **line N**: [issue description] + +### Summary +[1-2 sentences: overall assessment, number of issues by severity] +``` + +Review each changed file separately. If no issues found, state "No issues found" for that file. diff --git a/.claude/rules/dml-scalar-test-structure.md b/.claude/rules/dml-scalar-test-structure.md new file mode 100644 index 00000000..8ee372ae --- /dev/null +++ b/.claude/rules/dml-scalar-test-structure.md @@ -0,0 +1,135 @@ +# DoubleMLScalar Test Structure + +> **Apply when**: Implementing a new model in the `DoubleMLScalar` hierarchy. +> **Source**: Derived from `doc/diagrams/testing_structure.md`. + +## Required Test Files + +Every scalar model `` in module `/` requires **5 test files** in `doubleml//tests/`: + +| File | Purpose | +|------|---------| +| `test__scalar.py` | Core estimation accuracy (3-sigma rule) | +| `test__scalar_return_types.py` | Property types, shapes, API contracts | +| `test__scalar_exceptions.py` | Input validation, error messages | +| `test__scalar_vs_.py` | Exact match with old `DoubleML` | +| `test__scalar_external_predictions.py` | External predictions equivalence | + +All test functions must be marked `@pytest.mark.ci`. + +--- + +## 1. Core Estimation (`test__scalar.py`) + +**Fixture**: Parametrize over `score` variants and model-specific options. Use `scope="module"`, `np.random.seed(3141)`, `n_obs=500`, `n_folds=5`, `n_rep=1`. + +**Required tests**: +- `test_coef`: `abs(coef - true_theta) <= 3.0 * se` (when true theta matches DGP) + - For unknown true params (e.g., ATTE): `np.isfinite(coef)` and `abs(coef) < 10.0` +- `test_se`: `se > 0` + +## 2. Return Types (`test__scalar_return_types.py`) + +**Constants**: `N_OBS=200`, `N_FOLDS=3`, `N_REP=2`. Single fixture fitting one model. + +**Required tests**: + +| Test | Assertion | +|------|-----------| +| `test_coef_type_and_shape` | `isinstance(coef, np.ndarray)`, `shape == (1,)` | +| `test_se_type_and_shape` | `isinstance(se, np.ndarray)`, `shape == (1,)` | +| `test_all_thetas_shape` | `shape == (1, N_REP)` | +| `test_all_ses_shape` | `shape == (1, N_REP)` | +| `test_summary_type` | `isinstance(summary, pd.DataFrame)`, `len == 1` | +| `test_confint_type_and_shape` | `isinstance(ci, pd.DataFrame)`, `shape == (1, 2)` | +| `test_psi_shape` | `shape == (N_OBS, 1, N_REP)` | +| `test_predictions_type` | `isinstance(predictions, dict)`, each value `shape == (N_OBS, N_REP)` | +| `test_smpls_type` | `len(smpls) == N_REP`, each has `N_FOLDS` tuples of `(train, test)` arrays | +| `test_n_properties` | `n_obs == N_OBS`, `n_folds == N_FOLDS`, `n_rep == N_REP`, `score == expected` | +| `test_required_learners` | Returns list of expected learner names | +| `test_str_repr` | `str(model)` and `repr(model)` return `str` | +| `test_get_params` | Returns dict with learner keys | +| `test_set_params` | Modifies and confirms learner parameter change | +| `test_before_fit_raises` | `coef`/`se` before `fit()` raises error | + +## 3. Exceptions (`test__scalar_exceptions.py`) + +**Common exception tests** (required for all models): + +| Test | Input | Expected | +|------|-------|----------| +| `test_exception_data` | Non-DoubleMLData | `TypeError` | +| `test_exception_score` | Invalid score string | `ValueError` | +| `test_exception_n_folds` | `n_folds < 2` | `ValueError` | +| `test_exception_n_rep` | `n_rep < 1` | `ValueError` | +| `test_exception_fit_nuisance_without_smpls` | Fit before `draw_sample_splitting()` | `ValueError` | +| `test_exception_estimate_causal_without_predictions` | Estimate before `fit_nuisance_models()` | `ValueError` | +| `test_exception_missing_learner` | `fit()` without required learners | `ValueError` | +| `test_exception_invalid_learner` | Class instead of instance | `TypeError` | + +**Model-specific exceptions** to add per model: +- PLR: multiple treatments, `ml_g` warning for partialling out +- IRM: non-binary treatment, `ml_m` must be classifier, `normalize_ipw` type + +Always use `pytest.raises(Error, match=r"regex pattern")`. + +## 4. Comparison (`test__scalar_vs_.py`) + +**Fixture**: Parametrize `score` and `n_rep` (use `[1, 3]`). + +**Critical pattern**: Share sample splits from old model: +```python +dml_new._smpls = dml_old.smpls # Old/new consume random state differently +``` + +**Required tests** — all use `np.testing.assert_allclose(..., rtol=1e-9)`: +- `test_coef_equal`: `new.coef` vs `old.coef` +- `test_se_equal`: `new.se` vs `old.se` +- `test_all_coef_equal`: `new.all_thetas` vs `old.all_coef` (note: property name differs!) +- `test_all_se_equal`: `new.all_ses` vs `old.all_se` + +## 5. External Predictions (`test__scalar_external_predictions.py`) + +**Fixture**: Parametrize `score`, `n_rep` (`[1, 3]`), and one `set_ml_x_ext` bool fixture per learner. + +**Pattern**: +1. Fit reference model normally +2. Extract `dml_ref.predictions['ml_x']` for external learners +3. Fit test model with `dml_ext._smpls = dml_ref.smpls` and `fit(external_predictions=...)` + +**Required tests** — use `math.isclose(a, b, rel_tol=1e-9, abs_tol=1e-4)` (not `assert_allclose`): +- `test_coef`: Reference vs. external +- `test_se`: Reference vs. external + +`math.isclose` with `abs_tol=1e-4` because small numerical differences accumulate when mixing external and fitted predictions. + +--- + +## Assertion Tolerance Summary + +| Context | Method | Why | +|---------|--------|-----| +| Core estimation | `abs(coef - true) <= 3.0 * se` | Statistical 3-sigma | +| Backward compatibility | `assert_allclose(rtol=1e-9)` | Must be identical | +| External predictions | `math.isclose(rel_tol=1e-9, abs_tol=1e-4)` | Numerical accumulation | + +## New Model Checklist + +### Implementation +- [ ] Inherits from `LinearScoreMixin` (or `NonLinearScoreMixin`) +- [ ] `_LEARNER_SPECS` class variable defined +- [ ] `required_learners` property (score-dependent list) +- [ ] `set_learners()` with model-specific kwargs +- [ ] `_check_data()` static method +- [ ] `draw_sample_splitting()` (override if stratification needed) +- [ ] `_nuisance_est()` per-fold estimation +- [ ] `_get_score_elements()` returns `{'psi_a': ..., 'psi_b': ...}` + +### Tests +- [ ] All 5 test files created and pass: `pytest doubleml//tests/test__scalar*.py -v -m ci` +- [ ] Old tests still pass: `pytest doubleml//tests/ -v` + +### Quality +- [ ] `black doubleml//` +- [ ] `ruff check doubleml//` +- [ ] `mypy doubleml//` diff --git a/.claude/rules/error-handling.md b/.claude/rules/error-handling.md new file mode 100644 index 00000000..285be7c0 --- /dev/null +++ b/.claude/rules/error-handling.md @@ -0,0 +1,91 @@ +# Error Handling — DoubleML + +> **Apply when**: Adding input validation, raising exceptions, or writing `pytest.raises` tests. + +## Exception Type Mapping + +| Situation | Exception | Example | +|-----------|-----------|---------| +| Invalid parameter value | `ValueError` | `n_folds < 2`, unknown `score` | +| Wrong argument type | `TypeError` | Non-`DoubleMLData` passed, class instead of instance | +| Property accessed before `fit()` | `ValueError` | `model.coef` before fitting | +| Wrong method call order | `ValueError` | `fit_nuisance_models()` before `draw_sample_splitting()` | + +## Validation Patterns + +### Use Project Helpers + +Always use validation functions from `doubleml/utils/_checks.py`: + +```python +from doubleml.utils._checks import _check_learner, _check_score, _check_finite_predictions + +# Learner validation (checks sklearn compatibility, instance vs class) +self._learner_ml_l = _check_learner(ml_l, 'ml_l', regressor=True, classifier=False) + +# Score validation +_check_score(score, valid_scores=['IV-type', 'partialling out'], allow_callable=True) +``` + +### Fail Fast — Validate in Constructor and Setters + +```python +def __init__(self, data: DoubleMLData, score: str = "ATE") -> None: + self._check_data(data) # Validate immediately + + if score not in self._VALID_SCORES: + raise ValueError(f"score must be one of {self._VALID_SCORES}, got '{score}'") +``` + +### Error Messages Must Include Expected vs. Actual + +```python +# Good: specific and actionable +raise ValueError(f"n_folds must be at least 2, got {n_folds}") +raise TypeError( + f"ml_m must be a classifier with predict_proba(). " + f"Got {type(ml_m).__name__}. Did you pass a class instead of an instance?" +) + +# Bad: vague +raise ValueError("Invalid input") +``` + +### Method Call Order Validation + +```python +def fit_nuisance_models(self) -> None: + if self._smpls is None: + raise ValueError("Sample splitting has not been drawn. Call draw_sample_splitting() first.") + +def estimate_causal_parameters(self) -> None: + if self._predictions is None: + raise ValueError("Nuisance models not fitted. Call fit_nuisance_models() first, or use fit().") +``` + +## Warnings vs. Exceptions + +- **Exception**: Input is invalid, execution cannot continue +- **`warnings.warn()`**: Input is valid but may cause poor results + +```python +# Warn on extreme propensity scores (valid but risky) +if np.any((propensity < 1e-12) | (propensity > 1 - 1e-12)): + warnings.warn( + f"Propensity scores close to 0 or 1 (eps=1e-12). " + f"Trimming at {self._trimming_threshold}.", + UserWarning + ) +``` + +## Testing Exceptions + +Always use `match=` with regex to verify the error message: + +```python +@pytest.mark.ci +def test_exception_invalid_score(): + msg = r"score must be one of .*, got 'invalid'" + with pytest.raises(ValueError, match=msg): + DoubleMLPLR(data, score='invalid') +``` diff --git a/.claude/rules/performance-guidelines.md b/.claude/rules/performance-guidelines.md new file mode 100644 index 00000000..6854eaba --- /dev/null +++ b/.claude/rules/performance-guidelines.md @@ -0,0 +1,67 @@ +# Performance Guidelines — DoubleML + +> **Apply when**: Writing nuisance estimation, score computation, or any code operating on `(n_obs,)` or `(n_obs, n_rep)` arrays. + +## Core Rules + +1. **Vectorize** — Use NumPy array operations, never Python loops over observations +2. **Pre-allocate** — Create output arrays at full size before filling per-fold +3. **Clone before fit** — `clone(learner).fit(X, y)` — learners are mutable +4. **Profile first** — Don't optimize without measuring + +## DoubleML-Specific Patterns + +### Nuisance Estimation (Per-Fold) + +```python +# Pre-allocate prediction arrays +predictions = { + 'ml_l': np.zeros((n_obs, n_rep)), + 'ml_m': np.zeros((n_obs, n_rep)), +} + +for i_rep, smpl in enumerate(smpls): + for train_idx, test_idx in smpl: + # Clone learner (mutable!), fit, predict in one chain + predictions['ml_l'][test_idx, i_rep] = ( + clone(self._learner_ml_l).fit(X[train_idx], y[train_idx]).predict(X[test_idx]) + ) +``` + +### Score Computation + +```python +# Vectorized — operates on full arrays +psi_a = -d_res * d_res # (n_obs,) +psi_b = d_res * (y - ml_g_hat) # (n_obs,) +theta = -np.mean(psi_b) / np.mean(psi_a) +``` + +### Propensity Scores + +```python +# predict_proba returns (n_obs, 2) — take column 1 +propensity = clone(self._learner_ml_m).fit(X_train, d_train).predict_proba(X_test)[:, 1] + +# Clip in one vectorized operation +propensity = np.clip(propensity, self._trimming_threshold, 1 - self._trimming_threshold) +``` + +### Matrix Operations + +```python +# Use lstsq, not manual inversion +beta = np.linalg.lstsq(X, y, rcond=None)[0] + +# Not: beta = np.linalg.inv(X.T @ X) @ X.T @ y (numerically unstable) +``` + +## Anti-Patterns + +| Don't | Do Instead | +|-------|-----------| +| `for i in range(n_obs): result[i] = ...` | `result = vectorized_op(array)` | +| `np.append(result, value)` in a loop | Pre-allocate `np.zeros(n)`, fill by index | +| `df.apply(lambda x: ...)` | `df['col'] ** 2` or `np.log(df['col'])` | +| `KFold(n_splits=5)` | `DoubleMLResampling(n_folds=5, ...)` | +| `np.linalg.inv(X.T @ X) @ X.T @ y` | `np.linalg.lstsq(X, y, rcond=None)[0]` | diff --git a/.claude/rules/py-code-conventions.md b/.claude/rules/py-code-conventions.md new file mode 100644 index 00000000..4344a3a0 --- /dev/null +++ b/.claude/rules/py-code-conventions.md @@ -0,0 +1,196 @@ +# Python Code Conventions — DoubleML + +> **Apply when**: Writing or modifying any Python file in `doubleml/`. + +## Tooling (from `pyproject.toml`) + +| Tool | Config | Command | +|------|--------|---------| +| **black** | line-length=127, preview=true, target py310-313 | `black .` | +| **ruff** | rules E,F,W,I; ignores E721; target py312 | `ruff check .` / `ruff check --fix .` | +| **mypy** | `disallow_untyped_defs=true`, `no_implicit_optional=true`, excludes tests | `mypy doubleml` | +| **pre-commit** | black + ruff + trailing whitespace + debug statements | `pre-commit run --all-files` | + +## File Structure + +Every new or modified Python file must start with a **module-level docstring**. Do not add copyright headers, author/date stamps, or file paths — git tracks all of that. + +### Module Docstring Patterns + +Match the existing codebase style depending on file type: + +```python +# Implementation files — one sentence: what the module contains +"""Partially Linear Regression (PLR) model based on the DoubleMLScalar hierarchy.""" + +# __init__.py files — Sphinx :mod: reference +"""The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models.""" + +# Test files — one sentence: what is being tested +"""Compare PLR scalar against the existing DoubleMLPLR implementation.""" +``` + +### Full File Header (implementation files) + +```python +"""Partially Linear Regression (PLR) model based on the DoubleMLScalar hierarchy.""" +from __future__ import annotations # needed when class methods return Self/own type + +from typing import Any, Optional + +import numpy as np +import pandas as pd +from sklearn.base import clone + +from doubleml.double_ml_scalar import DoubleMLScalar +from doubleml.utils._checks import _check_learner +``` + +Import order (enforced by ruff/isort): standard library, third-party, local. + +### `from __future__ import annotations` + +Not required in every file. Use it when a class references its own type in annotations (forward reference). Since the project targets Python 3.10+, `list[int]`, `dict[str, T]`, and `X | Y` unions work natively without it. + +## Type Hints + +All functions require complete type annotations including return types. + +```python +def _nuisance_est(self, smpls: list[tuple[np.ndarray, np.ndarray]], n_rep: int = 1) -> dict[str, np.ndarray]: +``` + +- Use `-> None` for functions without return value +- Use `Optional[X]` or `X | None` (with `__future__` import) for nullable params +- `Any` is acceptable for scikit-learn estimators and dynamic learner objects +- Never suppress valid errors with `# type: ignore` — fix the type instead + +## Docstrings (NumPy Style) + +Required sections: **summary**, **Parameters**, **Returns**. Optional: **Raises**, **Examples**, **Notes**. + +```python +def _get_score_elements(self, psi_predictions: dict[str, np.ndarray]) -> dict[str, np.ndarray]: + """ + Compute score elements from nuisance predictions. + + Parameters + ---------- + psi_predictions : dict[str, np.ndarray] + Dictionary with keys ``'ml_l'``, ``'ml_m'`` containing predictions of shape ``(n_obs,)``. + + Returns + ------- + dict[str, np.ndarray] + Dictionary with keys ``'psi_a'`` (derivative) and ``'psi_b'`` (moment condition). + """ +``` + +Use `:class:\`~doubleml.DoubleMLData\`` for Sphinx cross-references. Use `.. math::` blocks for formulas. + +## Naming Conventions + +| Element | Convention | Example | +|---------|-----------|---------| +| Modules | `snake_case` | `double_ml_plr.py` | +| Classes | `PascalCase` with `DoubleML` prefix | `DoubleMLPLR` | +| Methods/functions | `snake_case` | `fit_nuisance_models()` | +| Private methods | `_leading_underscore` | `_nuisance_est()` | +| Class variables | `_UPPER_SNAKE` | `_LEARNER_SPECS` | +| Constants | `UPPER_SNAKE` | `DEFAULT_N_FOLDS` | +| Statistical notation | Conventional names | `theta`, `se`, `psi_a`, `psi_b`, `n_obs`, `n_folds` | + +## Class Design Patterns + +### Property vs. Method + +- **`@property`**: Computed attributes that are cheap and feel like data — `coef`, `se`, `summary`, `predictions`, `n_obs`, `n_folds`, `n_rep`, `score` +- **Methods**: Actions with side effects or expensive computation — `fit()`, `confint()`, `bootstrap()`, `draw_sample_splitting()` +- **`fit()` returns `self`** to enable chaining + +### Class Variables vs. Instance Variables + +- **Class variable**: Shared metadata — `_LEARNER_SPECS`, `_VALID_SCORES` +- **Instance variable**: Per-object state — `_dml_data`, `_smpls`, `_predictions` + +### Decorators + +- `@staticmethod` for stateless validation: `_check_data()` +- `@property` for computed attributes: `coef`, `se` +- `@abstractmethod` for template method hooks: `_nuisance_est()`, `_get_score_elements()` + +### Score Function Contract + +`_get_score_elements()` must return `dict[str, np.ndarray]` with: +- `'psi_a'`: Score derivative, shape `(n_obs,)` +- `'psi_b'`: Moment condition, shape `(n_obs,)` + +Linear scores use closed-form: `theta = -mean(psi_b) / mean(psi_a)`. + +## DoubleML-Specific Patterns + +### Learner Handling + +```python +# Always validate learners with _check_learner +self._learner_ml_l = _check_learner(ml_l, 'ml_l', regressor=True, classifier=False) + +# Always clone before fitting (learners are mutable) +fitted_learner = clone(self._learner_ml_l).fit(X_train, y_train) + +# Classifiers need predict_proba for propensity scores +propensity = fitted_learner.predict_proba(X_test)[:, 1] +``` + +### Sample Splitting + +Always use `DoubleMLResampling`, never raw `KFold`: + +```python +from doubleml.utils.resampling import DoubleMLResampling +resampling = DoubleMLResampling(n_folds=5, n_repeats=1, n_obs=n_obs) +``` + +### Vectorized Score Computation + +```python +# Correct: vectorized NumPy operations +psi_a = -d_res * d_res # shape: (n_obs,) +psi_b = d_res * (y - ml_g_hat) # shape: (n_obs,) + +# Wrong: Python loops over observations +``` + +Pre-allocate prediction arrays: `np.zeros((n_obs, n_rep))`. + +### Error Messages + +Include expected vs. actual values. Use `_check_*` helpers from `doubleml/utils/_checks.py`. + +```python +if score not in self._VALID_SCORES: + raise ValueError(f"score must be one of {self._VALID_SCORES}, got '{score}'") +``` + +Use `warnings.warn()` for non-fatal issues (e.g., extreme propensity scores), exceptions for invalid input. + +## Verification Checklist + +Before completing any task, run: + +```bash +black . # Format +ruff check --fix . # Lint + auto-fix +mypy doubleml # Type check +pytest -m ci # Tests +``` + +Check: +- [ ] All functions have type hints and return types +- [ ] File starts with a module-level docstring (one sentence, matching file type pattern) +- [ ] Public functions/classes have NumPy-style docstrings +- [ ] Learners validated with `_check_learner()`, cloned with `clone()` before fitting +- [ ] Score elements named `psi_a`/`psi_b`, shapes are `(n_obs,)` +- [ ] No `print()`, `breakpoint()`, or debug statements +- [ ] No magic numbers — use named constants +- [ ] Sample splitting uses `DoubleMLResampling`, not raw `KFold` diff --git a/.claude/rules/testing-conventions.md b/.claude/rules/testing-conventions.md new file mode 100644 index 00000000..0508ae42 --- /dev/null +++ b/.claude/rules/testing-conventions.md @@ -0,0 +1,104 @@ +# Testing Conventions — DoubleML + +> **Apply when**: Writing or modifying test files in `doubleml/**/tests/`. + +## Test Organization + +``` +doubleml//tests/ +├── __init__.py +├── conftest.py # Shared fixtures +├── test_.py # Legacy model tests +├── test__scalar.py # Scalar model tests (see dml-scalar-test-structure.md) +└── ... +``` + +Package-level tests and utilities live in `doubleml/tests/` (with `_utils*.py` helpers). + +## Markers + +**All test functions must be marked `@pytest.mark.ci`** — this is the CI gate. + +```python +@pytest.mark.ci +def test_coef_accuracy(fitted_model): + ... + +@pytest.mark.ci +@pytest.mark.parametrize("score", ["IV-type", "partialling out"]) +def test_score_variants(score): + ... +``` + +Other markers: `@pytest.mark.ci_rdd` for RDD-specific tests. + +Run: `pytest -m ci` (CI), `pytest doubleml/plm/tests/` (module), `pytest -k "plr and scalar"` (pattern). + +## Fixtures + +### Use `scope="module"` for Expensive Operations + +Model fitting is expensive. Fit once, share across tests: + +```python +@pytest.fixture(scope="module") +def fitted_model(): + np.random.seed(42) + data = make_plr_data(n_obs=200) + dml_obj = DoubleMLPLRScalar(data, score="IV-type") + dml_obj.set_learners(ml_l=Lasso(), ml_m=Lasso()) + dml_obj.draw_sample_splitting(n_folds=3, n_rep=2) + dml_obj.fit() + return dml_obj +``` + +### Parametrize for Multiple Scenarios + +```python +@pytest.fixture(scope="module", params=["IV-type", "partialling out"]) +def score(request): + return request.param +``` + +Each combination creates one fixture instance shared across all tests in the module. + +## Assertion Patterns + +| Context | Pattern | Tolerance | +|---------|---------|-----------| +| Statistical accuracy | `abs(coef - true_theta) <= 3.0 * se` | 3-sigma rule | +| Backward compatibility | `np.testing.assert_allclose(new, old, rtol=1e-9)` | Exact match | +| External predictions | `math.isclose(a, b, rel_tol=1e-9, abs_tol=1e-4)` | Small tolerance | +| Exception messages | `pytest.raises(ValueError, match=r"regex")` | Exact message | +| Types and shapes | `isinstance(x, np.ndarray)`, `x.shape == (n,)` | Exact | + +### Key: Always Use `match=` for Exception Tests + +```python +msg = r"score must be one of .*, got 'invalid'" +with pytest.raises(ValueError, match=msg): + DoubleMLPLR(data, score='invalid') +``` + +## Reproducibility + +- **Always seed**: `np.random.seed(42)` at the start of data generation +- **Share sample splits** in comparison tests: `dml_new._smpls = dml_old.smpls` + (Old and new implementations consume random state differently during `__init__`) +- **Small data for speed**: `n_obs=200`, `n_folds=3` for return type / exception tests +- **Larger data for accuracy**: `n_obs=500`, `n_folds=5` for estimation tests + +## Naming + +- Files: `test_.py`, `test__scalar.py`, `test__scalar_exceptions.py` +- Functions: `test_` — e.g., `test_coef_within_3_sigma`, `test_exception_invalid_score` +- Docstrings: Every test function gets a one-line docstring explaining what it verifies + +## Checklist + +- [ ] All tests marked `@pytest.mark.ci` +- [ ] Fixtures use `scope="module"` for model fitting +- [ ] Exception tests use `match=` with regex +- [ ] Seeds set for reproducibility +- [ ] Test functions have descriptive names and docstrings +- [ ] New scalar models have all 5 required test files (see `dml-scalar-test-structure.md`) From 48ae3a9d2004dfc33fed40a43e6ae41b313a91a9 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 7 Feb 2026 13:16:57 +0100 Subject: [PATCH 11/38] Refactor IRM class type hints to use built-in types and improve code clarity --- doubleml/irm/irm_scalar.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index f8c0ee53..83a7f47b 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import ClassVar, Dict, List, Optional, Self, Union +from typing import ClassVar, Self import numpy as np from sklearn.base import clone @@ -78,7 +78,7 @@ class IRM(LinearScoreMixin): """ # Define learner specifications for IRM - _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] = { + _LEARNER_SPECS: ClassVar[dict[str, LearnerSpec]] = { "ml_g0": LearnerSpec("ml_g0", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"), "ml_g1": LearnerSpec("ml_g1", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"), "ml_m": LearnerSpec("ml_m", allow_regressor=False, allow_classifier=True), @@ -88,11 +88,11 @@ def __init__( self, obj_dml_data: DoubleMLData, score: str = "ATE", - ml_g: Optional[object] = None, - ml_m: Optional[object] = None, + ml_g: object | None = None, + ml_m: object | None = None, normalize_ipw: bool = False, - weights: Optional[Union[np.ndarray, Dict]] = None, - ps_processor_config: Optional[PSProcessorConfig] = None, + weights: np.ndarray | dict | None = None, + ps_processor_config: PSProcessorConfig | None = None, ): """ Initialize IRM model. @@ -165,12 +165,12 @@ def ps_processor(self) -> PSProcessor: return self._ps_processor @property - def weights(self) -> Dict: + def weights(self) -> dict: """Weights for weighted ATE/ATTE.""" return self._weights @property - def required_learners(self) -> List[str]: + def required_learners(self) -> list[str]: """Required learners for IRM: ml_g0, ml_g1, and ml_m.""" return ["ml_g0", "ml_g1", "ml_m"] @@ -178,10 +178,10 @@ def required_learners(self) -> List[str]: def set_learners( self, - ml_g: Optional[object] = None, - ml_g0: Optional[object] = None, - ml_g1: Optional[object] = None, - ml_m: Optional[object] = None, + ml_g: object | None = None, + ml_g0: object | None = None, + ml_g1: object | None = None, + ml_m: object | None = None, ) -> Self: """ Set the learners for nuisance estimation. @@ -269,7 +269,7 @@ def _nuisance_est( test_idx: np.ndarray, i_rep: int, i_fold: int, - external_predictions: Optional[Dict[str, np.ndarray]] = None, + external_predictions: dict[str, np.ndarray] | None = None, ) -> None: x = self._dml_data.x y = self._dml_data.y @@ -308,7 +308,7 @@ def _nuisance_est( # ==================== Score Elements ==================== - def _get_score_elements(self) -> Dict[str, np.ndarray]: + def _get_score_elements(self) -> dict[str, np.ndarray]: y = self._dml_data.y d = self._dml_data.d @@ -371,17 +371,18 @@ def _check_data(obj_dml_data: object) -> None: "needs to be specified as treatment variable." ) - def _initialize_weights(self, weights: Optional[Union[np.ndarray, Dict]]) -> None: + def _initialize_weights(self, weights: np.ndarray | dict | None) -> None: """Initialize weights storage.""" if weights is None: weights = np.ones(self._dml_data.n_obs) if isinstance(weights, np.ndarray): self._weights = {"weights": weights} else: - assert isinstance(weights, dict) + if not isinstance(weights, dict): + raise TypeError(f"weights must be np.ndarray or dict, got {type(weights).__name__}") self._weights = weights - def _get_weights(self, m_hat: np.ndarray) -> tuple: + def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """ Compute weights and weights_bar for score computation. @@ -408,8 +409,7 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple: else: weights_bar = weights.copy() else: - # ATTE - assert self.score == "ATTE" + # ATTE (score validated in __init__) w = self._weights["weights"] subgroup = w * d subgroup_probability = np.mean(subgroup) From 1886a7db00a21c3caecdf76b1f890b253de1cfea Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 9 Feb 2026 09:13:23 +0100 Subject: [PATCH 12/38] Refactor DoubleMLScalar to enhance sample splitting functionality; add tests for cluster-based sample splitting and external prediction validation. --- doubleml/double_ml_scalar.py | 228 +++++++++++++++--- doubleml/tests/test_scalar_cluster.py | 134 ++++++++++ doubleml/tests/test_scalar_ext_predictions.py | 45 ++++ .../tests/test_scalar_set_sample_splitting.py | 62 +++++ 4 files changed, 431 insertions(+), 38 deletions(-) create mode 100644 doubleml/tests/test_scalar_cluster.py create mode 100644 doubleml/tests/test_scalar_ext_predictions.py create mode 100644 doubleml/tests/test_scalar_set_sample_splitting.py diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index a0010005..0f69a3e1 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -3,7 +3,7 @@ """ from abc import ABC, abstractmethod -from typing import ClassVar, Dict, List, Optional, Self +from typing import ClassVar, Self import numpy as np @@ -11,8 +11,9 @@ from .double_ml_base import DoubleMLBase from .double_ml_framework import DoubleMLCore as DoubleMLCoreData from .double_ml_framework import DoubleMLFramework +from .utils._checks import _check_sample_splitting from .utils._learner import LearnerInfo, LearnerSpec, validate_learner -from .utils.resampling import DoubleMLResampling +from .utils.resampling import DoubleMLClusterResampling, DoubleMLResampling class DoubleMLScalar(DoubleMLBase, ABC): @@ -45,7 +46,7 @@ class DoubleMLScalar(DoubleMLBase, ABC): """ # Subclasses define all possible learners for the model - _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] + _LEARNER_SPECS: ClassVar[dict[str, LearnerSpec]] def __init__( self, @@ -81,24 +82,26 @@ def __init__( self._score = score # Learner storage: single dict for all learner state - self._learners: Dict[str, LearnerInfo] = {} + self._learners: dict[str, LearnerInfo] = {} # Resampling parameters (set via draw_sample_splitting) - self._n_folds: Optional[int] = None - self._n_rep: Optional[int] = None - self._smpls: Optional[List] = None + self._n_folds: int | None = None + self._n_folds_per_cluster: int | None = None + self._n_rep: int | None = None + self._smpls: list | None = None + self._smpls_cluster: list | None = None # Initialize storage for predictions and results - self._predictions: Optional[Dict[str, np.ndarray]] = None - self._all_thetas: Optional[np.ndarray] = None - self._all_ses: Optional[np.ndarray] = None - self._psi: Optional[np.ndarray] = None - self._psi_deriv: Optional[np.ndarray] = None - self._var_scaling_factors: Optional[np.ndarray] = None + self._predictions: dict[str, np.ndarray] | None = None + self._all_thetas: np.ndarray | None = None + self._all_ses: np.ndarray | None = None + self._psi: np.ndarray | None = None + self._psi_deriv: np.ndarray | None = None + self._var_scaling_factors: np.ndarray | None = None # For iteration (used during fit) - self._i_rep: Optional[int] = None - self._i_fold: Optional[int] = None + self._i_rep: int | None = None + self._i_fold: int | None = None # ==================== Properties ==================== @@ -153,7 +156,7 @@ def score(self) -> str: return self._score @property - def predictions(self) -> Dict[str, np.ndarray]: + def predictions(self) -> dict[str, np.ndarray]: """ Predictions from nuisance models. @@ -172,7 +175,7 @@ def predictions(self) -> Dict[str, np.ndarray]: return self._predictions @property - def smpls(self) -> List: + def smpls(self) -> list: """ Sample splitting indices used for cross-fitting. @@ -185,9 +188,28 @@ def smpls(self) -> List: raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.") return self._smpls + @property + def smpls_cluster(self) -> list | None: + """ + Cluster-based sample splitting indices used for cross-fitting. + + Returns + ------- + list or None + List of cluster sample splitting indices for each repetition, or None. + + Raises + ------ + ValueError + If cluster data is used but cluster splitting is not available. + """ + if self._dml_data.is_cluster_data and self._smpls_cluster is None: + raise ValueError("Cluster sample splitting has not been provided. Call set_sample_splitting() first.") + return self._smpls_cluster + @property @abstractmethod - def required_learners(self) -> List[str]: + def required_learners(self) -> list[str]: """ Names of the required learners for current configuration. @@ -202,7 +224,7 @@ def required_learners(self) -> List[str]: pass @property - def learners(self) -> Dict[str, object]: + def learners(self) -> dict[str, object]: """ Access registered learner objects by name. @@ -213,7 +235,7 @@ def learners(self) -> Dict[str, object]: """ return {name: info.learner for name, info in self._learners.items()} - def get_params(self, learner_name: str) -> Dict: + def get_params(self, learner_name: str) -> dict: """ Get parameters of a registered learner. @@ -316,8 +338,8 @@ def fit( self, n_folds: int = 5, n_rep: int = 1, - n_jobs_cv: Optional[int] = None, - external_predictions: Optional[Dict[str, np.ndarray]] = None, + n_jobs_cv: int | None = None, + external_predictions: dict[str, np.ndarray] | None = None, **kwargs, ) -> Self: """ @@ -358,8 +380,8 @@ def fit( def fit_nuisance_models( self, - n_jobs_cv: Optional[int] = None, - external_predictions: Optional[Dict[str, np.ndarray]] = None, + n_jobs_cv: int | None = None, + external_predictions: dict[str, np.ndarray] | None = None, ) -> Self: """ Fit nuisance models via cross-fitting. @@ -389,6 +411,9 @@ def fit_nuisance_models( if self._smpls is None: raise ValueError("Sample splitting has not been initialized. Call draw_sample_splitting() first.") + if external_predictions is not None: + self._check_external_predictions(external_predictions) + # Validate that all required learners are available self._check_learners_available(external_predictions) @@ -485,18 +510,93 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: if not isinstance(n_rep, int) or n_rep < 1: raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.") - self._n_folds = n_folds - self._n_rep = n_rep + if self._dml_data.is_cluster_data: + self._n_folds_per_cluster = n_folds + self._n_rep = n_rep + self._n_folds = n_folds**self._dml_data.n_cluster_vars + + resampler = DoubleMLClusterResampling( + n_folds=n_folds, + n_rep=n_rep, + n_obs=self._n_obs, + n_cluster_vars=self._dml_data.n_cluster_vars, + cluster_vars=self._dml_data.cluster_vars, + ) + self._smpls, self._smpls_cluster = resampler.split_samples() + else: + self._n_folds = n_folds + self._n_folds_per_cluster = None + self._n_rep = n_rep + + # Create resampler + resampler = DoubleMLResampling( + n_folds=n_folds, + n_rep=n_rep, + n_obs=self._n_obs, + ) + + # Generate splits + self._smpls = resampler.split_samples() + self._smpls_cluster = None + + self._reset_fit_state() + + return self + + def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None = None) -> Self: + """ + Set the sample splitting for DoubleMLScalar models. + + Parameters + ---------- + all_smpls : list + List of tuples (train_ind, test_ind) per fold, or list of lists of tuples + for repeated sample splitting. + all_smpls_cluster : list or None + Nested list for cluster sample splitting. Required for cluster data. + Default is ``None``. + + Returns + ------- + self : Self - # Create resampler - resampler = DoubleMLResampling( - n_folds=n_folds, - n_rep=n_rep, + Raises + ------ + TypeError + If ``all_smpls`` is not a list or if tuple shorthand is used. + ValueError + If the partition is invalid or cluster splitting is missing. + """ + if isinstance(all_smpls, tuple): + raise TypeError("all_smpls must be a list of folds; tuple shorthand is not supported for DoubleMLScalar.") + if not isinstance(all_smpls, list): + raise TypeError(f"all_smpls must be of list type. {str(all_smpls)} of type {str(type(all_smpls))} was passed.") + + smpls, smpls_cluster, n_rep, n_folds = _check_sample_splitting( + all_smpls, + all_smpls_cluster, + self._dml_data, + self._dml_data.is_cluster_data, n_obs=self._n_obs, ) - # Generate splits - self._smpls = resampler.split_samples() + self._smpls = smpls + self._smpls_cluster = smpls_cluster + self._n_rep = n_rep + self._n_folds = n_folds + if self._dml_data.is_cluster_data: + n_cluster_vars = self._dml_data.n_cluster_vars + n_folds_per_cluster = int(round(n_folds ** (1.0 / n_cluster_vars))) + if n_folds_per_cluster**n_cluster_vars != n_folds: + raise ValueError( + "Invalid cluster sample splitting. n_folds must be a power of n_folds_per_cluster " + "for the number of cluster variables." + ) + self._n_folds_per_cluster = n_folds_per_cluster + else: + self._n_folds_per_cluster = None + + self._reset_fit_state() return self @@ -514,7 +614,7 @@ def _initialize_result_arrays(self) -> None: self._psi = np.zeros((n_obs, n_thetas, n_rep)) self._psi_deriv = np.zeros((n_obs, n_thetas, n_rep)) - def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]: + def _initialize_predictions_dict(self) -> dict[str, np.ndarray]: """ Initialize dictionary for storing predictions. @@ -531,7 +631,37 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]: n_rep = self.n_rep return {name: np.full((n_obs, n_rep), np.nan) for name in self.required_learners} - def _check_learners_available(self, external_predictions: Optional[Dict[str, np.ndarray]] = None) -> None: + def _check_external_predictions(self, external_predictions: dict[str, np.ndarray]) -> None: + """ + Validate external prediction arrays. + + Parameters + ---------- + external_predictions : dict + Dictionary of external predictions keyed by learner name. + + Raises + ------ + TypeError + If a value is not a numpy array. + ValueError + If a value does not match shape (n_obs, n_rep). + """ + n_obs = self._n_obs + n_rep = self.n_rep + required = set(self.required_learners) + + for key, values in external_predictions.items(): + if key not in required: + raise ValueError( + f"External predictions provided for unknown learner '{key}'. " f"Allowed learners: {sorted(required)}." + ) + if not isinstance(values, np.ndarray): + raise TypeError(f"External predictions for '{key}' must be a numpy array. Got {type(values).__name__}.") + if values.shape != (n_obs, n_rep): + raise ValueError(f"External predictions for '{key}' must have shape ({n_obs}, {n_rep}). Got {values.shape}.") + + def _check_learners_available(self, external_predictions: dict[str, np.ndarray] | None = None) -> None: """ Validate that all required learners are set or covered by external predictions. @@ -567,13 +697,23 @@ def _construct_framework(self) -> DoubleMLFramework: # Both already in framework shape: (n_obs, n_thetas, n_rep) scaled_psi = np.divide(self._psi, np.mean(self._psi_deriv, axis=0, keepdims=True)) + cluster_dict = None + if self._dml_data.is_cluster_data: + cluster_dict = { + "smpls": self.smpls, + "smpls_cluster": self.smpls_cluster, + "cluster_vars": self._dml_data.cluster_vars, + "n_folds_per_cluster": self._n_folds_per_cluster, + } + # Create data container (no transpose needed - already in framework convention!) framework_data = DoubleMLCoreData( all_thetas=self._all_thetas, # (n_thetas, n_rep) all_ses=self._all_ses, # (n_thetas, n_rep) var_scaling_factors=self._var_scaling_factors, # (n_thetas,) scaled_psi=scaled_psi, # (n_obs, n_thetas, n_rep) - is_cluster_data=False, # TODO: Add cluster data support + is_cluster_data=self._dml_data.is_cluster_data, + cluster_dict=cluster_dict, ) # Create and return framework @@ -582,6 +722,18 @@ def _construct_framework(self) -> DoubleMLFramework: treatment_names=self._dml_data.d_cols, ) + def _reset_fit_state(self) -> None: + """Clear fit-dependent state after changing the sample splitting.""" + self._predictions = None + self._framework = None + self._all_thetas = None + self._all_ses = None + self._psi = None + self._psi_deriv = None + self._var_scaling_factors = None + self._i_rep = None + self._i_fold = None + # ==================== Abstract Methods (Must be Implemented by Subclasses) ==================== @abstractmethod @@ -591,7 +743,7 @@ def _nuisance_est( test_idx: np.ndarray, i_rep: int, i_fold: int, - external_predictions: Optional[Dict[str, np.ndarray]] = None, + external_predictions: dict[str, np.ndarray] | None = None, ) -> None: """ Estimate nuisance parameters for one fold. @@ -621,7 +773,7 @@ def _nuisance_est( pass @abstractmethod - def _get_score_elements(self) -> Dict[str, np.ndarray]: + def _get_score_elements(self) -> dict[str, np.ndarray]: """ Compute score function elements from nuisance predictions. @@ -647,7 +799,7 @@ def _get_score_elements(self) -> Dict[str, np.ndarray]: pass @abstractmethod - def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None: + def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None: """ Estimate causal parameters and standard errors from score elements. diff --git a/doubleml/tests/test_scalar_cluster.py b/doubleml/tests/test_scalar_cluster.py new file mode 100644 index 00000000..0ad05f68 --- /dev/null +++ b/doubleml/tests/test_scalar_cluster.py @@ -0,0 +1,134 @@ +"""Test cluster-based sample splitting for scalar PLR models.""" + +import numpy as np +import pytest +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import Lasso, LinearRegression + +from doubleml import DoubleMLData +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + +from ._utils import _clone + + +@pytest.fixture( + scope="module", params=[RandomForestRegressor(max_depth=2, n_estimators=10), LinearRegression(), Lasso(alpha=0.1)] +) +def learner(request): + return request.param + + +@pytest.mark.ci +def test_scalar_plr_cluster_set_sample_splitting(): + """Check set_sample_splitting consistency for scalar PLR cluster data.""" + np.random.seed(3141) + n_i = 5 + n_j = 6 + n_obs = n_i * n_j + + df = make_plr_CCDDHNR2018(n_obs=n_obs, return_type="DataFrame") + x_cols = [col for col in df.columns if col.startswith("X")] + + df["cluster_i"] = np.repeat(np.arange(n_i), n_j) + df["cluster_j"] = np.tile(np.arange(n_j), n_i) + + dml_data = DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols=["cluster_i", "cluster_j"]) + + ml_l = LinearRegression() + ml_m = LinearRegression() + + dml_obj = PLR(dml_data) + dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m) + dml_obj.draw_sample_splitting(n_folds=2, n_rep=2) + dml_obj.fit() + + dml_obj_ext = PLR(dml_data) + dml_obj_ext.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + dml_obj_ext.set_sample_splitting(all_smpls=dml_obj.smpls, all_smpls_cluster=dml_obj.smpls_cluster) + dml_obj_ext.fit() + + assert np.isclose(dml_obj.coef[0], dml_obj_ext.coef[0], rtol=1e-9, atol=1e-4) + assert np.isclose(dml_obj.se[0], dml_obj_ext.se[0], rtol=1e-9, atol=1e-4) + + +@pytest.fixture(scope="module") +def dml_plr_scalar_cluster_with_index(generate_data1, learner): + """Fit scalar PLR with and without clustering for comparison.""" + # in the one-way cluster case with exactly one observation per cluster, we get the same result w & w/o clustering + n_folds = 2 + + data = generate_data1 + x_cols = data.columns[data.columns.str.startswith("X")].tolist() + + ml_l = _clone(learner) + ml_m = _clone(learner) + + obj_dml_data = DoubleMLData(data, "y", ["d"], x_cols) + np.random.seed(3141) + dml_plr_obj = PLR(obj_dml_data) + dml_plr_obj.set_learners(ml_l=ml_l, ml_m=ml_m) + dml_plr_obj.draw_sample_splitting(n_folds=n_folds) + dml_plr_obj.fit() + + df = data.reset_index() + dml_cluster_data = DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index") + np.random.seed(3141) + dml_plr_cluster_obj = PLR(dml_cluster_data) + dml_plr_cluster_obj.set_learners(ml_l=_clone(learner), ml_m=_clone(learner)) + dml_plr_cluster_obj.draw_sample_splitting(n_folds=n_folds) + dml_plr_cluster_obj.fit() + + dml_plr_cluster_ext_smpls = PLR(dml_cluster_data) + dml_plr_cluster_ext_smpls.set_learners(ml_l=_clone(learner), ml_m=_clone(learner)) + dml_plr_cluster_ext_smpls.set_sample_splitting( + all_smpls=dml_plr_cluster_obj.smpls, + all_smpls_cluster=dml_plr_cluster_obj.smpls_cluster, + ) + np.random.seed(3141) + dml_plr_cluster_ext_smpls.fit() + + res_dict = { + "coef": dml_plr_obj.coef, + "coef_manual": dml_plr_cluster_obj.coef, + "se": dml_plr_obj.se, + "se_manual": dml_plr_cluster_obj.se, + "coef_ext_smpls": dml_plr_cluster_ext_smpls.coef, + "se_ext_smpls": dml_plr_cluster_ext_smpls.se, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_plr_scalar_cluster_with_index_coef(dml_plr_scalar_cluster_with_index): + """Validate scalar PLR cluster coefficients match across configurations.""" + assert np.isclose( + dml_plr_scalar_cluster_with_index["coef"][0], + dml_plr_scalar_cluster_with_index["coef_manual"][0], + rtol=1e-9, + atol=1e-4, + ) + assert np.isclose( + dml_plr_scalar_cluster_with_index["coef"][0], + dml_plr_scalar_cluster_with_index["coef_ext_smpls"][0], + rtol=1e-9, + atol=1e-4, + ) + + +@pytest.mark.ci +def test_dml_plr_scalar_cluster_with_index_se(dml_plr_scalar_cluster_with_index): + """Validate scalar PLR cluster standard errors match across configurations.""" + assert np.isclose( + dml_plr_scalar_cluster_with_index["se"][0], + dml_plr_scalar_cluster_with_index["se_manual"][0], + rtol=1e-9, + atol=1e-4, + ) + assert np.isclose( + dml_plr_scalar_cluster_with_index["se"][0], + dml_plr_scalar_cluster_with_index["se_ext_smpls"][0], + rtol=1e-9, + atol=1e-4, + ) diff --git a/doubleml/tests/test_scalar_ext_predictions.py b/doubleml/tests/test_scalar_ext_predictions.py new file mode 100644 index 00000000..349ad25c --- /dev/null +++ b/doubleml/tests/test_scalar_ext_predictions.py @@ -0,0 +1,45 @@ +"""Test external prediction validation for scalar DoubleML models.""" + +import numpy as np +import pytest +from sklearn.linear_model import Lasso + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + + +@pytest.mark.ci +def test_scalar_external_predictions_unknown_key(): + """Reject external predictions with unknown learner keys.""" + np.random.seed(3141) + dml_data = make_plr_CCDDHNR2018(n_obs=10) + dml_obj = PLR(dml_data) + dml_obj.set_learners(ml_l=Lasso(), ml_m=Lasso()) + dml_obj.draw_sample_splitting(n_folds=2, n_rep=1) + + ext_predictions = { + "ml_l": np.zeros((10, 1)), + "ml_m": np.zeros((10, 1)), + "ml_unknown": np.zeros((10, 1)), + } + msg = "External predictions provided for unknown learner 'ml_unknown'" + with pytest.raises(ValueError, match=msg): + dml_obj.fit_nuisance_models(external_predictions=ext_predictions) + + +@pytest.mark.ci +def test_scalar_external_predictions_shape(): + """Reject external predictions with incorrect shape.""" + np.random.seed(3141) + dml_data = make_plr_CCDDHNR2018(n_obs=10) + dml_obj = PLR(dml_data) + dml_obj.set_learners(ml_l=Lasso(), ml_m=Lasso()) + dml_obj.draw_sample_splitting(n_folds=2, n_rep=1) + + ext_predictions = { + "ml_l": np.zeros((10, 2)), + "ml_m": np.zeros((10, 1)), + } + msg = r"External predictions for 'ml_l' must have shape \(10, 1\)" + with pytest.raises(ValueError, match=msg): + dml_obj.fit_nuisance_models(external_predictions=ext_predictions) diff --git a/doubleml/tests/test_scalar_set_sample_splitting.py b/doubleml/tests/test_scalar_set_sample_splitting.py new file mode 100644 index 00000000..bc9abd84 --- /dev/null +++ b/doubleml/tests/test_scalar_set_sample_splitting.py @@ -0,0 +1,62 @@ +"""Test sample splitting setup for scalar DoubleML models.""" + +import numpy as np +import pytest + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + + +def _assert_smpls_equal(smpls0, smpls1): + assert len(smpls0) == len(smpls1) + for i_rep in range(len(smpls0)): + assert len(smpls0[i_rep]) == len(smpls1[i_rep]) + for i_fold in range(len(smpls0[i_rep])): + assert np.array_equal(smpls0[i_rep][i_fold][0], smpls1[i_rep][i_fold][0]) + assert np.array_equal(smpls0[i_rep][i_fold][1], smpls1[i_rep][i_fold][1]) + + +@pytest.mark.ci +def test_scalar_set_sample_splitting_list(): + """Ensure list-of-tuples splits set n_folds/n_rep correctly.""" + np.random.seed(3141) + dml_data = make_plr_CCDDHNR2018(n_obs=10) + dml_obj = PLR(dml_data) + + smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])] + dml_obj.set_sample_splitting(smpls) + + assert dml_obj.n_folds == 2 + assert dml_obj.n_rep == 1 + _assert_smpls_equal([smpls], dml_obj.smpls) + + +@pytest.mark.ci +def test_scalar_set_sample_splitting_list_of_lists(): + """Ensure list-of-list splits set repeated sample splitting correctly.""" + np.random.seed(3141) + dml_data = make_plr_CCDDHNR2018(n_obs=10) + dml_obj = PLR(dml_data) + + smpls = [ + [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])], + [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]), ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])], + ] + dml_obj.set_sample_splitting(smpls) + + assert dml_obj.n_folds == 2 + assert dml_obj.n_rep == 2 + _assert_smpls_equal(smpls, dml_obj.smpls) + + +@pytest.mark.ci +def test_scalar_set_sample_splitting_tuple_rejected(): + """Reject tuple shorthand for scalar set_sample_splitting.""" + np.random.seed(3141) + dml_data = make_plr_CCDDHNR2018(n_obs=10) + dml_obj = PLR(dml_data) + + smpls = (np.arange(10), np.arange(10)) + msg = "all_smpls must be a list of folds; tuple shorthand is not supported" + with pytest.raises(TypeError, match=msg): + dml_obj.set_sample_splitting(smpls) From 0ca053d70b02082b18e9677f780f563cd31a6182 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 9 Feb 2026 09:13:32 +0100 Subject: [PATCH 13/38] Refactor IRM and PLR classes to reset fit state after updating learners; enhance tests for return types and reset behavior. --- .claude/rules/dml-scalar-test-structure.md | 78 ++++++++++--------- doubleml/irm/irm_scalar.py | 1 + .../irm/tests/test_irm_scalar_return_types.py | 46 +++++++++++ doubleml/plm/plr_scalar.py | 1 + .../plm/tests/test_plr_scalar_return_types.py | 37 +++++++++ 5 files changed, 125 insertions(+), 38 deletions(-) diff --git a/.claude/rules/dml-scalar-test-structure.md b/.claude/rules/dml-scalar-test-structure.md index 8ee372ae..d6a03327 100644 --- a/.claude/rules/dml-scalar-test-structure.md +++ b/.claude/rules/dml-scalar-test-structure.md @@ -7,13 +7,13 @@ Every scalar model `` in module `/` requires **5 test files** in `doubleml//tests/`: -| File | Purpose | -|------|---------| -| `test__scalar.py` | Core estimation accuracy (3-sigma rule) | -| `test__scalar_return_types.py` | Property types, shapes, API contracts | -| `test__scalar_exceptions.py` | Input validation, error messages | -| `test__scalar_vs_.py` | Exact match with old `DoubleML` | -| `test__scalar_external_predictions.py` | External predictions equivalence | +| File | Purpose | +| --------------------------------------------- | --------------------------------------- | +| `test__scalar.py` | Core estimation accuracy (3-sigma rule) | +| `test__scalar_return_types.py` | Property types, shapes, API contracts | +| `test__scalar_exceptions.py` | Input validation, error messages | +| `test__scalar_vs_.py` | Exact match with old `DoubleML` | +| `test__scalar_external_predictions.py` | External predictions equivalence | All test functions must be marked `@pytest.mark.ci`. @@ -34,38 +34,40 @@ All test functions must be marked `@pytest.mark.ci`. **Required tests**: -| Test | Assertion | -|------|-----------| -| `test_coef_type_and_shape` | `isinstance(coef, np.ndarray)`, `shape == (1,)` | -| `test_se_type_and_shape` | `isinstance(se, np.ndarray)`, `shape == (1,)` | -| `test_all_thetas_shape` | `shape == (1, N_REP)` | -| `test_all_ses_shape` | `shape == (1, N_REP)` | -| `test_summary_type` | `isinstance(summary, pd.DataFrame)`, `len == 1` | -| `test_confint_type_and_shape` | `isinstance(ci, pd.DataFrame)`, `shape == (1, 2)` | -| `test_psi_shape` | `shape == (N_OBS, 1, N_REP)` | -| `test_predictions_type` | `isinstance(predictions, dict)`, each value `shape == (N_OBS, N_REP)` | -| `test_smpls_type` | `len(smpls) == N_REP`, each has `N_FOLDS` tuples of `(train, test)` arrays | -| `test_n_properties` | `n_obs == N_OBS`, `n_folds == N_FOLDS`, `n_rep == N_REP`, `score == expected` | -| `test_required_learners` | Returns list of expected learner names | -| `test_str_repr` | `str(model)` and `repr(model)` return `str` | -| `test_get_params` | Returns dict with learner keys | -| `test_set_params` | Modifies and confirms learner parameter change | -| `test_before_fit_raises` | `coef`/`se` before `fit()` raises error | +| Test | Assertion | +| ---------------------------------------- | ----------------------------------------------------------------------------- | +| `test_coef_type_and_shape` | `isinstance(coef, np.ndarray)`, `shape == (1,)` | +| `test_se_type_and_shape` | `isinstance(se, np.ndarray)`, `shape == (1,)` | +| `test_all_thetas_shape` | `shape == (1, N_REP)` | +| `test_all_ses_shape` | `shape == (1, N_REP)` | +| `test_summary_type` | `isinstance(summary, pd.DataFrame)`, `len == 1` | +| `test_confint_type_and_shape` | `isinstance(ci, pd.DataFrame)`, `shape == (1, 2)` | +| `test_psi_shape` | `shape == (N_OBS, 1, N_REP)` | +| `test_predictions_type` | `isinstance(predictions, dict)`, each value `shape == (N_OBS, N_REP)` | +| `test_smpls_type` | `len(smpls) == N_REP`, each has `N_FOLDS` tuples of `(train, test)` arrays | +| `test_n_properties` | `n_obs == N_OBS`, `n_folds == N_FOLDS`, `n_rep == N_REP`, `score == expected` | +| `test_required_learners` | Returns list of expected learner names | +| `test_str_repr` | `str(model)` and `repr(model)` return `str` | +| `test_get_params` | Returns dict with learner keys | +| `test_set_params` | Modifies and confirms learner parameter change | +| `test_before_fit_raises` | `coef`/`se` before `fit()` raises error | +| `test_reset_after_set_learners` | Updating learners clears fitted results | +| `test_reset_after_draw_sample_splitting` | Changing splits clears fitted results | ## 3. Exceptions (`test__scalar_exceptions.py`) **Common exception tests** (required for all models): -| Test | Input | Expected | -|------|-------|----------| -| `test_exception_data` | Non-DoubleMLData | `TypeError` | -| `test_exception_score` | Invalid score string | `ValueError` | -| `test_exception_n_folds` | `n_folds < 2` | `ValueError` | -| `test_exception_n_rep` | `n_rep < 1` | `ValueError` | -| `test_exception_fit_nuisance_without_smpls` | Fit before `draw_sample_splitting()` | `ValueError` | +| Test | Input | Expected | +| ---------------------------------------------------- | --------------------------------------- | ------------ | +| `test_exception_data` | Non-DoubleMLData | `TypeError` | +| `test_exception_score` | Invalid score string | `ValueError` | +| `test_exception_n_folds` | `n_folds < 2` | `ValueError` | +| `test_exception_n_rep` | `n_rep < 1` | `ValueError` | +| `test_exception_fit_nuisance_without_smpls` | Fit before `draw_sample_splitting()` | `ValueError` | | `test_exception_estimate_causal_without_predictions` | Estimate before `fit_nuisance_models()` | `ValueError` | -| `test_exception_missing_learner` | `fit()` without required learners | `ValueError` | -| `test_exception_invalid_learner` | Class instead of instance | `TypeError` | +| `test_exception_missing_learner` | `fit()` without required learners | `ValueError` | +| `test_exception_invalid_learner` | Class instead of instance | `TypeError` | **Model-specific exceptions** to add per model: - PLR: multiple treatments, `ml_g` warning for partialling out @@ -107,11 +109,11 @@ dml_new._smpls = dml_old.smpls # Old/new consume random state differently ## Assertion Tolerance Summary -| Context | Method | Why | -|---------|--------|-----| -| Core estimation | `abs(coef - true) <= 3.0 * se` | Statistical 3-sigma | -| Backward compatibility | `assert_allclose(rtol=1e-9)` | Must be identical | -| External predictions | `math.isclose(rel_tol=1e-9, abs_tol=1e-4)` | Numerical accumulation | +| Context | Method | Why | +| ---------------------- | ------------------------------------------ | ---------------------- | +| Core estimation | `abs(coef - true) <= 3.0 * se` | Statistical 3-sigma | +| Backward compatibility | `assert_allclose(rtol=1e-9)` | Must be identical | +| External predictions | `math.isclose(rel_tol=1e-9, abs_tol=1e-4)` | Numerical accumulation | ## New Model Checklist diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index 83a7f47b..71f2f142 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -219,6 +219,7 @@ def set_learners( if learner is not None: self._register_learner(name, learner) + self._reset_fit_state() return self # ==================== Sample Splitting ==================== diff --git a/doubleml/irm/tests/test_irm_scalar_return_types.py b/doubleml/irm/tests/test_irm_scalar_return_types.py index 15eaae82..a437f49d 100644 --- a/doubleml/irm/tests/test_irm_scalar_return_types.py +++ b/doubleml/irm/tests/test_irm_scalar_return_types.py @@ -1,3 +1,5 @@ +"""Validate IRM scalar return types and reset behavior.""" + import numpy as np import pandas as pd import pytest @@ -152,6 +154,7 @@ def test_get_params_invalid_learner(fitted_dml_obj): @pytest.mark.ci def test_before_fit_raises(): + """Raise errors when accessing results before fitting.""" np.random.seed(3141) dml_obj = IRM(obj_dml_data) with pytest.raises(ValueError, match="framework is not yet initialized"): @@ -168,3 +171,46 @@ def test_irm_properties(fitted_dml_obj): assert "weights" in fitted_dml_obj.weights assert fitted_dml_obj.ps_processor is not None assert fitted_dml_obj.ps_processor_config is not None + + +@pytest.mark.ci +def test_reset_after_set_learners(): + """Reset fitted state after updating learners.""" + np.random.seed(3141) + dml_obj = IRM(obj_dml_data) + dml_obj.set_learners( + ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42), + ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42), + ) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + + dml_obj.set_learners( + ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42), + ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42), + ) + + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.coef + with pytest.raises(ValueError, match="Predictions not available. Call fit"): + _ = dml_obj.predictions + + +@pytest.mark.ci +def test_reset_after_draw_sample_splitting(): + """Reset fitted state after changing sample splits.""" + np.random.seed(3141) + dml_obj = IRM(obj_dml_data) + dml_obj.set_learners( + ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42), + ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42), + ) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.coef + with pytest.raises(ValueError, match="Predictions not available. Call fit"): + _ = dml_obj.predictions diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index ef18fb68..e4aeaa51 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -128,6 +128,7 @@ def set_learners( # IV-type: clone ml_l to ml_g if only one provided self._handle_iv_cloning() + self._reset_fit_state() return self def _handle_iv_cloning(self) -> None: diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py index 09832931..39fe77e6 100644 --- a/doubleml/plm/tests/test_plr_scalar_return_types.py +++ b/doubleml/plm/tests/test_plr_scalar_return_types.py @@ -1,3 +1,5 @@ +"""Validate PLR scalar return types and reset behavior.""" + import numpy as np import pandas as pd import pytest @@ -148,8 +150,43 @@ def test_get_params_invalid_learner(fitted_dml_obj): @pytest.mark.ci def test_before_fit_raises(): + """Raise errors when accessing results before fitting.""" + np.random.seed(3141) + dml_obj = PLR(obj_dml_data) + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.coef + with pytest.raises(ValueError, match="Predictions not available. Call fit"): + _ = dml_obj.predictions + + +@pytest.mark.ci +def test_reset_after_set_learners(): + """Reset fitted state after updating learners.""" np.random.seed(3141) dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.coef + with pytest.raises(ValueError, match="Predictions not available. Call fit"): + _ = dml_obj.predictions + + +@pytest.mark.ci +def test_reset_after_draw_sample_splitting(): + """Reset fitted state after changing sample splits.""" + np.random.seed(3141) + dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + with pytest.raises(ValueError, match="framework is not yet initialized"): _ = dml_obj.coef with pytest.raises(ValueError, match="Predictions not available. Call fit"): From 33c8b01bdb93a188f18abc4624ed4c63d6a86e10 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 9 Feb 2026 11:21:37 +0100 Subject: [PATCH 14/38] Add copilot documentation for code style, error handling, performance, testing, and scalar model test structure --- .github/copilot-instructions.md | 46 +++++++++++++++++++++++++++++++ .github/copilot/README.md | 10 +++++++ .github/copilot/code-style.md | 10 +++++++ .github/copilot/error-handling.md | 9 ++++++ .github/copilot/performance.md | 9 ++++++ .github/copilot/scalar-tests.md | 12 ++++++++ .github/copilot/testing.md | 9 ++++++ 7 files changed, 105 insertions(+) create mode 100644 .github/copilot-instructions.md create mode 100644 .github/copilot/README.md create mode 100644 .github/copilot/code-style.md create mode 100644 .github/copilot/error-handling.md create mode 100644 .github/copilot/performance.md create mode 100644 .github/copilot/scalar-tests.md create mode 100644 .github/copilot/testing.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 00000000..eaf517e5 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,46 @@ +# Copilot instructions for DoubleML (Python) + +These instructions guide code and test authoring. Keep changes aligned with the detailed rules in .claude to avoid drift. + +## Scope +- Authoring guidance only (not reviewer-only rules). +- Prefer concise, targeted edits; avoid unrelated refactors. + +## Code style and design +- Start each Python file with a one-sentence module docstring. +- Use NumPy-style docstrings for public APIs (summary, Parameters, Returns). +- Require full type hints, including return types. +- Follow DoubleML patterns: `_check_*` validation helpers, `DoubleMLResampling` for splitting, clone learners before fit. +- Keep score outputs named `psi_a` and `psi_b` with shape `(n_obs,)`. + +## Error handling +- Use `ValueError` for invalid values and `TypeError` for wrong types. +- Validate early (constructors and setters) with clear expected/actual messages. +- In tests, always use `pytest.raises(..., match=...)`. + +## Testing +- Mark all tests with `@pytest.mark.ci`. +- Use module-scoped fixtures for expensive fits. +- Seed random generators for reproducibility. +- For new scalar models, follow the required 5-file test structure. + +## Verification (lightweight) +Run relevant checks when changes warrant it: +- `black .` +- `ruff check --fix .` +- `mypy doubleml` +- `pytest -m ci` + +## References (canonical rules) +- Code style: .claude/rules/py-code-conventions.md +- Error handling: .claude/rules/error-handling.md +- Performance: .claude/rules/performance-guidelines.md +- Testing: .claude/rules/testing-conventions.md +- Scalar test structure: .claude/rules/dml-scalar-test-structure.md + +## Optional reference docs +- .github/copilot/code-style.md +- .github/copilot/error-handling.md +- .github/copilot/performance.md +- .github/copilot/testing.md +- .github/copilot/scalar-tests.md diff --git a/.github/copilot/README.md b/.github/copilot/README.md new file mode 100644 index 00000000..26b7d41c --- /dev/null +++ b/.github/copilot/README.md @@ -0,0 +1,10 @@ +# Copilot reference docs + +These short guides summarize the canonical rules in .claude. +Use them as a quick pointer, not a source of truth. + +- code-style.md -> .claude/rules/py-code-conventions.md +- error-handling.md -> .claude/rules/error-handling.md +- performance.md -> .claude/rules/performance-guidelines.md +- testing.md -> .claude/rules/testing-conventions.md +- scalar-tests.md -> .claude/rules/dml-scalar-test-structure.md diff --git a/.github/copilot/code-style.md b/.github/copilot/code-style.md new file mode 100644 index 00000000..66b6cc0c --- /dev/null +++ b/.github/copilot/code-style.md @@ -0,0 +1,10 @@ +# Code style (summary) + +- Module-level docstring required (one sentence). +- NumPy-style docstrings for public APIs. +- Full type hints, including return types. +- Use built-in generics (list[int], dict[str, T]) for Python 3.10+. +- Follow DoubleML patterns: `_check_*` helpers, `DoubleMLResampling`, clone learners. +- Score outputs use `psi_a` and `psi_b` with shape `(n_obs,)`. + +Canonical: .claude/rules/py-code-conventions.md diff --git a/.github/copilot/error-handling.md b/.github/copilot/error-handling.md new file mode 100644 index 00000000..1e76440f --- /dev/null +++ b/.github/copilot/error-handling.md @@ -0,0 +1,9 @@ +# Error handling (summary) + +- Invalid values -> `ValueError`; wrong types -> `TypeError`. +- Validate early in constructors and setters. +- Error messages include expected vs actual values. +- Prefer `_check_*` helpers from doubleml/utils/_checks.py. +- Tests must use `pytest.raises(..., match=...)`. + +Canonical: .claude/rules/error-handling.md diff --git a/.github/copilot/performance.md b/.github/copilot/performance.md new file mode 100644 index 00000000..cba0a4e1 --- /dev/null +++ b/.github/copilot/performance.md @@ -0,0 +1,9 @@ +# Performance (summary) + +- Vectorize array operations; avoid Python loops over observations. +- Pre-allocate `(n_obs, n_rep)` arrays before filling. +- Clone learners before fit (mutable estimators). +- Use `DoubleMLResampling`, not raw `KFold`. +- Prefer `np.linalg.lstsq` over manual inversion. + +Canonical: .claude/rules/performance-guidelines.md diff --git a/.github/copilot/scalar-tests.md b/.github/copilot/scalar-tests.md new file mode 100644 index 00000000..f9e6a983 --- /dev/null +++ b/.github/copilot/scalar-tests.md @@ -0,0 +1,12 @@ +# Scalar model test structure (summary) + +New DoubleMLScalar models require five test files: +- test__scalar.py +- test__scalar_return_types.py +- test__scalar_exceptions.py +- test__scalar_vs_.py +- test__scalar_external_predictions.py + +See details and required assertions in the canonical rule. + +Canonical: .claude/rules/dml-scalar-test-structure.md diff --git a/.github/copilot/testing.md b/.github/copilot/testing.md new file mode 100644 index 00000000..7f21f957 --- /dev/null +++ b/.github/copilot/testing.md @@ -0,0 +1,9 @@ +# Testing (summary) + +- Mark all tests with `@pytest.mark.ci`. +- Use module-scoped fixtures for expensive fits. +- Seed RNGs for reproducibility. +- Use `match=` in exception tests. +- Follow naming: `test__scalar*.py` for scalar models. + +Canonical: .claude/rules/testing-conventions.md From 45c5f4879e03c37075617622bcd15cd8e37d4d60 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 9 Feb 2026 14:15:29 +0100 Subject: [PATCH 15/38] Enhance DoubleMLScalar and IRM classes for stratified sample splitting; update tests for consistency --- doubleml/double_ml_scalar.py | 4 ++- doubleml/irm/irm_scalar.py | 44 ++------------------------- doubleml/tests/test_scalar_cluster.py | 8 +++-- 3 files changed, 12 insertions(+), 44 deletions(-) diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index 0f69a3e1..783b79ce 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -90,6 +90,7 @@ def __init__( self._n_rep: int | None = None self._smpls: list | None = None self._smpls_cluster: list | None = None + self._stratify_variable: np.ndarray | None = None # For stratified sample splitting # Initialize storage for predictions and results self._predictions: dict[str, np.ndarray] | None = None @@ -528,11 +529,12 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: self._n_folds_per_cluster = None self._n_rep = n_rep - # Create resampler + # Create resampler (with optional stratification) resampler = DoubleMLResampling( n_folds=n_folds, n_rep=n_rep, n_obs=self._n_obs, + stratify=self._stratify_variable, ) # Generate splits diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index 71f2f142..70c03ff4 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -16,7 +16,6 @@ from ..utils._learner import LearnerSpec, predict_nuisance from ..utils._propensity_score import _propensity_score_adjustment from ..utils.propensity_score_processing import PSProcessor, PSProcessorConfig -from ..utils.resampling import DoubleMLResampling class IRM(LinearScoreMixin): @@ -126,6 +125,9 @@ def __init__( score=score, ) + # Enable stratified sample splitting for binary treatment + self._stratify_variable = self._dml_data.d + # Normalize IPW if not isinstance(normalize_ipw, bool): raise TypeError("Normalization indicator has to be boolean. " f"Object of type {str(type(normalize_ipw))} passed.") @@ -222,46 +224,6 @@ def set_learners( self._reset_fit_state() return self - # ==================== Sample Splitting ==================== - - def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: - """ - Draw stratified sample splitting for cross-fitting. - - Uses stratified K-fold splitting to ensure each fold contains both - treatment groups (D=0 and D=1). - - Parameters - ---------- - n_folds : int, optional - Number of folds for cross-fitting. Default is 5. - n_rep : int, optional - Number of repetitions for sample splitting. Default is 1. - - Returns - ------- - self : IRM - The estimator with initialized sample splits. - """ - if not isinstance(n_folds, int) or n_folds < 2: - raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.") - if not isinstance(n_rep, int) or n_rep < 1: - raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.") - - self._n_folds = n_folds - self._n_rep = n_rep - - # Create stratified resampler - resampler = DoubleMLResampling( - n_folds=n_folds, - n_rep=n_rep, - n_obs=self._n_obs, - stratify=self._dml_data.d, - ) - - self._smpls = resampler.split_samples() - return self - # ==================== Nuisance Estimation ==================== def _nuisance_est( diff --git a/doubleml/tests/test_scalar_cluster.py b/doubleml/tests/test_scalar_cluster.py index 0ad05f68..4388ec74 100644 --- a/doubleml/tests/test_scalar_cluster.py +++ b/doubleml/tests/test_scalar_cluster.py @@ -3,7 +3,7 @@ import numpy as np import pytest from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import Lasso, LinearRegression +from sklearn.linear_model import LinearRegression from doubleml import DoubleMLData from doubleml.plm.datasets import make_plr_CCDDHNR2018 @@ -13,7 +13,11 @@ @pytest.fixture( - scope="module", params=[RandomForestRegressor(max_depth=2, n_estimators=10), LinearRegression(), Lasso(alpha=0.1)] + scope="module", + params=[ + RandomForestRegressor(max_depth=2, n_estimators=10, random_state=42), + LinearRegression(), + ], ) def learner(request): return request.param From 0051c77173f96d5c8b73fa6a121337ad0c0149aa Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Fri, 13 Feb 2026 09:11:00 +0100 Subject: [PATCH 16/38] add post_nuisance checks --- doubleml/double_ml_scalar.py | 6 +++ doubleml/irm/irm_scalar.py | 30 +++++++++++++- .../irm/tests/test_irm_scalar_exceptions.py | 32 +++++++++++++++ doubleml/plm/plr_scalar.py | 40 +++++++++++++++++++ .../plm/tests/test_plr_scalar_exceptions.py | 28 ++++++++++++- doubleml/utils/_checks.py | 19 ++++++--- 6 files changed, 147 insertions(+), 8 deletions(-) diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index 783b79ce..958e4b3e 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -446,6 +446,9 @@ def fit_nuisance_models( external_predictions=external_predictions, ) + # Post-nuisance prediction checks (model-specific) + self._post_nuisance_checks() + return self def estimate_causal_parameters(self) -> Self: @@ -738,6 +741,9 @@ def _reset_fit_state(self) -> None: # ==================== Abstract Methods (Must be Implemented by Subclasses) ==================== + def _post_nuisance_checks(self) -> None: + """Post-nuisance prediction validation hook. Override in subclasses for model-specific checks.""" + @abstractmethod def _nuisance_est( self, diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index 70c03ff4..57f4be91 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -12,7 +12,7 @@ from ..data.base_data import DoubleMLData from ..double_ml_linear_score import LinearScoreMixin -from ..utils._checks import _check_score, _check_weights +from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_score, _check_weights from ..utils._learner import LearnerSpec, predict_nuisance from ..utils._propensity_score import _propensity_score_adjustment from ..utils.propensity_score_processing import PSProcessor, PSProcessorConfig @@ -226,6 +226,34 @@ def set_learners( # ==================== Nuisance Estimation ==================== + def _post_nuisance_checks(self) -> None: + """Check predictions for validity after cross-fitting completes.""" + for i_rep in range(self.n_rep): + # After full K-fold cross-fitting all observations are test observations + # in exactly one fold, so the full prediction array is populated. + + # Skip checks for learners with external predictions (not registered in _learners) + if "ml_g0" in self._learners: + _check_finite_predictions(self._predictions["ml_g0"][:, i_rep], self._learners["ml_g0"].learner, "ml_g0") + if self._dml_data.binary_outcome: + _check_binary_predictions( + self._predictions["ml_g0"][:, i_rep], + self._learners["ml_g0"].learner, + "ml_g0", + self._dml_data.y_col, + ) + if "ml_g1" in self._learners: + _check_finite_predictions(self._predictions["ml_g1"][:, i_rep], self._learners["ml_g1"].learner, "ml_g1") + if self._dml_data.binary_outcome: + _check_binary_predictions( + self._predictions["ml_g1"][:, i_rep], + self._learners["ml_g1"].learner, + "ml_g1", + self._dml_data.y_col, + ) + if "ml_m" in self._learners: + _check_finite_predictions(self._predictions["ml_m"][:, i_rep], self._learners["ml_m"].learner, "ml_m") + def _nuisance_est( self, train_idx: np.ndarray, diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py index df0aab60..59dc91a1 100644 --- a/doubleml/irm/tests/test_irm_scalar_exceptions.py +++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py @@ -4,6 +4,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import LinearRegression +import doubleml as dml from doubleml.irm.datasets import make_irm_data from doubleml.irm.irm_scalar import IRM from doubleml.plm.datasets import make_plr_CCDDHNR2018 @@ -11,6 +12,26 @@ np.random.seed(3141) obj_dml_data = make_irm_data(theta=0.5, n_obs=100, dim_x=10, return_type="DoubleMLData") +# Binary-outcome data for binary predictions check tests +np.random.seed(42) +_n = 200 +_X = np.random.normal(size=(_n, 3)) +_d_bin = (np.random.normal(size=_n) > 0).astype(float) +_y_bin = (np.random.normal(size=_n) > 0).astype(float) +_df_binary = pd.DataFrame({"y": _y_bin, "d": _d_bin, "X1": _X[:, 0], "X2": _X[:, 1], "X3": _X[:, 2]}) +obj_dml_data_binary = dml.DoubleMLData(_df_binary, y_col="y", d_cols="d", x_cols=["X1", "X2", "X3"]) + + +class _HardLabelClassifier(RandomForestClassifier): + """Classifier that returns hard 0/1 labels instead of probabilities — for testing only.""" + + def predict_proba(self, X): + preds = np.zeros((len(X), 2)) + preds[:, 1] = (np.arange(len(X)) % 2).astype(float) + preds[:, 0] = 1.0 - preds[:, 1] + return preds + + ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42) ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42) @@ -131,3 +152,14 @@ def test_irm_scalar_exception_normalize_ipw_type(): msg = r"Normalization indicator has to be boolean" with pytest.raises(TypeError, match=msg): IRM(obj_dml_data, normalize_ipw="True") + + +@pytest.mark.ci +def test_irm_scalar_exception_binary_predictions_g(): + """Classifier ml_g returning hard labels (0/1) instead of probabilities raises ValueError.""" + ml_m_test = RandomForestClassifier(n_estimators=5, random_state=42) + dml_obj = IRM(obj_dml_data_binary, ml_g=_HardLabelClassifier(), ml_m=ml_m_test) + dml_obj.draw_sample_splitting(n_folds=3) + msg = r"For the binary variable .+, predictions .+ are also observed to be binary" + with pytest.raises(ValueError, match=msg): + dml_obj.fit_nuisance_models() diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index e4aeaa51..1c38af24 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -12,6 +12,7 @@ from ..data.base_data import DoubleMLData from ..double_ml_linear_score import LinearScoreMixin +from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_is_propensity from ..utils._learner import LearnerSpec, predict_nuisance @@ -73,6 +74,8 @@ def __init__( valid_scores = ["partialling out", "IV-type"] if score not in valid_scores: raise ValueError(f"Invalid score '{score}'. Valid scores: {valid_scores}.") + if score == "IV-type" and obj_dml_data.binary_outcome: + raise ValueError("For score = 'IV-type', additive probability models (binary outcomes) are not supported.") super().__init__( obj_dml_data=obj_dml_data, @@ -126,6 +129,14 @@ def set_learners( continue self._register_learner(name, learner) + # Warn when a classifier is used for ml_l with a binary outcome + if ml_l is not None and "ml_l" in self._learners: + if self._learners["ml_l"].is_classifier and self._dml_data.binary_outcome: + warnings.warn( + f"The ml_l learner {str(ml_l)} was identified as classifier. " "Fitting an additive probability model.", + UserWarning, + ) + # IV-type: clone ml_l to ml_g if only one provided self._handle_iv_cloning() self._reset_fit_state() @@ -173,6 +184,35 @@ def _check_data(obj_dml_data): "To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR." ) + def _post_nuisance_checks(self) -> None: + """Check predictions for validity after cross-fitting completes.""" + for i_rep in range(self.n_rep): + # After full K-fold cross-fitting, all observations are test observations + # in exactly one fold, so the full prediction array is populated. + + # Skip checks for learners with external predictions (not registered in _learners) + if "ml_l" in self._learners: + _check_finite_predictions(self._predictions["ml_l"][:, i_rep], self._learners["ml_l"].learner, "ml_l") + if "ml_m" in self._learners: + _check_finite_predictions(self._predictions["ml_m"][:, i_rep], self._learners["ml_m"].learner, "ml_m") + + # Propensity score range check when ml_m is a classifier + if self._learners["ml_m"].is_classifier: + _check_is_propensity( + self._predictions["ml_m"][:, i_rep], + self._learners["ml_m"].learner, + "ml_m", + ) + + # Binary predictions check for binary treatment + if self._dml_data.binary_treats.all(): + _check_binary_predictions( + self._predictions["ml_m"][:, i_rep], + self._learners["ml_m"].learner, + "ml_m", + self._dml_data.d_cols[0], + ) + def _nuisance_est( self, train_idx: np.ndarray, diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py index fb1ba7a9..d49d1902 100644 --- a/doubleml/plm/tests/test_plr_scalar_exceptions.py +++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd import pytest -from sklearn.linear_model import Lasso +from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml from doubleml.plm.datasets import make_plr_CCDDHNR2018 @@ -10,6 +10,15 @@ np.random.seed(3141) obj_dml_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=10, alpha=0.5) +# Binary-outcome data for binary-specific tests +np.random.seed(42) +_n = 100 +_X = np.random.normal(size=(_n, 3)) +_d = (np.random.normal(size=_n) > 0).astype(float) +_y_bin = (np.random.normal(size=_n) > 0).astype(float) +_df_binary = pd.DataFrame({"y": _y_bin, "d": _d, "X1": _X[:, 0], "X2": _X[:, 1], "X3": _X[:, 2]}) +obj_dml_data_binary = dml.DoubleMLData(_df_binary, y_col="y", d_cols="d", x_cols=["X1", "X2", "X3"]) + # Create data with instruments for IV check df = obj_dml_data.data.copy() x_cols = [c for c in df.columns if c.startswith("X")] @@ -110,3 +119,20 @@ def test_plr_scalar_exception_invalid_learner(): msg = r"Invalid learner provided for ml_l: provide an instance" with pytest.raises(TypeError, match=msg): dml_obj.set_learners(ml_l=Lasso) # class instead of instance + + +@pytest.mark.ci +def test_plr_scalar_exception_iv_type_binary_outcome(): + """IV-type score with binary outcome raises ValueError.""" + msg = r"For score = 'IV-type', additive probability models \(binary outcomes\) are not supported\." + with pytest.raises(ValueError, match=msg): + PLR(obj_dml_data_binary, score="IV-type") + + +@pytest.mark.ci +def test_plr_scalar_warning_binary_outcome_classifier(): + """Classifier ml_l with binary outcome warns about fitting an additive probability model.""" + dml_obj = PLR(obj_dml_data_binary) + msg = r"The ml_l learner .+ was identified as classifier\. Fitting an additive probability model\." + with pytest.warns(UserWarning, match=msg): + dml_obj.set_learners(ml_l=LogisticRegression(), ml_m=Lasso()) diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index 7db749dc..ad493e28 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -109,9 +109,13 @@ def _check_smpl_split_tpl(tpl, n_obs, check_intersect=False): return train_index, test_index -def _check_finite_predictions(preds, learner, learner_name, smpls): - test_indices = np.concatenate([test_index for _, test_index in smpls]) - if not np.all(np.isfinite(preds[test_indices])): +def _check_finite_predictions(preds, learner, learner_name, smpls=None): + if smpls is not None: + indices = np.concatenate([test_index for _, test_index in smpls]) + check_preds = preds[indices] + else: + check_preds = preds + if not np.all(np.isfinite(check_preds)): raise ValueError(f"Predictions from learner {str(learner)} for {learner_name} are not finite.") return @@ -189,9 +193,12 @@ def _check_contains_iv(obj_dml_data): return -def _check_is_propensity(preds, learner, learner_name, smpls, eps=1e-12): - test_indices = np.concatenate([test_index for _, test_index in smpls]) - if any((preds[test_indices] < eps) | (preds[test_indices] > 1 - eps)): +def _check_is_propensity(preds, learner, learner_name, smpls=None, eps=1e-12): + if smpls is not None: + check_preds = preds[np.concatenate([test_index for _, test_index in smpls])] + else: + check_preds = preds + if any((check_preds < eps) | (check_preds > 1 - eps)): warnings.warn( f"Propensity predictions from learner {str(learner)} for {learner_name} are close to zero or one (eps={eps})." ) From 35434bb0fc5cec60ddc2369244ed4c6ad4ef20c7 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 28 Feb 2026 08:32:48 +0100 Subject: [PATCH 17/38] add guideline for using absolute imports from project root --- .claude/rules/py-code-conventions.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.claude/rules/py-code-conventions.md b/.claude/rules/py-code-conventions.md index 4344a3a0..ae8b022d 100644 --- a/.claude/rules/py-code-conventions.md +++ b/.claude/rules/py-code-conventions.md @@ -48,6 +48,8 @@ from doubleml.utils._checks import _check_learner Import order (enforced by ruff/isort): standard library, third-party, local. +Use absolute imports from the project root (`doubleml.`) rather than relative imports (`..utils._checks`). + ### `from __future__ import annotations` Not required in every file. Use it when a class references its own type in annotations (forward reference). Since the project targets Python 3.10+, `list[int]`, `dict[str, T]`, and `X | Y` unions work natively without it. From d195fffea2345b94fbb2541de3ba4db34ce36a96 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 28 Feb 2026 13:11:37 +0100 Subject: [PATCH 18/38] add guidelines for tuning tests and required fixtures for scalar models --- .claude/agents/py-general-reviewer.md | 59 +++++++++++++++++++++++++++ .claude/rules/testing-conventions.md | 52 ++++++++++++++++++++++- 2 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 .claude/agents/py-general-reviewer.md diff --git a/.claude/agents/py-general-reviewer.md b/.claude/agents/py-general-reviewer.md new file mode 100644 index 00000000..3f3b2a70 --- /dev/null +++ b/.claude/agents/py-general-reviewer.md @@ -0,0 +1,59 @@ +--- +name: py-general-reviewer +description: Professional Python code reviewer focusing on logic, performance, and best practices. Uses a debate-driven approach to minimize false positives. +tools: Read, Grep, Glob, Bash +model: inherit +--- + +Review Python code changes for functional correctness and industry-standard best practices. Report issues only — never edit source files. + +## Workflow + +1. **Identify Changes**: Run `git diff --name-only HEAD~1` to identify changed `.py` files. +2. **Read**: Read the content of each modified file. +3. **Internal Debate**: For each file, simulate a dialogue: + - **@Auditor**: Finds potential bugs, edge cases, and "code smells." + - **@Author**: Defends the implementation (e.g., "This is a temporary shim" or "Performance requires this complexity"). + - **@Resolution**: Agree on the final list of actionable improvements. +4. **Output**: Use the "Final Review" format specified below. + +## Review Checklist + +### 🔴 Critical (Bug Risk / Logic) +- **Edge Cases**: Unhandled `None` values, empty lists, or `0` divisors. +- **Resource Leaks**: Files or network sockets opened without `with` blocks. +- **Mutable Defaults**: Using `list` or `dict` as default arguments in functions. +- **Concurrency**: Thread-safety issues or race conditions in shared state. +- **Logic Errors**: Off-by-one errors or incorrect boolean logic in complex conditionals. + +### 🟡 Warning (Best Practices / Clean Code) +- **Complexity**: Functions longer than 50 lines or nesting deeper than 3 levels. +- **DRY (Don't Repeat Yourself)**: Significant logic duplication that should be a helper function. +- **Error Handling**: Using "bare" `except:` blocks instead of specific exceptions. +- **Type Hinting**: Public APIs missing type annotations for parameters or return values. +- **Hardcoding**: URLs, credentials, or magic numbers that should be constants/config. + +### 🟢 Suggestion (Style / Optimization) +- **Vectorization**: Using loops where NumPy or Pandas operations would be $O(1)$ or significantly faster. +- **Built-ins**: Re-implementing logic that exists in `itertools`, `collections`, or `pathlib`. +- **Docstrings**: Missing or outdated descriptions of function intent. + +## Output Format + +```markdown +## Final Review: `` + +### ⚖️ The Debate Summary +[1-2 sentences on what was debated between the Auditor and Author.] + +### 🚫 Resolved Issues (Blocking) +- **line N**: [issue]. **Fix**: `` + +### ⚠️ Resolved Warnings +- **line N**: [issue]. **Consider**: `` + +### ✅ Dismissed (False Positives) +- **line N**: [Original concern] -> [Reason for dismissal] + +### Summary +[Final assessment: e.g., "3 issues found (1 critical, 2 warnings)"] diff --git a/.claude/rules/testing-conventions.md b/.claude/rules/testing-conventions.md index 0508ae42..e48b5fa8 100644 --- a/.claude/rules/testing-conventions.md +++ b/.claude/rules/testing-conventions.md @@ -88,9 +88,58 @@ with pytest.raises(ValueError, match=msg): - **Small data for speed**: `n_obs=200`, `n_folds=3` for return type / exception tests - **Larger data for accuracy**: `n_obs=500`, `n_folds=5` for estimation tests +## Tuning Tests (`test__scalar_tune_ml_models.py`) + +Scalar models with `tune_ml_models()` require a dedicated test file. Add it alongside the 5 standard scalar test files. + +### Fixtures and Shared Constants + +```python +# Matches resolve_optuna_cv(cv=5) used internally — required for improvement assertions +_TUNE_CV = KFold(n_splits=5, shuffle=True, random_state=42) + +@pytest.fixture(scope="module") +def _data(): + np.random.seed(3141) + return make__data(n_obs=500, dim_x=5) + +@pytest.fixture(scope="module", params=["score_a", "score_b"]) +def score(request): + return request.param +``` + +### Required Tests + +| Test | Checks | +|------|--------| +| `test__scalar_tune_basic` | Return type `dict[str, DMLOptunaResult]`; correct keys; `tuned=True`; params applied to learners; `model.fit()` succeeds. Parametrize over `score` + `_SAMPLER_CASES`. | +| `test__scalar_tune_improves_score` | `tune_res[name].best_score > cross_val_score(default_tree, ..., cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean()` | +| `test__scalar_tune_returns_self` | `return_tune_res=False` returns `self` | +| `test__scalar_tune_set_as_params_false` | Learner params unchanged; `best_params` still populated | +| `test__scalar_tune_invalid_key` | Unknown key raises `ValueError` | +| `test__scalar_tune_partial_space` | Tuning only a subset leaves unspecified learners unchanged | + +For models with `_LEARNER_PARAM_ALIASES` (e.g., IRM `"ml_g"` → `["ml_g0", "ml_g1"]`), add: + +| Test | Checks | +|------|--------| +| `test__scalar_tune__alias` | Alias expands to concrete keys in result dict (not the alias key itself) | +| `test__scalar_tune__alias_explicit_override` | Explicit concrete key overrides alias; verify by constraining the tuned range | + +### Scalar vs. Old API + +`DoubleMLScalar.tune_ml_models()` returns `dict[str, DMLOptunaResult]` **directly** — no repetition index. The old `DoubleML` API wraps results in a list (`tune_res[0]["ml_l"]`) because tuning runs per repetition. Scalar tuning uses the full dataset once, so the list dimension doesn't exist. + +```python +# Scalar (new): tune_res["ml_l"].best_params +# Old DoubleML: tune_res[0]["ml_l"].best_params +``` + +--- + ## Naming -- Files: `test_.py`, `test__scalar.py`, `test__scalar_exceptions.py` +- Files: `test_.py`, `test__scalar.py`, `test__scalar_exceptions.py`, `test__scalar_tune_ml_models.py` - Functions: `test_` — e.g., `test_coef_within_3_sigma`, `test_exception_invalid_score` - Docstrings: Every test function gets a one-line docstring explaining what it verifies @@ -102,3 +151,4 @@ with pytest.raises(ValueError, match=msg): - [ ] Seeds set for reproducibility - [ ] Test functions have descriptive names and docstrings - [ ] New scalar models have all 5 required test files (see `dml-scalar-test-structure.md`) +- [ ] If model has `tune_ml_models()`, add `test__scalar_tune_ml_models.py` with all required tuning tests From 15216f0f5fb43b0c72c83a45c9c8b1e6c825df0a Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 28 Feb 2026 13:38:10 +0100 Subject: [PATCH 19/38] Enhance DoubleMLScalar with improved tuning functionality and tests --- doubleml/double_ml_scalar.py | 264 ++++++++++++++++- doubleml/irm/irm_scalar.py | 53 +++- .../tests/test_irm_scalar_tune_ml_models.py | 275 ++++++++++++++++++ doubleml/plm/plr_scalar.py | 83 +++++- .../tests/test_plr_scalar_tune_ml_models.py | 259 +++++++++++++++++ .../test_scalar_tune_optuna_exceptions.py | 217 ++++++++++++++ doubleml/tests/test_scalar_tune_pruning.py | 120 ++++++++ doubleml/utils/_tune_optuna.py | 49 ++-- 8 files changed, 1294 insertions(+), 26 deletions(-) create mode 100644 doubleml/irm/tests/test_irm_scalar_tune_ml_models.py create mode 100644 doubleml/plm/tests/test_plr_scalar_tune_ml_models.py create mode 100644 doubleml/tests/test_scalar_tune_optuna_exceptions.py create mode 100644 doubleml/tests/test_scalar_tune_pruning.py diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index 958e4b3e..bd39dc9e 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -3,7 +3,10 @@ """ from abc import ABC, abstractmethod -from typing import ClassVar, Self +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Self + +if TYPE_CHECKING: + from .utils._tune_optuna import DMLOptunaResult import numpy as np @@ -13,6 +16,7 @@ from .double_ml_framework import DoubleMLFramework from .utils._checks import _check_sample_splitting from .utils._learner import LearnerInfo, LearnerSpec, validate_learner +from .utils._tune_optuna import OPTUNA_GLOBAL_SETTING_KEYS, _dml_tune_optuna, resolve_optuna_cv from .utils.resampling import DoubleMLClusterResampling, DoubleMLResampling @@ -48,6 +52,11 @@ class DoubleMLScalar(DoubleMLBase, ABC): # Subclasses define all possible learners for the model _LEARNER_SPECS: ClassVar[dict[str, LearnerSpec]] + # Shorthand aliases for tune_ml_models(): maps user-facing key → list of internal learner keys. + # Example: {"ml_g": ["ml_g0", "ml_g1"]} lets users write ml_g once to tune both. + # Subclasses override as needed; default is no aliases. + _LEARNER_PARAM_ALIASES: ClassVar[dict[str, list[str]]] = {} + def __init__( self, obj_dml_data: DoubleMLBaseData, @@ -212,15 +221,20 @@ def smpls_cluster(self) -> list | None: @abstractmethod def required_learners(self) -> list[str]: """ - Names of the required learners for current configuration. + Names of the required learners for the current configuration. Subclasses implement this as a property that returns the learner names needed based on the current score function or model configuration. + The order of this list determines the tuning order in + :meth:`tune_ml_models`. Learners that depend on earlier results (e.g., + PLR ``ml_g`` depends on ``ml_l`` and ``ml_m`` for its 2-stage target) + must appear later in the list. + Returns ------- list of str - List of required learner names. + Ordered list of required learner names. """ pass @@ -835,6 +849,250 @@ def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None: """ pass + # ==================== Hyperparameter Tuning ==================== + + def tune_ml_models( + self, + ml_param_space: dict[str, Callable | None], + scoring_methods: dict[str, str | Callable | None] | None = None, + cv: int = 5, + optuna_settings: dict | None = None, + set_as_params: bool = True, + return_tune_res: bool = False, + ) -> "Self | dict[str, DMLOptunaResult]": # quoted because DMLOptunaResult is TYPE_CHECKING-only + """ + Tune hyperparameters for all nuisance learners using Optuna. + + Parameters + ---------- + ml_param_space : dict + Parameter space functions keyed by learner name (or alias). + Each value must be a callable taking an Optuna trial and returning a dict. + Alias keys (e.g. ``'ml_g'`` for IRM, expanding to ``'ml_g0'`` and ``'ml_g1'``) + are supported; explicit learner keys always override alias-derived entries. + scoring_methods : dict or None, optional + Scoring functions keyed by concrete learner name. If ``None``, the + estimator's default score method is used. Default is ``None``. + cv : int, optional + Number of cross-validation folds for Optuna tuning. Default is ``5``. + optuna_settings : dict or None, optional + Global or per-learner Optuna settings (e.g., ``n_trials``, ``sampler``). + Default is ``None``. + set_as_params : bool, optional + If ``True``, apply the best found parameters to the registered learner + objects so they are used in subsequent calls to :meth:`fit`. Default is ``True``. + return_tune_res : bool, optional + If ``True``, return a dict of :class:`~doubleml.utils._tune_optuna.DMLOptunaResult` + objects keyed by learner name. Default is ``False``. + + Notes + ----- + Learners are tuned in the order defined by :attr:`required_learners`. + For multi-stage learners (e.g., PLR ``ml_g`` with ``score='IV-type'``), + earlier learner results are passed to :meth:`_get_tuning_data` via + ``partial_results``. If a preceding learner was not included in + ``ml_param_space``, its current (untuned) parameters are used as the + fallback when computing the intermediate target. + + Returns + ------- + self : Self + Returned when ``return_tune_res=False``. + tune_res : dict + Dict of :class:`~doubleml.utils._tune_optuna.DMLOptunaResult` objects keyed by + learner name. Returned when ``return_tune_res=True``. + """ + if not isinstance(set_as_params, bool): + raise TypeError(f"set_as_params must be True or False. Got {str(set_as_params)}.") + if not isinstance(return_tune_res, bool): + raise TypeError(f"return_tune_res must be True or False. Got {str(return_tune_res)}.") + if isinstance(cv, list): + raise TypeError( + "cv as a list of pre-made (train_idx, test_idx) pairs is not supported in tune_ml_models(). " + "Pass an integer (number of folds) or a scikit-learn cross-validation splitter instead." + ) + + # Expand aliases and validate keys (also checks None, callability) + expanded_space = self._expand_tuning_param_space(ml_param_space) + + self._validate_optuna_setting_keys(optuna_settings) + + # Resolve cv once; all learners share the same splitter + cv_splitter = resolve_optuna_cv(cv) + + partial_results: dict[str, Any] = {} + for learner_name in self.required_learners: + # Skip learners not in the expanded param space or set to None + if learner_name not in expanded_space or expanded_space[learner_name] is None: + continue + # Skip learners not yet registered via set_learners() + if learner_name not in self._learners: + continue + + y_tune, x_tune = self._get_tuning_data(learner_name, partial_results, cv_splitter) + + scoring = None if scoring_methods is None else scoring_methods.get(learner_name) + + result = _dml_tune_optuna( + y=y_tune, + x=x_tune, + learner=self._learners[learner_name].learner, + param_grid_func=expanded_space[learner_name], + scoring_method=scoring, + cv=cv_splitter, + optuna_settings=optuna_settings, + learner_name=learner_name, + params_name=learner_name, + ) + partial_results[learner_name] = result + + if set_as_params and result.tuned: + self._learners[learner_name].learner.set_params(**result.best_params) + + if return_tune_res: + return partial_results + return self + + def _expand_tuning_param_space(self, ml_param_space: dict[str, Callable | None]) -> dict[str, Callable | None]: + """ + Expand alias keys in ml_param_space to concrete learner keys. + + Uses a two-pass strategy so explicit keys always override alias-derived + entries, regardless of insertion order: + + - Pass 1: for alias keys, apply with ``setdefault`` (won't override explicit keys) + - Pass 2: for explicit learner keys, apply with direct assignment (always overrides) + + Parameters + ---------- + ml_param_space : dict + Parameter space dict, may contain alias keys (e.g. ``'ml_g'`` for IRM). + + Returns + ------- + dict + Expanded dict with only concrete learner keys. + + Raises + ------ + ValueError + If ``ml_param_space`` is not a non-empty dict, or if a key is neither a valid + alias nor a defined learner name. + TypeError + If a parameter space value is not callable. + """ + if not isinstance(ml_param_space, dict): + raise TypeError(f"ml_param_space must be a dict. Got {type(ml_param_space).__name__}.") + if not ml_param_space: + raise ValueError("ml_param_space must be a non-empty dictionary.") + + valid_keys = set(self._LEARNER_SPECS.keys()) | set(self._LEARNER_PARAM_ALIASES.keys()) + for key in ml_param_space: + if key not in valid_keys: + raise ValueError(f"Invalid key '{key}' in ml_param_space. " f"Valid keys: {sorted(valid_keys)}.") + + # Validate callability of non-None parameter space functions + for key, fn in ml_param_space.items(): + if fn is not None and not callable(fn): + raise TypeError( + f"Parameter space for '{key}' must be a callable function that takes a trial " + f"and returns a dict. Got {type(fn).__name__}. " + f"Example: def ml_params(trial): return {{'max_depth': trial.suggest_int('max_depth', 1, 10)}}" + ) + + expanded: dict[str, Callable | None] = {} + # Pass 1: expand alias keys (setdefault so explicit keys will win in pass 2) + for key, fn in ml_param_space.items(): + if key in self._LEARNER_PARAM_ALIASES: + for alias_target in self._LEARNER_PARAM_ALIASES[key]: + expanded.setdefault(alias_target, fn) + + # Pass 2: explicit learner keys always override alias-derived entries + for key, fn in ml_param_space.items(): + if key not in self._LEARNER_PARAM_ALIASES: + expanded[key] = fn + + return expanded + + def _validate_optuna_setting_keys(self, optuna_settings: dict | None) -> None: + """ + Validate learner-level keys provided in ``optuna_settings``. + + Parameters + ---------- + optuna_settings : dict or None + Optuna settings dict to validate. + + Raises + ------ + TypeError + If ``optuna_settings`` is not a dict or None, or if a learner-specific + value is not a dict. + ValueError + If a key is not a global Optuna setting and not a valid learner name or alias. + """ + if optuna_settings is not None and not isinstance(optuna_settings, dict): + raise TypeError(f"optuna_settings must be a dict or None. Got {str(type(optuna_settings))}.") + + if not optuna_settings: # None or empty dict — no settings to validate + return + + allowed_learner_keys = set(self._LEARNER_SPECS.keys()) | set(self._LEARNER_PARAM_ALIASES.keys()) + invalid_keys = [ + key for key in optuna_settings if key not in OPTUNA_GLOBAL_SETTING_KEYS and key not in allowed_learner_keys + ] + + if invalid_keys: + valid_keys_msg = ", ".join(sorted(allowed_learner_keys)) if allowed_learner_keys else "" + raise ValueError( + f"Invalid optuna_settings keys for {self.__class__.__name__}: " + f"{', '.join(sorted(invalid_keys))}. " + f"Valid learner-specific keys are: {valid_keys_msg}." + ) + + for key in allowed_learner_keys: + if key in optuna_settings and not isinstance(optuna_settings[key], dict): + raise TypeError(f"Optuna settings for '{key}' must be a dict.") + + def _get_tuning_data( + self, + learner_name: str, + partial_results: dict[str, Any], + cv: Any, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Return ``(y_target, x)`` arrays for tuning the given learner. + + Subclasses must override this method to return the appropriate data for each + learner. The ``partial_results`` argument enables multi-stage tuning (e.g., PLR + ``ml_g`` which depends on earlier ``ml_l`` and ``ml_m`` results). + + Parameters + ---------- + learner_name : str + Name of the learner to tune. + partial_results : dict + Already-computed :class:`~doubleml.utils._tune_optuna.DMLOptunaResult` + objects, keyed by learner name. + cv : cross-validator + Cross-validation splitter, already resolved by :meth:`tune_ml_models`. + + Returns + ------- + y_target : np.ndarray + Target array for the learner. + x : np.ndarray + Feature matrix. + + Raises + ------ + NotImplementedError + Always; subclasses must override this method. + """ + raise NotImplementedError( + f"_get_tuning_data not implemented for {self.__class__.__name__}. " "Subclasses must override this method." + ) + def __str__(self) -> str: """ String representation of the DoubleMLScalar object. diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index 57f4be91..f6983368 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import ClassVar, Self +from typing import Any, ClassVar, Self import numpy as np from sklearn.base import clone @@ -83,6 +83,12 @@ class IRM(LinearScoreMixin): "ml_m": LearnerSpec("ml_m", allow_regressor=False, allow_classifier=True), } + # ml_g is a shorthand for tuning both ml_g0 and ml_g1 with the same param function. + # Explicit ml_g0 or ml_g1 keys always override the alias. + _LEARNER_PARAM_ALIASES: ClassVar[dict[str, list[str]]] = { + "ml_g": ["ml_g0", "ml_g1"], + } + def __init__( self, obj_dml_data: DoubleMLData, @@ -362,6 +368,51 @@ def _check_data(obj_dml_data: object) -> None: "needs to be specified as treatment variable." ) + def _get_tuning_data( + self, + learner_name: str, + _partial_results: dict[str, Any], + _cv: Any, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Return ``(y_target, x)`` for tuning the given IRM learner. + + Parameters + ---------- + learner_name : str + Learner to tune: ``'ml_g0'``, ``'ml_g1'``, or ``'ml_m'``. + _partial_results : dict + Already-tuned DMLOptunaResult objects (unused for IRM). + _cv : cross-validator + Cross-validation splitter (unused for IRM). + + Returns + ------- + y_target : np.ndarray + Target array for the learner. + x : np.ndarray + Feature matrix. + + Raises + ------ + ValueError + If ``learner_name`` is not a valid IRM learner name. + """ + y = self._dml_data.y + d = self._dml_data.d + x = self._dml_data.x + + if learner_name == "ml_g0": + mask = d == 0 + return y[mask], x[mask] + if learner_name == "ml_g1": + mask = d == 1 + return y[mask], x[mask] + if learner_name == "ml_m": + return d, x + + raise ValueError(f"Unknown learner '{learner_name}' for IRM.") + def _initialize_weights(self, weights: np.ndarray | dict | None) -> None: """Initialize weights storage.""" if weights is None: diff --git a/doubleml/irm/tests/test_irm_scalar_tune_ml_models.py b/doubleml/irm/tests/test_irm_scalar_tune_ml_models.py new file mode 100644 index 00000000..a11629b4 --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_tune_ml_models.py @@ -0,0 +1,275 @@ +"""Tests for IRM scalar hyperparameter tuning via tune_ml_models().""" + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.model_selection import KFold, cross_val_score +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM +from doubleml.tests._utils_tune_optuna import ( + _SAMPLER_CASES, + _assert_tree_params, + _basic_optuna_settings, + _small_tree_params, +) +from doubleml.utils._tune_optuna import DMLOptunaResult + +# CV splitter matching tune_ml_models() default (cv=5) +_TUNE_CV = KFold(n_splits=5, shuffle=True, random_state=42) + + +@pytest.fixture(scope="module") +def irm_data(): + """IRM data fixture shared across all tests in this module.""" + np.random.seed(3142) + return make_irm_data(n_obs=500, dim_x=5) + + +@pytest.fixture(scope="module", params=["ATE", "ATTE"]) +def score(request): + """Score function variants for IRM.""" + return request.param + + +@pytest.mark.ci +@pytest.mark.parametrize("sampler_name,optuna_sampler", _SAMPLER_CASES, ids=[c[0] for c in _SAMPLER_CASES]) +def test_irm_scalar_tune_basic(irm_data, score, sampler_name, optuna_sampler): + """tune_ml_models() returns DMLOptunaResult with valid tree params and applies them to learners.""" + ml_g = DecisionTreeRegressor(random_state=321) + ml_m = DecisionTreeClassifier(random_state=654) + + model = IRM(irm_data, score=score) + model.set_learners(ml_g=ml_g, ml_m=ml_m) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_g0": _small_tree_params, "ml_g1": _small_tree_params, "ml_m": _small_tree_params}, + optuna_settings=_basic_optuna_settings({"sampler": optuna_sampler}), + return_tune_res=True, + ) + + # Return type and keys + assert isinstance(tune_res, dict) + assert set(tune_res.keys()) == {"ml_g0", "ml_g1", "ml_m"} + + # Each result is a DMLOptunaResult with valid tree params + for key in ("ml_g0", "ml_g1"): + assert isinstance(tune_res[key], DMLOptunaResult) + assert tune_res[key].tuned is True + _assert_tree_params(tune_res[key].best_params) + + assert isinstance(tune_res["ml_m"], DMLOptunaResult) + assert tune_res["ml_m"].tuned is True + _assert_tree_params(tune_res["ml_m"].best_params) + + # Best params are applied to the registered learner objects + assert model.get_params("ml_g0")["max_depth"] == tune_res["ml_g0"].best_params["max_depth"] + assert model.get_params("ml_g1")["max_depth"] == tune_res["ml_g1"].best_params["max_depth"] + assert model.get_params("ml_m")["max_depth"] == tune_res["ml_m"].best_params["max_depth"] + + # Model fits successfully after tuning + model.fit(n_folds=3) + assert np.isfinite(model.coef).all() + + +@pytest.mark.ci +def test_irm_scalar_tune_improves_score(irm_data, score): + """Tuning default (overfitting) trees improves cross-validated neg_rmse for ml_g0 and ml_g1.""" + x, y, d = irm_data.x, irm_data.y, irm_data.d + + ml_g = DecisionTreeRegressor(random_state=321) + ml_m = DecisionTreeClassifier(random_state=654) + + # Baseline: default trees overfit on training folds → very negative neg_rmse + mask0, mask1 = d == 0, d == 1 + baseline_g0 = cross_val_score(clone(ml_g), x[mask0], y[mask0], cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean() + baseline_g1 = cross_val_score(clone(ml_g), x[mask1], y[mask1], cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean() + + model = IRM(irm_data, score=score) + model.set_learners(ml_g=ml_g, ml_m=ml_m) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_g0": _small_tree_params, "ml_g1": _small_tree_params, "ml_m": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + return_tune_res=True, + ) + + # Optuna best_score should exceed baseline (less overfitting) + assert tune_res["ml_g0"].best_score > baseline_g0 + assert tune_res["ml_g1"].best_score > baseline_g1 + + +@pytest.mark.ci +def test_irm_scalar_tune_ml_g_alias(irm_data): + """ml_g alias expands to both ml_g0 and ml_g1; result keys are the concrete learner names.""" + model = IRM(irm_data) + model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2)) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + return_tune_res=True, + ) + + # Alias expands: result has ml_g0, ml_g1 (not ml_g) + assert set(tune_res.keys()) == {"ml_g0", "ml_g1", "ml_m"} + _assert_tree_params(tune_res["ml_g0"].best_params) + _assert_tree_params(tune_res["ml_g1"].best_params) + _assert_tree_params(tune_res["ml_m"].best_params) + + # Model fits after tuning + model.fit(n_folds=3) + assert np.isfinite(model.coef).all() + + +@pytest.mark.ci +def test_irm_scalar_tune_ml_g_alias_explicit_override(irm_data): + """Explicit ml_g0 key overrides the ml_g alias; ml_g1 still gets the alias function.""" + + def specific_g0_params(trial): + """Restricts max_depth to 1-3 to distinguish from _small_tree_params (1-20).""" + return { + "max_depth": trial.suggest_int("max_depth", 1, 3), + "min_samples_split": trial.suggest_int("min_samples_split", 2, 20), + "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10), + } + + model = IRM(irm_data) + model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2)) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_g": _small_tree_params, "ml_g0": specific_g0_params, "ml_m": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + return_tune_res=True, + ) + + assert set(tune_res.keys()) == {"ml_g0", "ml_g1", "ml_m"} + # ml_g0 used specific_g0_params: max_depth constrained to [1, 3] + assert tune_res["ml_g0"].best_params["max_depth"] <= 3 + # ml_g1 used _small_tree_params: all three keys present, max_depth up to 20 + _assert_tree_params(tune_res["ml_g1"].best_params) + + +@pytest.mark.ci +def test_irm_scalar_tune_returns_self(irm_data): + """tune_ml_models() with return_tune_res=False returns self.""" + model = IRM(irm_data) + model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2)) + + result = model.tune_ml_models( + ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + ) + + assert result is model + + +@pytest.mark.ci +def test_irm_scalar_tune_set_as_params_false(irm_data): + """tune_ml_models(set_as_params=False) finds best params but does not apply them to learners.""" + model = IRM(irm_data) + model.set_learners( + ml_g=DecisionTreeRegressor(max_depth=1, random_state=1), + ml_m=DecisionTreeClassifier(max_depth=1, random_state=2), + ) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + set_as_params=False, + return_tune_res=True, + ) + + # Learner params are unchanged + assert model.get_params("ml_g0")["max_depth"] == 1 + assert model.get_params("ml_g1")["max_depth"] == 1 + assert model.get_params("ml_m")["max_depth"] == 1 + # But tune_res still has valid best params + _assert_tree_params(tune_res["ml_g0"].best_params) + _assert_tree_params(tune_res["ml_g1"].best_params) + _assert_tree_params(tune_res["ml_m"].best_params) + + +@pytest.mark.ci +def test_irm_scalar_tune_invalid_key(irm_data): + """_expand_tuning_param_space() raises ValueError for unknown keys.""" + model = IRM(irm_data) + model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2)) + + with pytest.raises(ValueError, match="Invalid key 'ml_z' in ml_param_space"): + model.tune_ml_models( + ml_param_space={"ml_z": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + ) + + +@pytest.mark.ci +def test_irm_scalar_tune_partial_space(irm_data): + """Tuning only a subset of learners leaves unspecified learners unchanged.""" + model = IRM(irm_data) + model.set_learners( + ml_g=DecisionTreeRegressor(max_depth=5, random_state=1), + ml_m=DecisionTreeClassifier(max_depth=5, random_state=2), + ) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_g0": _small_tree_params}, # only ml_g0 + optuna_settings=_basic_optuna_settings(), + return_tune_res=True, + ) + + # Only ml_g0 was tuned + assert set(tune_res.keys()) == {"ml_g0"} + _assert_tree_params(tune_res["ml_g0"].best_params) + # ml_g1 and ml_m max_depth are unchanged + assert model.get_params("ml_g1")["max_depth"] == 5 + assert model.get_params("ml_m")["max_depth"] == 5 + + +@pytest.fixture( + scope="module", + params=["int", "kfold_splitter"], + ids=["cv=int", "cv=KFold"], +) +def cv_variant(request): + """Different cv argument types accepted by tune_ml_models(): int and splitter.""" + if request.param == "int": + return 3 + return KFold(n_splits=3, shuffle=True, random_state=7) + + +@pytest.mark.ci +def test_irm_scalar_tune_cv_types(irm_data, cv_variant): + """tune_ml_models() succeeds for supported cv argument types: int and splitter.""" + model = IRM(irm_data) + model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2)) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params}, + cv=cv_variant, + optuna_settings=_basic_optuna_settings(), + return_tune_res=True, + ) + + for name in ("ml_g0", "ml_g1", "ml_m"): + assert name in tune_res + assert tune_res[name].tuned is True + assert isinstance(tune_res[name].best_params, dict) + assert np.isfinite(tune_res[name].best_score) + + +@pytest.mark.ci +def test_irm_scalar_tune_cv_list_raises(irm_data): + """tune_ml_models() raises TypeError when cv is a list of pre-made split pairs.""" + model = IRM(irm_data) + model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2)) + cv_list = list(KFold(n_splits=3).split(np.arange(irm_data.n_obs))) + + msg = r"cv as a list of pre-made \(train_idx, test_idx\) pairs is not supported" + with pytest.raises(TypeError, match=msg): + model.tune_ml_models( + ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params}, + cv=cv_list, + optuna_settings=_basic_optuna_settings(), + ) diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 1c38af24..caf290de 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -5,10 +5,11 @@ from __future__ import annotations import warnings -from typing import Dict, List, Optional, Self +from typing import Any, ClassVar, Dict, List, Optional, Self import numpy as np from sklearn.base import clone +from sklearn.model_selection import cross_val_predict from ..data.base_data import DoubleMLData from ..double_ml_linear_score import LinearScoreMixin @@ -37,7 +38,7 @@ class PLR(LinearScoreMixin): """ # Define learner specifications for PLR - _LEARNER_SPECS: Dict[str, LearnerSpec] = { + _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] = { "ml_l": LearnerSpec("ml_l", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"), "ml_m": LearnerSpec("ml_m", allow_regressor=True, allow_classifier=True, binary_data_check="treatment"), "ml_g": LearnerSpec("ml_g", allow_regressor=True, allow_classifier=False), @@ -277,6 +278,84 @@ def _nuisance_est( ml_g.fit(x[train_j], y[train_j] - theta_initial * d[train_j]) self._predictions["ml_g"][test_j, i_rep] = predict_nuisance(ml_g, x[test_j], ml_g_info.is_classifier) + def _get_tuning_data( + self, + learner_name: str, + partial_results: dict[str, Any], + cv: Any, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Return ``(y_target, x)`` for tuning the given PLR learner. + + Parameters + ---------- + learner_name : str + Learner to tune: ``'ml_l'``, ``'ml_m'``, or ``'ml_g'``. + partial_results : dict + Already-tuned DMLOptunaResult objects, keyed by learner name. + Used for 2-stage ``ml_g`` tuning: applies the best params from + ``ml_l`` and ``ml_m`` when computing the initial theta estimate. + If ``ml_l`` or ``ml_m`` were not tuned in this call, their current + (untuned) learner params are used as a fallback. + cv : cross-validator + Cross-validation splitter, already resolved in :meth:`tune_ml_models`. + + Returns + ------- + y_target : np.ndarray + Target array for the learner. + x : np.ndarray + Feature matrix. + + Raises + ------ + ValueError + If ``learner_name`` is not a valid PLR learner name. + """ + y = self._dml_data.y + d = self._dml_data.d + x = self._dml_data.x + + if learner_name == "ml_l": + return y, x + if learner_name == "ml_m": + return d, x + if learner_name == "ml_g": + # 2-stage: compute initial theta via cross-validated ml_l/ml_m predictions. + # Apply tuned params if available, otherwise use the current learner params. + if "ml_l" not in self._learners or "ml_m" not in self._learners: + raise ValueError( + "Tuning 'ml_g' requires 'ml_l' and 'ml_m' to be registered. " + "Call set_learners(ml_l=..., ml_m=...) before tuning 'ml_g'." + ) + l_info = self._learners["ml_l"] + m_info = self._learners["ml_m"] + + l_est = clone(l_info.learner) + if "ml_l" in partial_results: + l_est.set_params(**partial_results["ml_l"].best_params) + + m_est = clone(m_info.learner) + if "ml_m" in partial_results: + m_est.set_params(**partial_results["ml_m"].best_params) + + if l_info.is_classifier: + l_hat = cross_val_predict(l_est, x, y, cv=cv, method="predict_proba")[:, 1] + else: + l_hat = cross_val_predict(l_est, x, y, cv=cv) + + if m_info.is_classifier: + m_hat = cross_val_predict(m_est, x, d, cv=cv, method="predict_proba")[:, 1] + else: + m_hat = cross_val_predict(m_est, x, d, cv=cv) + + psi_a = -((d - m_hat) ** 2) + psi_b = (d - m_hat) * (y - l_hat) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + return y - theta_initial * d, x + + raise ValueError(f"Unknown learner '{learner_name}' for PLR.") + def _get_score_elements(self) -> Dict[str, np.ndarray]: y = self._dml_data.y d = self._dml_data.d diff --git a/doubleml/plm/tests/test_plr_scalar_tune_ml_models.py b/doubleml/plm/tests/test_plr_scalar_tune_ml_models.py new file mode 100644 index 00000000..fb231cea --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar_tune_ml_models.py @@ -0,0 +1,259 @@ +"""Tests for PLR scalar hyperparameter tuning via tune_ml_models().""" + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.model_selection import KFold, cross_val_predict, cross_val_score +from sklearn.tree import DecisionTreeRegressor + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR +from doubleml.tests._utils_tune_optuna import ( + _SAMPLER_CASES, + _assert_tree_params, + _basic_optuna_settings, + _small_tree_params, +) +from doubleml.utils._tune_optuna import DMLOptunaResult + +# CV splitter matching tune_ml_models() default (cv=5) +_TUNE_CV = KFold(n_splits=5, shuffle=True, random_state=42) + + +@pytest.fixture(scope="module") +def plr_data(): + """PLR data fixture shared across all tests in this module.""" + np.random.seed(3141) + return make_plr_CCDDHNR2018(n_obs=500, dim_x=5, alpha=0.5) + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def score(request): + """Score function variants for PLR.""" + return request.param + + +@pytest.mark.ci +@pytest.mark.parametrize("sampler_name,optuna_sampler", _SAMPLER_CASES, ids=[c[0] for c in _SAMPLER_CASES]) +def test_plr_scalar_tune_basic(plr_data, score, sampler_name, optuna_sampler): + """tune_ml_models() returns DMLOptunaResult with valid tree params and applies them to learners.""" + ml_l = DecisionTreeRegressor(random_state=123) + ml_m = DecisionTreeRegressor(random_state=456) + + model = PLR(plr_data, score=score) + model.set_learners(ml_l=ml_l, ml_m=ml_m) + if score == "IV-type": + model.set_learners(ml_g=DecisionTreeRegressor(random_state=789)) + + param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params} + if score == "IV-type": + param_space["ml_g"] = _small_tree_params + + tune_res = model.tune_ml_models( + ml_param_space=param_space, + optuna_settings=_basic_optuna_settings({"sampler": optuna_sampler}), + return_tune_res=True, + ) + + # Return type and keys + assert isinstance(tune_res, dict) + expected_keys = {"ml_l", "ml_m"} + if score == "IV-type": + expected_keys.add("ml_g") + assert set(tune_res.keys()) == expected_keys + + # Each result is a DMLOptunaResult with valid tree params + for key in tune_res: + assert isinstance(tune_res[key], DMLOptunaResult) + assert tune_res[key].tuned is True + _assert_tree_params(tune_res[key].best_params) + + # Best params are applied to the registered learner objects + assert model.get_params("ml_l")["max_depth"] == tune_res["ml_l"].best_params["max_depth"] + assert model.get_params("ml_m")["max_depth"] == tune_res["ml_m"].best_params["max_depth"] + if score == "IV-type": + assert model.get_params("ml_g")["max_depth"] == tune_res["ml_g"].best_params["max_depth"] + + # Model fits successfully after tuning + model.fit(n_folds=3) + assert np.isfinite(model.coef).all() + + +@pytest.mark.ci +def test_plr_scalar_tune_improves_score(plr_data, score): + """Tuning a default (overfitting) tree improves cross-validated neg_rmse.""" + x, y, d = plr_data.x, plr_data.y, plr_data.d + + ml_l = DecisionTreeRegressor(random_state=123) + ml_m = DecisionTreeRegressor(random_state=456) + + # Baseline: default trees overfit on training folds → high test RMSE → very negative neg_rmse + baseline_l = cross_val_score(clone(ml_l), x, y, cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean() + baseline_m = cross_val_score(clone(ml_m), x, d, cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean() + + model = PLR(plr_data, score=score) + model.set_learners(ml_l=ml_l, ml_m=ml_m) + if score == "IV-type": + model.set_learners(ml_g=DecisionTreeRegressor(random_state=789)) + + param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params} + if score == "IV-type": + param_space["ml_g"] = _small_tree_params + + tune_res = model.tune_ml_models( + ml_param_space=param_space, + optuna_settings=_basic_optuna_settings(), + return_tune_res=True, + ) + + # Optuna best_score (neg_root_mean_squared_error) should exceed baseline (less overfitting) + assert tune_res["ml_l"].best_score > baseline_l + assert tune_res["ml_m"].best_score > baseline_m + + if score == "IV-type": + # Replicate _get_tuning_data's 2-stage target for ml_g: y - theta_initial * d. + # Uses _TUNE_CV which matches resolve_optuna_cv(cv=5) used internally. + ml_g = DecisionTreeRegressor(random_state=789) + l_hat = cross_val_predict(clone(ml_l), x, y, cv=_TUNE_CV) + m_hat = cross_val_predict(clone(ml_m), x, d, cv=_TUNE_CV) + psi_a = -((d - m_hat) ** 2) + psi_b = (d - m_hat) * (y - l_hat) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + y_g = y - theta_initial * d + baseline_g = cross_val_score(clone(ml_g), x, y_g, cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean() + assert tune_res["ml_g"].best_score > baseline_g + + +@pytest.mark.ci +def test_plr_scalar_tune_returns_self(plr_data): + """tune_ml_models() with return_tune_res=False returns self.""" + model = PLR(plr_data) + model.set_learners(ml_l=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeRegressor(random_state=2)) + + result = model.tune_ml_models( + ml_param_space={"ml_l": _small_tree_params, "ml_m": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + ) + + assert result is model + + +@pytest.mark.ci +def test_plr_scalar_tune_set_as_params_false(plr_data): + """tune_ml_models(set_as_params=False) finds best params but does not apply them to learners.""" + model = PLR(plr_data) + model.set_learners( + ml_l=DecisionTreeRegressor(max_depth=1, random_state=1), + ml_m=DecisionTreeRegressor(max_depth=1, random_state=2), + ) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_l": _small_tree_params, "ml_m": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + set_as_params=False, + return_tune_res=True, + ) + + # Learner params are unchanged + assert model.get_params("ml_l")["max_depth"] == 1 + assert model.get_params("ml_m")["max_depth"] == 1 + # But tune_res still has valid best params + _assert_tree_params(tune_res["ml_l"].best_params) + _assert_tree_params(tune_res["ml_m"].best_params) + + +@pytest.mark.ci +def test_plr_scalar_tune_invalid_key(plr_data): + """_expand_tuning_param_space() raises ValueError for unknown keys.""" + model = PLR(plr_data) + model.set_learners(ml_l=DecisionTreeRegressor(), ml_m=DecisionTreeRegressor()) + + with pytest.raises(ValueError, match="Invalid key 'ml_z' in ml_param_space"): + model.tune_ml_models( + ml_param_space={"ml_z": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + ) + + +@pytest.fixture( + scope="module", + params=["int", "kfold_splitter"], + ids=["cv=int", "cv=KFold"], +) +def cv_variant(request): + """Different cv argument types accepted by tune_ml_models(): int and splitter.""" + if request.param == "int": + return 3 + return KFold(n_splits=3, shuffle=True, random_state=7) + + +@pytest.mark.ci +def test_plr_scalar_tune_cv_types(plr_data, cv_variant): + """tune_ml_models() succeeds for supported cv argument types: int and splitter.""" + model = PLR(plr_data) + model.set_learners(ml_l=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeRegressor(random_state=2)) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_l": _small_tree_params, "ml_m": _small_tree_params}, + cv=cv_variant, + optuna_settings=_basic_optuna_settings(), + return_tune_res=True, + ) + + for name in ("ml_l", "ml_m"): + assert name in tune_res + assert tune_res[name].tuned is True + assert isinstance(tune_res[name].best_params, dict) + assert np.isfinite(tune_res[name].best_score) + + +@pytest.mark.ci +def test_plr_scalar_tune_cv_list_raises(plr_data): + """tune_ml_models() raises TypeError when cv is a list of pre-made split pairs.""" + model = PLR(plr_data) + model.set_learners(ml_l=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeRegressor(random_state=2)) + cv_list = list(KFold(n_splits=3).split(np.arange(plr_data.n_obs))) + + msg = r"cv as a list of pre-made \(train_idx, test_idx\) pairs is not supported" + with pytest.raises(TypeError, match=msg): + model.tune_ml_models( + ml_param_space={"ml_l": _small_tree_params, "ml_m": _small_tree_params}, + cv=cv_list, + optuna_settings=_basic_optuna_settings(), + ) + + +@pytest.mark.ci +def test_plr_scalar_tune_partial_space(plr_data): + """Tuning only a subset of learners leaves unspecified learners unchanged.""" + model = PLR(plr_data) + model.set_learners( + ml_l=DecisionTreeRegressor(max_depth=5, random_state=1), + ml_m=DecisionTreeRegressor(max_depth=5, random_state=2), + ) + + tune_res = model.tune_ml_models( + ml_param_space={"ml_l": _small_tree_params}, # only ml_l + optuna_settings=_basic_optuna_settings(), + return_tune_res=True, + ) + + # Only ml_l was tuned + assert set(tune_res.keys()) == {"ml_l"} + _assert_tree_params(tune_res["ml_l"].best_params) + # ml_m max_depth is unchanged + assert model.get_params("ml_m")["max_depth"] == 5 + + +@pytest.mark.ci +def test_plr_scalar_tune_ml_g_missing_ml_l_ml_m(plr_data): + """Tuning ml_g without ml_l and ml_m registered raises ValueError.""" + model = PLR(plr_data, score="IV-type") + model.set_learners(ml_g=DecisionTreeRegressor(random_state=1)) + + msg = r"Tuning 'ml_g' requires 'ml_l' and 'ml_m' to be registered\." + with pytest.raises(ValueError, match=msg): + model.tune_ml_models( + ml_param_space={"ml_g": _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + ) diff --git a/doubleml/tests/test_scalar_tune_optuna_exceptions.py b/doubleml/tests/test_scalar_tune_optuna_exceptions.py new file mode 100644 index 00000000..78e8c7b0 --- /dev/null +++ b/doubleml/tests/test_scalar_tune_optuna_exceptions.py @@ -0,0 +1,217 @@ +"""Tests for DoubleMLScalar.tune_ml_models() input validation and error handling.""" + +import re + +import numpy as np +import pytest +from sklearn.model_selection import KFold +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR +from doubleml.tests._utils_tune_optuna import _basic_optuna_settings, _small_tree_params + +# ── Shared fixtures ──────────────────────────────────────────────────────────── + +np.random.seed(42) +_plr_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=5) +_irm_data = make_irm_data(n_obs=100, dim_x=5) + + +@pytest.fixture(scope="module") +def plr_model(): + """Fitted PLR scalar model for reuse across exception tests.""" + model = PLR(_plr_data) + model.set_learners( + ml_l=DecisionTreeRegressor(random_state=1), + ml_m=DecisionTreeRegressor(random_state=2), + ) + return model + + +@pytest.fixture(scope="module") +def irm_model(): + """Fitted IRM scalar model for reuse across exception tests.""" + model = IRM(_irm_data) + model.set_learners( + ml_g=DecisionTreeRegressor(random_state=1), + ml_m=DecisionTreeClassifier(random_state=2), + ) + return model + + +# ── ml_param_space validation ────────────────────────────────────────────────── + + +@pytest.mark.ci +@pytest.mark.parametrize( + "ml_param_space, exc, msg", + [ + (None, TypeError, "ml_param_space must be a dict. Got NoneType."), + ({}, ValueError, "ml_param_space must be a non-empty dictionary."), + ( + {"ml_l": "not-callable"}, + TypeError, + "Parameter space for 'ml_l' must be a callable function that takes a trial and returns a dict. Got str.", + ), + ], +) +def test_scalar_tune_invalid_param_space(plr_model, ml_param_space, exc, msg): + """tune_ml_models() raises on None, empty, or non-callable ml_param_space.""" + with pytest.raises(exc, match=re.escape(msg)): + plr_model.tune_ml_models(ml_param_space, optuna_settings=_basic_optuna_settings()) + + +@pytest.mark.ci +@pytest.mark.parametrize( + "bad_key, model_name", + [ + ("ml_z", "PLR"), + ("ml_g0", "PLR"), + ], +) +def test_scalar_tune_invalid_param_space_key_plr(plr_model, bad_key, model_name): + """_expand_tuning_param_space() raises ValueError for keys not valid for PLR.""" + with pytest.raises(ValueError, match=re.escape(f"Invalid key '{bad_key}' in ml_param_space")): + plr_model.tune_ml_models( + {bad_key: _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + ) + + +@pytest.mark.ci +@pytest.mark.parametrize("bad_key", ["ml_l", "ml_z"]) +def test_scalar_tune_invalid_param_space_key_irm(irm_model, bad_key): + """_expand_tuning_param_space() raises ValueError for keys not valid for IRM.""" + with pytest.raises(ValueError, match=re.escape(f"Invalid key '{bad_key}' in ml_param_space")): + irm_model.tune_ml_models( + {bad_key: _small_tree_params}, + optuna_settings=_basic_optuna_settings(), + ) + + +# ── Boolean flag validation ──────────────────────────────────────────────────── + + +@pytest.mark.ci +@pytest.mark.parametrize("set_as_params", ["invalid", None, 1]) +def test_scalar_tune_invalid_set_as_params(plr_model, set_as_params): + """tune_ml_models() raises TypeError for non-bool set_as_params.""" + msg = re.escape(f"set_as_params must be True or False. Got {str(set_as_params)}.") + with pytest.raises(TypeError, match=msg): + plr_model.tune_ml_models( + {"ml_l": _small_tree_params}, + set_as_params=set_as_params, + optuna_settings=_basic_optuna_settings(), + ) + + +@pytest.mark.ci +@pytest.mark.parametrize("return_tune_res", ["invalid", None, 1]) +def test_scalar_tune_invalid_return_tune_res(plr_model, return_tune_res): + """tune_ml_models() raises TypeError for non-bool return_tune_res.""" + msg = re.escape(f"return_tune_res must be True or False. Got {str(return_tune_res)}.") + with pytest.raises(TypeError, match=msg): + plr_model.tune_ml_models( + {"ml_l": _small_tree_params}, + return_tune_res=return_tune_res, + optuna_settings=_basic_optuna_settings(), + ) + + +# ── optuna_settings validation ───────────────────────────────────────────────── + + +@pytest.mark.ci +@pytest.mark.parametrize( + "optuna_settings, exc, msg", + [ + ("invalid", TypeError, "optuna_settings must be a dict or None. Got ."), + ( + {"ml_g0": {"n_trials": 2}}, + ValueError, + "Invalid optuna_settings keys for PLR: ml_g0. Valid learner-specific keys are:", + ), + ({"ml_l": "not-a-dict"}, TypeError, "Optuna settings for 'ml_l' must be a dict."), + ], +) +def test_scalar_tune_invalid_optuna_settings_plr(plr_model, optuna_settings, exc, msg): + """tune_ml_models() raises on non-dict, invalid learner key, or non-dict learner settings for PLR.""" + with pytest.raises(exc, match=re.escape(msg)): + plr_model.tune_ml_models({"ml_l": _small_tree_params}, optuna_settings=optuna_settings) + + +@pytest.mark.ci +@pytest.mark.parametrize( + "invalid_key", + ["ml_l", "ml_z"], +) +def test_scalar_tune_invalid_optuna_settings_key_irm(irm_model, invalid_key): + """tune_ml_models() raises ValueError for optuna_settings keys not valid for IRM.""" + with pytest.raises(ValueError, match=f"Invalid optuna_settings keys for IRM: {invalid_key}"): + irm_model.tune_ml_models( + {"ml_g": _small_tree_params, "ml_m": _small_tree_params}, + optuna_settings={invalid_key: {"n_trials": 2}}, + ) + + +# ── cv validation (delegated to resolve_optuna_cv) ──────────────────────────── + + +@pytest.mark.ci +@pytest.mark.parametrize( + "cv, exc, msg", + [ + ("invalid", TypeError, "cv must not be provided as a string."), + (1, ValueError, "The number of folds used for tuning must be at least two. 1 was passed."), + ], +) +def test_scalar_tune_invalid_cv(plr_model, cv, exc, msg): + """tune_ml_models() raises for string cv or cv < 2.""" + with pytest.raises(exc, match=re.escape(msg)): + plr_model.tune_ml_models( + {"ml_l": _small_tree_params}, + cv=cv, + optuna_settings=_basic_optuna_settings(), + ) + + +@pytest.mark.ci +def test_scalar_tune_non_iterable_cv(plr_model): + """tune_ml_models() raises TypeError for a non-iterable cv object.""" + + class NonIterableCV: + pass + + msg = ( + "cv must be an integer >= 2, a scikit-learn cross-validation splitter, " + "or an iterable of (train_indices, test_indices) pairs." + ) + with pytest.raises(TypeError, match=re.escape(msg)): + plr_model.tune_ml_models( + {"ml_l": _small_tree_params}, + cv=NonIterableCV(), + optuna_settings=_basic_optuna_settings(), + ) + + +# ── cv variants (positive behavior) ─────────────────────────────────────────── + + +@pytest.mark.ci +def test_scalar_tune_cv_variants(plr_model): + """tune_ml_models() accepts integer and KFold splitter as cv.""" + param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params} + settings = _basic_optuna_settings() + + # integer cv + result = plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings, return_tune_res=True) + assert "ml_l" in result + + # KFold splitter + result = plr_model.tune_ml_models( + param_space, cv=KFold(n_splits=3, shuffle=True, random_state=0), optuna_settings=settings, return_tune_res=True + ) + assert "ml_l" in result diff --git a/doubleml/tests/test_scalar_tune_pruning.py b/doubleml/tests/test_scalar_tune_pruning.py new file mode 100644 index 00000000..75459139 --- /dev/null +++ b/doubleml/tests/test_scalar_tune_pruning.py @@ -0,0 +1,120 @@ +"""Tests for per-fold pruning support in DoubleMLScalar.tune_ml_models().""" + +import numpy as np +import optuna +import pytest +from sklearn.tree import DecisionTreeRegressor + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR +from doubleml.tests._utils_tune_optuna import _small_tree_params + +# ── Shared fixtures ──────────────────────────────────────────────────────────── + +np.random.seed(42) +_plr_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=5) + + +@pytest.fixture(scope="module") +def plr_model(): + """PLR scalar model for reuse across pruning tests.""" + model = PLR(_plr_data) + model.set_learners( + ml_l=DecisionTreeRegressor(random_state=1), + ml_m=DecisionTreeRegressor(random_state=2), + ) + return model + + +# ── Pruning tests ────────────────────────────────────────────────────────────── + + +@pytest.mark.ci +def test_scalar_tune_with_median_pruner(plr_model): + """tune_ml_models() completes successfully when MedianPruner is passed via study_kwargs.""" + param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params} + settings = { + "n_trials": 8, + "sampler": optuna.samplers.RandomSampler(seed=3141), + "study_kwargs": {"pruner": optuna.pruners.MedianPruner(n_startup_trials=1, n_warmup_steps=0)}, + "verbosity": optuna.logging.WARNING, + } + + result = plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings, return_tune_res=True) + + for name in ("ml_l", "ml_m"): + assert name in result + assert result[name].tuned is True + assert isinstance(result[name].best_params, dict) + assert np.isfinite(result[name].best_score) + # At least one complete trial must exist (RuntimeError raised otherwise) + complete = [t for t in result[name].study.trials if t.state == optuna.trial.TrialState.COMPLETE] + assert len(complete) >= 1 + + +@pytest.mark.ci +def test_scalar_tune_pruner_produces_pruned_trials(plr_model): + """MedianPruner with n_startup_trials=1 produces at least one pruned trial over enough trials.""" + param_space = {"ml_l": _small_tree_params} + settings = { + "n_trials": 20, + "sampler": optuna.samplers.RandomSampler(seed=99), + "study_kwargs": {"pruner": optuna.pruners.MedianPruner(n_startup_trials=1, n_warmup_steps=0)}, + "verbosity": optuna.logging.WARNING, + } + + result = plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings, return_tune_res=True) + + study = result["ml_l"].study + pruned = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED] + assert len(pruned) >= 1, "Expected at least one pruned trial with MedianPruner(n_startup_trials=1) over 20 trials" + + +@pytest.mark.ci +def test_scalar_tune_all_trials_pruned_raises(plr_model): + """tune_ml_models() raises RuntimeError when a pruner eliminates all trials.""" + + class _AlwaysPruner(optuna.pruners.BasePruner): + """Prune every trial unconditionally (even step 0).""" + + def prune(self, study: optuna.Study, trial: optuna.trial.FrozenTrial) -> bool: + return True + + param_space = {"ml_l": _small_tree_params} + settings = { + "n_trials": 3, + "study_kwargs": {"pruner": _AlwaysPruner()}, + "verbosity": optuna.logging.WARNING, + } + + with pytest.raises(RuntimeError, match="Optuna optimization failed to produce any complete trials."): + plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings) + + +@pytest.mark.ci +def test_scalar_tune_pruner_per_learner(plr_model): + """Per-learner study_kwargs pruner applies only to that learner; the other learner is unaffected.""" + param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params} + settings = { + "n_trials": 20, + "sampler": optuna.samplers.RandomSampler(seed=3141), + "verbosity": optuna.logging.WARNING, + # ml_l: aggressive pruner → expect pruned trials + "ml_l": { + "study_kwargs": {"pruner": optuna.pruners.MedianPruner(n_startup_trials=1, n_warmup_steps=0)}, + }, + # ml_m: explicitly disable pruning → zero pruned trials + "ml_m": { + "study_kwargs": {"pruner": optuna.pruners.NopPruner()}, + }, + } + + result = plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings, return_tune_res=True) + + # ml_l: expect at least one pruned trial due to the per-learner MedianPruner + ml_l_pruned = [t for t in result["ml_l"].study.trials if t.state == optuna.trial.TrialState.PRUNED] + assert len(ml_l_pruned) >= 1, "Expected ml_l to have pruned trials with a per-learner MedianPruner" + + # ml_m: NoPruner → all 20 trials should be complete + ml_m_pruned = [t for t in result["ml_m"].study.trials if t.state == optuna.trial.TrialState.PRUNED] + assert len(ml_m_pruned) == 0, "Expected ml_m to have no pruned trials since NoPruner was configured" diff --git a/doubleml/utils/_tune_optuna.py b/doubleml/utils/_tune_optuna.py index 36d8f7e7..f3e2a821 100644 --- a/doubleml/utils/_tune_optuna.py +++ b/doubleml/utils/_tune_optuna.py @@ -27,7 +27,8 @@ import numpy as np import optuna from sklearn.base import clone, is_classifier, is_regressor -from sklearn.model_selection import BaseCrossValidator, KFold, cross_val_score +from sklearn.metrics import check_scoring +from sklearn.model_selection import BaseCrossValidator, KFold logger = logging.getLogger(__name__) @@ -400,9 +401,9 @@ def _check_tuning_inputs( Returns ------- - cross-validator or iterable - Cross-validation splitter compatible with - :func:`sklearn.model_selection.cross_val_score`. + cross-validator or list + Cross-validation splitter or pre-made list of ``(train, test)`` index + pairs as returned by :func:`resolve_optuna_cv`. """ if y.shape[0] != x.shape[0]: @@ -520,6 +521,10 @@ def _create_objective(param_grid_func, learner, x, y, cv, scoring_method): """ Create an Optuna objective function for hyperparameter optimization. + Uses a manual fold loop with per-fold intermediate reporting so that + Optuna pruners (e.g. ``MedianPruner``, ``HyperbandPruner``) can stop + unpromising trials early after each cross-validation fold. + Parameters ---------- param_grid_func : callable @@ -531,8 +536,10 @@ def _create_objective(param_grid_func, learner, x, y, cv, scoring_method): Features (full dataset). y : np.ndarray Target variable (full dataset). - cv : cross-validation generator - KFold or similar cross-validation splitter. + cv : cross-validation splitter or list of (train, test) pairs + A scikit-learn cross-validation splitter (has a ``.split()`` method) or + a pre-made list of ``(train_indices, test_indices)`` pairs as returned + by :func:`resolve_optuna_cv`. scoring_method : str, callable or None Scoring argument for cross-validation. ``None`` delegates to the estimator's default ``score`` implementation. @@ -542,6 +549,10 @@ def _create_objective(param_grid_func, learner, x, y, cv, scoring_method): callable Objective function for Optuna optimization. """ + # Build scorer once; scoring_method is already resolved (non-None) by _resolve_optuna_scoring + scorer = check_scoring(clone(learner), scoring=scoring_method) + # Pre-compute splits: cv may be a splitter (has .split) or a list of (train, test) pairs + splits = cv if isinstance(cv, list) else list(cv.split(x, y)) def objective(trial): """Objective function for Optuna optimization.""" @@ -554,21 +565,19 @@ def objective(trial): f"Example: def params(trial): return {{'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1)}}" ) - # Clone learner and set parameters - estimator = clone(learner).set_params(**params) - - # Perform cross-validation on full dataset - scores = cross_val_score( - estimator, - x, - y, - cv=cv, - scoring=scoring_method, - error_score="raise", - ) + # Manual fold loop with per-fold intermediate reporting for pruning support + fold_scores = [] + for step, (train_idx, test_idx) in enumerate(splits): + est = clone(learner).set_params(**params) + est.fit(x[train_idx], y[train_idx]) + fold_scores.append(scorer(est, x[test_idx], y[test_idx])) + + # Report running mean after each fold so pruners can act between folds + trial.report(float(np.nanmean(fold_scores)), step) + if trial.should_prune(): + raise optuna.TrialPruned() - # Return mean test score - return np.nanmean(scores) + return float(np.nanmean(fold_scores)) return objective From b0026dabd59f4880e2e6d931ffda52c05b84b220 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sun, 1 Mar 2026 08:58:52 +0100 Subject: [PATCH 20/38] add nuisance evalutaion --- doubleml/double_ml_scalar.py | 171 +++++++++++ doubleml/irm/irm_scalar.py | 14 + .../test_irm_scalar_evaluate_learners.py | 265 ++++++++++++++++++ doubleml/plm/plr_scalar.py | 39 ++- .../test_plr_scalar_evaluate_learners.py | 251 +++++++++++++++++ 5 files changed, 729 insertions(+), 11 deletions(-) create mode 100644 doubleml/irm/tests/test_irm_scalar_evaluate_learners.py create mode 100644 doubleml/plm/tests/test_plr_scalar_evaluate_learners.py diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index bd39dc9e..f685cce9 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -9,6 +9,7 @@ from .utils._tune_optuna import DMLOptunaResult import numpy as np +from sklearn.metrics import log_loss, root_mean_squared_error from .data.base_data import DoubleMLBaseData from .double_ml_base import DoubleMLBase @@ -103,6 +104,8 @@ def __init__( # Initialize storage for predictions and results self._predictions: dict[str, np.ndarray] | None = None + self._nuisance_targets: dict[str, np.ndarray] | None = None + self._nuisance_loss: dict[str, np.ndarray] | None = None self._all_thetas: np.ndarray | None = None self._all_ses: np.ndarray | None = None self._psi: np.ndarray | None = None @@ -184,6 +187,51 @@ def predictions(self) -> dict[str, np.ndarray]: raise ValueError("Predictions not available. Call fit() first.") return self._predictions + @property + def nuisance_targets(self) -> dict[str, np.ndarray]: + """ + Target arrays used for nuisance loss evaluation. + + Returns + ------- + dict[str, np.ndarray] + Dictionary with target arrays of shape ``(n_obs, n_rep)`` per learner. + Entries are all-NaN for learners whose targets cannot be recovered post-fit + (e.g. PLR ``ml_g``). + + Raises + ------ + ValueError + If the model has not been fitted yet. + """ + if self._nuisance_targets is None: + raise ValueError("Nuisance targets not available. Call fit() or fit_nuisance_models() first.") + return self._nuisance_targets + + @property + def nuisance_loss(self) -> dict[str, np.ndarray]: + """ + Out-of-sample loss per learner, shape ``(n_rep,)``. + + Uses RMSE for regressors and logloss for classifiers, determined automatically + from the registered learner type. Entries are NaN for learners whose targets are + unavailable or whose type cannot be determined (external predictions without a + registered learner). + + Returns + ------- + dict[str, np.ndarray] + Dictionary with loss arrays of shape ``(n_rep,)`` per learner. + + Raises + ------ + ValueError + If the model has not been fitted yet. + """ + if self._nuisance_loss is None: + raise ValueError("Nuisance loss not available. Call fit() or fit_nuisance_models() first.") + return self._nuisance_loss + @property def smpls(self) -> list: """ @@ -463,6 +511,16 @@ def fit_nuisance_models( # Post-nuisance prediction checks (model-specific) self._post_nuisance_checks() + # Build nuisance targets: _get_nuisance_targets() may return None for some learners + # (e.g. PLR ml_g whose target y - θ·d varies per rep). Convert None → all-NaN array + # so _nuisance_targets is always dict[str, np.ndarray]. + raw_targets = self._get_nuisance_targets() + self._nuisance_targets = {} + for name in self.required_learners: + t = raw_targets.get(name) + self._nuisance_targets[name] = t if isinstance(t, np.ndarray) else np.full((self._n_obs, self.n_rep), np.nan) + self._nuisance_loss = self.evaluate_learners() + return self def estimate_causal_parameters(self) -> Self: @@ -744,6 +802,8 @@ def _construct_framework(self) -> DoubleMLFramework: def _reset_fit_state(self) -> None: """Clear fit-dependent state after changing the sample splitting.""" self._predictions = None + self._nuisance_targets = None + self._nuisance_loss = None self._framework = None self._all_thetas = None self._all_ses = None @@ -753,11 +813,122 @@ def _reset_fit_state(self) -> None: self._i_rep = None self._i_fold = None + def evaluate_learners( + self, + learners: list[str] | None = None, + metric: Callable | None = None, + ) -> dict[str, np.ndarray]: + """ + Evaluate fitted learners on cross-validated predictions with a custom metric. + + Parameters + ---------- + learners : list of str or None, optional + Names of learners to evaluate. Default is all :attr:`required_learners`. + metric : callable or None, optional + Metric function with signature ``(y_true, y_pred) -> float``. Any sklearn + metric function (e.g. ``sklearn.metrics.root_mean_squared_error``, + ``sklearn.metrics.r2_score``, ``sklearn.metrics.log_loss``) or any custom + callable with the same signature can be passed. + If ``None``, automatically selects ``root_mean_squared_error`` for regressors + and ``log_loss`` for classifiers based on the registered learner type. + + Returns + ------- + dict[str, np.ndarray] + Dictionary with loss arrays of shape ``(n_rep,)`` per learner. + Entries are NaN for repetitions with no valid (non-NaN) targets or for + learners whose type cannot be determined (external predictions without a + registered learner). + + Raises + ------ + ValueError + If the model has not been fitted yet, or if a requested learner name is not + in :attr:`required_learners`. + TypeError + If ``metric`` is not callable. + ValueError + If the metric returns a non-finite value. + + Examples + -------- + >>> from sklearn.metrics import root_mean_squared_error, r2_score, log_loss + >>> model.evaluate_learners() + >>> model.evaluate_learners(metric=r2_score) + >>> model.evaluate_learners(learners=["ml_m"], metric=log_loss) + """ + if self._nuisance_targets is None: + raise ValueError("Nuisance targets not available. Call fit() or fit_nuisance_models() first.") + if metric is not None and not callable(metric): + raise TypeError(f"metric must be callable or None. Got {type(metric).__name__}.") + + if learners is None: + learners = self.required_learners + + invalid = [name for name in learners if name not in self.required_learners] + if invalid: + raise ValueError(f"Invalid learner(s) {invalid}. Must be a subset of {self.required_learners}.") + + n_rep = self.n_rep + result: dict[str, np.ndarray] = {} + + for name in learners: + target = self._nuisance_targets[name] # (n_obs, n_rep) + pred = self._predictions[name] # (n_obs, n_rep) + + loss_arr = np.full(n_rep, np.nan) + for i_rep in range(n_rep): + mask = ~np.isnan(target[:, i_rep]) + if not mask.any(): + continue + + t, p = target[mask, i_rep], pred[mask, i_rep] + + if metric is None: + if name not in self._learners: + # No registered learner type (external predictions) — infer from target values + unique_vals = np.unique(t) + is_binary = len(unique_vals) <= 2 and np.all(np.isin(unique_vals, [0, 1])) + fn: Callable = log_loss if is_binary else root_mean_squared_error + else: + fn = log_loss if self._learners[name].is_classifier else root_mean_squared_error + else: + fn = metric + + res = fn(t, p) + if not np.isfinite(res): + raise ValueError( + f"Evaluation of learner '{name}' for repetition {i_rep} returned " f"a non-finite value: {res}." + ) + loss_arr[i_rep] = res + + result[name] = loss_arr + + return result + # ==================== Abstract Methods (Must be Implemented by Subclasses) ==================== def _post_nuisance_checks(self) -> None: """Post-nuisance prediction validation hook. Override in subclasses for model-specific checks.""" + @abstractmethod + def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]: + """ + Return target arrays for nuisance loss evaluation. + + Subclasses must implement this to provide targets for each learner. + Return ``None`` for learners whose targets cannot be recovered post-fit + (e.g. PLR ``ml_g`` whose target ``y - θ·d`` varies per repetition). + + Returns + ------- + dict[str, np.ndarray or None] + Dictionary mapping learner names to target arrays of shape ``(n_obs, n_rep)``, + or ``None`` where targets are not available. + """ + pass + @abstractmethod def _nuisance_est( self, diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index f6983368..4305eb27 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -305,6 +305,20 @@ def _nuisance_est( # ==================== Score Elements ==================== + def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]: + """Return target arrays for nuisance loss evaluation. + + ml_g0 and ml_g1 are fitted only on the d==0 and d==1 subgroups respectively, + so targets for the opposite group are NaN. ml_m target is d (binary treatment). + """ + y = self._dml_data.y + d = self._dml_data.d + return { + "ml_g0": np.tile(np.where(d == 0, y, np.nan)[:, np.newaxis], (1, self.n_rep)), + "ml_g1": np.tile(np.where(d == 1, y, np.nan)[:, np.newaxis], (1, self.n_rep)), + "ml_m": np.tile(d[:, np.newaxis], (1, self.n_rep)), + } + def _get_score_elements(self) -> dict[str, np.ndarray]: y = self._dml_data.y d = self._dml_data.d diff --git a/doubleml/irm/tests/test_irm_scalar_evaluate_learners.py b/doubleml/irm/tests/test_irm_scalar_evaluate_learners.py new file mode 100644 index 00000000..40b7a8bd --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_evaluate_learners.py @@ -0,0 +1,265 @@ +"""Tests for evaluate_learners(), nuisance_loss, and nuisance_targets on IRM scalar models.""" + +import numpy as np +import pytest +from sklearn.linear_model import Lasso, LogisticRegression +from sklearn.metrics import log_loss, mean_absolute_error, r2_score, root_mean_squared_error + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM + +N_OBS = 500 +N_FOLDS = 5 +N_REP = 2 + + +@pytest.fixture(scope="module") +def irm_data(): + """Shared IRM dataset.""" + np.random.seed(3141) + return make_irm_data(n_obs=N_OBS, dim_x=5) + + +@pytest.fixture(scope="module", params=["ATE", "ATTE"]) +def score(request): + """Parametrize over IRM score functions.""" + return request.param + + +@pytest.fixture(scope="module") +def fitted_irm(score, irm_data): + """Fit an IRM model for the given score.""" + model = IRM(irm_data, score=score) + model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression()) + model.fit(n_folds=N_FOLDS, n_rep=N_REP) + return model + + +# ==================== nuisance_loss ==================== + + +@pytest.mark.ci +def test_nuisance_loss_type_and_shape(fitted_irm): + """nuisance_loss is a dict of (n_rep,) arrays; all entries are finite.""" + loss = fitted_irm.nuisance_loss + + assert isinstance(loss, dict) + for name in ["ml_g0", "ml_g1", "ml_m"]: + assert isinstance(loss[name], np.ndarray) + assert loss[name].shape == (N_REP,) + assert np.all(np.isfinite(loss[name])) + + +@pytest.mark.ci +def test_nuisance_loss_ml_m_is_logloss(fitted_irm): + """ml_m loss uses logloss (classifier path) — positive finite values.""" + loss = fitted_irm.nuisance_loss + assert np.all(loss["ml_m"] > 0) + + +@pytest.mark.ci +def test_nuisance_loss_ml_g_is_rmse(fitted_irm): + """ml_g0 and ml_g1 loss uses RMSE (regressor path) — positive finite values.""" + loss = fitted_irm.nuisance_loss + assert np.all(loss["ml_g0"] > 0) + assert np.all(loss["ml_g1"] > 0) + + +# ==================== nuisance_targets ==================== + + +@pytest.mark.ci +def test_nuisance_targets_type_and_shape(fitted_irm): + """nuisance_targets is a dict; all entries are (n_obs, n_rep) arrays.""" + targets = fitted_irm.nuisance_targets + + assert isinstance(targets, dict) + for name in ["ml_g0", "ml_g1", "ml_m"]: + assert isinstance(targets[name], np.ndarray) + assert targets[name].shape == (N_OBS, N_REP) + + +@pytest.mark.ci +def test_nuisance_targets_ml_g0_partial_nan(fitted_irm, irm_data): + """ml_g0 target is y where d==0 and NaN where d==1.""" + targets = fitted_irm.nuisance_targets + d = irm_data.d + + for i_rep in range(N_REP): + col = targets["ml_g0"][:, i_rep] + assert np.all(np.isnan(col[d == 1])) + assert np.all(np.isfinite(col[d == 0])) + + +@pytest.mark.ci +def test_nuisance_targets_ml_g1_partial_nan(fitted_irm, irm_data): + """ml_g1 target is y where d==1 and NaN where d==0.""" + targets = fitted_irm.nuisance_targets + d = irm_data.d + + for i_rep in range(N_REP): + col = targets["ml_g1"][:, i_rep] + assert np.all(np.isnan(col[d == 0])) + assert np.all(np.isfinite(col[d == 1])) + + +@pytest.mark.ci +def test_nuisance_targets_ml_m_equals_d(fitted_irm, irm_data): + """ml_m target is d broadcast across repetitions.""" + targets = fitted_irm.nuisance_targets + d = irm_data.d + for i_rep in range(N_REP): + np.testing.assert_array_equal(targets["ml_m"][:, i_rep], d) + + +# ==================== evaluate_learners ==================== + + +@pytest.mark.ci +def test_evaluate_learners_default(fitted_irm): + """Default evaluate_learners() returns finite values with correct shape.""" + result = fitted_irm.evaluate_learners() + + assert isinstance(result, dict) + for name in ["ml_g0", "ml_g1", "ml_m"]: + assert isinstance(result[name], np.ndarray) + assert result[name].shape == (N_REP,) + assert np.all(np.isfinite(result[name])) + + +@pytest.mark.ci +def test_evaluate_learners_logloss_ml_m_matches_nuisance_loss(fitted_irm): + """evaluate_learners with log_loss on ml_m matches nuisance_loss['ml_m'].""" + result = fitted_irm.evaluate_learners(learners=["ml_m"], metric=log_loss) + loss = fitted_irm.nuisance_loss + + np.testing.assert_allclose(result["ml_m"], loss["ml_m"], rtol=1e-9) + + +@pytest.mark.ci +def test_evaluate_learners_rmse_ml_g_matches_nuisance_loss(fitted_irm): + """evaluate_learners with RMSE on ml_g0/g1 matches nuisance_loss.""" + result = fitted_irm.evaluate_learners(learners=["ml_g0", "ml_g1"], metric=root_mean_squared_error) + loss = fitted_irm.nuisance_loss + + np.testing.assert_allclose(result["ml_g0"], loss["ml_g0"], rtol=1e-9) + np.testing.assert_allclose(result["ml_g1"], loss["ml_g1"], rtol=1e-9) + + +@pytest.mark.ci +def test_evaluate_learners_partial_nans_ml_g(fitted_irm): + """RMSE for ml_g0/g1 is finite despite NaN targets for the opposite treatment group.""" + result = fitted_irm.evaluate_learners(learners=["ml_g0", "ml_g1"], metric=root_mean_squared_error) + + assert np.all(np.isfinite(result["ml_g0"])) + assert np.all(np.isfinite(result["ml_g1"])) + + +@pytest.mark.ci +def test_evaluate_learners_r2(fitted_irm): + """evaluate_learners with r2_score returns values <= 1 with correct shape.""" + result = fitted_irm.evaluate_learners(learners=["ml_g0", "ml_g1"], metric=r2_score) + + for name in ["ml_g0", "ml_g1"]: + assert result[name].shape == (N_REP,) + assert np.all(result[name] <= 1.0) + + +@pytest.mark.ci +def test_evaluate_learners_mae(fitted_irm): + """evaluate_learners with mean_absolute_error returns positive values.""" + result = fitted_irm.evaluate_learners(learners=["ml_g0", "ml_g1"], metric=mean_absolute_error) + + for name in ["ml_g0", "ml_g1"]: + assert result[name].shape == (N_REP,) + assert np.all(result[name] > 0) + + +@pytest.mark.ci +def test_evaluate_learners_subset(fitted_irm): + """Requesting only ml_m returns only the ml_m key.""" + result = fitted_irm.evaluate_learners(learners=["ml_m"]) + + assert list(result.keys()) == ["ml_m"] + assert result["ml_m"].shape == (N_REP,) + + +@pytest.mark.ci +def test_evaluate_learners_custom_metric(fitted_irm): + """A custom lambda metric produces consistent results.""" + custom_mae = lambda y_true, y_pred: np.mean(np.abs(y_true - y_pred)) # noqa: E731 + result_custom = fitted_irm.evaluate_learners(learners=["ml_g0"], metric=custom_mae) + result_sklearn = fitted_irm.evaluate_learners(learners=["ml_g0"], metric=mean_absolute_error) + + np.testing.assert_allclose(result_custom["ml_g0"], result_sklearn["ml_g0"], rtol=1e-9) + + +# ==================== Before-fit errors ==================== + + +@pytest.mark.ci +def test_evaluate_learners_before_fit_raises(irm_data): + """evaluate_learners() raises ValueError before fit().""" + model = IRM(irm_data) + model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression()) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + model.evaluate_learners() + + +@pytest.mark.ci +def test_nuisance_loss_before_fit_raises(irm_data): + """nuisance_loss raises ValueError before fit().""" + model = IRM(irm_data) + model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression()) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + _ = model.nuisance_loss + + +@pytest.mark.ci +def test_nuisance_targets_before_fit_raises(irm_data): + """nuisance_targets raises ValueError before fit().""" + model = IRM(irm_data) + model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression()) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + _ = model.nuisance_targets + + +# ==================== Input validation ==================== + + +@pytest.mark.ci +def test_evaluate_learners_invalid_learner(fitted_irm): + """Requesting an unknown learner name raises ValueError.""" + with pytest.raises(ValueError, match=r"Invalid learner"): + fitted_irm.evaluate_learners(learners=["ml_g0", "ml_unknown"]) + + +@pytest.mark.ci +def test_evaluate_learners_invalid_metric(fitted_irm): + """Passing a non-callable metric raises TypeError.""" + with pytest.raises(TypeError, match=r"metric must be callable"): + fitted_irm.evaluate_learners(metric="rmse") + + +# ==================== Reset behaviour ==================== + + +@pytest.mark.ci +def test_reset_clears_nuisance(irm_data): + """After draw_sample_splitting(), nuisance_loss raises ValueError.""" + model = IRM(irm_data) + model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression()) + model.fit(n_folds=N_FOLDS, n_rep=N_REP) + assert model.nuisance_loss is not None + + model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + _ = model.nuisance_loss diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index caf290de..0e1c149e 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -5,7 +5,7 @@ from __future__ import annotations import warnings -from typing import Any, ClassVar, Dict, List, Optional, Self +from typing import Any, ClassVar, Self import numpy as np from sklearn.base import clone @@ -38,7 +38,7 @@ class PLR(LinearScoreMixin): """ # Define learner specifications for PLR - _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] = { + _LEARNER_SPECS: ClassVar[dict[str, LearnerSpec]] = { "ml_l": LearnerSpec("ml_l", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"), "ml_m": LearnerSpec("ml_m", allow_regressor=True, allow_classifier=True, binary_data_check="treatment"), "ml_g": LearnerSpec("ml_g", allow_regressor=True, allow_classifier=False), @@ -48,9 +48,9 @@ def __init__( self, obj_dml_data: DoubleMLData, score: str = "partialling out", - ml_l: Optional[object] = None, - ml_m: Optional[object] = None, - ml_g: Optional[object] = None, + ml_l: object | None = None, + ml_m: object | None = None, + ml_g: object | None = None, ): """ Initialize PLR model. @@ -88,7 +88,7 @@ def __init__( self.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) @property - def required_learners(self) -> List[str]: + def required_learners(self) -> list[str]: """Required learners for current score.""" names = ["ml_l", "ml_m"] if self.score == "IV-type": @@ -97,9 +97,9 @@ def required_learners(self) -> List[str]: def set_learners( self, - ml_l: Optional[object] = None, - ml_m: Optional[object] = None, - ml_g: Optional[object] = None, + ml_l: object | None = None, + ml_m: object | None = None, + ml_g: object | None = None, ) -> Self: """ Set the learners for nuisance estimation. @@ -220,7 +220,7 @@ def _nuisance_est( test_idx: np.ndarray, i_rep: int, i_fold: int, - external_predictions: Optional[Dict[str, np.ndarray]] = None, + external_predictions: dict[str, np.ndarray] | None = None, ) -> None: x = self._dml_data.x y = self._dml_data.y @@ -356,7 +356,24 @@ def _get_tuning_data( raise ValueError(f"Unknown learner '{learner_name}' for PLR.") - def _get_score_elements(self) -> Dict[str, np.ndarray]: + def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]: + """Return target arrays for nuisance loss evaluation. + + Returns y for ml_l, d for ml_m. For IV-type score, ml_g target is None because + the adjusted outcome y - θ·d depends on the estimated parameter and varies per + repetition, so it cannot be recovered post-fit. + """ + y = self._dml_data.y + d = self._dml_data.d + targets: dict[str, np.ndarray | None] = { + "ml_l": np.tile(y[:, np.newaxis], (1, self.n_rep)), + "ml_m": np.tile(d[:, np.newaxis], (1, self.n_rep)), + } + if "ml_g" in self.required_learners: + targets["ml_g"] = None + return targets + + def _get_score_elements(self) -> dict[str, np.ndarray]: y = self._dml_data.y d = self._dml_data.d diff --git a/doubleml/plm/tests/test_plr_scalar_evaluate_learners.py b/doubleml/plm/tests/test_plr_scalar_evaluate_learners.py new file mode 100644 index 00000000..bb843274 --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar_evaluate_learners.py @@ -0,0 +1,251 @@ +"""Tests for evaluate_learners(), nuisance_loss, and nuisance_targets on PLR scalar models.""" + +import numpy as np +import pytest +from sklearn.linear_model import Lasso +from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + +N_OBS = 500 +N_FOLDS = 5 +N_REP = 2 + + +@pytest.fixture(scope="module") +def plr_data(): + """Shared PLR dataset.""" + np.random.seed(3141) + return make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5) + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def score(request): + """Parametrize over PLR score functions.""" + return request.param + + +@pytest.fixture(scope="module") +def fitted_plr(score, plr_data): + """Fit a PLR model for the given score.""" + model = PLR(plr_data, score=score) + model.set_learners(ml_l=Lasso(), ml_m=Lasso()) + model.fit(n_folds=N_FOLDS, n_rep=N_REP) + return model + + +# ==================== nuisance_loss ==================== + + +@pytest.mark.ci +def test_nuisance_loss_type_and_shape(fitted_plr): + """nuisance_loss is a dict of (n_rep,) arrays; ml_l/ml_m are finite; ml_g is NaN.""" + loss = fitted_plr.nuisance_loss + + assert isinstance(loss, dict) + for name in ["ml_l", "ml_m"]: + assert isinstance(loss[name], np.ndarray) + assert loss[name].shape == (N_REP,) + assert np.all(np.isfinite(loss[name])) + + if fitted_plr.score == "IV-type": + assert isinstance(loss["ml_g"], np.ndarray) + assert loss["ml_g"].shape == (N_REP,) + assert np.all(np.isnan(loss["ml_g"])) + + +@pytest.mark.ci +def test_nuisance_loss_positive(fitted_plr): + """RMSE values for ml_l and ml_m are strictly positive.""" + loss = fitted_plr.nuisance_loss + assert np.all(loss["ml_l"] > 0) + assert np.all(loss["ml_m"] > 0) + + +# ==================== nuisance_targets ==================== + + +@pytest.mark.ci +def test_nuisance_targets_type_and_shape(fitted_plr): + """nuisance_targets is a dict; ml_l/ml_m have real values; ml_g is all-NaN (IV-type).""" + targets = fitted_plr.nuisance_targets + + assert isinstance(targets, dict) + for name in ["ml_l", "ml_m"]: + assert isinstance(targets[name], np.ndarray) + assert targets[name].shape == (N_OBS, N_REP) + assert not np.all(np.isnan(targets[name])) + + if fitted_plr.score == "IV-type": + assert isinstance(targets["ml_g"], np.ndarray) + assert targets["ml_g"].shape == (N_OBS, N_REP) + assert np.all(np.isnan(targets["ml_g"])) + + +@pytest.mark.ci +def test_nuisance_targets_ml_l_equals_y(fitted_plr, plr_data): + """ml_l target is y broadcast across repetitions.""" + targets = fitted_plr.nuisance_targets + y = plr_data.y + for i_rep in range(N_REP): + np.testing.assert_array_equal(targets["ml_l"][:, i_rep], y) + + +@pytest.mark.ci +def test_nuisance_targets_ml_m_equals_d(fitted_plr, plr_data): + """ml_m target is d broadcast across repetitions.""" + targets = fitted_plr.nuisance_targets + d = plr_data.d + for i_rep in range(N_REP): + np.testing.assert_array_equal(targets["ml_m"][:, i_rep], d) + + +# ==================== evaluate_learners ==================== + + +@pytest.mark.ci +def test_evaluate_learners_default(fitted_plr): + """Default evaluate_learners() returns RMSE for ml_l and ml_m.""" + result = fitted_plr.evaluate_learners() + + assert isinstance(result, dict) + for name in ["ml_l", "ml_m"]: + assert isinstance(result[name], np.ndarray) + assert result[name].shape == (N_REP,) + assert np.all(result[name] > 0) + + +@pytest.mark.ci +def test_evaluate_learners_rmse_matches_nuisance_loss(fitted_plr): + """evaluate_learners with root_mean_squared_error matches nuisance_loss for ml_l and ml_m.""" + result = fitted_plr.evaluate_learners(metric=root_mean_squared_error) + loss = fitted_plr.nuisance_loss + + np.testing.assert_allclose(result["ml_l"], loss["ml_l"], rtol=1e-9) + np.testing.assert_allclose(result["ml_m"], loss["ml_m"], rtol=1e-9) + + +@pytest.mark.ci +def test_evaluate_learners_r2(fitted_plr): + """evaluate_learners with r2_score returns values <= 1 with correct shape.""" + result = fitted_plr.evaluate_learners(learners=["ml_l", "ml_m"], metric=r2_score) + + for name in ["ml_l", "ml_m"]: + assert result[name].shape == (N_REP,) + assert np.all(result[name] <= 1.0) + + +@pytest.mark.ci +def test_evaluate_learners_mae(fitted_plr): + """evaluate_learners with mean_absolute_error returns positive values with correct shape.""" + result = fitted_plr.evaluate_learners(learners=["ml_l", "ml_m"], metric=mean_absolute_error) + + for name in ["ml_l", "ml_m"]: + assert result[name].shape == (N_REP,) + assert np.all(result[name] > 0) + + +@pytest.mark.ci +def test_evaluate_learners_subset(fitted_plr): + """Requesting only ml_l returns only the ml_l key.""" + result = fitted_plr.evaluate_learners(learners=["ml_l"]) + + assert list(result.keys()) == ["ml_l"] + assert result["ml_l"].shape == (N_REP,) + + +@pytest.mark.ci +def test_evaluate_learners_custom_metric(fitted_plr): + """A custom lambda metric produces consistent results.""" + custom_mae = lambda y_true, y_pred: np.mean(np.abs(y_true - y_pred)) # noqa: E731 + result_custom = fitted_plr.evaluate_learners(learners=["ml_l"], metric=custom_mae) + result_sklearn = fitted_plr.evaluate_learners(learners=["ml_l"], metric=mean_absolute_error) + + np.testing.assert_allclose(result_custom["ml_l"], result_sklearn["ml_l"], rtol=1e-9) + + +# ==================== Before-fit errors ==================== + + +@pytest.mark.ci +def test_evaluate_learners_before_fit_raises(plr_data): + """evaluate_learners() raises ValueError before fit_nuisance_models().""" + model = PLR(plr_data) + model.set_learners(ml_l=Lasso(), ml_m=Lasso()) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + model.evaluate_learners() + + +@pytest.mark.ci +def test_evaluate_learners_after_reset_raises(plr_data): + """evaluate_learners() raises ValueError after draw_sample_splitting() resets fit state.""" + model = PLR(plr_data) + model.set_learners(ml_l=Lasso(), ml_m=Lasso()) + model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + model.fit_nuisance_models() + # Re-drawing splits resets fit state + model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + model.evaluate_learners() + + +@pytest.mark.ci +def test_nuisance_loss_before_fit_raises(plr_data): + """nuisance_loss raises ValueError before fit_nuisance_models().""" + model = PLR(plr_data) + model.set_learners(ml_l=Lasso(), ml_m=Lasso()) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + _ = model.nuisance_loss + + +@pytest.mark.ci +def test_nuisance_targets_before_fit_raises(plr_data): + """nuisance_targets raises ValueError before fit_nuisance_models().""" + model = PLR(plr_data) + model.set_learners(ml_l=Lasso(), ml_m=Lasso()) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + _ = model.nuisance_targets + + +# ==================== Input validation ==================== + + +@pytest.mark.ci +def test_evaluate_learners_invalid_learner(fitted_plr): + """Requesting an unknown learner name raises ValueError.""" + with pytest.raises(ValueError, match=r"Invalid learner"): + fitted_plr.evaluate_learners(learners=["ml_l", "ml_unknown"]) + + +@pytest.mark.ci +def test_evaluate_learners_invalid_metric(fitted_plr): + """Passing a non-callable metric raises TypeError.""" + with pytest.raises(TypeError, match=r"metric must be callable"): + fitted_plr.evaluate_learners(metric="rmse") + + +# ==================== Reset behaviour ==================== + + +@pytest.mark.ci +def test_reset_clears_nuisance(plr_data): + """After draw_sample_splitting(), nuisance_loss raises ValueError.""" + model = PLR(plr_data) + model.set_learners(ml_l=Lasso(), ml_m=Lasso()) + model.fit(n_folds=N_FOLDS, n_rep=N_REP) + assert model.nuisance_loss is not None + + model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + + msg = r"Call fit\(\) or fit_nuisance_models\(\) first" + with pytest.raises(ValueError, match=msg): + _ = model.nuisance_loss From 050fa27035c03bc3eb2816339ec018ba37d9add6 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sun, 1 Mar 2026 12:00:02 +0100 Subject: [PATCH 21/38] Implement sensitivity analysis for scalar models in DoubleML - Added `_sensitivity_element_est` method to `DoubleMLScalar`, `IRM`, and `PLR` classes to compute sensitivity elements including sigma2, nu2, and their influence functions. - Introduced `sensitivity_elements` property to retrieve computed sensitivity elements after model fitting. - Implemented validation checks for sensitivity elements in `DoubleMLScalar`. - Added exception handling for sensitivity analysis methods in `IRM` and `PLR` classes to ensure proper input types and values. - Created unit tests for sensitivity analysis, including checks for element shapes, bounds, and exception handling in both `IRM` and `PLR` models. - Ensured compatibility of sensitivity elements between scalar and legacy models in comparison tests. --- .claude/rules/testing-conventions.md | 61 +++++++++++++- doubleml/double_ml_scalar.py | 81 ++++++++++++++++++ doubleml/irm/irm_scalar.py | 61 ++++++++++++++ .../irm/tests/test_irm_scalar_exceptions.py | 65 ++++++++++++++ .../irm/tests/test_irm_scalar_return_types.py | 78 +++++++++++++++++ .../irm/tests/test_irm_scalar_sensitivity.py | 84 +++++++++++++++++++ doubleml/irm/tests/test_irm_scalar_vs_irm.py | 31 +++++++ doubleml/plm/plr_scalar.py | 56 +++++++++++++ .../plm/tests/test_plr_scalar_exceptions.py | 66 +++++++++++++++ .../plm/tests/test_plr_scalar_return_types.py | 75 +++++++++++++++++ .../plm/tests/test_plr_scalar_sensitivity.py | 81 ++++++++++++++++++ doubleml/plm/tests/test_plr_scalar_vs_plr.py | 31 +++++++ 12 files changed, 769 insertions(+), 1 deletion(-) create mode 100644 doubleml/irm/tests/test_irm_scalar_sensitivity.py create mode 100644 doubleml/plm/tests/test_plr_scalar_sensitivity.py diff --git a/.claude/rules/testing-conventions.md b/.claude/rules/testing-conventions.md index e48b5fa8..46a83b42 100644 --- a/.claude/rules/testing-conventions.md +++ b/.claude/rules/testing-conventions.md @@ -137,9 +137,66 @@ For models with `_LEARNER_PARAM_ALIASES` (e.g., IRM `"ml_g"` → `["ml_g0", "ml_ --- +## Evaluate Learners Tests (`test__scalar_evaluate_learners.py`) + +Scalar models with `evaluate_learners()` require a dedicated test file. Constants: `N_OBS=500`, `N_FOLDS=5`, `N_REP=2`. Score-parametrized fixture (same pattern as tuning tests). + +**Required tests:** + +| Test | Checks | +|------|--------| +| `test_nuisance_loss_type_and_shape` | `dict`; each value `shape == (N_REP,)`; finite or NaN as expected | +| `test_nuisance_loss_positive` | RMSE > 0 for learners with real targets | +| `test_nuisance_targets_type_and_shape` | `shape == (N_OBS, N_REP)`; NaN arrays for unknown targets | +| `test_nuisance_targets_correct_values` | ml_l target == y; ml_m target == d (model-specific) | +| `test_evaluate_learners_default` | Default metric returns finite positive values | +| `test_evaluate_learners_rmse_matches_nuisance_loss` | `evaluate_learners(root_mean_squared_error)` equals `nuisance_loss` | +| `test_evaluate_learners_r2` | R² ≤ 1; correct shape | +| `test_evaluate_learners_mae` | MAE > 0; correct shape | +| `test_evaluate_learners_subset` | `learners=["ml_l"]` returns only `"ml_l"` key | +| `test_evaluate_learners_custom_metric` | Lambda metric matches sklearn equivalent | +| `test_evaluate_learners_before_fit_raises` | `ValueError` before `fit_nuisance_models()` | +| `test_evaluate_learners_after_reset_raises` | `ValueError` after `draw_sample_splitting()` | +| `test_nuisance_loss_before_fit_raises` | `ValueError` on `.nuisance_loss` before fit | +| `test_nuisance_targets_before_fit_raises` | `ValueError` on `.nuisance_targets` before fit | +| `test_evaluate_learners_invalid_learner` | Unknown learner name raises `ValueError` | +| `test_evaluate_learners_invalid_metric` | Non-callable metric raises `TypeError` | +| `test_reset_clears_nuisance` | After `draw_sample_splitting()`, `nuisance_loss` raises | + +NaN conventions: PLR `ml_g` → all-NaN; IRM `ml_g0` → NaN for `d==1`; `ml_g1` → NaN for `d==0`. + +--- + +## Sensitivity Tests (`test__scalar_sensitivity.py`) + +Scalar models with `_sensitivity_element_est()` require a dedicated test file. Constants: `N_OBS=500`, `N_FOLDS=5`, `N_REP=2`. Score-parametrized `fitted_` fixture. + +**Exception tests** go in `test__scalar_exceptions.py` — not in this file: + +| Test | Input | Expected | +|------|-------|----------| +| `test_exception_sensitivity_before_fit` | Call before `fit()` | `ValueError` matching `"The framework is not yet initialized"` | +| `test_exception_sensitivity_cf_y` | `cf_y=1` (int) / `cf_y=1.0` (boundary) | `TypeError` / `ValueError` | +| `test_exception_sensitivity_cf_d` | `cf_d=1` / `cf_d=1.0` | `TypeError` / `ValueError` | +| `test_exception_sensitivity_rho` | `rho=1` (int) / `rho=1.1` (out of range) | `TypeError` / `ValueError` | +| `test_exception_sensitivity_level` | `level=1` (int) / `level=0.0` (boundary) | `TypeError` / `ValueError` | +| `test_exception_sensitivity_null_hypothesis` | Wrong shape array | `ValueError` | + +**Required tests** (parametrize over all scores): + +| Test | Checks | +|------|--------| +| `test_sensitivity_elements_positive` | `sigma2 >= 0`, `nu2 > 0`, `max_bias >= 0` | +| `test_sensitivity_params_structure` | After `sensitivity_analysis()`: `theta/se/ci` have `lower`/`upper`; `rv`/`rva` in [0, 1] | +| `test_sensitivity_params_bounds_ordered` | `theta["lower"] <= coef <= theta["upper"]` | +| `test_sensitivity_rho0` | `rho=0.0`: `se["lower"] ≈ se["upper"] ≈ model.se` (`rtol=1e-6`) | +| `test_sensitivity_monotonicity_cf_y` | `cf_y=0.15` → wider theta bounds than `cf_y=0.03` | + +--- + ## Naming -- Files: `test_.py`, `test__scalar.py`, `test__scalar_exceptions.py`, `test__scalar_tune_ml_models.py` +- Files: `test_.py`, `test__scalar.py`, `test__scalar_exceptions.py`, `test__scalar_tune_ml_models.py`, `test__scalar_evaluate_learners.py`, `test__scalar_sensitivity.py` - Functions: `test_` — e.g., `test_coef_within_3_sigma`, `test_exception_invalid_score` - Docstrings: Every test function gets a one-line docstring explaining what it verifies @@ -152,3 +209,5 @@ For models with `_LEARNER_PARAM_ALIASES` (e.g., IRM `"ml_g"` → `["ml_g0", "ml_ - [ ] Test functions have descriptive names and docstrings - [ ] New scalar models have all 5 required test files (see `dml-scalar-test-structure.md`) - [ ] If model has `tune_ml_models()`, add `test__scalar_tune_ml_models.py` with all required tuning tests +- [ ] If model has `evaluate_learners()` / `nuisance_loss`, add `test__scalar_evaluate_learners.py` +- [ ] If model has `_sensitivity_element_est()`, add sensitivity exception tests to `test__scalar_exceptions.py` and add `test__scalar_sensitivity.py` diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index f685cce9..d10c1c49 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -17,6 +17,7 @@ from .double_ml_framework import DoubleMLFramework from .utils._checks import _check_sample_splitting from .utils._learner import LearnerInfo, LearnerSpec, validate_learner +from .utils._sensitivity import _compute_sensitivity_bias from .utils._tune_optuna import OPTUNA_GLOBAL_SETTING_KEYS, _dml_tune_optuna, resolve_optuna_cv from .utils.resampling import DoubleMLClusterResampling, DoubleMLResampling @@ -106,6 +107,7 @@ def __init__( self._predictions: dict[str, np.ndarray] | None = None self._nuisance_targets: dict[str, np.ndarray] | None = None self._nuisance_loss: dict[str, np.ndarray] | None = None + self._sensitivity_elements: dict[str, np.ndarray] | None = None self._all_thetas: np.ndarray | None = None self._all_ses: np.ndarray | None = None self._psi: np.ndarray | None = None @@ -232,6 +234,22 @@ def nuisance_loss(self) -> dict[str, np.ndarray]: raise ValueError("Nuisance loss not available. Call fit() or fit_nuisance_models() first.") return self._nuisance_loss + @property + def sensitivity_elements(self) -> dict[str, np.ndarray] | None: + """ + Raw sensitivity elements computed after :meth:`fit`. + + Returns ``None`` if sensitivity analysis is not implemented for this model + or if the model has not been fitted yet. + + Returns + ------- + dict[str, np.ndarray] or None + Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``), + ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``). + """ + return self._sensitivity_elements + @property def smpls(self) -> list: """ @@ -552,6 +570,10 @@ def estimate_causal_parameters(self) -> Self: # Estimate causal parameters - from score mixin self._est_causal_pars_and_se(psi_elements) + # Compute sensitivity elements (optional hook — None by default) + self._sensitivity_elements = self._sensitivity_element_est() + self._validate_sensitivity_elements() + # Construct framework self._framework = self._construct_framework() @@ -783,6 +805,22 @@ def _construct_framework(self) -> DoubleMLFramework: "n_folds_per_cluster": self._n_folds_per_cluster, } + # Compute framework-ready sensitivity elements if available + sensitivity_elements_for_framework: dict[str, np.ndarray] | None = None + if self._sensitivity_elements is not None: + max_bias, psi_max_bias = _compute_sensitivity_bias( + sigma2=self._sensitivity_elements["sigma2"], + nu2=self._sensitivity_elements["nu2"], + psi_sigma2=self._sensitivity_elements["psi_sigma2"], + psi_nu2=self._sensitivity_elements["psi_nu2"], + ) + sensitivity_elements_for_framework = { + "max_bias": max_bias, # (1, 1, n_rep) + "psi_max_bias": psi_max_bias, # (n_obs, 1, n_rep) + "sigma2": self._sensitivity_elements["sigma2"], # (1, 1, n_rep) + "nu2": self._sensitivity_elements["nu2"], # (1, 1, n_rep) + } + # Create data container (no transpose needed - already in framework convention!) framework_data = DoubleMLCoreData( all_thetas=self._all_thetas, # (n_thetas, n_rep) @@ -791,6 +829,7 @@ def _construct_framework(self) -> DoubleMLFramework: scaled_psi=scaled_psi, # (n_obs, n_thetas, n_rep) is_cluster_data=self._dml_data.is_cluster_data, cluster_dict=cluster_dict, + sensitivity_elements=sensitivity_elements_for_framework, ) # Create and return framework @@ -804,6 +843,7 @@ def _reset_fit_state(self) -> None: self._predictions = None self._nuisance_targets = None self._nuisance_loss = None + self._sensitivity_elements = None self._framework = None self._all_thetas = None self._all_ses = None @@ -912,6 +952,47 @@ def evaluate_learners( def _post_nuisance_checks(self) -> None: """Post-nuisance prediction validation hook. Override in subclasses for model-specific checks.""" + def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: + """ + Compute sensitivity analysis elements after causal parameter estimation. + + Optional hook called after :meth:`_est_causal_pars_and_se` in + :meth:`estimate_causal_parameters`. Override in subclasses to enable + sensitivity analysis via :meth:`sensitivity_analysis`. + + Implementations should access ``self._predictions``, ``self._dml_data``, + and ``self._all_thetas`` directly and compute results vectorized over all + ``n_rep`` repetitions at once. + + Returns + ------- + dict[str, np.ndarray] or None + Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``), + ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``). + Return ``None`` (default) if sensitivity analysis is not implemented. + """ + return None + + def _validate_sensitivity_elements(self) -> None: + """Re-estimate nu2 from riesz representer if nu2 is non-positive (degenerate PS).""" + import warnings + + if self._sensitivity_elements is None: + return + nu2 = self._sensitivity_elements["nu2"] # (1, 1, n_rep) + rr = self._sensitivity_elements["riesz_rep"] # (n_obs, 1, n_rep) + if np.any(nu2 <= 0): + treatment_name = self._dml_data.d_cols[0] + warnings.warn( + f"The estimated nu2 for treatment '{treatment_name}' is not positive. " + "Re-estimation based on riesz representer (non-orthogonal).", + UserWarning, + ) + psi_nu2_new = rr**2 + nu2_new = np.mean(psi_nu2_new, axis=0, keepdims=True) + self._sensitivity_elements["nu2"] = nu2_new + self._sensitivity_elements["psi_nu2"] = psi_nu2_new - nu2_new + @abstractmethod def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]: """ diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index 4305eb27..f6fe85de 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -475,3 +475,64 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]: weights_bar = np.divide(m_hat * w[:, np.newaxis], subgroup_probability) return weights, weights_bar + + def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: + """ + Compute IRM sensitivity elements vectorized over all repetitions. + + Reproduces the propensity score processing and weight computation from + :meth:`_get_score_elements` to compute sigma2, nu2, their influence + functions, and the Riesz representer. + + Returns + ------- + dict[str, np.ndarray] or None + Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``), + ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``). + """ + y = self._dml_data.y # (n_obs,) + d = self._dml_data.d # (n_obs,) + g_hat0 = self._predictions["ml_g0"] # (n_obs, n_rep) + g_hat1 = self._predictions["ml_g1"] # (n_obs, n_rep) + m_hat_raw = self._predictions["ml_m"] # (n_obs, n_rep) + + # Reproduce PS processing (same per-rep loop as _get_score_elements) + m_hat = np.zeros_like(m_hat_raw) + for i_rep in range(self.n_rep): + m_hat[:, i_rep] = self._ps_processor.adjust_ps(m_hat_raw[:, i_rep], d, cv=self._smpls[i_rep], learner_name="ml_m") + m_hat_adj = np.zeros_like(m_hat) + for i_rep in range(self.n_rep): + m_hat_adj[:, i_rep] = _propensity_score_adjustment( + propensity_score=m_hat[:, i_rep], + treatment_indicator=d, + normalize_ipw=self.normalize_ipw, + ) + + d2d = d[:, np.newaxis] # (n_obs, 1) for broadcasting + + # sigma2: squared residual of the outcome regression + sigma2_score = (y[:, np.newaxis] - d2d * g_hat1 - (1.0 - d2d) * g_hat0) ** 2 # (n_obs, n_rep) + sigma2_mean = np.mean(sigma2_score, axis=0) # (n_rep,) + psi_sigma2 = sigma2_score - sigma2_mean[np.newaxis, :] # (n_obs, n_rep) + sigma2 = sigma2_mean[np.newaxis, np.newaxis, :] # (1, 1, n_rep) + psi_sigma2 = psi_sigma2[:, np.newaxis, :] # (n_obs, 1, n_rep) + + # Riesz representer and nu2 — uses _get_weights which vectorizes over n_rep + weights, weights_bar = self._get_weights(m_hat_adj) # each (n_obs, n_rep) + rr_2d = weights_bar * (np.divide(d2d, m_hat_adj) - np.divide(1.0 - d2d, 1.0 - m_hat_adj)) # (n_obs, n_rep) + m_alpha = weights * weights_bar * (np.divide(1.0, m_hat_adj) + np.divide(1.0, 1.0 - m_hat_adj)) # (n_obs, n_rep) + + nu2_score = 2.0 * m_alpha - rr_2d**2 # (n_obs, n_rep) + nu2_mean = np.mean(nu2_score, axis=0) # (n_rep,) + psi_nu2 = nu2_score - nu2_mean[np.newaxis, :] # (n_obs, n_rep) + nu2 = nu2_mean[np.newaxis, np.newaxis, :] # (1, 1, n_rep) + psi_nu2 = psi_nu2[:, np.newaxis, :] # (n_obs, 1, n_rep) + rr = rr_2d[:, np.newaxis, :] # (n_obs, 1, n_rep) + + return { + "sigma2": sigma2, + "nu2": nu2, + "psi_sigma2": psi_sigma2, + "psi_nu2": psi_nu2, + "riesz_rep": rr, + } diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py index 59dc91a1..2fe5fd35 100644 --- a/doubleml/irm/tests/test_irm_scalar_exceptions.py +++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py @@ -163,3 +163,68 @@ def test_irm_scalar_exception_binary_predictions_g(): msg = r"For the binary variable .+, predictions .+ are also observed to be binary" with pytest.raises(ValueError, match=msg): dml_obj.fit_nuisance_models() + + +# ==================== sensitivity_analysis exceptions ==================== + + +@pytest.fixture(scope="module") +def fitted_irm_for_sensitivity(): + """Fitted IRM model for sensitivity exception tests.""" + dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m) + dml_obj.fit(n_folds=3, n_rep=1) + return dml_obj + + +@pytest.mark.ci +def test_exception_sensitivity_before_fit(): + """sensitivity_analysis() raises ValueError before fit().""" + dml_obj = IRM(obj_dml_data) + msg = r"The framework is not yet initialized" + with pytest.raises(ValueError, match=msg): + dml_obj.sensitivity_analysis() + + +@pytest.mark.ci +def test_exception_sensitivity_cf_y(fitted_irm_for_sensitivity): + """cf_y must be a float in [0,1).""" + with pytest.raises(TypeError, match=r"cf_y must be of float type"): + fitted_irm_for_sensitivity.sensitivity_analysis(cf_y=1) + with pytest.raises(ValueError, match=r"cf_y must be in \[0,1\)"): + fitted_irm_for_sensitivity.sensitivity_analysis(cf_y=1.0) + + +@pytest.mark.ci +def test_exception_sensitivity_cf_d(fitted_irm_for_sensitivity): + """cf_d must be a float in [0,1).""" + with pytest.raises(TypeError, match=r"cf_d must be of float type"): + fitted_irm_for_sensitivity.sensitivity_analysis(cf_d=1) + with pytest.raises(ValueError, match=r"cf_d must be in \[0,1\)"): + fitted_irm_for_sensitivity.sensitivity_analysis(cf_d=1.0) + + +@pytest.mark.ci +def test_exception_sensitivity_rho(fitted_irm_for_sensitivity): + """rho must be a float with |rho| <= 1.""" + with pytest.raises(TypeError, match=r"rho must be of float type"): + fitted_irm_for_sensitivity.sensitivity_analysis(rho=1) + with pytest.raises(ValueError, match=r"The absolute value of rho must be in \[0,1\]"): + fitted_irm_for_sensitivity.sensitivity_analysis(rho=1.1) + + +@pytest.mark.ci +def test_exception_sensitivity_level(fitted_irm_for_sensitivity): + """level must be a float in (0,1).""" + with pytest.raises(TypeError, match=r"The confidence level must be of float type"): + fitted_irm_for_sensitivity.sensitivity_analysis(level=1) + with pytest.raises(ValueError, match=r"The confidence level must be in \(0,1\)"): + fitted_irm_for_sensitivity.sensitivity_analysis(level=0.0) + + +@pytest.mark.ci +def test_exception_sensitivity_null_hypothesis(fitted_irm_for_sensitivity): + """null_hypothesis with wrong shape raises ValueError.""" + import numpy as np + + with pytest.raises(ValueError, match=r"null_hypothesis"): + fitted_irm_for_sensitivity.sensitivity_analysis(null_hypothesis=np.array([0.0, 0.0])) diff --git a/doubleml/irm/tests/test_irm_scalar_return_types.py b/doubleml/irm/tests/test_irm_scalar_return_types.py index a437f49d..4b755899 100644 --- a/doubleml/irm/tests/test_irm_scalar_return_types.py +++ b/doubleml/irm/tests/test_irm_scalar_return_types.py @@ -214,3 +214,81 @@ def test_reset_after_draw_sample_splitting(): _ = dml_obj.coef with pytest.raises(ValueError, match="Predictions not available. Call fit"): _ = dml_obj.predictions + + +@pytest.mark.ci +def test_sensitivity_elements_type_and_shape(fitted_dml_obj): + """sensitivity_elements has correct keys, types, and shapes after fit.""" + elems = fitted_dml_obj.sensitivity_elements + assert isinstance(elems, dict) + for key in ["sigma2", "nu2"]: + assert key in elems + assert isinstance(elems[key], np.ndarray) + assert elems[key].shape == (1, 1, N_REP) + for key in ["psi_sigma2", "psi_nu2", "riesz_rep"]: + assert key in elems + assert isinstance(elems[key], np.ndarray) + assert elems[key].shape == (N_OBS, 1, N_REP) + + +@pytest.mark.ci +def test_sensitivity_analysis_runs(fitted_dml_obj): + """sensitivity_analysis() completes without error and returns self.""" + result = fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + assert result is fitted_dml_obj.framework + + +@pytest.mark.ci +def test_sensitivity_before_fit_is_none(): + """sensitivity_elements returns None before fit().""" + dml_obj = IRM(obj_dml_data) + assert dml_obj.sensitivity_elements is None + + +@pytest.mark.ci +def test_sensitivity_reset_after_draw_sample_splitting(): + """sensitivity_elements resets to None after draw_sample_splitting().""" + np.random.seed(3141) + dml_obj = IRM(obj_dml_data) + dml_obj.set_learners( + ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42), + ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42), + ) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + assert dml_obj.sensitivity_elements is not None + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + assert dml_obj.sensitivity_elements is None + + +@pytest.mark.ci +def test_sensitivity_params_structure(fitted_dml_obj): + """sensitivity_params has expected keys and finite rv/rva after sensitivity_analysis().""" + fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03) + params = fitted_dml_obj.framework.sensitivity_params + for key in ["theta", "se", "ci"]: + assert "lower" in params[key] and "upper" in params[key] + for key in ["rv", "rva"]: + assert np.all(np.isfinite(params[key])) + assert np.all(params[key] >= 0) and np.all(params[key] <= 1) + + +@pytest.mark.ci +def test_sensitivity_rho0_se_bounds(fitted_dml_obj): + """With rho=0, se lower and upper bounds equal the unadjusted se.""" + fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=0.0) + params = fitted_dml_obj.framework.sensitivity_params + np.testing.assert_allclose(params["se"]["lower"], fitted_dml_obj.se, rtol=1e-6) + np.testing.assert_allclose(params["se"]["upper"], fitted_dml_obj.se, rtol=1e-6) + + +@pytest.mark.ci +def test_sensitivity_monotonicity_cf_y(fitted_dml_obj): + """Increasing cf_y widens the theta sensitivity bounds.""" + fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + params_low = fitted_dml_obj.framework.sensitivity_params + width_low = params_low["theta"]["upper"] - params_low["theta"]["lower"] + fitted_dml_obj.sensitivity_analysis(cf_y=0.15, cf_d=0.03, rho=1.0) + params_high = fitted_dml_obj.framework.sensitivity_params + width_high = params_high["theta"]["upper"] - params_high["theta"]["lower"] + assert np.all(width_high >= width_low) diff --git a/doubleml/irm/tests/test_irm_scalar_sensitivity.py b/doubleml/irm/tests/test_irm_scalar_sensitivity.py new file mode 100644 index 00000000..d3682e9e --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_sensitivity.py @@ -0,0 +1,84 @@ +"""Score-parametrized sensitivity analysis tests for IRM scalar models.""" + +import numpy as np +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM + +N_OBS = 500 +N_FOLDS = 5 +N_REP = 2 + + +@pytest.fixture(scope="module") +def irm_data(): + """Shared IRM dataset.""" + np.random.seed(3141) + return make_irm_data(theta=0.5, n_obs=N_OBS, dim_x=5, return_type="DoubleMLData") + + +@pytest.fixture(scope="module", params=["ATE", "ATTE"]) +def fitted_irm(request, irm_data): + """Fitted IRM model parametrized over both score variants.""" + dml_obj = IRM(irm_data, score=request.param) + dml_obj.set_learners( + ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42), + ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42), + ) + dml_obj.fit(n_folds=N_FOLDS, n_rep=N_REP) + return dml_obj + + +@pytest.mark.ci +def test_sensitivity_elements_positive(fitted_irm): + """sigma2 >= 0, nu2 > 0, and max_bias >= 0 for each score variant.""" + elems = fitted_irm.sensitivity_elements + assert np.all(elems["sigma2"] >= 0) + assert np.all(elems["nu2"] > 0) + assert np.all(fitted_irm.framework.sensitivity_elements["max_bias"] >= 0) + + +@pytest.mark.ci +def test_sensitivity_params_structure(fitted_irm): + """After sensitivity_analysis(), theta/se/ci have lower/upper; rv/rva in [0,1].""" + fitted_irm.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + params = fitted_irm.framework.sensitivity_params + for key in ["theta", "se", "ci"]: + assert "lower" in params[key] and "upper" in params[key] + for key in ["rv", "rva"]: + assert np.all(np.isfinite(params[key])) + assert np.all(params[key] >= 0) and np.all(params[key] <= 1) + + +@pytest.mark.ci +def test_sensitivity_params_bounds_ordered(fitted_irm): + """theta lower bound <= estimated coef <= theta upper bound.""" + fitted_irm.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + params = fitted_irm.framework.sensitivity_params + assert np.all(params["theta"]["lower"] <= fitted_irm.coef) + assert np.all(fitted_irm.coef <= params["theta"]["upper"]) + + +@pytest.mark.ci +def test_sensitivity_rho0(fitted_irm): + """With rho=0, se lower and upper bounds equal the unadjusted se.""" + fitted_irm.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=0.0) + params = fitted_irm.framework.sensitivity_params + np.testing.assert_allclose(params["se"]["lower"], fitted_irm.se, rtol=1e-6) + np.testing.assert_allclose(params["se"]["upper"], fitted_irm.se, rtol=1e-6) + + +@pytest.mark.ci +def test_sensitivity_monotonicity_cf_y(fitted_irm): + """Increasing cf_y produces wider theta sensitivity bounds.""" + fitted_irm.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + width_low = ( + fitted_irm.framework.sensitivity_params["theta"]["upper"] - fitted_irm.framework.sensitivity_params["theta"]["lower"] + ) + fitted_irm.sensitivity_analysis(cf_y=0.15, cf_d=0.03, rho=1.0) + width_high = ( + fitted_irm.framework.sensitivity_params["theta"]["upper"] - fitted_irm.framework.sensitivity_params["theta"]["lower"] + ) + assert np.all(width_high >= width_low) diff --git a/doubleml/irm/tests/test_irm_scalar_vs_irm.py b/doubleml/irm/tests/test_irm_scalar_vs_irm.py index 196385e8..adf578cc 100644 --- a/doubleml/irm/tests/test_irm_scalar_vs_irm.py +++ b/doubleml/irm/tests/test_irm_scalar_vs_irm.py @@ -80,3 +80,34 @@ def test_all_se_equal(comparison_fixture): old = comparison_fixture["old"] new = comparison_fixture["new"] np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_sigma2_equal(comparison_fixture): + """IRM scalar sigma2 matches DoubleMLIRM sensitivity_elements['sigma2'].""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + # Legacy shape: (1, n_rep, 1); scalar shape: (1, 1, n_rep). Transpose to align. + old_sigma2 = np.transpose(old.sensitivity_elements["sigma2"], (0, 2, 1)) + np.testing.assert_allclose(new.sensitivity_elements["sigma2"], old_sigma2, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_nu2_equal(comparison_fixture): + """IRM scalar nu2 matches DoubleMLIRM sensitivity_elements['nu2'].""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + old_nu2 = np.transpose(old.sensitivity_elements["nu2"], (0, 2, 1)) + np.testing.assert_allclose(new.sensitivity_elements["nu2"], old_nu2, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_max_bias_equal(comparison_fixture): + """IRM scalar framework max_bias matches DoubleMLIRM framework max_bias.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose( + new.framework.sensitivity_elements["max_bias"], + old.framework.sensitivity_elements["max_bias"], + rtol=1e-9, + ) diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 0e1c149e..9d2da5eb 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -392,3 +392,59 @@ def _get_score_elements(self) -> dict[str, np.ndarray]: psi_b = v_hat * (y[:, np.newaxis] - g_hat) return {"psi_a": psi_a, "psi_b": psi_b} + + def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: + """ + Compute PLR sensitivity elements vectorized over all repetitions. + + Computes sigma2 (outcome residual variance), nu2 (inverse of treatment + residual variance), their influence functions, and the Riesz representer. + Handles both ``'partialling out'`` and ``'IV-type'`` scores. + + Returns + ------- + dict[str, np.ndarray] or None + Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``), + ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``). + Returns ``None`` for callable scores (no standard Riesz representer). + """ + if callable(self.score): + return None + + y = self._dml_data.y # (n_obs,) + d = self._dml_data.d # (n_obs,) + m_hat = self._predictions["ml_m"] # (n_obs, n_rep) + theta = self._all_thetas # (1, n_rep) — broadcasts with (n_obs, n_rep) + + treatment_residual = d[:, np.newaxis] - m_hat # (n_obs, n_rep) + + if self.score == "partialling out": + l_hat = self._predictions["ml_l"] # (n_obs, n_rep) + sigma2_score = (y[:, np.newaxis] - l_hat - theta * treatment_residual) ** 2 + else: # "IV-type" + g_hat = self._predictions["ml_g"] # (n_obs, n_rep) + sigma2_score = (y[:, np.newaxis] - g_hat - theta * d[:, np.newaxis]) ** 2 + + # sigma2: mean across observations, reshaped to (1, 1, n_rep) + sigma2_mean = np.mean(sigma2_score, axis=0) # (n_rep,) + psi_sigma2 = sigma2_score - sigma2_mean[np.newaxis, :] # (n_obs, n_rep) + sigma2 = sigma2_mean[np.newaxis, np.newaxis, :] # (1, 1, n_rep) + psi_sigma2 = psi_sigma2[:, np.newaxis, :] # (n_obs, 1, n_rep) + + # nu2 = 1 / E[(d - m_hat)^2], reshaped to (1, 1, n_rep) + tr_sq_mean = np.mean(treatment_residual**2, axis=0) # (n_rep,) + nu2_val = 1.0 / tr_sq_mean # (n_rep,) + psi_nu2 = nu2_val[np.newaxis, :] - treatment_residual**2 * nu2_val[np.newaxis, :] ** 2 # (n_obs, n_rep) + nu2 = nu2_val[np.newaxis, np.newaxis, :] # (1, 1, n_rep) + psi_nu2 = psi_nu2[:, np.newaxis, :] # (n_obs, 1, n_rep) + + # Riesz representer: (d - m_hat) * nu2 + rr = (treatment_residual * nu2_val[np.newaxis, :])[:, np.newaxis, :] # (n_obs, 1, n_rep) + + return { + "sigma2": sigma2, + "nu2": nu2, + "psi_sigma2": psi_sigma2, + "psi_nu2": psi_nu2, + "riesz_rep": rr, + } diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py index d49d1902..7d2a57b9 100644 --- a/doubleml/plm/tests/test_plr_scalar_exceptions.py +++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py @@ -136,3 +136,69 @@ def test_plr_scalar_warning_binary_outcome_classifier(): msg = r"The ml_l learner .+ was identified as classifier\. Fitting an additive probability model\." with pytest.warns(UserWarning, match=msg): dml_obj.set_learners(ml_l=LogisticRegression(), ml_m=Lasso()) + + +# ==================== sensitivity_analysis exceptions ==================== + + +@pytest.fixture(scope="module") +def fitted_plr_for_sensitivity(): + """Fitted PLR model for sensitivity exception tests.""" + dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m) + dml_obj.fit(n_folds=3, n_rep=1) + return dml_obj + + +@pytest.mark.ci +def test_exception_sensitivity_before_fit(): + """sensitivity_analysis() raises ValueError before fit().""" + dml_obj = PLR(obj_dml_data) + msg = r"The framework is not yet initialized" + with pytest.raises(ValueError, match=msg): + dml_obj.sensitivity_analysis() + + +@pytest.mark.ci +def test_exception_sensitivity_cf_y(fitted_plr_for_sensitivity): + """cf_y must be a float in [0,1).""" + with pytest.raises(TypeError, match=r"cf_y must be of float type"): + fitted_plr_for_sensitivity.sensitivity_analysis(cf_y=1) + with pytest.raises(ValueError, match=r"cf_y must be in \[0,1\)"): + fitted_plr_for_sensitivity.sensitivity_analysis(cf_y=1.0) + + +@pytest.mark.ci +def test_exception_sensitivity_cf_d(fitted_plr_for_sensitivity): + """cf_d must be a float in [0,1).""" + with pytest.raises(TypeError, match=r"cf_d must be of float type"): + fitted_plr_for_sensitivity.sensitivity_analysis(cf_d=1) + with pytest.raises(ValueError, match=r"cf_d must be in \[0,1\)"): + fitted_plr_for_sensitivity.sensitivity_analysis(cf_d=1.0) + + +@pytest.mark.ci +def test_exception_sensitivity_rho(fitted_plr_for_sensitivity): + """rho must be a float with |rho| <= 1.""" + with pytest.raises(TypeError, match=r"rho must be of float type"): + fitted_plr_for_sensitivity.sensitivity_analysis(rho=1) + with pytest.raises(ValueError, match=r"The absolute value of rho must be in \[0,1\]"): + fitted_plr_for_sensitivity.sensitivity_analysis(rho=1.1) + + +@pytest.mark.ci +def test_exception_sensitivity_level(fitted_plr_for_sensitivity): + """level must be a float in (0,1).""" + with pytest.raises(TypeError, match=r"The confidence level must be of float type"): + fitted_plr_for_sensitivity.sensitivity_analysis(level=1) + with pytest.raises(ValueError, match=r"The confidence level must be in \(0,1\)"): + fitted_plr_for_sensitivity.sensitivity_analysis(level=0.0) + + +@pytest.mark.ci +def test_exception_sensitivity_null_hypothesis(fitted_plr_for_sensitivity): + """null_hypothesis with wrong shape raises ValueError.""" + import numpy as np + + with pytest.raises(ValueError, match=r"null_hypothesis"): + fitted_plr_for_sensitivity.sensitivity_analysis(null_hypothesis=np.array([0.0, 0.0])) diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py index 39fe77e6..3771eeca 100644 --- a/doubleml/plm/tests/test_plr_scalar_return_types.py +++ b/doubleml/plm/tests/test_plr_scalar_return_types.py @@ -191,3 +191,78 @@ def test_reset_after_draw_sample_splitting(): _ = dml_obj.coef with pytest.raises(ValueError, match="Predictions not available. Call fit"): _ = dml_obj.predictions + + +@pytest.mark.ci +def test_sensitivity_elements_type_and_shape(fitted_dml_obj): + """sensitivity_elements has correct keys, types, and shapes after fit.""" + elems = fitted_dml_obj.sensitivity_elements + assert isinstance(elems, dict) + for key in ["sigma2", "nu2"]: + assert key in elems + assert isinstance(elems[key], np.ndarray) + assert elems[key].shape == (1, 1, N_REP) + for key in ["psi_sigma2", "psi_nu2", "riesz_rep"]: + assert key in elems + assert isinstance(elems[key], np.ndarray) + assert elems[key].shape == (N_OBS, 1, N_REP) + + +@pytest.mark.ci +def test_sensitivity_analysis_runs(fitted_dml_obj): + """sensitivity_analysis() completes without error and returns self.""" + result = fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + assert result is fitted_dml_obj.framework + + +@pytest.mark.ci +def test_sensitivity_before_fit_is_none(): + """sensitivity_elements returns None before fit().""" + dml_obj = PLR(obj_dml_data) + assert dml_obj.sensitivity_elements is None + + +@pytest.mark.ci +def test_sensitivity_reset_after_draw_sample_splitting(): + """sensitivity_elements resets to None after draw_sample_splitting().""" + np.random.seed(3141) + dml_obj = PLR(obj_dml_data) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + assert dml_obj.sensitivity_elements is not None + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + assert dml_obj.sensitivity_elements is None + + +@pytest.mark.ci +def test_sensitivity_params_structure(fitted_dml_obj): + """sensitivity_params has expected keys and finite rv/rva after sensitivity_analysis().""" + fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03) + params = fitted_dml_obj.framework.sensitivity_params + for key in ["theta", "se", "ci"]: + assert "lower" in params[key] and "upper" in params[key] + for key in ["rv", "rva"]: + assert np.all(np.isfinite(params[key])) + assert np.all(params[key] >= 0) and np.all(params[key] <= 1) + + +@pytest.mark.ci +def test_sensitivity_rho0_se_bounds(fitted_dml_obj): + """With rho=0, se lower and upper bounds equal the unadjusted se.""" + fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=0.0) + params = fitted_dml_obj.framework.sensitivity_params + np.testing.assert_allclose(params["se"]["lower"], fitted_dml_obj.se, rtol=1e-6) + np.testing.assert_allclose(params["se"]["upper"], fitted_dml_obj.se, rtol=1e-6) + + +@pytest.mark.ci +def test_sensitivity_monotonicity_cf_y(fitted_dml_obj): + """Increasing cf_y widens the theta sensitivity bounds.""" + fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + params_low = fitted_dml_obj.framework.sensitivity_params + width_low = params_low["theta"]["upper"] - params_low["theta"]["lower"] + fitted_dml_obj.sensitivity_analysis(cf_y=0.15, cf_d=0.03, rho=1.0) + params_high = fitted_dml_obj.framework.sensitivity_params + width_high = params_high["theta"]["upper"] - params_high["theta"]["lower"] + assert np.all(width_high >= width_low) diff --git a/doubleml/plm/tests/test_plr_scalar_sensitivity.py b/doubleml/plm/tests/test_plr_scalar_sensitivity.py new file mode 100644 index 00000000..2b358757 --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar_sensitivity.py @@ -0,0 +1,81 @@ +"""Score-parametrized sensitivity analysis tests for PLR scalar models.""" + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + +N_OBS = 500 +N_FOLDS = 5 +N_REP = 2 + + +@pytest.fixture(scope="module") +def plr_data(): + """Shared PLR dataset.""" + np.random.seed(3141) + return make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5) + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def fitted_plr(request, plr_data): + """Fitted PLR model parametrized over both score variants.""" + dml_obj = PLR(plr_data, score=request.param) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + dml_obj.fit(n_folds=N_FOLDS, n_rep=N_REP) + return dml_obj + + +@pytest.mark.ci +def test_sensitivity_elements_positive(fitted_plr): + """sigma2 >= 0, nu2 > 0, and max_bias >= 0 for each score variant.""" + elems = fitted_plr.sensitivity_elements + assert np.all(elems["sigma2"] >= 0) + assert np.all(elems["nu2"] > 0) + assert np.all(fitted_plr.framework.sensitivity_elements["max_bias"] >= 0) + + +@pytest.mark.ci +def test_sensitivity_params_structure(fitted_plr): + """After sensitivity_analysis(), theta/se/ci have lower/upper; rv/rva in [0,1].""" + fitted_plr.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + params = fitted_plr.framework.sensitivity_params + for key in ["theta", "se", "ci"]: + assert "lower" in params[key] and "upper" in params[key] + for key in ["rv", "rva"]: + assert np.all(np.isfinite(params[key])) + assert np.all(params[key] >= 0) and np.all(params[key] <= 1) + + +@pytest.mark.ci +def test_sensitivity_params_bounds_ordered(fitted_plr): + """theta lower bound <= estimated coef <= theta upper bound.""" + fitted_plr.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + params = fitted_plr.framework.sensitivity_params + assert np.all(params["theta"]["lower"] <= fitted_plr.coef) + assert np.all(fitted_plr.coef <= params["theta"]["upper"]) + + +@pytest.mark.ci +def test_sensitivity_rho0(fitted_plr): + """With rho=0, se lower and upper bounds equal the unadjusted se.""" + fitted_plr.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=0.0) + params = fitted_plr.framework.sensitivity_params + np.testing.assert_allclose(params["se"]["lower"], fitted_plr.se, rtol=1e-6) + np.testing.assert_allclose(params["se"]["upper"], fitted_plr.se, rtol=1e-6) + + +@pytest.mark.ci +def test_sensitivity_monotonicity_cf_y(fitted_plr): + """Increasing cf_y produces wider theta sensitivity bounds.""" + fitted_plr.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0) + width_low = ( + fitted_plr.framework.sensitivity_params["theta"]["upper"] - fitted_plr.framework.sensitivity_params["theta"]["lower"] + ) + fitted_plr.sensitivity_analysis(cf_y=0.15, cf_d=0.03, rho=1.0) + width_high = ( + fitted_plr.framework.sensitivity_params["theta"]["upper"] - fitted_plr.framework.sensitivity_params["theta"]["lower"] + ) + assert np.all(width_high >= width_low) diff --git a/doubleml/plm/tests/test_plr_scalar_vs_plr.py b/doubleml/plm/tests/test_plr_scalar_vs_plr.py index 15453c12..713acb6a 100644 --- a/doubleml/plm/tests/test_plr_scalar_vs_plr.py +++ b/doubleml/plm/tests/test_plr_scalar_vs_plr.py @@ -81,3 +81,34 @@ def test_all_se_equal(comparison_fixture): old = comparison_fixture["old"] new = comparison_fixture["new"] np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_sigma2_equal(comparison_fixture): + """PLR scalar sigma2 matches DoubleMLPLR sensitivity_elements['sigma2'].""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + # Legacy shape: (1, n_rep, 1); scalar shape: (1, 1, n_rep). Transpose to align. + old_sigma2 = np.transpose(old.sensitivity_elements["sigma2"], (0, 2, 1)) + np.testing.assert_allclose(new.sensitivity_elements["sigma2"], old_sigma2, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_nu2_equal(comparison_fixture): + """PLR scalar nu2 matches DoubleMLPLR sensitivity_elements['nu2'].""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + old_nu2 = np.transpose(old.sensitivity_elements["nu2"], (0, 2, 1)) + np.testing.assert_allclose(new.sensitivity_elements["nu2"], old_nu2, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_max_bias_equal(comparison_fixture): + """PLR scalar framework max_bias matches DoubleMLPLR framework max_bias.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose( + new.framework.sensitivity_elements["max_bias"], + old.framework.sensitivity_elements["max_bias"], + rtol=1e-9, + ) From e980ccae37a03296665a913e53754a22581bd044 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sun, 1 Mar 2026 21:23:31 +0100 Subject: [PATCH 22/38] add first dml vector class --- doubleml/double_ml_vector.py | 751 +++++++++++++++++++++++++++++++++++ 1 file changed, 751 insertions(+) create mode 100644 doubleml/double_ml_vector.py diff --git a/doubleml/double_ml_vector.py b/doubleml/double_ml_vector.py new file mode 100644 index 00000000..f6f1e376 --- /dev/null +++ b/doubleml/double_ml_vector.py @@ -0,0 +1,751 @@ +"""Abstract base class for multi-treatment DoubleML models (parameter vector estimation).""" + +from __future__ import annotations + +import copy +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Self + +if TYPE_CHECKING: + from .utils._tune_optuna import DMLOptunaResult + +import numpy as np +import pandas as pd +from joblib import Parallel, delayed + +from .data.base_data import DoubleMLData +from .double_ml_base import DoubleMLBase +from .double_ml_framework import concat +from .double_ml_scalar import DoubleMLScalar +from .utils._checks import _check_sample_splitting +from .utils._tune_optuna import TUNE_ML_MODELS_DOC +from .utils.gain_statistics import gain_statistics +from .utils.resampling import DoubleMLResampling + + +class DoubleMLVector(DoubleMLBase, ABC): + """ + Abstract base class for multi-treatment DoubleML models. + + Orchestrates multiple :class:`~doubleml.DoubleMLScalar` instances — one per + treatment column in ``d_cols`` — sharing a single set of sample splits and + concatenating their :class:`~doubleml.DoubleMLFramework` objects into one + unified result. + + This class is intentionally general: by overriding :meth:`_initialize_models` + (and optionally :meth:`_get_data_for_model`), concrete subclasses can cover + any scenario where multiple scalar models must be fitted and combined: + + * **Multiple treatment columns** (e.g., ``DoubleMLPLRVector``): each sub-model + receives a single-column data view created by :meth:`_get_data_for_model`. + * **Multiple treatment levels** (e.g., a future ``DoubleMLAPOSVector``): all + sub-models share the same data; each scalar carries its own ``treatment_level`` + parameter. Override :meth:`_get_data_for_model` to return ``self._dml_data`` + unchanged, or bypass it entirely inside :meth:`_initialize_models`. + + Parameters + ---------- + obj_dml_data : DoubleMLBaseData + The data object for the double machine learning model. + score : str, optional + The score function to use. Default is ``'default'``. + + Attributes + ---------- + n_folds : int + Number of cross-fitting folds. + n_rep : int + Number of sample-splitting repetitions. + score : str + The score function being used. + modellist : list of DoubleMLScalar + The scalar sub-models, one per treatment column (or model key). + """ + + def __init__( + self, + obj_dml_data: DoubleMLData, + score: str = "default", + ) -> None: + super().__init__(obj_dml_data) + self._dml_data: DoubleMLData = obj_dml_data # narrow for attribute access + self._score = score + + # Sample-splitting state + self._n_folds: int | None = None + self._n_folds_per_cluster: int | None = None + self._n_rep: int | None = None + self._smpls: list | None = None + self._smpls_cluster: list | None = None + + # Sub-model list — populated by subclass via _initialize_models() + self._modellist: list[DoubleMLScalar] | None = None + + # ==================== Properties ==================== + + @property + def n_rep(self) -> int: + """ + Number of repetitions for sample splitting. + + Returns + ------- + int + Number of repetitions. + + Raises + ------ + ValueError + If sample splitting has not been drawn yet. + """ + if self._n_rep is None: + raise ValueError("n_rep not set. Call draw_sample_splitting() first.") + return self._n_rep + + @property + def n_folds(self) -> int: + """ + Number of folds for cross-fitting. + + Returns + ------- + int + Number of folds. + + Raises + ------ + ValueError + If sample splitting has not been drawn yet. + """ + if self._n_folds is None: + raise ValueError("n_folds not set. Call draw_sample_splitting() first.") + return self._n_folds + + @property + def score(self) -> str: + """ + The score function being used. + + Returns + ------- + str + Score function name. + """ + return self._score + + @property + def smpls(self) -> list: + """ + Sample-splitting indices used for cross-fitting. + + Returns + ------- + list + List of sample-splitting indices for each repetition. + + Raises + ------ + ValueError + If sample splitting has not been drawn yet. + """ + if self._smpls is None: + raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.") + return self._smpls + + @property + def modellist(self) -> list[DoubleMLScalar] | None: + """ + The scalar sub-models in the same order as ``d_cols``. + + Returns + ------- + list of DoubleMLScalar or None + ``None`` before :meth:`_initialize_models` has been called by the subclass. + """ + return self._modellist + + @property + def n_rep_boot(self) -> int | None: + """ + The number of bootstrap replications, or ``None`` if not bootstrapped. + + Returns + ------- + int or None + """ + return None if self._framework is None else self._framework.n_rep_boot + + @property + def boot_method(self) -> str | None: + """ + The bootstrap method used, or ``None`` if not bootstrapped. + + Returns + ------- + str or None + """ + return None if self._framework is None else self._framework.boot_method + + @property + def boot_t_stat(self) -> np.ndarray | None: + """ + Bootstrapped t-statistics, or ``None`` if not bootstrapped. + + Returns + ------- + np.ndarray or None + """ + return None if self._framework is None else self._framework.boot_t_stat + + @property + def sensitivity_elements(self) -> dict[str, np.ndarray] | None: + """ + Raw sensitivity elements after :meth:`fit`, or ``None`` if unavailable. + + Returns + ------- + dict or None + """ + return None if self._framework is None else self._framework.sensitivity_elements + + @property + def sensitivity_params(self) -> dict | None: + """ + Sensitivity analysis parameters after :meth:`sensitivity_analysis`, + or ``None`` if not yet computed. + + Returns + ------- + dict or None + """ + return None if self._framework is None else self._framework.sensitivity_params + + @property + def sensitivity_summary(self) -> str: + """ + Summary for the sensitivity analysis after :meth:`sensitivity_analysis`. + + Returns + ------- + str + + Raises + ------ + ValueError + If :meth:`fit` has not been called yet. + """ + if self._framework is None: + raise ValueError("Apply fit() before accessing sensitivity_summary.") + return self._framework.sensitivity_summary + + # ==================== Abstract Methods ==================== + + @property + @abstractmethod + def required_learners(self) -> list[str]: + """ + Names of the required learners for the current configuration. + + Returns + ------- + list of str + Ordered list of required learner names. + """ + + @abstractmethod + def set_learners(self, **kwargs: object) -> Self: + """ + Set the learners for nuisance estimation on all sub-models. + + Subclasses must implement this method with explicit keyword arguments + matching their model's learners (e.g., ``ml_l``, ``ml_m`` for PLR). + The same learners (cloned per sub-model) are applied to every treatment. + + Parameters + ---------- + **kwargs + Learner keyword arguments specific to the subclass. + + Returns + ------- + self : Self + """ + + @abstractmethod + def _initialize_models(self) -> list[DoubleMLScalar]: + """ + Create and return one scalar sub-model per treatment column. + + Called once during ``__init__`` of concrete subclasses. Use + :meth:`_get_data_for_model` to obtain a single-treatment data view for + each ``d_col``, or bypass it for scenarios where all sub-models share the + same data (e.g., APOS-like treatment-level orchestration). + + Returns + ------- + list of DoubleMLScalar + One configured scalar model per element of ``self._dml_data.d_cols``. + """ + + # ==================== Protected Helpers ==================== + + def _get_data_for_model(self, d_col: str) -> DoubleMLData: + """ + Return a single-treatment :class:`~doubleml.data.DoubleMLData` for ``d_col``. + + Creates a new :class:`~doubleml.data.DoubleMLData` that **shares the + underlying DataFrame** (zero additional memory for array data). Other + treatment columns are appended to ``x_cols`` so that the + :class:`DoubleMLScalar` single-treatment check passes. + + Override in subclasses for non-d_col scenarios. For example, an APOS-like + class would override this to return ``self._dml_data`` unchanged (each APO + scalar stores its treatment level internally). + + Parameters + ---------- + d_col : str + The treatment column to make active. + + Returns + ------- + DoubleMLData + A :class:`~doubleml.data.DoubleMLData` with ``d_cols=[d_col]`` + and all other treatment columns added to ``x_cols``. + """ + other_d_cols = [c for c in self._dml_data.d_cols if c != d_col] + x_cols = list(self._dml_data.x_cols) + other_d_cols + + return DoubleMLData( + data=self._dml_data.data, # Shared DataFrame — zero copy overhead + y_col=self._dml_data.y_col, + d_cols=d_col, + x_cols=x_cols, + z_cols=self._dml_data.z_cols, + cluster_cols=self._dml_data.cluster_cols, + use_other_treat_as_covariate=False, # Already handled above + force_all_x_finite=self._dml_data.force_all_x_finite, + force_all_d_finite=self._dml_data.force_all_d_finite, + ) + + def _reset_fit_state(self) -> None: + """Clear fit-dependent state when sample splitting changes.""" + self._framework = None + if self._modellist is not None: + for model in self._modellist: + model._reset_fit_state() + + def _propagate_splits_to_models(self) -> None: + """Push the vector's sample splits into each sub-model.""" + if self._modellist is None: + raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.") + for model in self._modellist: + model._smpls = self._smpls + model._smpls_cluster = self._smpls_cluster + model._n_folds = self._n_folds + model._n_folds_per_cluster = self._n_folds_per_cluster + model._n_rep = self._n_rep + + def _fit_single_model( + self, + i_d: int, + n_jobs_cv: int | None, + ext_preds: dict[str, np.ndarray] | None, + ) -> DoubleMLScalar: + """Fit nuisance models and estimate causal parameters for one sub-model.""" + if self._modellist is None: + raise ValueError("Sub-models are not initialized.") + model = self._modellist[i_d] + model.fit(n_jobs_cv=n_jobs_cv, external_predictions=ext_preds) + return model + + # ==================== Sample Splitting ==================== + + def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: + """ + Draw sample splitting for cross-fitting. + + Splits are drawn once for the vector and shared across all sub-models via + :meth:`_propagate_splits_to_models`. + + Parameters + ---------- + n_folds : int, optional + Number of folds. Default is ``5``. + n_rep : int, optional + Number of repetitions. Default is ``1``. + + Returns + ------- + self : Self + + Raises + ------ + ValueError + If ``n_folds < 2`` or ``n_rep < 1``. + """ + if not isinstance(n_folds, int) or n_folds < 2: + raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.") + if not isinstance(n_rep, int) or n_rep < 1: + raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.") + + resampler = DoubleMLResampling( + n_folds=n_folds, + n_rep=n_rep, + n_obs=self._n_obs, + ) + self._smpls = resampler.split_samples() + self._smpls_cluster = None + self._n_folds = n_folds + self._n_folds_per_cluster = None + self._n_rep = n_rep + + self._reset_fit_state() + return self + + def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None = None) -> Self: + """ + Set pre-computed sample splitting for all sub-models. + + Parameters + ---------- + all_smpls : list + List of ``(train_ind, test_ind)`` tuples per fold, or a list of such + lists for repeated sample splitting. + all_smpls_cluster : list or None, optional + Nested list for cluster sample splitting. Default is ``None``. + + Returns + ------- + self : Self + + Raises + ------ + TypeError + If ``all_smpls`` is not a list. + ValueError + If the partition is invalid. + """ + if isinstance(all_smpls, tuple): + raise TypeError("all_smpls must be a list of folds; tuple shorthand is not supported for DoubleMLVector.") + if not isinstance(all_smpls, list): + raise TypeError(f"all_smpls must be of list type. " f"{str(all_smpls)} of type {str(type(all_smpls))} was passed.") + + smpls, smpls_cluster, n_rep, n_folds = _check_sample_splitting( + all_smpls, + all_smpls_cluster, + self._dml_data, + self._dml_data.is_cluster_data, + n_obs=self._n_obs, + ) + + self._smpls = smpls + self._smpls_cluster = smpls_cluster + self._n_rep = n_rep + self._n_folds = n_folds + self._n_folds_per_cluster = None + + self._reset_fit_state() + return self + + # ==================== Fit ==================== + + def fit( + self, + n_folds: int = 5, + n_rep: int = 1, + n_jobs_models: int | None = None, + n_jobs_cv: int | None = None, + external_predictions: dict[str, dict[str, np.ndarray]] | None = None, + **kwargs: Any, + ) -> Self: + """ + Estimate all sub-models and combine their results. + + Calls :meth:`draw_sample_splitting` (if not yet done), fits each scalar + sub-model (optionally in parallel via joblib), and concatenates their + :class:`~doubleml.DoubleMLFramework` objects into one unified result. + + Parameters + ---------- + n_folds : int, optional + Number of cross-fitting folds. Default is ``5``. + Only used if sample splitting has not been drawn yet. + n_rep : int, optional + Number of repetitions. Default is ``1``. + Only used if sample splitting has not been drawn yet. + n_jobs_models : int or None, optional + Number of jobs for parallel sub-model fitting. ``None`` means + sequential. Default is ``None``. + n_jobs_cv : int or None, optional + Number of jobs for cross-validation inside each sub-model. + Default is ``None``. + external_predictions : dict or None, optional + Nested dictionary keyed by treatment column name. Each value is a dict + of external predictions passed to the corresponding sub-model's + :meth:`~doubleml.DoubleMLScalar.fit_nuisance_models`. + Default is ``None``. + + Returns + ------- + self : Self + """ + if self._smpls is None: + self.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + + self._propagate_splits_to_models() + + fitted_models = Parallel(n_jobs=n_jobs_models, verbose=0, pre_dispatch="2*n_jobs")( + delayed(self._fit_single_model)( + i_d, + n_jobs_cv, + external_predictions.get(d_col) if external_predictions is not None else None, + ) + for i_d, d_col in enumerate(self._dml_data.d_cols) + ) + + self._modellist = list(fitted_models) + + # Concatenate scalar frameworks into one unified multi-treatment framework + self._framework = concat([m.framework for m in self._modellist]) + self._framework.treatment_names = list(self._dml_data.d_cols) + + return self + + # ==================== Learner Access ==================== + + def get_params(self, learner_name: str) -> list[dict]: + """ + Get parameters of a learner across all sub-models. + + Parameters + ---------- + learner_name : str + Name of the learner. + + Returns + ------- + list of dict + One parameter dict per sub-model, in ``d_cols`` order. + """ + if self._modellist is None: + raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.") + return [model.get_params(learner_name) for model in self._modellist] + + def set_params(self, learner_name: str, **params: object) -> Self: + """ + Set parameters of a learner on all sub-models. + + Parameters + ---------- + learner_name : str + Name of the learner. + **params + Parameters to set on each sub-model's learner. + + Returns + ------- + self : Self + """ + if self._modellist is None: + raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.") + for model in self._modellist: + model.set_params(learner_name, **params) + return self + + # ==================== Hyperparameter Tuning ==================== + + def tune_ml_models( + self, + ml_param_space: dict, + scoring_methods: dict | None = None, + cv: int = 5, + set_as_params: bool = True, + return_tune_res: bool = False, + optuna_settings: dict | None = None, + ) -> "Self | list[dict[str, DMLOptunaResult]]": + """Hyperparameter-tuning for DoubleML models using Optuna.""" + if self._modellist is None: + raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.") + tuning_kwargs: dict[str, Any] = { + "ml_param_space": ml_param_space, + "scoring_methods": scoring_methods, + "cv": cv, + "set_as_params": set_as_params, + "return_tune_res": return_tune_res, + "optuna_settings": optuna_settings, + } + + tune_res: list = [] + for model in self._modellist: + res = model.tune_ml_models(**tuning_kwargs) + if return_tune_res: + tune_res.append(res) + + return tune_res if return_tune_res else self + + tune_ml_models.__doc__ = TUNE_ML_MODELS_DOC + + # ==================== Sensitivity ==================== + + def sensitivity_plot( + self, + idx_treatment: int = 0, + value: str = "theta", + rho: float = 1.0, + level: float = 0.95, + null_hypothesis: float = 0.0, + include_scenario: bool = True, + benchmarks: dict | None = None, + fill: bool = True, + grid_bounds: tuple[float, float] = (0.15, 0.15), + grid_size: int = 100, + ) -> object: + """ + Contour plot of the sensitivity with respect to latent/confounding variables. + + Parameters + ---------- + idx_treatment : int, optional + Index of the treatment parameter to plot. Default is ``0``. + value : str, optional + Contour value: ``'theta'`` for bounds, ``'ci'`` for bounds including + statistical uncertainty. Default is ``'theta'``. + rho : float, optional + Correlation between confounders in the main regression and Riesz + representer. Default is ``1.0``. + level : float, optional + The confidence level. Default is ``0.95``. + null_hypothesis : float, optional + Null hypothesis for the direction of contour lines. Default is ``0.0``. + include_scenario : bool, optional + Whether to highlight the last :meth:`sensitivity_analysis` scenario. + Default is ``True``. + benchmarks : dict or None, optional + Benchmark dictionary with keys ``'cf_y'``, ``'cf_d'``, ``'name'``. + Default is ``None``. + fill : bool, optional + Heatmap style (``True``) vs. contour lines only (``False``). + Default is ``True``. + grid_bounds : tuple of float, optional + Evaluation bounds ``(cf_d_max, cf_y_max)`` in ``[0, 1)``. + Default is ``(0.15, 0.15)``. + grid_size : int, optional + Number of grid evaluation points. Default is ``100``. + + Returns + ------- + fig : plotly figure + Plotly figure of the sensitivity contours. + + Raises + ------ + ValueError + If :meth:`fit` has not been called yet. + """ + if self._framework is None: + raise ValueError("Apply fit() before sensitivity_plot().") + return self._framework.sensitivity_plot( + idx_treatment=idx_treatment, + value=value, + rho=rho, + level=level, + null_hypothesis=null_hypothesis, + include_scenario=include_scenario, + benchmarks=benchmarks, + fill=fill, + grid_bounds=grid_bounds, + grid_size=grid_size, + ) + + def sensitivity_benchmark(self, benchmarking_set: list[str], fit_args: dict | None = None) -> pd.DataFrame: + """ + Compute a benchmark for a given set of features. + + Refits a short-form model excluding ``benchmarking_set`` from ``x_cols`` + and computes gain statistics comparing long and short forms. + + Parameters + ---------- + benchmarking_set : list of str + Feature names to benchmark. Must be a non-empty subset of ``x_cols``. + fit_args : dict or None, optional + Additional keyword arguments passed to :meth:`fit` when refitting the + short-form model. Default is ``None``. + + Returns + ------- + pd.DataFrame + Benchmark results indexed by treatment column names with columns + ``'cf_y'``, ``'cf_d'``, ``'rho'``, and ``'delta_theta'``. + + Raises + ------ + NotImplementedError + If sensitivity analysis is not available for this model. + TypeError + If ``benchmarking_set`` or ``fit_args`` have the wrong type. + ValueError + If ``benchmarking_set`` is empty or not a subset of ``x_cols``. + """ + if self._framework is None: + raise ValueError("Apply fit() before sensitivity_benchmark().") + + x_list_long = self._dml_data.x_cols + + if self.sensitivity_elements is None: + raise NotImplementedError(f"Sensitivity analysis not yet implemented for {self.__class__.__name__}.") + if not isinstance(benchmarking_set, list): + raise TypeError( + f"benchmarking_set must be a list. " f"{str(benchmarking_set)} of type {type(benchmarking_set)} was passed." + ) + if len(benchmarking_set) == 0: + raise ValueError("benchmarking_set must not be empty.") + if not set(benchmarking_set) <= set(x_list_long): + raise ValueError( + f"benchmarking_set must be a subset of features {str(self._dml_data.x_cols)}. " + f"{str(benchmarking_set)} was passed." + ) + if fit_args is not None and not isinstance(fit_args, dict): + raise TypeError(f"fit_args must be a dict. {str(fit_args)} of type {type(fit_args)} was passed.") + + x_list_short = [x for x in x_list_long if x not in benchmarking_set] + dml_short = copy.deepcopy(self) + dml_short._dml_data.x_cols = x_list_short + # Sub-models each hold their own DoubleMLData — rebuild them from the updated _dml_data + # so that the short-form model actually uses the reduced feature set. + dml_short._modellist = dml_short._initialize_models() + dml_short._framework = None + + if fit_args is not None: + dml_short.fit(**fit_args) + else: + dml_short.fit() + + benchmark_dict = gain_statistics(dml_long=self, dml_short=dml_short) + df_benchmark = pd.DataFrame(benchmark_dict, index=self._dml_data.d_cols) + return df_benchmark + + # ==================== String Representation ==================== + + def __str__(self) -> str: + """ + String representation of the DoubleMLVector object. + + Returns + ------- + str + A formatted string summary of the model. + """ + class_name = self.__class__.__name__ + header = f"{'=' * 20} {class_name} Object {'=' * 20}" + + info = f"Score function: {self.score}\n" + if self._n_folds is not None: + info += f"Resampling: {self._n_folds}-fold CV, {self._n_rep} repetitions\n" + info += f"Treatments: {list(self._dml_data.d_cols)}\n" + + if self._framework is not None: + return f"{header}\n\n{info}\n{str(self.summary)}" + else: + return f"{header}\n\n{info}\nModel not yet fitted. Call fit() first." From 17cf8f30b93907a9af90d5110a3c3730b6f503e5 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 25 Mar 2026 08:03:24 +0100 Subject: [PATCH 23/38] Add branch status and TODOs documentation for sk-refactoring --- .claude/CLAUDE.md | 2 ++ .claude/STATUS.md | 74 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 .claude/STATUS.md diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 0dc51dca..69f4d98b 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -8,6 +8,8 @@ DoubleML is a Python package implementing Double/Debiased Machine Learning (DML) **Docs**: https://docs.doubleml.org | **Source**: https://github.com/DoubleML/doubleml-for-py +**Branch status & TODOs**: `.claude/STATUS.md` + ## Architecture ### Class Hierarchy diff --git a/.claude/STATUS.md b/.claude/STATUS.md new file mode 100644 index 00000000..ac0d5881 --- /dev/null +++ b/.claude/STATUS.md @@ -0,0 +1,74 @@ +# Branch Status & TODOs + +> Tracked in git so it syncs across machines. Update this file as work progresses. +> Reference: `CLAUDE.md` loads this automatically via the line below. + +--- + +## Branch: `sk-refactoring` + +**Goal**: Introduce a new `DoubleMLScalar` / `DoubleMLVector` hierarchy alongside +the existing `DoubleML` API — cleaner design, better testability, explicit tuning, +nuisance evaluation, and sensitivity analysis. + +### Completed + +- [x] **Claude tooling** — `.claude/` dir, `CLAUDE.md`, `rules/`, `agents/`, `skills/` +- [x] **Architecture docs** — `doc/diagrams/architecture.md`, `doc/diagrams/testing_structure.md` +- [x] **`DoubleMLBase`** — abstract base with shared properties (`coef`, `se`, `summary`) and inference delegation (`doubleml/double_ml_base.py`) +- [x] **`LinearScoreMixin`** — closed-form θ = −E[ψ_b]/E[ψ_a] solver (`doubleml/double_ml_linear_score.py`) +- [x] **`DoubleMLScalar`** — single-parameter orchestrator (`doubleml/double_ml_scalar.py`) with: + - `fit()` → `draw_sample_splitting()` + `fit_nuisance_models()` + `estimate_causal_parameters()` + - `tune_ml_models()` via Optuna (`_LEARNER_PARAM_ALIASES`, `_get_tuning_data()` hook) + - `nuisance_targets`, `nuisance_loss`, `evaluate_learners()` + - `_sensitivity_element_est()` hook + full sensitivity analysis pipeline +- [x] **`DoubleMLPLRScalar`** — PLR scalar (`doubleml/plm/plr_scalar.py`) with all 7 test files: + - `test_plr_scalar.py`, `_return_types`, `_exceptions`, `_vs_plr`, `_external_predictions`, `_tune_ml_models`, `_evaluate_learners`, `_sensitivity` +- [x] **`DoubleMLIRMScalar`** — IRM scalar (`doubleml/irm/irm_scalar.py`) with all 7 test files (same structure) +- [x] **`DoubleMLVector`** — multi-treatment base class first iteration (`doubleml/double_ml_vector.py`) +- [x] **BLP multi-rep support** — `doubleml/utils/blp.py` + +### In Progress + +- [ ] **`DoubleMLVector`** — base class exists; no concrete subclass yet + +### Feature Gaps vs Legacy Classes + +Missing from `PLR` / `IRM` scalar compared to `DoubleMLPLR` / `DoubleMLIRM`: + +| Feature | Legacy location | Applies to | Notes | +|---------|----------------|-----------|-------| +| `cate()` | `plr.py:447`, `irm.py:564` | both | Depends on BLP (multi-rep already done) | +| `gate()` | `plr.py:485`, `irm.py:598` | both | Delegates to `cate()` | +| `_partial_out()` | `plr.py:522` | PLR only | Helper needed by PLR `cate()`/`gate()` | +| `policy_tree()` | `irm.py:635` | IRM only | Not planned yet | + +Weighted effects in IRM (`weights` dict form): +- Array weights: ✅ supported +- Dict weights with `weights_bar`: ⚠️ **gap** — `_check_weights()` called at init with `n_rep=1` (`utils/_checks.py:276`) but `n_rep` is only determined at `draw_sample_splitting()`. Dict weights with `weights_bar.shape == (n_obs, n_rep > 1)` fail validation incorrectly. + +Intentionally **not ported**: +- Callable score — design decision +- `trimming_rule` / `trimming_threshold` deprecated props — use `ps_processor_config` + +### Planned + +| Item | Files | Notes | +|------|-------|-------| +| `cate()` + `gate()` for PLR scalar | `doubleml/plm/plr_scalar.py` | Needs `_partial_out()` first | +| `cate()` + `gate()` for IRM scalar | `doubleml/irm/irm_scalar.py` | | +| Fix dict `weights_bar` validation for multi-rep | `doubleml/irm/irm_scalar.py` | Defer n_rep shape check to `fit()` | +| `DoubleMLPLRVector` | `doubleml/plm/plr_vector.py` + tests | First concrete Vector subclass | +| `DoubleMLPLIVScalar` | `doubleml/plm/pliv_scalar.py` + 7 test files | Next scalar model | +| `DoubleMLPLPRScalar` | `doubleml/plm/plpr_scalar.py` + 7 test files | | +| DID scalar variants | `doubleml/did/*_scalar.py` | DID, DIDCSBinary, DIDMulti | +| `DoubleMLVector` tests | `doubleml/tests/test_vector_*.py` | Base class tests | + +--- + +## How to Update This File + +- Mark items `[x]` when complete +- Move items between sections as work progresses +- Add new planned items as they are identified +- Commit this file with the relevant code changes so the status stays in sync From 3818c2b86703dcae861e99497e143f9dc84c4931 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Wed, 25 Mar 2026 10:00:54 +0100 Subject: [PATCH 24/38] Refactor weight handling in IRM and add comprehensive exception tests for weights --- doubleml/irm/irm_scalar.py | 10 +- .../irm/tests/test_irm_scalar_exceptions.py | 86 +++++++++++++ .../tests/test_irm_scalar_weighted_scores.py | 113 ++++++++++++++++++ doubleml/utils/_checks.py | 17 ++- 4 files changed, 217 insertions(+), 9 deletions(-) create mode 100644 doubleml/irm/tests/test_irm_scalar_weighted_scores.py diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index f6fe85de..44e62e96 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -147,8 +147,8 @@ def __init__( self._ps_processor_config = PSProcessorConfig() self._ps_processor = PSProcessor.from_config(self._ps_processor_config) - # Weights - _check_weights(weights, score, obj_dml_data.n_obs, n_rep=1) + # Weights — n_rep shape deferred to _get_weights() when n_rep is known + _check_weights(weights, score, obj_dml_data.n_obs) self._initialize_weights(weights) # Set learners if provided @@ -460,8 +460,12 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]: w = self._weights["weights"] weights = w[:, np.newaxis] * np.ones((1, self.n_rep)) # (n_obs, n_rep) if "weights_bar" in self._weights: - # weights_bar has shape (n_obs, n_rep) already weights_bar = self._weights["weights_bar"] + if weights_bar.shape != (self.n_obs, self.n_rep): + raise ValueError( + f"weights_bar must have shape ({self.n_obs}, {self.n_rep}). " + f"weights_bar of shape {weights_bar.shape} was passed." + ) else: weights_bar = weights.copy() else: diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py index 2fe5fd35..3f0ac0bb 100644 --- a/doubleml/irm/tests/test_irm_scalar_exceptions.py +++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py @@ -165,6 +165,92 @@ def test_irm_scalar_exception_binary_predictions_g(): dml_obj.fit_nuisance_models() +# ==================== weights exceptions ==================== + +_N_OBS = obj_dml_data.n_obs + + +@pytest.mark.ci +def test_exception_weights_wrong_type(): + """weights of non-array, non-dict type raises TypeError.""" + msg = r"weights must be a numpy array or dictionary\." + with pytest.raises(TypeError, match=msg): + IRM(obj_dml_data, weights="not_an_array") + + +@pytest.mark.ci +def test_exception_weights_wrong_shape(): + """1D weights array with wrong length raises ValueError.""" + msg = r"weights must have shape" + with pytest.raises(ValueError, match=msg): + IRM(obj_dml_data, weights=np.ones(_N_OBS + 1)) + + +@pytest.mark.ci +def test_exception_weights_negative(): + """weights array with a negative value raises ValueError.""" + w = np.ones(_N_OBS) + w[0] = -1.0 + msg = r"All weights values must be greater or equal 0\." + with pytest.raises(ValueError, match=msg): + IRM(obj_dml_data, weights=w) + + +@pytest.mark.ci +def test_exception_weights_atte_not_array(): + """dict weights with score='ATTE' raises TypeError.""" + dict_weights = {"weights": np.ones(_N_OBS), "weights_bar": np.ones((_N_OBS, 1))} + msg = r"weights must be a numpy array for ATTE score\." + with pytest.raises(TypeError, match=msg): + IRM(obj_dml_data, score="ATTE", weights=dict_weights) + + +@pytest.mark.ci +def test_exception_weights_atte_not_binary(): + """Non-binary array weights with score='ATTE' raises ValueError.""" + w = np.full(_N_OBS, 0.5) + msg = r"weights must be binary for ATTE score\." + with pytest.raises(ValueError, match=msg): + IRM(obj_dml_data, score="ATTE", weights=w) + + +@pytest.mark.ci +def test_exception_dict_weights_wrong_keys(): + """Dict weights with unexpected keys raises ValueError.""" + bad_dict = {"weights": np.ones(_N_OBS), "wrong_key": np.ones((_N_OBS, 1))} + msg = r"weights must have keys" + with pytest.raises(ValueError, match=msg): + IRM(obj_dml_data, weights=bad_dict) + + +@pytest.mark.ci +def test_exception_dict_weights_bar_wrong_n_obs(): + """Dict weights_bar with wrong number of rows raises ValueError at init.""" + dict_weights = { + "weights": np.ones(_N_OBS), + "weights_bar": np.ones((_N_OBS + 1, 1)), + } + msg = r"weights_bar must be a 2-dimensional array with" + with pytest.raises(ValueError, match=msg): + IRM(obj_dml_data, weights=dict_weights) + + +@pytest.mark.ci +def test_exception_dict_weights_bar_wrong_n_rep(): + """Dict weights_bar with wrong n_rep column raises ValueError at estimate time.""" + # weights_bar has 2 columns but n_rep=3 is used; mismatch detected in estimate_causal_parameters() + dict_weights = { + "weights": np.ones(_N_OBS), + "weights_bar": np.ones((_N_OBS, 2)), + } + dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m, weights=dict_weights) + dml_obj.draw_sample_splitting(n_folds=2, n_rep=3) + dml_obj.fit_nuisance_models() + msg = r"weights_bar must have shape" + with pytest.raises(ValueError, match=msg): + dml_obj.estimate_causal_parameters() + + # ==================== sensitivity_analysis exceptions ==================== diff --git a/doubleml/irm/tests/test_irm_scalar_weighted_scores.py b/doubleml/irm/tests/test_irm_scalar_weighted_scores.py new file mode 100644 index 00000000..ee2586ae --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_weighted_scores.py @@ -0,0 +1,113 @@ +"""Test weighted score computation for IRM scalar, including dict weights with n_rep > 1. + +With constant dict weights c * ones (weights = c, weights_bar = c): + psi_a = -weights / mean(weights) = -c/c = -1 (same as unweighted) + psi_b = c * psi_b_unweighted + theta = -mean(psi_b) / mean(psi_a) = c * theta_unweighted + se = c * se_unweighted (psi scales by c, psi_a unchanged) +""" + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM + +_N_FOLDS = 5 +_N_OBS = 500 +_DIM_X = 10 +_WEIGHT_CONST = 0.5 + +ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42) +ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42) + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + """Number of repetitions — covers single- and multi-rep cases.""" + return request.param + + +@pytest.fixture(scope="module") +def irm_data(): + """Shared IRM dataset.""" + np.random.seed(42) + return make_irm_data(theta=0.5, n_obs=_N_OBS, dim_x=_DIM_X, return_type="DoubleMLData") + + +@pytest.fixture(scope="module") +def constant_weights_fixture(irm_data, n_rep): + """Pair of IRM scalar models sharing sample splits: unweighted and constant-0.5-weighted. + + With weights = weights_bar = 0.5 * ones: + theta_weighted = 0.5 * theta_unweighted + se_weighted = 0.5 * se_unweighted + """ + n_obs = irm_data.n_obs + const_weights = { + "weights": np.full(n_obs, _WEIGHT_CONST), + "weights_bar": np.full((n_obs, n_rep), _WEIGHT_CONST), + } + + # Unweighted reference + dml_ref = IRM(irm_data, score="ATE") + dml_ref.set_learners(ml_g=clone(ml_g), ml_m=clone(ml_m)) + dml_ref.draw_sample_splitting(n_folds=_N_FOLDS, n_rep=n_rep) + dml_ref.fit_nuisance_models() + dml_ref.estimate_causal_parameters() + + # Constant-weighted — share exact sample splits for identical nuisance predictions + dml_weighted = IRM(irm_data, score="ATE", weights=const_weights) + dml_weighted.set_learners(ml_g=clone(ml_g), ml_m=clone(ml_m)) + dml_weighted._n_folds = _N_FOLDS + dml_weighted._n_rep = n_rep + dml_weighted._smpls = dml_ref.smpls + dml_weighted.fit_nuisance_models() + dml_weighted.estimate_causal_parameters() + + return {"ref": dml_ref, "weighted": dml_weighted} + + +@pytest.mark.ci +def test_dict_weights_n_rep_gt1_succeeds(): + """IRM scalar with weights_bar shape (n_obs, 3) and n_rep=3 fits without error.""" + np.random.seed(42) + obj_dml_data = make_irm_data(theta=0.5, n_obs=200, dim_x=5, return_type="DoubleMLData") + n_obs = obj_dml_data.n_obs + n_rep = 3 + + dict_weights = { + "weights": np.full(n_obs, _WEIGHT_CONST), + "weights_bar": np.full((n_obs, n_rep), _WEIGHT_CONST), + } + dml_obj = IRM(obj_dml_data, score="ATE", weights=dict_weights) + dml_obj.set_learners(ml_g=clone(ml_g), ml_m=clone(ml_m)) + dml_obj.draw_sample_splitting(n_folds=3, n_rep=n_rep) + dml_obj.fit_nuisance_models() + dml_obj.estimate_causal_parameters() + + +@pytest.mark.ci +def test_constant_weights_coef(constant_weights_fixture): + """theta (coef) with constant weights c equals c * theta_unweighted.""" + np.testing.assert_allclose( + constant_weights_fixture["weighted"].coef, + _WEIGHT_CONST * constant_weights_fixture["ref"].coef, + rtol=1e-9, + ) + + +@pytest.mark.ci +def test_constant_weights_se(constant_weights_fixture): + """se with constant weights c equals c * se_unweighted. + + psi_weighted = c * psi_unweighted, psi_a unchanged (-1), so + se_weighted = sqrt(mean(c² * psi²)) / sqrt(n) = c * se_unweighted. + """ + np.testing.assert_allclose( + constant_weights_fixture["weighted"].se, + _WEIGHT_CONST * constant_weights_fixture["ref"].se, + rtol=1e-9, + ) diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index ad493e28..f6470d13 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -240,7 +240,7 @@ def _check_benchmarks(benchmarks): return -def _check_weights(weights, score, n_obs, n_rep): +def _check_weights(weights, score, n_obs, n_rep: int | None = None): if weights is not None: # check general type if (not isinstance(weights, np.ndarray)) and (not isinstance(weights, dict)): @@ -273,14 +273,19 @@ def _check_weights(weights, score, n_obs, n_rep): if not set(weights.keys()) == set(expected_keys): raise ValueError(f"weights must have keys {expected_keys}. keys {str(weights.keys())} were passed.") - expected_shapes = [(n_obs,), (n_obs, n_rep)] - if weights["weights"].shape != expected_shapes[0]: + if weights["weights"].shape != (n_obs,): raise ValueError( - f"weights must have shape {expected_shapes[0]}. weights of shape {weights['weights'].shape} was passed." + f"weights must have shape ({n_obs},). weights of shape {weights['weights'].shape} was passed." ) - if weights["weights_bar"].shape != expected_shapes[1]: + # weights_bar must be 2D with n_obs rows; the n_rep column is validated later when n_rep is known + if weights["weights_bar"].ndim != 2 or weights["weights_bar"].shape[0] != n_obs: raise ValueError( - f"weights_bar must have shape {expected_shapes[1]}. " + f"weights_bar must be a 2-dimensional array with {n_obs} rows. " + f"weights_bar of shape {weights['weights_bar'].shape} was passed." + ) + if n_rep is not None and weights["weights_bar"].shape[1] != n_rep: + raise ValueError( + f"weights_bar must have shape ({n_obs}, {n_rep}). " f"weights_bar of shape {weights['weights_bar'].shape} was passed." ) if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)): From 82d95a5cb3d8c2793458b1e74c2b2e797e4a0297 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 08:55:55 +0200 Subject: [PATCH 25/38] refactor: enhance validation for weights_bar in IRM and update fit handling in DoubleMLScalar --- .claude/STATUS.md | 3 +- doubleml/double_ml_scalar.py | 60 +++++++++--- doubleml/irm/irm_scalar.py | 15 ++- .../irm/tests/test_irm_scalar_exceptions.py | 49 +++++++++- doubleml/tests/test_scalar_fit.py | 93 +++++++++++++++++++ 5 files changed, 197 insertions(+), 23 deletions(-) create mode 100644 doubleml/tests/test_scalar_fit.py diff --git a/.claude/STATUS.md b/.claude/STATUS.md index ac0d5881..c3a73ecb 100644 --- a/.claude/STATUS.md +++ b/.claude/STATUS.md @@ -45,7 +45,7 @@ Missing from `PLR` / `IRM` scalar compared to `DoubleMLPLR` / `DoubleMLIRM`: Weighted effects in IRM (`weights` dict form): - Array weights: ✅ supported -- Dict weights with `weights_bar`: ⚠️ **gap** — `_check_weights()` called at init with `n_rep=1` (`utils/_checks.py:276`) but `n_rep` is only determined at `draw_sample_splitting()`. Dict weights with `weights_bar.shape == (n_obs, n_rep > 1)` fail validation incorrectly. +- Dict weights with `weights_bar`: ✅ supported — init defers the `n_rep` column check; `DoubleMLScalar._check_smpls_dependent_inputs()` hook validates `weights_bar.shape == (n_obs, n_rep)` from inside both `draw_sample_splitting()` and `set_sample_splitting()`. `fit(n_folds=..., n_rep=...)` re-draws splits with a `UserWarning` when args conflict with existing splits. Intentionally **not ported**: - Callable score — design decision @@ -57,7 +57,6 @@ Intentionally **not ported**: |------|-------|-------| | `cate()` + `gate()` for PLR scalar | `doubleml/plm/plr_scalar.py` | Needs `_partial_out()` first | | `cate()` + `gate()` for IRM scalar | `doubleml/irm/irm_scalar.py` | | -| Fix dict `weights_bar` validation for multi-rep | `doubleml/irm/irm_scalar.py` | Defer n_rep shape check to `fit()` | | `DoubleMLPLRVector` | `doubleml/plm/plr_vector.py` + tests | First concrete Vector subclass | | `DoubleMLPLIVScalar` | `doubleml/plm/pliv_scalar.py` + 7 test files | Next scalar model | | `DoubleMLPLPRScalar` | `doubleml/plm/plpr_scalar.py` + 7 test files | | diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index d10c1c49..d460b824 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -2,6 +2,7 @@ Abstract base class for scalar DoubleML models (single parameter estimation). """ +import warnings from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any, Callable, ClassVar, Self @@ -417,8 +418,8 @@ def set_learners(self, **kwargs: object) -> Self: def fit( self, - n_folds: int = 5, - n_rep: int = 1, + n_folds: int | None = None, + n_rep: int | None = None, n_jobs_cv: int | None = None, external_predictions: dict[str, np.ndarray] | None = None, **kwargs, @@ -431,12 +432,18 @@ def fit( Parameters ---------- - n_folds : int, optional - Number of folds for cross-fitting. Default is 5. - Only used if sample splitting has not been drawn yet. - n_rep : int, optional - Number of repetitions for sample splitting. Default is 1. - Only used if sample splitting has not been drawn yet. + n_folds : int or None, optional + Number of folds for cross-fitting. If sample splitting has not been + drawn yet, defaults to 5. If sample splitting already exists and + ``n_folds`` differs from ``self.n_folds``, the splits are re-drawn + (discarding existing splits and fit state) and a :class:`UserWarning` + is emitted. Default is ``None``. + n_rep : int or None, optional + Number of repetitions for sample splitting. If sample splitting has + not been drawn yet, defaults to 1. If sample splitting already exists + and ``n_rep`` differs from ``self.n_rep``, the splits are re-drawn + (discarding existing splits and fit state) and a :class:`UserWarning` + is emitted. Default is ``None``. n_jobs_cv : int, optional Number of jobs for parallel processing during cross-validation. Currently not used (reserved for future parallelization). @@ -454,7 +461,26 @@ def fit( The fitted estimator. """ if self._smpls is None: - self.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + self.draw_sample_splitting( + n_folds=5 if n_folds is None else n_folds, + n_rep=1 if n_rep is None else n_rep, + ) + else: + current_n_folds = self.n_folds + current_n_rep = self.n_rep + n_folds_conflict = n_folds is not None and n_folds != current_n_folds + n_rep_conflict = n_rep is not None and n_rep != current_n_rep + if n_folds_conflict or n_rep_conflict: + new_n_folds = n_folds if (n_folds is not None and n_folds_conflict) else current_n_folds + new_n_rep = n_rep if (n_rep is not None and n_rep_conflict) else current_n_rep + warnings.warn( + f"Re-drawing sample splitting (was n_folds={current_n_folds}, n_rep={current_n_rep}; " + f"now n_folds={new_n_folds}, n_rep={new_n_rep}). " + "Existing splits and fit state are discarded.", + UserWarning, + stacklevel=2, + ) + self.draw_sample_splitting(n_folds=new_n_folds, n_rep=new_n_rep) self.fit_nuisance_models(n_jobs_cv=n_jobs_cv, external_predictions=external_predictions) self.estimate_causal_parameters() return self @@ -639,6 +665,7 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: self._smpls_cluster = None self._reset_fit_state() + self._check_smpls_dependent_inputs() return self @@ -696,6 +723,7 @@ def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None = self._n_folds_per_cluster = None self._reset_fit_state() + self._check_smpls_dependent_inputs() return self @@ -853,6 +881,18 @@ def _reset_fit_state(self) -> None: self._i_rep = None self._i_fold = None + def _check_smpls_dependent_inputs(self) -> None: + """ + Validate inputs whose shape depends on ``n_rep``. + + Called by :meth:`draw_sample_splitting` and :meth:`set_sample_splitting` + after ``self._n_rep`` and ``self._smpls`` have been set and fit state has + been reset. Subclasses override this hook to validate user-supplied + objects whose shape only becomes meaningful once ``n_rep`` is known + (e.g. ``weights_bar`` in IRM). The default implementation is a no-op. + """ + return + def evaluate_learners( self, learners: list[str] | None = None, @@ -975,8 +1015,6 @@ def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: def _validate_sensitivity_elements(self) -> None: """Re-estimate nu2 from riesz representer if nu2 is non-positive (degenerate PS).""" - import warnings - if self._sensitivity_elements is None: return nu2 = self._sensitivity_elements["nu2"] # (1, 1, n_rep) diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index 44e62e96..58f94c95 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -461,11 +461,6 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]: weights = w[:, np.newaxis] * np.ones((1, self.n_rep)) # (n_obs, n_rep) if "weights_bar" in self._weights: weights_bar = self._weights["weights_bar"] - if weights_bar.shape != (self.n_obs, self.n_rep): - raise ValueError( - f"weights_bar must have shape ({self.n_obs}, {self.n_rep}). " - f"weights_bar of shape {weights_bar.shape} was passed." - ) else: weights_bar = weights.copy() else: @@ -480,6 +475,16 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]: return weights, weights_bar + def _check_smpls_dependent_inputs(self) -> None: + """Validate ``weights_bar`` shape now that ``n_rep`` is known.""" + if "weights_bar" in self._weights: + weights_bar = self._weights["weights_bar"] + expected = (self.n_obs, self.n_rep) + if weights_bar.shape != expected: + raise ValueError( + f"weights_bar must have shape {expected}. " f"weights_bar of shape {weights_bar.shape} was passed." + ) + def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: """ Compute IRM sensitivity elements vectorized over all repetitions. diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py index 3f0ac0bb..a1db7598 100644 --- a/doubleml/irm/tests/test_irm_scalar_exceptions.py +++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py @@ -237,18 +237,57 @@ def test_exception_dict_weights_bar_wrong_n_obs(): @pytest.mark.ci def test_exception_dict_weights_bar_wrong_n_rep(): - """Dict weights_bar with wrong n_rep column raises ValueError at estimate time.""" - # weights_bar has 2 columns but n_rep=3 is used; mismatch detected in estimate_causal_parameters() + """Dict weights_bar with wrong n_rep column raises ValueError at draw_sample_splitting().""" + # weights_bar has 2 columns but n_rep=3 is requested; the mismatch is detected as + # soon as n_rep becomes known (i.e. inside draw_sample_splitting), before any nuisance fitting. dict_weights = { "weights": np.ones(_N_OBS), "weights_bar": np.ones((_N_OBS, 2)), } dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m, weights=dict_weights) - dml_obj.draw_sample_splitting(n_folds=2, n_rep=3) - dml_obj.fit_nuisance_models() msg = r"weights_bar must have shape" with pytest.raises(ValueError, match=msg): - dml_obj.estimate_causal_parameters() + dml_obj.draw_sample_splitting(n_folds=2, n_rep=3) + + +@pytest.mark.ci +def test_exception_dict_weights_bar_wrong_n_rep_via_set_sample_splitting(): + """Dict weights_bar mismatch is also caught when splits arrive via set_sample_splitting().""" + dict_weights = { + "weights": np.ones(_N_OBS), + "weights_bar": np.ones((_N_OBS, 1)), + } + dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m, weights=dict_weights) + + # Build a manually constructed sample splitting list with n_rep=2. + rng = np.random.default_rng(0) + indices = np.arange(_N_OBS) + all_smpls = [] + for _ in range(2): + perm = rng.permutation(indices) + fold_size = _N_OBS // 2 + test1, test2 = perm[:fold_size], perm[fold_size:] + train1 = np.setdiff1d(indices, test1) + train2 = np.setdiff1d(indices, test2) + all_smpls.append([(train1, test1), (train2, test2)]) + + msg = r"weights_bar must have shape" + with pytest.raises(ValueError, match=msg): + dml_obj.set_sample_splitting(all_smpls) + + +@pytest.mark.ci +def test_exception_dict_weights_bar_after_redraw(): + """Re-drawing splits with a different n_rep re-runs the weights_bar check.""" + dict_weights = { + "weights": np.ones(_N_OBS), + "weights_bar": np.ones((_N_OBS, 1)), + } + dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m, weights=dict_weights) + dml_obj.draw_sample_splitting(n_folds=2, n_rep=1) # OK + msg = r"weights_bar must have shape" + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_folds=2, n_rep=2) # ==================== sensitivity_analysis exceptions ==================== diff --git a/doubleml/tests/test_scalar_fit.py b/doubleml/tests/test_scalar_fit.py new file mode 100644 index 00000000..ceb0e76e --- /dev/null +++ b/doubleml/tests/test_scalar_fit.py @@ -0,0 +1,93 @@ +"""Test fit() argument handling on DoubleMLScalar (vehicle: PLR scalar).""" + +import warnings + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression + +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR + +N_OBS = 200 +N_FOLDS = 3 + + +def _build_unfitted_plr() -> PLR: + np.random.seed(3141) + dml_data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=10, alpha=0.5) + dml_obj = PLR(dml_data) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + return dml_obj + + +@pytest.mark.ci +def test_fit_redraws_on_n_rep_mismatch(): + """fit(n_rep=...) re-draws splits and warns when n_rep differs from existing splits.""" + dml_obj = _build_unfitted_plr() + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=1) + msg = r"Re-drawing sample splitting" + with pytest.warns(UserWarning, match=msg): + dml_obj.fit(n_rep=3) + assert dml_obj.n_rep == 3 + assert dml_obj.n_folds == N_FOLDS # n_folds preserved + assert dml_obj.all_thetas.shape == (1, 3) + + +@pytest.mark.ci +def test_fit_redraws_on_n_folds_mismatch(): + """fit(n_folds=...) re-draws splits and warns when n_folds differs from existing splits.""" + dml_obj = _build_unfitted_plr() + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=2) + msg = r"Re-drawing sample splitting" + with pytest.warns(UserWarning, match=msg): + dml_obj.fit(n_folds=N_FOLDS + 2) + assert dml_obj.n_folds == N_FOLDS + 2 + assert dml_obj.n_rep == 2 # n_rep preserved + + +@pytest.mark.ci +def test_fit_no_warning_when_consistent(): + """fit(n_rep, n_folds) matching existing splits emits no UserWarning and keeps splits.""" + dml_obj = _build_unfitted_plr() + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=2) + original_smpls = dml_obj.smpls + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + dml_obj.fit(n_folds=N_FOLDS, n_rep=2) + # smpls were not redrawn + assert dml_obj.smpls is original_smpls + + +@pytest.mark.ci +def test_fit_no_warning_when_args_omitted(): + """fit() with no args emits no UserWarning even when splits differ from defaults.""" + dml_obj = _build_unfitted_plr() + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=2) + original_smpls = dml_obj.smpls + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + dml_obj.fit() + assert dml_obj.n_rep == 2 + assert dml_obj.n_folds == N_FOLDS + assert dml_obj.smpls is original_smpls + + +@pytest.mark.ci +def test_fit_draws_default_splits_when_none_set(): + """fit() without prior draw_sample_splitting() falls back to default n_folds=5, n_rep=1.""" + dml_obj = _build_unfitted_plr() + dml_obj.fit() + assert dml_obj.n_folds == 5 + assert dml_obj.n_rep == 1 + + +@pytest.mark.ci +def test_fit_draws_explicit_splits_when_none_set(): + """fit(n_folds, n_rep) without prior draw_sample_splitting() honors the args without warning.""" + dml_obj = _build_unfitted_plr() + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + dml_obj.fit(n_folds=4, n_rep=2) + assert dml_obj.n_folds == 4 + assert dml_obj.n_rep == 2 From 10305aa7b68b8dee2a33845093c94e976d7fd8ae Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 10:25:19 +0200 Subject: [PATCH 26/38] feat: Add CATE and GATE methods to IRM and PLR scalar models - Implemented `cate()` and `gate()` methods in `IRM` and `PLR` classes for estimating conditional average treatment effects. - Enhanced `DoubleMLBLP` to support per-rep basis for multi-rep scenarios. - Updated tests for `IRM` and `PLR` to validate new functionality, including checks for correct handling of multi-rep bases and group effects. - Improved validation of basis inputs in `DoubleMLBLP` to accept both single DataFrame and list of DataFrames. - Added new test cases to ensure robustness of the new features and backward compatibility with legacy models. --- .claude/STATUS.md | 11 +- doubleml/irm/irm_scalar.py | 73 +++++++ .../irm/tests/test_irm_scalar_cate_gate.py | 205 ++++++++++++++++++ doubleml/plm/plr.py | 4 +- doubleml/plm/plr_scalar.py | 106 +++++++++ doubleml/plm/tests/test_plr.py | 28 +++ .../plm/tests/test_plr_scalar_cate_gate.py | 205 ++++++++++++++++++ doubleml/utils/blp.py | 63 ++++-- doubleml/utils/tests/test_blp.py | 76 ++++++- 9 files changed, 749 insertions(+), 22 deletions(-) create mode 100644 doubleml/irm/tests/test_irm_scalar_cate_gate.py create mode 100644 doubleml/plm/tests/test_plr_scalar_cate_gate.py diff --git a/.claude/STATUS.md b/.claude/STATUS.md index c3a73ecb..c358e91f 100644 --- a/.claude/STATUS.md +++ b/.claude/STATUS.md @@ -25,6 +25,9 @@ nuisance evaluation, and sensitivity analysis. - [x] **`DoubleMLPLRScalar`** — PLR scalar (`doubleml/plm/plr_scalar.py`) with all 7 test files: - `test_plr_scalar.py`, `_return_types`, `_exceptions`, `_vs_plr`, `_external_predictions`, `_tune_ml_models`, `_evaluate_learners`, `_sensitivity` - [x] **`DoubleMLIRMScalar`** — IRM scalar (`doubleml/irm/irm_scalar.py`) with all 7 test files (same structure) +- [x] **`cate()` + `gate()` for IRM scalar** — `doubleml/irm/irm_scalar.py` + `test_irm_scalar_cate_gate.py` +- [x] **`cate()` + `gate()` + `_partial_out()` for PLR scalar** — `doubleml/plm/plr_scalar.py` + `test_plr_scalar_cate_gate.py`. Multi-rep × multi-column basis fully supported. +- [x] **`DoubleMLBLP` per-rep basis API** — `basis` may be a single `pd.DataFrame` (shared) or a `list[pd.DataFrame]` of length `n_rep`. Also fixes the legacy `DoubleMLPLR.cate()` multi-rep bug (`basis * D_tilde` mis-broadcast for `n_rep>1` and `d_basis>1`). - [x] **`DoubleMLVector`** — multi-treatment base class first iteration (`doubleml/double_ml_vector.py`) - [x] **BLP multi-rep support** — `doubleml/utils/blp.py` @@ -38,9 +41,9 @@ Missing from `PLR` / `IRM` scalar compared to `DoubleMLPLR` / `DoubleMLIRM`: | Feature | Legacy location | Applies to | Notes | |---------|----------------|-----------|-------| -| `cate()` | `plr.py:447`, `irm.py:564` | both | Depends on BLP (multi-rep already done) | -| `gate()` | `plr.py:485`, `irm.py:598` | both | Delegates to `cate()` | -| `_partial_out()` | `plr.py:522` | PLR only | Helper needed by PLR `cate()`/`gate()` | +| `cate()` | `plr.py:447`, `irm.py:564` | — | ✅ ported for both IRM and PLR | +| `gate()` | `plr.py:485`, `irm.py:598` | — | ✅ ported for both IRM and PLR | +| `_partial_out()` | `plr.py:522` | — | ✅ ported for PLR scalar | | `policy_tree()` | `irm.py:635` | IRM only | Not planned yet | Weighted effects in IRM (`weights` dict form): @@ -55,8 +58,6 @@ Intentionally **not ported**: | Item | Files | Notes | |------|-------|-------| -| `cate()` + `gate()` for PLR scalar | `doubleml/plm/plr_scalar.py` | Needs `_partial_out()` first | -| `cate()` + `gate()` for IRM scalar | `doubleml/irm/irm_scalar.py` | | | `DoubleMLPLRVector` | `doubleml/plm/plr_vector.py` + tests | First concrete Vector subclass | | `DoubleMLPLIVScalar` | `doubleml/plm/pliv_scalar.py` + 7 test files | Next scalar model | | `DoubleMLPLPRScalar` | `doubleml/plm/plpr_scalar.py` + 7 test files | | diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index 58f94c95..ac44a346 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -4,9 +4,11 @@ from __future__ import annotations +import warnings from typing import Any, ClassVar, Self import numpy as np +import pandas as pd from sklearn.base import clone from sklearn.utils.multiclass import type_of_target @@ -15,6 +17,7 @@ from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_score, _check_weights from ..utils._learner import LearnerSpec, predict_nuisance from ..utils._propensity_score import _propensity_score_adjustment +from ..utils.blp import DoubleMLBLP from ..utils.propensity_score_processing import PSProcessor, PSProcessorConfig @@ -357,6 +360,76 @@ def _get_score_elements(self) -> dict[str, np.ndarray]: return {"psi_a": psi_a, "psi_b": psi_b} + # ==================== Heterogeneous Effects ==================== + + def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> DoubleMLBLP: + """ + Calculate conditional average treatment effects (CATE) for a given basis. + + Parameters + ---------- + basis : :class:`pandas.DataFrame` + The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``, + where ``n_obs`` is the number of observations and ``d`` is the number of predictors. + is_gate : bool + Indicates whether the basis is constructed for GATEs (dummy-basis). + Default is ``False``. + **kwargs : dict + Additional keyword arguments passed to :meth:`statsmodels.regression.linear_model.OLS.fit`, + e.g. ``cov_type``. + + Returns + ------- + model : :class:`doubleml.DoubleMLBLP` + Best linear predictor model. + """ + if self.score != "ATE": + raise ValueError(f"Invalid score '{self.score}'. CATE is only implemented for score='ATE'.") + if self._predictions is None: + raise ValueError("CATE requires a fitted model. Call fit() first.") + + orth_signal = self._get_score_elements()["psi_b"] + + model = DoubleMLBLP(orth_signal, basis=basis, is_gate=is_gate) + model.fit(**kwargs) + return model + + def gate(self, groups: pd.DataFrame, **kwargs: Any) -> DoubleMLBLP: + """ + Calculate group average treatment effects (GATE) for mutually exclusive groups. + + Parameters + ---------- + groups : :class:`pandas.DataFrame` + The group indicator for estimating the best linear predictor. Groups should be mutually exclusive. + Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations + and ``d`` is the number of groups, or ``(n_obs, 1)`` containing the corresponding groups (as str). + **kwargs : dict + Additional keyword arguments passed to :meth:`statsmodels.regression.linear_model.OLS.fit`, + e.g. ``cov_type``. + + Returns + ------- + model : :class:`doubleml.DoubleMLBLP` + Best linear predictor model for group effects. + """ + if not isinstance(groups, pd.DataFrame): + raise TypeError(f"Groups must be of DataFrame type. Groups of type {str(type(groups))} was passed.") + + if not all(groups.dtypes == bool) or all(groups.dtypes == int): + if groups.shape[1] == 1: + groups = pd.get_dummies(groups, prefix="Group", prefix_sep="_") + else: + raise TypeError( + "Columns of groups must be of bool type or int type (dummy coded). " + "Alternatively, groups should only contain one column." + ) + + if any(groups.sum(0) <= 5): + warnings.warn("At least one group effect is estimated with less than 6 observations.") + + return self.cate(groups, is_gate=True, **kwargs) + # ==================== Private Helpers ==================== @staticmethod diff --git a/doubleml/irm/tests/test_irm_scalar_cate_gate.py b/doubleml/irm/tests/test_irm_scalar_cate_gate.py new file mode 100644 index 00000000..dfa73924 --- /dev/null +++ b/doubleml/irm/tests/test_irm_scalar_cate_gate.py @@ -0,0 +1,205 @@ +"""Test cate() and gate() for the IRM scalar model.""" + +import numpy as np +import pandas as pd +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +import doubleml as dml +from doubleml.irm.datasets import make_irm_data +from doubleml.irm.irm_scalar import IRM +from doubleml.utils.blp import DoubleMLBLP + +N_OBS = 120 +N_FOLDS = 3 +BASIS_DIM = 5 + + +def _build_irm(n_rep: int, score: str = "ATE", random_state: int = 42) -> tuple[IRM, pd.DataFrame]: + """Build and fit an IRM scalar model with a random basis.""" + np.random.seed(random_state) + data = make_irm_data(n_obs=N_OBS, dim_x=2, return_type="DoubleMLData") + + ml_g = RandomForestRegressor(n_estimators=10, random_state=random_state) + ml_m = RandomForestClassifier(n_estimators=10, random_state=random_state) + + model = IRM(data, score=score) + model.set_learners(ml_g=ml_g, ml_m=ml_m) + model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=n_rep) + model.fit() + + basis = pd.DataFrame( + np.random.normal(0, 1, size=(N_OBS, BASIS_DIM)), + columns=[f"b{i}" for i in range(BASIS_DIM)], + ) + return model, basis + + +@pytest.fixture(scope="module") +def fitted_irm_single_rep() -> tuple[IRM, pd.DataFrame]: + return _build_irm(n_rep=1) + + +@pytest.fixture(scope="module") +def fitted_irm_multi_rep() -> tuple[IRM, pd.DataFrame]: + return _build_irm(n_rep=2) + + +@pytest.mark.ci +def test_cate_returns_blp(fitted_irm_single_rep): + """cate() returns a fitted DoubleMLBLP instance.""" + model, basis = fitted_irm_single_rep + cate = model.cate(basis) + assert isinstance(cate, DoubleMLBLP) + + +@pytest.mark.ci +def test_cate_confint_shape(fitted_irm_single_rep): + """cate().confint() returns a DataFrame with one row per basis column.""" + model, basis = fitted_irm_single_rep + cate = model.cate(basis) + ci = cate.confint() + assert isinstance(ci, pd.DataFrame) + assert ci.shape[0] == BASIS_DIM + + +@pytest.mark.ci +@pytest.mark.parametrize("cov_type", ["nonrobust", "HC1", "HC3"]) +def test_cate_cov_type_passthrough(fitted_irm_single_rep, cov_type): + """The cov_type kwarg propagates through to the underlying OLS fit.""" + model, basis = fitted_irm_single_rep + cate = model.cate(basis, cov_type=cov_type) + assert cate.blp_model[0].cov_type == cov_type + + +@pytest.mark.ci +def test_cate_multi_rep_n_rep(fitted_irm_multi_rep): + """cate.n_rep matches the model's n_rep.""" + model, basis = fitted_irm_multi_rep + cate = model.cate(basis) + assert cate.n_rep == 2 + assert isinstance(cate.blp_model, list) + assert len(cate.blp_model) == 2 + + +@pytest.mark.ci +def test_cate_multi_rep_shapes(fitted_irm_multi_rep): + """all_coef and all_se have shape (BASIS_DIM, n_rep) under multi-rep.""" + model, basis = fitted_irm_multi_rep + cate = model.cate(basis) + assert cate.all_coef.shape == (BASIS_DIM, 2) + assert cate.all_se.shape == (BASIS_DIM, 2) + assert isinstance(cate.confint(), pd.DataFrame) + assert isinstance(cate.summary, pd.DataFrame) + + +@pytest.mark.ci +def test_gate_dummy_coded(fitted_irm_single_rep): + """gate() accepts a pre-dummy-coded boolean DataFrame.""" + model, _ = fitted_irm_single_rep + x1 = model._dml_data.data["X1"] + groups = pd.DataFrame({"Group 1": x1 <= x1.median(), "Group 2": x1 > x1.median()}) + gate = model.gate(groups) + assert isinstance(gate, DoubleMLBLP) + assert all(gate.confint().index == groups.columns.to_list()) + + +@pytest.mark.ci +def test_gate_single_column_string(fitted_irm_single_rep): + """A single-column string DataFrame is auto-converted to dummies.""" + model, _ = fitted_irm_single_rep + np.random.seed(0) + groups = pd.DataFrame(np.random.choice(["A", "B"], N_OBS)) + gate = model.gate(groups) + assert isinstance(gate, DoubleMLBLP) + assert all(gate.confint().index == ["Group_A", "Group_B"]) + + +@pytest.mark.ci +def test_gate_warns_small_group(fitted_irm_single_rep): + """A group with <= 5 observations triggers a UserWarning.""" + model, _ = fitted_irm_single_rep + groups = pd.DataFrame( + { + "small": np.array([True] * 3 + [False] * (N_OBS - 3)), + "large": np.array([False] * 3 + [True] * (N_OBS - 3)), + } + ) + with pytest.warns(UserWarning, match=r"At least one group effect is estimated with less than 6 observations"): + model.gate(groups) + + +@pytest.mark.ci +def test_cate_exception_atte(): + """CATE on an ATTE model raises ValueError.""" + model, basis = _build_irm(n_rep=1, score="ATTE") + with pytest.raises(ValueError, match=r"only implemented for score='ATE'"): + model.cate(basis) + + +@pytest.mark.ci +def test_cate_exception_before_fit(): + """Calling cate() before fit() raises ValueError.""" + np.random.seed(42) + data = make_irm_data(n_obs=N_OBS, dim_x=2, return_type="DoubleMLData") + model = IRM(data, score="ATE") + model.set_learners(ml_g=RandomForestRegressor(n_estimators=10), ml_m=RandomForestClassifier(n_estimators=10)) + basis = pd.DataFrame(np.random.normal(0, 1, size=(N_OBS, BASIS_DIM))) + with pytest.raises(ValueError, match=r"requires a fitted model"): + model.cate(basis) + + +@pytest.mark.ci +def test_gate_exception_not_dataframe(fitted_irm_single_rep): + """gate() with a non-DataFrame raises TypeError.""" + model, _ = fitted_irm_single_rep + with pytest.raises(TypeError, match=r"DataFrame type"): + model.gate(np.zeros((N_OBS, 2))) + + +@pytest.mark.ci +def test_gate_exception_bad_dtype(fitted_irm_single_rep): + """gate() with multi-column non-bool/int data raises TypeError.""" + model, _ = fitted_irm_single_rep + groups = pd.DataFrame( + { + "g1": np.random.normal(0, 1, N_OBS), + "g2": np.random.normal(0, 1, N_OBS), + } + ) + with pytest.raises(TypeError, match=r"bool type or int type"): + model.gate(groups) + + +@pytest.mark.ci +def test_cate_vs_legacy(): + """CATE coefficients from the new IRM match the legacy DoubleMLIRM.""" + n_obs = 200 + np.random.seed(42) + data = make_irm_data(n_obs=n_obs, dim_x=5, return_type="DoubleMLData") + + ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42) + ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42) + + np.random.seed(3141) + dml_old = dml.DoubleMLIRM(data, ml_g, ml_m, n_folds=N_FOLDS, n_rep=1, score="ATE") + dml_old.fit() + + dml_new = IRM(data, score="ATE") + dml_new.set_learners(ml_g=ml_g, ml_m=ml_m) + dml_new._n_folds = N_FOLDS + dml_new._n_rep = 1 + dml_new._smpls = dml_old.smpls + dml_new.fit() + + np.random.seed(0) + basis = pd.DataFrame( + np.random.normal(0, 1, size=(n_obs, BASIS_DIM)), + columns=[f"b{i}" for i in range(BASIS_DIM)], + ) + + cate_old = dml_old.cate(basis) + cate_new = dml_new.cate(basis) + + np.testing.assert_allclose(cate_new.coef, cate_old.coef, rtol=1e-9) + np.testing.assert_allclose(cate_new.se, cate_old.se, rtol=1e-9) diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index ed96bf84..825ec845 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -473,10 +473,10 @@ def cate(self, basis, is_gate=False, **kwargs): Y_tilde, D_tilde = self._partial_out() - D_basis = basis * D_tilde + basis_per_rep = [basis.multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)] model = DoubleMLBLP( orth_signal=Y_tilde, - basis=D_basis, + basis=basis_per_rep, is_gate=is_gate, ) model.fit(**kwargs) diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 9d2da5eb..4451e267 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -8,6 +8,7 @@ from typing import Any, ClassVar, Self import numpy as np +import pandas as pd from sklearn.base import clone from sklearn.model_selection import cross_val_predict @@ -15,6 +16,7 @@ from ..double_ml_linear_score import LinearScoreMixin from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_is_propensity from ..utils._learner import LearnerSpec, predict_nuisance +from ..utils.blp import DoubleMLBLP class PLR(LinearScoreMixin): @@ -393,6 +395,110 @@ def _get_score_elements(self) -> dict[str, np.ndarray]: return {"psi_a": psi_a, "psi_b": psi_b} + # ==================== Heterogeneous Effects ==================== + + def _partial_out(self) -> tuple[np.ndarray, np.ndarray]: + """ + Return partialled-out residuals (Y_tilde, D_tilde), each of shape (n_obs, n_rep). + + For score ``'partialling out'``: ``Y_tilde = y - ml_l``, ``D_tilde = d - ml_m``. + For score ``'IV-type'``: ``Y_tilde = y - theta * ml_m - ml_g`` and + ``D_tilde = d - ml_m`` where ``theta = self.coef[0]`` (aggregated across reps, + matching the legacy DoubleMLPLR behavior). + + Returns + ------- + Y_tilde, D_tilde : tuple[np.ndarray, np.ndarray] + Outcome and treatment residuals, each of shape ``(n_obs, n_rep)``. + """ + if self._predictions is None: + raise ValueError("predictions are None. Call fit() first.") + + y = self._dml_data.y[:, np.newaxis] + d = self._dml_data.d[:, np.newaxis] + m_hat = self._predictions["ml_m"] + + if self.score == "partialling out": + Y_tilde = y - self._predictions["ml_l"] + D_tilde = d - m_hat + else: # "IV-type" + Y_tilde = y - self.coef[0] * m_hat - self._predictions["ml_g"] + D_tilde = d - m_hat + return Y_tilde, D_tilde + + def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> DoubleMLBLP: + """ + Calculate conditional average treatment effects (CATE) for a given basis. + + Builds one ``basis * D_tilde[:, i_rep]`` DataFrame per repetition, fits per-rep + OLS via :class:`DoubleMLBLP`, and aggregates coefficients across repetitions. + + Parameters + ---------- + basis : :class:`pandas.DataFrame` + The basis for estimating the best linear predictor. Has to have shape + ``(n_obs, d)``. + is_gate : bool + Indicates whether the basis is constructed for GATEs (dummy basis). + Default is ``False``. + **kwargs : dict + Additional keyword arguments passed to + :meth:`statsmodels.regression.linear_model.OLS.fit`, e.g. ``cov_type``. + + Returns + ------- + model : :class:`doubleml.DoubleMLBLP` + Best linear predictor model. + """ + if self._dml_data.n_treat > 1: + raise NotImplementedError( + f"Only implemented for single treatment. Number of treatments is {self._dml_data.n_treat}." + ) + if self._predictions is None: + raise ValueError("CATE requires a fitted model. Call fit() first.") + + Y_tilde, D_tilde = self._partial_out() + basis_per_rep = [basis.multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)] + + model = DoubleMLBLP(orth_signal=Y_tilde, basis=basis_per_rep, is_gate=is_gate) + model.fit(**kwargs) + return model + + def gate(self, groups: pd.DataFrame, **kwargs: Any) -> DoubleMLBLP: + """ + Calculate group average treatment effects (GATE) for mutually exclusive groups. + + Parameters + ---------- + groups : :class:`pandas.DataFrame` + The group indicator. Either dummy-coded with shape ``(n_obs, d)`` (one column + per group) or ``(n_obs, 1)`` containing the group labels (as str). + **kwargs : dict + Additional keyword arguments passed to + :meth:`statsmodels.regression.linear_model.OLS.fit`, e.g. ``cov_type``. + + Returns + ------- + model : :class:`doubleml.DoubleMLBLP` + Best linear predictor model for group effects. + """ + if not isinstance(groups, pd.DataFrame): + raise TypeError(f"Groups must be of DataFrame type. Groups of type {str(type(groups))} was passed.") + + if not all(groups.dtypes == bool) or all(groups.dtypes == int): + if groups.shape[1] == 1: + groups = pd.get_dummies(groups, prefix="Group", prefix_sep="_") + else: + raise TypeError( + "Columns of groups must be of bool type or int type (dummy coded). " + "Alternatively, groups should only contain one column." + ) + + if any(groups.sum(0) <= 5): + warnings.warn("At least one group effect is estimated with less than 6 observations.") + + return self.cate(groups, is_gate=True, **kwargs) + def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: """ Compute PLR sensitivity elements vectorized over all repetitions. diff --git a/doubleml/plm/tests/test_plr.py b/doubleml/plm/tests/test_plr.py index 67e396c5..62461657 100644 --- a/doubleml/plm/tests/test_plr.py +++ b/doubleml/plm/tests/test_plr.py @@ -379,3 +379,31 @@ def test_dml_plr_cate_gate_multiple_rep(score, cov_type): assert gate.all_se.shape == (groups.shape[1], 2) assert isinstance(gate.confint(), pd.DataFrame) assert all(gate.confint().index == groups.columns.tolist()) + + +@pytest.mark.ci +def test_dml_plr_cate_multi_rep_per_rep_correctness(): + """For n_rep>1 with a multi-column basis, the per-rep BLP fit must use that rep's + own D_tilde residuals (not the global broadcasting that the previous expression + produced). Verify by comparing against a manual sm.OLS fit on rep 0.""" + import statsmodels.api as sm + + n = 150 + np.random.seed(42) + obj_dml_data = dml.plm.datasets.make_plr_CCDDHNR2018(n_obs=n) + ml_l = LinearRegression() + ml_m = LinearRegression() + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=3, n_rep=3, score="partialling out") + dml_plr_obj.fit() + + np.random.seed(7) + basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 4)), columns=[f"b{i}" for i in range(4)]) + cate = dml_plr_obj.cate(basis) + + # Manually replicate the per-rep BLP for rep 0 + Y_tilde, D_tilde = dml_plr_obj._partial_out() + manual_basis_0 = basis.multiply(D_tilde[:, 0], axis=0) + manual_blp_0 = sm.OLS(Y_tilde[:, 0], manual_basis_0).fit(cov_type="HC0") + + np.testing.assert_allclose(cate.all_coef[:, 0], manual_blp_0.params, rtol=1e-12) + np.testing.assert_allclose(cate.all_se[:, 0], manual_blp_0.bse, rtol=1e-12) diff --git a/doubleml/plm/tests/test_plr_scalar_cate_gate.py b/doubleml/plm/tests/test_plr_scalar_cate_gate.py new file mode 100644 index 00000000..7be0f84b --- /dev/null +++ b/doubleml/plm/tests/test_plr_scalar_cate_gate.py @@ -0,0 +1,205 @@ +"""Test cate() and gate() for the PLR scalar model.""" + +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import Lasso + +import doubleml as dml +from doubleml.plm.datasets import make_plr_CCDDHNR2018 +from doubleml.plm.plr_scalar import PLR +from doubleml.utils.blp import DoubleMLBLP + +N_OBS = 200 +N_FOLDS = 3 +BASIS_DIM = 4 + + +def _build_plr(n_rep: int, score: str = "partialling out", random_state: int = 42) -> tuple[PLR, pd.DataFrame]: + """Build and fit a PLR scalar model with a random basis.""" + np.random.seed(random_state) + data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5, alpha=0.5, return_type="DoubleMLData") + + ml_l = Lasso(alpha=0.1) + ml_m = Lasso(alpha=0.1) + ml_g = Lasso(alpha=0.1) + + model = PLR(data, score=score) + model.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) + model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=n_rep) + model.fit() + + basis = pd.DataFrame( + np.random.normal(0, 1, size=(N_OBS, BASIS_DIM)), + columns=[f"b{i}" for i in range(BASIS_DIM)], + ) + return model, basis + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module") +def fitted_plr_single_rep(score) -> tuple[PLR, pd.DataFrame]: + return _build_plr(n_rep=1, score=score) + + +@pytest.fixture(scope="module") +def fitted_plr_multi_rep(score) -> tuple[PLR, pd.DataFrame]: + return _build_plr(n_rep=2, score=score) + + +@pytest.mark.ci +def test_cate_returns_blp(fitted_plr_single_rep): + """cate() returns a fitted DoubleMLBLP instance.""" + model, basis = fitted_plr_single_rep + cate = model.cate(basis) + assert isinstance(cate, DoubleMLBLP) + + +@pytest.mark.ci +def test_cate_confint_shape(fitted_plr_single_rep): + """cate().confint() returns a DataFrame with one row per basis column.""" + model, basis = fitted_plr_single_rep + cate = model.cate(basis) + ci = cate.confint() + assert isinstance(ci, pd.DataFrame) + assert ci.shape[0] == BASIS_DIM + + +@pytest.mark.ci +@pytest.mark.parametrize("cov_type", ["nonrobust", "HC1", "HC3"]) +def test_cate_cov_type_passthrough(fitted_plr_single_rep, cov_type): + """The cov_type kwarg propagates through to the underlying OLS fit.""" + model, basis = fitted_plr_single_rep + cate = model.cate(basis, cov_type=cov_type) + assert cate.blp_model[0].cov_type == cov_type + + +@pytest.mark.ci +def test_cate_multi_rep_n_rep(fitted_plr_multi_rep): + """cate.n_rep matches the model's n_rep under multi-rep.""" + model, basis = fitted_plr_multi_rep + cate = model.cate(basis) + assert cate.n_rep == 2 + assert isinstance(cate.blp_model, list) + assert len(cate.blp_model) == 2 + + +@pytest.mark.ci +def test_cate_multi_rep_shapes(fitted_plr_multi_rep): + """all_coef and all_se have shape (BASIS_DIM, n_rep) under multi-rep.""" + model, basis = fitted_plr_multi_rep + cate = model.cate(basis) + assert cate.all_coef.shape == (BASIS_DIM, 2) + assert cate.all_se.shape == (BASIS_DIM, 2) + assert isinstance(cate.confint(), pd.DataFrame) + assert isinstance(cate.summary, pd.DataFrame) + + +@pytest.mark.ci +def test_gate_dummy_coded(fitted_plr_single_rep): + """gate() accepts a pre-dummy-coded boolean DataFrame.""" + model, _ = fitted_plr_single_rep + x1 = model._dml_data.x[:, 0] + groups = pd.DataFrame({"low": x1 <= np.median(x1), "high": x1 > np.median(x1)}) + gate = model.gate(groups) + assert isinstance(gate, DoubleMLBLP) + assert all(gate.confint().index == groups.columns.to_list()) + + +@pytest.mark.ci +def test_gate_single_column_string(fitted_plr_single_rep): + """A single-column string DataFrame is auto-converted to dummies.""" + model, _ = fitted_plr_single_rep + np.random.seed(0) + groups = pd.DataFrame(np.random.choice(["A", "B"], N_OBS)) + gate = model.gate(groups) + assert isinstance(gate, DoubleMLBLP) + assert all(gate.confint().index == ["Group_A", "Group_B"]) + + +@pytest.mark.ci +def test_gate_warns_small_group(fitted_plr_single_rep): + """A group with <= 5 observations triggers a UserWarning.""" + model, _ = fitted_plr_single_rep + groups = pd.DataFrame( + { + "small": np.array([True] * 3 + [False] * (N_OBS - 3)), + "large": np.array([False] * 3 + [True] * (N_OBS - 3)), + } + ) + with pytest.warns(UserWarning, match=r"At least one group effect is estimated with less than 6 observations"): + model.gate(groups) + + +@pytest.mark.ci +def test_cate_exception_before_fit(): + """Calling cate() before fit() raises ValueError.""" + np.random.seed(42) + data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5, alpha=0.5, return_type="DoubleMLData") + model = PLR(data, score="partialling out") + model.set_learners(ml_l=Lasso(alpha=0.1), ml_m=Lasso(alpha=0.1)) + basis = pd.DataFrame(np.random.normal(0, 1, size=(N_OBS, BASIS_DIM))) + with pytest.raises(ValueError, match=r"requires a fitted model"): + model.cate(basis) + + +@pytest.mark.ci +def test_gate_exception_not_dataframe(fitted_plr_single_rep): + """gate() with a non-DataFrame raises TypeError.""" + model, _ = fitted_plr_single_rep + with pytest.raises(TypeError, match=r"DataFrame type"): + model.gate(np.zeros((N_OBS, 2))) + + +@pytest.mark.ci +def test_gate_exception_bad_dtype(fitted_plr_single_rep): + """gate() with multi-column non-bool/int data raises TypeError.""" + model, _ = fitted_plr_single_rep + groups = pd.DataFrame( + { + "g1": np.random.normal(0, 1, N_OBS), + "g2": np.random.normal(0, 1, N_OBS), + } + ) + with pytest.raises(TypeError, match=r"bool type or int type"): + model.gate(groups) + + +@pytest.mark.ci +@pytest.mark.parametrize("score", ["partialling out", "IV-type"]) +@pytest.mark.parametrize("n_rep", [1, 2]) +def test_cate_vs_legacy(score, n_rep): + """CATE coefficients from the new PLR match the legacy DoubleMLPLR.""" + np.random.seed(42) + data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5, alpha=0.5, return_type="DoubleMLData") + + ml_l = Lasso(alpha=0.1) + ml_m = Lasso(alpha=0.1) + ml_g = Lasso(alpha=0.1) + + np.random.seed(3141) + dml_old = dml.DoubleMLPLR(data, ml_l, ml_m, ml_g, n_folds=N_FOLDS, n_rep=n_rep, score=score) + dml_old.fit() + + dml_new = PLR(data, score=score) + dml_new.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) + dml_new._n_folds = N_FOLDS + dml_new._n_rep = n_rep + dml_new._smpls = dml_old.smpls + dml_new.fit() + + np.random.seed(0) + basis = pd.DataFrame( + np.random.normal(0, 1, size=(N_OBS, BASIS_DIM)), + columns=[f"b{i}" for i in range(BASIS_DIM)], + ) + + cate_old = dml_old.cate(basis) + cate_new = dml_new.cate(basis) + + np.testing.assert_allclose(cate_new.coef, cate_old.coef, rtol=1e-9) + np.testing.assert_allclose(cate_new.se, cate_old.se, rtol=1e-9) diff --git a/doubleml/utils/blp.py b/doubleml/utils/blp.py index c0b11e18..c5e59d7e 100644 --- a/doubleml/utils/blp.py +++ b/doubleml/utils/blp.py @@ -19,9 +19,12 @@ class DoubleMLBLP: The orthogonal signal to be predicted. Has to be of shape ``(n_obs,)`` or ``(n_obs, n_rep)``, where ``n_obs`` is the number of observations and ``n_rep`` is the number of repetitions. - basis : :class:`pandas.DataFrame` - The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``, - where ``n_obs`` is the number of observations and ``d`` is the number of predictors. + basis : :class:`pandas.DataFrame` or list of :class:`pandas.DataFrame` + The basis for estimating the best linear predictor. Either a single DataFrame of shape + ``(n_obs, d)`` (shared across all repetitions) or a list of DataFrames of length ``n_rep`` + (one basis per repetition, e.g. for PLR CATE where the basis is multiplied by per-rep + residuals). When a list is passed, every entry must have the same column names so per-rep + coefficients can be aggregated. is_gate : bool Indicates whether the basis is constructed for GATEs (dummy-basis). @@ -44,16 +47,8 @@ def __init__(self, orth_signal, basis, is_gate=False): self._n_rep = self._orth_signal.shape[1] self._is_gate = is_gate - if not isinstance(basis, pd.DataFrame): - raise TypeError(f"The basis must be of DataFrame type. Basis of type {str(type(basis))} was passed.") - if not basis.columns.is_unique: - raise ValueError("Invalid pd.DataFrame: Contains duplicate column names.") - if self._orth_signal.shape[0] != basis.shape[0]: - raise ValueError( - "The number of observations in signal and basis does not match. " - f"Got {str(self._orth_signal.shape[0])} and {str(basis.shape[0])}." - ) - self._basis = basis + self._basis_list = self._validate_basis(basis, self._orth_signal.shape[0], self._n_rep) + self._basis = self._basis_list[0] # initialize the score and the covariance self._blp_model = None @@ -63,6 +58,46 @@ def __init__(self, orth_signal, basis, is_gate=False): self._coef = None self._se = None + @staticmethod + def _validate_basis(basis, n_obs, n_rep): + """Validate ``basis`` and return a list of length ``n_rep``. + + ``basis`` may be a single ``pd.DataFrame`` (shared across reps) or a list of + ``pd.DataFrame`` of length ``n_rep``. Per-rep DataFrames must share column names + so coefficients are comparable for aggregation. + """ + if isinstance(basis, pd.DataFrame): + basis_list = [basis] * n_rep + elif isinstance(basis, list): + if len(basis) != n_rep: + raise ValueError(f"When basis is a list it must have length n_rep={n_rep}. Got length {len(basis)}.") + if not all(isinstance(b, pd.DataFrame) for b in basis): + raise TypeError("All entries of basis list must be of DataFrame type.") + ref_cols = list(basis[0].columns) + for i, b in enumerate(basis[1:], start=1): + if list(b.columns) != ref_cols: + raise ValueError( + f"All per-rep bases must have the same column names. " + f"Entry 0 columns: {ref_cols}, entry {i} columns: {list(b.columns)}." + ) + basis_list = basis + else: + raise TypeError( + f"The basis must be of DataFrame type or a list of DataFrames. " + f"Basis of type {str(type(basis))} was passed." + ) + + if not basis_list[0].columns.is_unique: + raise ValueError("Invalid pd.DataFrame: Contains duplicate column names.") + + for i, b in enumerate(basis_list): + if b.shape[0] != n_obs: + raise ValueError( + "The number of observations in signal and basis does not match. " + f"Got {n_obs} and {b.shape[0]}" + (f" (basis entry {i})." if len(basis_list) > 1 else ".") + ) + return basis_list + def __str__(self): class_name = self.__class__.__name__ header = f"================== {class_name} Object ==================\n" @@ -188,7 +223,7 @@ def fit(self, cov_type="HC0", **kwargs): self._blp_model = [] for i_rep in range(self.n_rep): - blp_model = sm.OLS(self._orth_signal[:, i_rep], self._basis).fit(cov_type=cov_type, **kwargs) + blp_model = sm.OLS(self._orth_signal[:, i_rep], self._basis_list[i_rep]).fit(cov_type=cov_type, **kwargs) self._blp_model.append(blp_model) self._all_coef[:, i_rep] = np.asarray(blp_model.params) self._all_se[:, i_rep] = np.asarray(blp_model.bse) diff --git a/doubleml/utils/tests/test_blp.py b/doubleml/utils/tests/test_blp.py index e05f850e..a89e2b0c 100644 --- a/doubleml/utils/tests/test_blp.py +++ b/doubleml/utils/tests/test_blp.py @@ -160,7 +160,7 @@ def test_doubleml_exception_blp(): msg = "The signal must be one- or two-dimensional. Signal of dimensions 3 was passed." with pytest.raises(ValueError, match=msg): dml.DoubleMLBLP(orth_signal=np.array([[[1]], [[2]]]), basis=random_basis) - msg = "The basis must be of DataFrame type. Basis of type was passed." + msg = r"The basis must be of DataFrame type or a list of DataFrames. Basis of type was passed." with pytest.raises(TypeError, match=msg): dml.DoubleMLBLP(orth_signal=signal, basis=1) msg = "The number of observations in signal and basis does not match. Got 3 and 2." @@ -200,3 +200,77 @@ def test_doubleml_exception_blp(): msg = "Invalid basis: DataFrame has to have the exact same number and ordering of columns." with pytest.raises(ValueError, match=msg): dml_blp_confint.confint(basis=pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["x_1", "x_2", "x_3"])) + + +@pytest.mark.ci +def test_blp_per_rep_basis_fits(): + """A list-of-DataFrames basis fits and exposes per-rep coefficient shapes.""" + n, d, n_rep = 50, 3, 3 + np.random.seed(0) + signal = np.random.normal(0, 1, size=(n, n_rep)) + cols = [f"b{i}" for i in range(d)] + basis_list = [pd.DataFrame(np.random.normal(0, 1, size=(n, d)), columns=cols) for _ in range(n_rep)] + + blp = dml.DoubleMLBLP(signal, basis_list).fit() + assert blp.all_coef.shape == (d, n_rep) + assert blp.all_se.shape == (d, n_rep) + assert blp.coef.shape == (d,) + assert blp.se.shape == (d,) + + +@pytest.mark.ci +def test_blp_per_rep_basis_matches_shared(): + """Per-rep list of identical bases yields the same fit as the shared-basis call.""" + n, d, n_rep = 50, 3, 3 + np.random.seed(1) + signal = np.random.normal(0, 1, size=(n, n_rep)) + basis = pd.DataFrame(np.random.normal(0, 1, size=(n, d)), columns=[f"b{i}" for i in range(d)]) + + blp_shared = dml.DoubleMLBLP(signal, basis).fit() + blp_list = dml.DoubleMLBLP(signal, [basis] * n_rep).fit() + + np.testing.assert_allclose(blp_list.all_coef, blp_shared.all_coef, rtol=1e-12) + np.testing.assert_allclose(blp_list.all_se, blp_shared.all_se, rtol=1e-12) + np.testing.assert_allclose(blp_list.coef, blp_shared.coef, rtol=1e-12) + + +@pytest.mark.ci +def test_blp_per_rep_basis_wrong_length(): + """Wrong list length raises ValueError.""" + n, n_rep = 30, 3 + signal = np.zeros((n, n_rep)) + basis = pd.DataFrame(np.zeros((n, 2)), columns=["a", "b"]) + with pytest.raises(ValueError, match=r"length n_rep=3"): + dml.DoubleMLBLP(signal, [basis, basis]) + + +@pytest.mark.ci +def test_blp_per_rep_basis_mismatched_columns(): + """Per-rep bases with different column names raise ValueError.""" + n, n_rep = 30, 2 + signal = np.zeros((n, n_rep)) + basis_a = pd.DataFrame(np.zeros((n, 2)), columns=["a", "b"]) + basis_b = pd.DataFrame(np.zeros((n, 2)), columns=["a", "c"]) + with pytest.raises(ValueError, match=r"same column names"): + dml.DoubleMLBLP(signal, [basis_a, basis_b]) + + +@pytest.mark.ci +def test_blp_per_rep_basis_mismatched_n_obs(): + """Per-rep basis with wrong row count raises ValueError.""" + n, n_rep = 30, 2 + signal = np.zeros((n, n_rep)) + basis_ok = pd.DataFrame(np.zeros((n, 2)), columns=["a", "b"]) + basis_bad = pd.DataFrame(np.zeros((n - 1, 2)), columns=["a", "b"]) + with pytest.raises(ValueError, match=r"basis entry 1"): + dml.DoubleMLBLP(signal, [basis_ok, basis_bad]) + + +@pytest.mark.ci +def test_blp_per_rep_basis_non_dataframe_entry(): + """A non-DataFrame entry in the list raises TypeError.""" + n, n_rep = 30, 2 + signal = np.zeros((n, n_rep)) + basis = pd.DataFrame(np.zeros((n, 2)), columns=["a", "b"]) + with pytest.raises(TypeError, match=r"All entries of basis list must be of DataFrame type"): + dml.DoubleMLBLP(signal, [basis, np.zeros((n, 2))]) From 71ef4838d1dab85266bfc33ac869f588dce8bdef Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 14:46:58 +0200 Subject: [PATCH 27/38] feat: Implement PLRVector for multi-treatment partially linear regression and add comprehensive tests --- .claude/STATUS.md | 5 +- doubleml/plm/plr_vector.py | 139 +++++++++++ doubleml/plm/tests/test_plr_vector.py | 52 ++++ .../plm/tests/test_plr_vector_exceptions.py | 160 +++++++++++++ .../test_plr_vector_external_predictions.py | 83 +++++++ .../plm/tests/test_plr_vector_return_types.py | 222 ++++++++++++++++++ doubleml/plm/tests/test_plr_vector_vs_plr.py | 122 ++++++++++ 7 files changed, 781 insertions(+), 2 deletions(-) create mode 100644 doubleml/plm/plr_vector.py create mode 100644 doubleml/plm/tests/test_plr_vector.py create mode 100644 doubleml/plm/tests/test_plr_vector_exceptions.py create mode 100644 doubleml/plm/tests/test_plr_vector_external_predictions.py create mode 100644 doubleml/plm/tests/test_plr_vector_return_types.py create mode 100644 doubleml/plm/tests/test_plr_vector_vs_plr.py diff --git a/.claude/STATUS.md b/.claude/STATUS.md index c358e91f..953b2543 100644 --- a/.claude/STATUS.md +++ b/.claude/STATUS.md @@ -30,10 +30,11 @@ nuisance evaluation, and sensitivity analysis. - [x] **`DoubleMLBLP` per-rep basis API** — `basis` may be a single `pd.DataFrame` (shared) or a `list[pd.DataFrame]` of length `n_rep`. Also fixes the legacy `DoubleMLPLR.cate()` multi-rep bug (`basis * D_tilde` mis-broadcast for `n_rep>1` and `d_basis>1`). - [x] **`DoubleMLVector`** — multi-treatment base class first iteration (`doubleml/double_ml_vector.py`) - [x] **BLP multi-rep support** — `doubleml/utils/blp.py` +- [x] **`PLRVector`** — first concrete `DoubleMLVector` subclass (`doubleml/plm/plr_vector.py`) with 5 test files: `test_plr_vector.py`, `_return_types`, `_exceptions`, `_vs_plr`, `_external_predictions`. Validates exact equivalence with legacy `DoubleMLPLR` for multi-treatment. ### In Progress -- [ ] **`DoubleMLVector`** — base class exists; no concrete subclass yet +_(none)_ ### Feature Gaps vs Legacy Classes @@ -58,7 +59,7 @@ Intentionally **not ported**: | Item | Files | Notes | |------|-------|-------| -| `DoubleMLPLRVector` | `doubleml/plm/plr_vector.py` + tests | First concrete Vector subclass | +| `DoubleMLIRMVector` | `doubleml/irm/irm_vector.py` + tests | Next concrete Vector subclass | | `DoubleMLPLIVScalar` | `doubleml/plm/pliv_scalar.py` + 7 test files | Next scalar model | | `DoubleMLPLPRScalar` | `doubleml/plm/plpr_scalar.py` + 7 test files | | | DID scalar variants | `doubleml/did/*_scalar.py` | DID, DIDCSBinary, DIDMulti | diff --git a/doubleml/plm/plr_vector.py b/doubleml/plm/plr_vector.py new file mode 100644 index 00000000..6a3621bb --- /dev/null +++ b/doubleml/plm/plr_vector.py @@ -0,0 +1,139 @@ +"""Partially Linear Regression (PLR) multi-treatment model based on the DoubleMLVector hierarchy.""" + +from __future__ import annotations + +from typing import Any, Self + +from ..data.base_data import DoubleMLData +from ..double_ml_scalar import DoubleMLScalar +from ..double_ml_vector import DoubleMLVector +from .plr_scalar import PLR + + +class PLRVector(DoubleMLVector): + """Multi-treatment double machine learning for partially linear regression models. + + Orchestrates one :class:`~doubleml.plm.plr_scalar.PLR` instance per treatment column + in ``d_cols``. Sample splits are drawn once and shared across all sub-models; + learners are propagated (and cloned per sub-model) via :meth:`set_learners`. + The scalar :class:`~doubleml.DoubleMLFramework` objects are concatenated into a + single multi-treatment framework after fit. + + Parameters + ---------- + obj_dml_data : DoubleMLData + The data object providing the data and specifying the variables for the causal + model. May contain multiple treatment columns in ``d_cols``. + score : str + The score function (``'partialling out'`` or ``'IV-type'``). + Default is ``'partialling out'``. + ml_l : estimator, optional + Learner for E[Y|X]. Can be regressor or classifier. + ml_m : estimator, optional + Learner for E[D|X]. Can be regressor or classifier. + ml_g : estimator, optional + Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor. + """ + + def __init__( + self, + obj_dml_data: DoubleMLData, + score: str = "partialling out", + ml_l: object | None = None, + ml_m: object | None = None, + ml_g: object | None = None, + ) -> None: + # Validate at the vector level so the error fires before sub-model construction. + self._check_data(obj_dml_data) + valid_scores = ["partialling out", "IV-type"] + if score not in valid_scores: + raise ValueError(f"Invalid score '{score}'. Valid scores: {valid_scores}.") + if score == "IV-type" and obj_dml_data.binary_outcome: + raise ValueError("For score = 'IV-type', additive probability models (binary outcomes) are not supported.") + + super().__init__(obj_dml_data=obj_dml_data, score=score) + self._modellist = self._initialize_models() + + if any(learner is not None for learner in (ml_l, ml_m, ml_g)): + self.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) + + @staticmethod + def _check_data(obj_dml_data: Any) -> None: + """Validate the data object for PLR vector estimation. + + Parameters + ---------- + obj_dml_data : Any + Data candidate. Must be a :class:`~doubleml.data.DoubleMLData` without + instrumental variables. + + Raises + ------ + TypeError + If ``obj_dml_data`` is not a :class:`~doubleml.data.DoubleMLData`. + ValueError + If ``obj_dml_data`` defines instrumental variables (``z_cols``). + """ + if not isinstance(obj_dml_data, DoubleMLData): + raise TypeError( + f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) + if obj_dml_data.z_cols is not None: + raise ValueError( + "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). " + "PLRVector does not support instrumental variables." + ) + + @property + def required_learners(self) -> list[str]: + """Required learners for the current score.""" + names = ["ml_l", "ml_m"] + if self.score == "IV-type": + names.append("ml_g") + return names + + def set_learners( + self, + ml_l: object | None = None, + ml_m: object | None = None, + ml_g: object | None = None, + ) -> Self: + """Set the learners for nuisance estimation on every sub-model. + + Parameters + ---------- + ml_l : estimator or None, optional + Learner for :math:`\\ell_0(X) = E[Y|X]`. + ml_m : estimator or None, optional + Learner for :math:`m_0(X) = E[D|X]`. + ml_g : estimator or None, optional + Learner for :math:`g_0(X) = E[Y - D\\theta_0|X]`. Required for ``score='IV-type'``. + + Returns + ------- + self : PLRVector + """ + if self._modellist is None: + raise RuntimeError("Sub-models are not initialized. _initialize_models() must run in __init__.") + for model in self._modellist: + model.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g) + self._reset_fit_state() + return self + + def _initialize_models(self) -> list[DoubleMLScalar]: + """Create one PLR sub-model per treatment column.""" + return [PLR(obj_dml_data=self._get_data_for_model(d_col), score=self.score) for d_col in self._dml_data.d_cols] + + def cate(self, *args: Any, **kwargs: Any) -> Any: + """Not implemented for multi-treatment PLR.""" + raise NotImplementedError( + "cate() is not defined for multi-treatment PLR. " + "Use the single-treatment PLR (doubleml.plm.plr_scalar.PLR) instead." + ) + + def gate(self, *args: Any, **kwargs: Any) -> Any: + """Not implemented for multi-treatment PLR.""" + raise NotImplementedError( + "gate() is not defined for multi-treatment PLR. " + "Use the single-treatment PLR (doubleml.plm.plr_scalar.PLR) instead." + ) diff --git a/doubleml/plm/tests/test_plr_vector.py b/doubleml/plm/tests/test_plr_vector.py new file mode 100644 index 00000000..8798b391 --- /dev/null +++ b/doubleml/plm/tests/test_plr_vector.py @@ -0,0 +1,52 @@ +"""Core multi-treatment estimation accuracy for PLRVector.""" + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.linear_model import Lasso + +import doubleml as dml +from doubleml.plm.plr_vector import PLRVector + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module") +def fitted_plr_vector_bivariate(generate_data_bivariate, score): + """PLRVector fitted on bivariate data with theta = [0.5, 0.9].""" + data = generate_data_bivariate + x_cols = data.columns[data.columns.str.startswith("X")].tolist() + d_cols = data.columns[data.columns.str.startswith("d")].tolist() + obj_dml_data = dml.DoubleMLData(data, y_col="y", d_cols=d_cols, x_cols=x_cols) + + learner = Lasso(alpha=0.1) + np.random.seed(3141) + dml_obj = PLRVector(obj_dml_data, score=score) + dml_obj.set_learners(ml_l=clone(learner), ml_m=clone(learner), ml_g=clone(learner) if score == "IV-type" else None) + dml_obj.draw_sample_splitting(n_folds=5, n_rep=1) + dml_obj.fit() + return dml_obj, np.array([0.5, 0.9]) + + +@pytest.mark.ci +def test_coef_within_3_sigma(fitted_plr_vector_bivariate): + """All treatment coefficients fall within 3 SE of the true thetas.""" + dml_obj, true_theta = fitted_plr_vector_bivariate + assert np.all(np.abs(dml_obj.coef - true_theta) <= 3.0 * dml_obj.se) + + +@pytest.mark.ci +def test_se_positive(fitted_plr_vector_bivariate): + """Standard errors are strictly positive for every treatment.""" + dml_obj, _ = fitted_plr_vector_bivariate + assert np.all(dml_obj.se > 0) + + +@pytest.mark.ci +def test_coef_shape_matches_d_cols(fitted_plr_vector_bivariate): + """Coefficient vector has one entry per treatment column.""" + dml_obj, _ = fitted_plr_vector_bivariate + assert dml_obj.coef.shape == (len(dml_obj._dml_data.d_cols),) diff --git a/doubleml/plm/tests/test_plr_vector_exceptions.py b/doubleml/plm/tests/test_plr_vector_exceptions.py new file mode 100644 index 00000000..0f018405 --- /dev/null +++ b/doubleml/plm/tests/test_plr_vector_exceptions.py @@ -0,0 +1,160 @@ +"""Validate PLRVector input validation and error handling.""" + +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import Lasso + +import doubleml as dml +from doubleml.plm.plr_vector import PLRVector + + +def _make_bivariate_data(n_obs: int = 200, dim_x: int = 5) -> dml.DoubleMLData: + np.random.seed(42) + x = np.random.normal(size=(n_obs, dim_x)) + d0 = np.random.normal(size=n_obs) + d1 = np.random.normal(size=n_obs) + y = 0.5 * d0 + 0.9 * d1 + x[:, 0] + np.random.normal(size=n_obs) + df = pd.DataFrame( + np.column_stack([x, y, d0, d1]), + columns=[f"X{i + 1}" for i in range(dim_x)] + ["y", "d1", "d2"], + ) + return dml.DoubleMLData(df, y_col="y", d_cols=["d1", "d2"], x_cols=[f"X{i + 1}" for i in range(dim_x)]) + + +def _make_binary_outcome_bivariate_data(n_obs: int = 100) -> dml.DoubleMLData: + np.random.seed(11) + x = np.random.normal(size=(n_obs, 3)) + d0 = (np.random.normal(size=n_obs) > 0).astype(float) + d1 = (np.random.normal(size=n_obs) > 0).astype(float) + y = (np.random.normal(size=n_obs) > 0).astype(float) + df = pd.DataFrame({"y": y, "d1": d0, "d2": d1, "X1": x[:, 0], "X2": x[:, 1], "X3": x[:, 2]}) + return dml.DoubleMLData(df, y_col="y", d_cols=["d1", "d2"], x_cols=["X1", "X2", "X3"]) + + +def _make_iv_data(n_obs: int = 200, dim_x: int = 5) -> dml.DoubleMLData: + np.random.seed(42) + x = np.random.normal(size=(n_obs, dim_x)) + d0 = np.random.normal(size=n_obs) + d1 = np.random.normal(size=n_obs) + z = np.random.normal(size=n_obs) + y = 0.5 * d0 + 0.9 * d1 + x[:, 0] + np.random.normal(size=n_obs) + df = pd.DataFrame( + np.column_stack([x, y, d0, d1, z]), + columns=[f"X{i + 1}" for i in range(dim_x)] + ["y", "d1", "d2", "Z1"], + ) + return dml.DoubleMLData( + df, + y_col="y", + d_cols=["d1", "d2"], + x_cols=[f"X{i + 1}" for i in range(dim_x)], + z_cols="Z1", + ) + + +@pytest.mark.ci +def test_exception_data_type(): + """Non-DoubleMLData input is rejected with a TypeError.""" + msg = r"The data must be of DoubleMLData type\." + with pytest.raises(TypeError, match=msg): + PLRVector(pd.DataFrame()) + + +@pytest.mark.ci +def test_exception_instrument(): + """Data carrying instrumental variables (z_cols) is rejected.""" + msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\." + with pytest.raises(ValueError, match=msg): + PLRVector(_make_iv_data()) + + +@pytest.mark.ci +def test_exception_invalid_score(): + """Unknown score string is rejected at construction.""" + msg = r"Invalid score 'invalid'\." + with pytest.raises(ValueError, match=msg): + PLRVector(_make_bivariate_data(), score="invalid") + + +@pytest.mark.ci +def test_exception_iv_type_binary_outcome(): + """IV-type score with binary outcome is rejected.""" + msg = r"For score = 'IV-type', additive probability models \(binary outcomes\) are not supported\." + with pytest.raises(ValueError, match=msg): + PLRVector(_make_binary_outcome_bivariate_data(), score="IV-type") + + +@pytest.mark.ci +def test_exception_n_folds(): + """draw_sample_splitting rejects n_folds < 2.""" + dml_obj = PLRVector(_make_bivariate_data()) + msg = r"n_folds must be an integer >= 2\." + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_folds=1) + + +@pytest.mark.ci +def test_exception_n_rep(): + """draw_sample_splitting rejects n_rep < 1.""" + dml_obj = PLRVector(_make_bivariate_data()) + msg = r"n_rep must be an integer >= 1\." + with pytest.raises(ValueError, match=msg): + dml_obj.draw_sample_splitting(n_rep=0) + + +@pytest.mark.ci +def test_exception_missing_learner(): + """fit() fails when no learners are registered.""" + dml_obj = PLRVector(_make_bivariate_data()) + dml_obj.draw_sample_splitting() + msg = r"Learner 'ml_l' is required but not set" + with pytest.raises(ValueError, match=msg): + dml_obj.fit() + + +@pytest.mark.ci +def test_exception_missing_partial_learner(): + """fit() fails when ml_m is missing.""" + dml_obj = PLRVector(_make_bivariate_data()) + dml_obj.set_learners(ml_l=Lasso(alpha=0.1)) + dml_obj.draw_sample_splitting() + msg = r"Learner 'ml_m' is required but not set" + with pytest.raises(ValueError, match=msg): + dml_obj.fit() + + +@pytest.mark.ci +def test_exception_invalid_learner_class(): + """Passing a class instead of an instance raises TypeError.""" + dml_obj = PLRVector(_make_bivariate_data()) + msg = r"Invalid learner provided for ml_l: provide an instance" + with pytest.raises(TypeError, match=msg): + dml_obj.set_learners(ml_l=Lasso) + + +@pytest.mark.ci +def test_warning_ml_g_partialling_out(): + """Passing ml_g with score='partialling out' triggers a UserWarning.""" + dml_obj = PLRVector(_make_bivariate_data(), score="partialling out") + with pytest.warns(UserWarning, match=r"not required for score.*ignored"): + dml_obj.set_learners(ml_l=Lasso(alpha=0.1), ml_m=Lasso(alpha=0.1), ml_g=Lasso(alpha=0.1)) + + +@pytest.mark.ci +def test_cate_not_implemented(): + """cate() raises NotImplementedError on multi-treatment PLR.""" + dml_obj = PLRVector(_make_bivariate_data()) + dml_obj.set_learners(ml_l=Lasso(alpha=0.1), ml_m=Lasso(alpha=0.1)) + dml_obj.fit(n_folds=3) + with pytest.raises(NotImplementedError, match=r"cate\(\) is not defined for multi-treatment PLR"): + dml_obj.cate(pd.DataFrame({"const": np.ones(200)})) + + +@pytest.mark.ci +def test_gate_not_implemented(): + """gate() raises NotImplementedError on multi-treatment PLR.""" + dml_obj = PLRVector(_make_bivariate_data()) + dml_obj.set_learners(ml_l=Lasso(alpha=0.1), ml_m=Lasso(alpha=0.1)) + dml_obj.fit(n_folds=3) + with pytest.raises(NotImplementedError, match=r"gate\(\) is not defined for multi-treatment PLR"): + dml_obj.gate(pd.DataFrame({"g": np.ones(200, dtype=bool)})) diff --git a/doubleml/plm/tests/test_plr_vector_external_predictions.py b/doubleml/plm/tests/test_plr_vector_external_predictions.py new file mode 100644 index 00000000..f4316c6f --- /dev/null +++ b/doubleml/plm/tests/test_plr_vector_external_predictions.py @@ -0,0 +1,83 @@ +"""External predictions equivalence for PLRVector across multiple treatments.""" + +import math + +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import LinearRegression + +import doubleml as dml +from doubleml.plm.plr_vector import PLRVector + + +def _make_bivariate_data(n_obs: int = 300, dim_x: int = 5) -> dml.DoubleMLData: + np.random.seed(42) + x = np.random.normal(size=(n_obs, dim_x)) + d0 = np.random.normal(size=n_obs) + d1 = np.random.normal(size=n_obs) + y = 0.5 * d0 + 0.9 * d1 + x[:, 0] + np.random.normal(size=n_obs) + df = pd.DataFrame( + np.column_stack([x, y, d0, d1]), + columns=[f"X{i + 1}" for i in range(dim_x)] + ["y", "d1", "d2"], + ) + return dml.DoubleMLData(df, y_col="y", d_cols=["d1", "d2"], x_cols=[f"X{i + 1}" for i in range(dim_x)]) + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def external_predictions_fixture(score, n_rep): + """Fit a reference PLRVector and a second one consuming its predictions externally.""" + n_folds = 3 + obj_dml_data = _make_bivariate_data() + learner_kwargs: dict[str, object] = {"ml_l": LinearRegression(), "ml_m": LinearRegression()} + if score == "IV-type": + learner_kwargs["ml_g"] = LinearRegression() + + np.random.seed(3141) + dml_ref = PLRVector(obj_dml_data, score=score) + dml_ref.set_learners(**learner_kwargs) + dml_ref.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_ref.fit() + + # Build external predictions per treatment, replicating every required learner. + learner_names = ["ml_l", "ml_m"] + (["ml_g"] if score == "IV-type" else []) + external_predictions = { + d_col: {name: dml_ref.modellist[i]._predictions[name] for name in learner_names} + for i, d_col in enumerate(obj_dml_data.d_cols) + } + + # Fit a fresh PLRVector consuming the external predictions on identical splits. + dml_ext = PLRVector(obj_dml_data, score=score) + dml_ext.set_learners(**learner_kwargs) + dml_ext.set_sample_splitting(dml_ref.smpls) + dml_ext.fit(external_predictions=external_predictions) + + return {"ref": dml_ref, "ext": dml_ext} + + +@pytest.mark.ci +def test_coef_matches_external(external_predictions_fixture): + """Per-treatment coefficients match the reference fit when fed via external_predictions.""" + ref = external_predictions_fixture["ref"] + ext = external_predictions_fixture["ext"] + for i in range(ref.coef.shape[0]): + assert math.isclose(ref.coef[i], ext.coef[i], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_se_matches_external(external_predictions_fixture): + """Per-treatment standard errors match the reference fit when fed via external_predictions.""" + ref = external_predictions_fixture["ref"] + ext = external_predictions_fixture["ext"] + for i in range(ref.se.shape[0]): + assert math.isclose(ref.se[i], ext.se[i], rel_tol=1e-9, abs_tol=1e-4) diff --git a/doubleml/plm/tests/test_plr_vector_return_types.py b/doubleml/plm/tests/test_plr_vector_return_types.py new file mode 100644 index 00000000..65346e17 --- /dev/null +++ b/doubleml/plm/tests/test_plr_vector_return_types.py @@ -0,0 +1,222 @@ +"""Validate PLRVector return types and reset behavior.""" + +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import LinearRegression + +import doubleml as dml +from doubleml.plm.plr_vector import PLRVector + +N_OBS = 200 +N_FOLDS = 3 +N_REP = 2 +N_REP_BOOT = 251 + + +def _make_data(n_obs: int = N_OBS, dim_x: int = 5) -> dml.DoubleMLData: + """Build a small bivariate-treatment DoubleMLData for return-type tests.""" + np.random.seed(7) + x = np.random.normal(size=(n_obs, dim_x)) + d0 = np.random.normal(size=n_obs) + d1 = np.random.normal(size=n_obs) + y = 0.5 * d0 + 0.9 * d1 + x[:, 0] + np.random.normal(size=n_obs) + df = pd.DataFrame( + np.column_stack([x, y, d0, d1]), + columns=[f"X{i + 1}" for i in range(dim_x)] + ["y", "d1", "d2"], + ) + return dml.DoubleMLData(df, y_col="y", d_cols=["d1", "d2"], x_cols=[f"X{i + 1}" for i in range(dim_x)]) + + +N_TREAT = 2 # tied to _make_data + + +@pytest.fixture(scope="module") +def fitted_plr_vector(): + """Fit a PLRVector once and share across tests.""" + np.random.seed(3141) + obj_dml_data = _make_data() + dml_obj = PLRVector(obj_dml_data) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + dml_obj.bootstrap(n_rep_boot=N_REP_BOOT) + return dml_obj + + +@pytest.mark.ci +def test_coef_type_and_shape(fitted_plr_vector): + """coef is a 1D array with one entry per treatment.""" + assert isinstance(fitted_plr_vector.coef, np.ndarray) + assert fitted_plr_vector.coef.shape == (N_TREAT,) + + +@pytest.mark.ci +def test_se_type_and_shape(fitted_plr_vector): + """se is a 1D array with one entry per treatment.""" + assert isinstance(fitted_plr_vector.se, np.ndarray) + assert fitted_plr_vector.se.shape == (N_TREAT,) + + +@pytest.mark.ci +def test_all_thetas_shape(fitted_plr_vector): + """all_thetas is (n_treat, n_rep).""" + assert fitted_plr_vector.all_thetas.shape == (N_TREAT, N_REP) + + +@pytest.mark.ci +def test_all_ses_shape(fitted_plr_vector): + """all_ses is (n_treat, n_rep).""" + assert fitted_plr_vector.all_ses.shape == (N_TREAT, N_REP) + + +@pytest.mark.ci +def test_summary_index_matches_d_cols(fitted_plr_vector): + """summary is a DataFrame indexed by d_cols in declaration order.""" + summary = fitted_plr_vector.summary + assert isinstance(summary, pd.DataFrame) + assert summary.shape[0] == N_TREAT + assert summary.index.tolist() == ["d1", "d2"] + + +@pytest.mark.ci +def test_confint_shape(fitted_plr_vector): + """confint returns (n_treat, 2) DataFrame.""" + ci = fitted_plr_vector.confint() + assert isinstance(ci, pd.DataFrame) + assert ci.shape == (N_TREAT, 2) + + +@pytest.mark.ci +def test_confint_joint_shape(fitted_plr_vector): + """confint(joint=True) returns (n_treat, 2) DataFrame after bootstrap.""" + ci = fitted_plr_vector.confint(joint=True) + assert isinstance(ci, pd.DataFrame) + assert ci.shape == (N_TREAT, 2) + + +@pytest.mark.ci +def test_psi_shape(fitted_plr_vector): + """psi has shape (n_obs, n_treat, n_rep).""" + assert fitted_plr_vector.psi.shape == (N_OBS, N_TREAT, N_REP) + + +@pytest.mark.ci +def test_modellist_length_and_type(fitted_plr_vector): + """modellist exposes one PLR scalar sub-model per treatment.""" + from doubleml.plm.plr_scalar import PLR + + models = fitted_plr_vector.modellist + assert isinstance(models, list) + assert len(models) == N_TREAT + assert all(isinstance(m, PLR) for m in models) + + +@pytest.mark.ci +def test_smpls_shared_across_submodels(fitted_plr_vector): + """Sample splits are propagated identically into each sub-model.""" + parent_smpls = fitted_plr_vector.smpls + for model in fitted_plr_vector.modellist: + for i_rep in range(N_REP): + for j_fold in range(N_FOLDS): + np.testing.assert_array_equal(model.smpls[i_rep][j_fold][0], parent_smpls[i_rep][j_fold][0]) + np.testing.assert_array_equal(model.smpls[i_rep][j_fold][1], parent_smpls[i_rep][j_fold][1]) + + +@pytest.mark.ci +def test_n_properties(fitted_plr_vector): + """n_obs, n_folds, n_rep, score reflect configuration.""" + assert fitted_plr_vector.n_obs == N_OBS + assert fitted_plr_vector.n_folds == N_FOLDS + assert fitted_plr_vector.n_rep == N_REP + assert fitted_plr_vector.score == "partialling out" + + +@pytest.mark.ci +def test_required_learners(fitted_plr_vector): + """required_learners is score-dependent and matches scalar PLR.""" + assert fitted_plr_vector.required_learners == ["ml_l", "ml_m"] + + +@pytest.mark.ci +def test_get_params_returns_per_submodel_list(fitted_plr_vector): + """get_params returns one parameter dict per sub-model, in d_cols order.""" + params = fitted_plr_vector.get_params("ml_l") + assert isinstance(params, list) + assert len(params) == N_TREAT + for p in params: + assert isinstance(p, dict) + assert "fit_intercept" in p + + +@pytest.mark.ci +def test_set_params_updates_all_submodels(fitted_plr_vector): + """set_params propagates to every sub-model and returns self.""" + result = fitted_plr_vector.set_params("ml_l", fit_intercept=False) + assert result is fitted_plr_vector + params = fitted_plr_vector.get_params("ml_l") + assert all(p["fit_intercept"] is False for p in params) + fitted_plr_vector.set_params("ml_l", fit_intercept=True) + + +@pytest.mark.ci +def test_sensitivity_elements_shape(fitted_plr_vector): + """sensitivity_elements exposes framework-level keys with multi-treatment shapes.""" + elems = fitted_plr_vector.sensitivity_elements + assert isinstance(elems, dict) + for key in ["sigma2", "nu2", "max_bias"]: + assert elems[key].shape == (1, N_TREAT, N_REP) + assert elems["psi_max_bias"].shape == (N_OBS, N_TREAT, N_REP) + + +@pytest.mark.ci +def test_treatment_names_set_on_framework(fitted_plr_vector): + """treatment_names on the framework match d_cols.""" + assert fitted_plr_vector.framework.treatment_names == ["d1", "d2"] + + +@pytest.mark.ci +def test_before_fit_raises(): + """Properties relying on framework raise before fit().""" + np.random.seed(3141) + dml_obj = PLRVector(_make_data()) + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.coef + + +@pytest.mark.ci +def test_reset_after_draw_sample_splitting(): + """draw_sample_splitting clears framework and fitted properties on vector and sub-models.""" + np.random.seed(3141) + dml_obj = PLRVector(_make_data()) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + dml_obj.fit() + _ = dml_obj.framework + _ = dml_obj.coef + + dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP) + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.framework + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.coef + for model in dml_obj.modellist: + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = model.framework + + +@pytest.mark.ci +def test_reset_after_set_learners(): + """set_learners after fit clears the vector framework so stale results aren't returned.""" + np.random.seed(3141) + dml_obj = PLRVector(_make_data()) + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + dml_obj.fit(n_folds=N_FOLDS, n_rep=N_REP) + _ = dml_obj.framework + + dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression()) + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = dml_obj.framework + for model in dml_obj.modellist: + with pytest.raises(ValueError, match="framework is not yet initialized"): + _ = model.framework diff --git a/doubleml/plm/tests/test_plr_vector_vs_plr.py b/doubleml/plm/tests/test_plr_vector_vs_plr.py new file mode 100644 index 00000000..58e28d4a --- /dev/null +++ b/doubleml/plm/tests/test_plr_vector_vs_plr.py @@ -0,0 +1,122 @@ +"""Compare PLRVector against the legacy DoubleMLPLR implementation for multi-treatment data.""" + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.linear_model import Lasso, LinearRegression + +import doubleml as dml +from doubleml.plm.plr_vector import PLRVector + + +@pytest.fixture(scope="module", params=[LinearRegression(), Lasso(alpha=0.1)]) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["partialling out", "IV-type"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def comparison_fixture(generate_data_bivariate, learner, score, n_rep): + n_folds = 3 + seed = 3141 + data = generate_data_bivariate + x_cols = data.columns[data.columns.str.startswith("X")].tolist() + d_cols = data.columns[data.columns.str.startswith("d")].tolist() + + obj_dml_data = dml.DoubleMLData(data, y_col="y", d_cols=d_cols, x_cols=x_cols) + + ml_g_arg = clone(learner) if score == "IV-type" else None + + # Legacy DoubleMLPLR draws splits in __init__ + np.random.seed(seed) + dml_old = dml.DoubleMLPLR( + obj_dml_data, + clone(learner), + clone(learner), + ml_g_arg, + n_folds=n_folds, + n_rep=n_rep, + score=score, + ) + dml_old.fit() + + # New PLRVector draws splits explicitly via draw_sample_splitting + np.random.seed(seed) + dml_new = PLRVector(obj_dml_data, score=score) + dml_new.set_learners(ml_l=clone(learner), ml_m=clone(learner), ml_g=ml_g_arg) + dml_new.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) + dml_new.fit() + + return {"old": dml_old, "new": dml_new} + + +@pytest.mark.ci +def test_coef_equal(comparison_fixture): + """PLRVector.coef matches legacy DoubleMLPLR.coef per treatment.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.coef, old.coef, rtol=1e-9) + + +@pytest.mark.ci +def test_se_equal(comparison_fixture): + """PLRVector.se matches legacy DoubleMLPLR.se per treatment.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.se, old.se, rtol=1e-9) + + +@pytest.mark.ci +def test_all_coef_equal(comparison_fixture): + """PLRVector.all_thetas matches legacy DoubleMLPLR.all_coef.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.all_thetas, old.all_coef, rtol=1e-9) + + +@pytest.mark.ci +def test_all_se_equal(comparison_fixture): + """PLRVector.all_ses matches legacy DoubleMLPLR.all_se.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_sigma2_equal(comparison_fixture): + """PLRVector sigma2 matches legacy DoubleMLPLR sensitivity_elements['sigma2'] after axis swap.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + # Legacy shape: (1, n_rep, n_treat); vector shape: (1, n_treat, n_rep). Transpose to align. + old_sigma2 = np.transpose(old.sensitivity_elements["sigma2"], (0, 2, 1)) + np.testing.assert_allclose(new.sensitivity_elements["sigma2"], old_sigma2, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_nu2_equal(comparison_fixture): + """PLRVector nu2 matches legacy DoubleMLPLR sensitivity_elements['nu2'] after axis swap.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + old_nu2 = np.transpose(old.sensitivity_elements["nu2"], (0, 2, 1)) + np.testing.assert_allclose(new.sensitivity_elements["nu2"], old_nu2, rtol=1e-9) + + +@pytest.mark.ci +def test_sensitivity_max_bias_equal(comparison_fixture): + """PLRVector framework max_bias matches legacy DoubleMLPLR framework max_bias.""" + old = comparison_fixture["old"] + new = comparison_fixture["new"] + np.testing.assert_allclose( + new.framework.sensitivity_elements["max_bias"], + old.framework.sensitivity_elements["max_bias"], + rtol=1e-9, + ) From 1ae721c9307b24082e99faf489305f86aff72c06 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 16:44:17 +0200 Subject: [PATCH 28/38] refactor: move Self type hint import to typing_extensions for 3.10 --- doubleml/double_ml_base.py | 3 ++- doubleml/double_ml_scalar.py | 4 +++- doubleml/double_ml_vector.py | 4 +++- doubleml/irm/irm_scalar.py | 3 ++- doubleml/plm/plr_scalar.py | 3 ++- doubleml/plm/plr_vector.py | 4 +++- 6 files changed, 15 insertions(+), 6 deletions(-) diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py index 645e3ed6..df0affa8 100644 --- a/doubleml/double_ml_base.py +++ b/doubleml/double_ml_base.py @@ -3,10 +3,11 @@ """ from abc import ABC, abstractmethod -from typing import Dict, Optional, Self +from typing import Dict, Optional import numpy as np import pandas as pd +from typing_extensions import Self from .data.base_data import DoubleMLBaseData from .double_ml_framework import DoubleMLFramework diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index d460b824..39bd495e 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -4,7 +4,9 @@ import warnings from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Self +from typing import TYPE_CHECKING, Any, Callable, ClassVar + +from typing_extensions import Self if TYPE_CHECKING: from .utils._tune_optuna import DMLOptunaResult diff --git a/doubleml/double_ml_vector.py b/doubleml/double_ml_vector.py index f6f1e376..e2a7c7d7 100644 --- a/doubleml/double_ml_vector.py +++ b/doubleml/double_ml_vector.py @@ -4,7 +4,9 @@ import copy from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Self +from typing import TYPE_CHECKING, Any + +from typing_extensions import Self if TYPE_CHECKING: from .utils._tune_optuna import DMLOptunaResult diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index ac44a346..1ceac4af 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -5,12 +5,13 @@ from __future__ import annotations import warnings -from typing import Any, ClassVar, Self +from typing import Any, ClassVar import numpy as np import pandas as pd from sklearn.base import clone from sklearn.utils.multiclass import type_of_target +from typing_extensions import Self from ..data.base_data import DoubleMLData from ..double_ml_linear_score import LinearScoreMixin diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 4451e267..2831453e 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -5,12 +5,13 @@ from __future__ import annotations import warnings -from typing import Any, ClassVar, Self +from typing import Any, ClassVar import numpy as np import pandas as pd from sklearn.base import clone from sklearn.model_selection import cross_val_predict +from typing_extensions import Self from ..data.base_data import DoubleMLData from ..double_ml_linear_score import LinearScoreMixin diff --git a/doubleml/plm/plr_vector.py b/doubleml/plm/plr_vector.py index 6a3621bb..87b55764 100644 --- a/doubleml/plm/plr_vector.py +++ b/doubleml/plm/plr_vector.py @@ -2,7 +2,9 @@ from __future__ import annotations -from typing import Any, Self +from typing import Any + +from typing_extensions import Self from ..data.base_data import DoubleMLData from ..double_ml_scalar import DoubleMLScalar From f1c0bcdaa74baede9c80dd2459031a906ffa7f6d Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 16:54:37 +0200 Subject: [PATCH 29/38] Fix high priority codacy issues: update set_learners method signature and enhance error handling in PLR and LearnerSpec validation --- doubleml/double_ml_scalar.py | 7 +------ doubleml/plm/plr_scalar.py | 5 +++-- doubleml/utils/_checks.py | 4 +++- doubleml/utils/_learner.py | 5 ++++- doubleml/utils/tests/test_learner.py | 13 +++++++++++++ 5 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 doubleml/utils/tests/test_learner.py diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index 39bd495e..10f74365 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -397,18 +397,13 @@ def _register_learner(self, name: str, learner: object) -> None: self._learners[name] = info @abstractmethod - def set_learners(self, **kwargs: object) -> Self: + def set_learners(self) -> Self: """ Set the learners for nuisance estimation. Subclasses must implement this method with explicit keyword arguments for each learner (e.g., ``ml_l``, ``ml_m``, ``ml_g`` for PLR). - Parameters - ---------- - **kwargs - Learner keyword arguments specific to the subclass. - Returns ------- self : Self diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 2831453e..77980881 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -388,11 +388,12 @@ def _get_score_elements(self) -> dict[str, np.ndarray]: u_hat = y[:, np.newaxis] - l_hat psi_a = -v_hat * v_hat psi_b = v_hat * u_hat - else: - assert self.score == "IV-type" + elif self.score == "IV-type": g_hat = self._predictions["ml_g"] psi_a = -v_hat * d[:, np.newaxis] psi_b = v_hat * (y[:, np.newaxis] - g_hat) + else: + raise ValueError(f"Invalid score '{self.score}'.") return {"psi_a": psi_a, "psi_b": psi_b} diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index 8adfafc1..3b065031 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -556,6 +556,9 @@ def _check_learner(learner, learner_name, regressor=True, classifier=True): err_msg_prefix = f"Invalid learner provided for {learner_name}: " warn_msg_prefix = f"Learner provided for {learner_name} is probably invalid: " + if not (regressor or classifier): + raise ValueError("At least one of regressor or classifier must be True.") + if isinstance(learner, type): raise TypeError(err_msg_prefix + "provide an instance of a learner instead of a class.") @@ -583,7 +586,6 @@ def _check_learner(learner, learner_name, regressor=True, classifier=True): warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.") learner_is_classifier = True else: - assert regressor # classifier, regressor or both must be True if not is_regressor(learner): warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.") learner_is_classifier = False diff --git a/doubleml/utils/_learner.py b/doubleml/utils/_learner.py index 04659c98..5c31a16a 100644 --- a/doubleml/utils/_learner.py +++ b/doubleml/utils/_learner.py @@ -36,6 +36,10 @@ class LearnerSpec: allow_classifier: bool = True binary_data_check: Optional[Literal["outcome", "treatment"]] = None + def __post_init__(self) -> None: + if not (self.allow_regressor or self.allow_classifier): + raise ValueError(f"LearnerSpec '{self.name}': at least one of allow_regressor or allow_classifier must be True.") + @dataclass class LearnerInfo: @@ -127,7 +131,6 @@ def validate_learner( warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.") learner_is_classifier = True else: - assert spec.allow_regressor # At least one must be True if not is_regressor(learner): warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.") learner_is_classifier = False diff --git a/doubleml/utils/tests/test_learner.py b/doubleml/utils/tests/test_learner.py new file mode 100644 index 00000000..07d67c74 --- /dev/null +++ b/doubleml/utils/tests/test_learner.py @@ -0,0 +1,13 @@ +"""Tests for LearnerSpec validation in doubleml.utils._learner.""" + +import pytest + +from doubleml.utils._learner import LearnerSpec + + +@pytest.mark.ci +def test_learner_spec_requires_regressor_or_classifier(): + """LearnerSpec must have at least one of allow_regressor / allow_classifier set to True.""" + msg = r"LearnerSpec 'ml_x': at least one of allow_regressor or allow_classifier must be True\." + with pytest.raises(ValueError, match=msg): + LearnerSpec("ml_x", allow_regressor=False, allow_classifier=False) From d74e9f935d21f4fa822072832a79919925dbf89c Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 17:23:51 +0200 Subject: [PATCH 30/38] fix medium codacy issues: streamline learner validation by extracting checks into dedicated functions --- doubleml/utils/_checks.py | 104 +++++++++++++++------------- doubleml/utils/_learner.py | 136 ++++++++++++++++++------------------- 2 files changed, 123 insertions(+), 117 deletions(-) diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index 3b065031..2857823a 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -241,59 +241,65 @@ def _check_benchmarks(benchmarks): return +def _check_weights_array(weights, n_obs): + if (weights.ndim != 1) or weights.shape[0] != n_obs: + raise ValueError(f"weights must have shape ({n_obs},). weights of shape {weights.shape} was passed.") + if not np.all(0 <= weights): + raise ValueError("All weights values must be greater or equal 0.") + if weights.sum() == 0: + raise ValueError("At least one weight must be non-zero.") + + +def _check_weights_atte(weights): + if not isinstance(weights, np.ndarray): + raise TypeError(f"weights must be a numpy array for ATTE score. weights of type {str(type(weights))} was passed.") + + is_binary = np.all((np.power(weights, 2) - weights) == 0) + if not is_binary: + raise ValueError("weights must be binary for ATTE score.") + + +def _check_weights_dict(weights, score, n_obs, n_rep): + if score != "ATE": + raise ValueError(f"weights as a dictionary is only supported for ATE score, got '{score}'.") + expected_keys = ["weights", "weights_bar"] + if not set(weights.keys()) == set(expected_keys): + raise ValueError(f"weights must have keys {expected_keys}. keys {str(weights.keys())} were passed.") + + if weights["weights"].shape != (n_obs,): + raise ValueError(f"weights must have shape ({n_obs},). weights of shape {weights['weights'].shape} was passed.") + # weights_bar must be 2D with n_obs rows; the n_rep column is validated later when n_rep is known + if weights["weights_bar"].ndim != 2 or weights["weights_bar"].shape[0] != n_obs: + raise ValueError( + f"weights_bar must be a 2-dimensional array with {n_obs} rows. " + f"weights_bar of shape {weights['weights_bar'].shape} was passed." + ) + if n_rep is not None and weights["weights_bar"].shape[1] != n_rep: + raise ValueError( + f"weights_bar must have shape ({n_obs}, {n_rep}). " + f"weights_bar of shape {weights['weights_bar'].shape} was passed." + ) + if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)): + raise ValueError("All weights values must be greater or equal 0.") + if (weights["weights"].sum() == 0) or (weights["weights_bar"].sum() == 0): + raise ValueError("At least one weight must be non-zero.") + + def _check_weights(weights, score, n_obs, n_rep: int | None = None): - if weights is not None: - # check general type - if (not isinstance(weights, np.ndarray)) and (not isinstance(weights, dict)): - raise TypeError(f"weights must be a numpy array or dictionary. weights of type {str(type(weights))} was passed.") - - # check shape - if isinstance(weights, np.ndarray): - if (weights.ndim != 1) or weights.shape[0] != n_obs: - raise ValueError(f"weights must have shape ({n_obs},). weights of shape {weights.shape} was passed.") - if not np.all(0 <= weights): - raise ValueError("All weights values must be greater or equal 0.") - if weights.sum() == 0: - raise ValueError("At least one weight must be non-zero.") - - # check special form for ATTE score - if score == "ATTE": - if not isinstance(weights, np.ndarray): - raise TypeError( - f"weights must be a numpy array for ATTE score. weights of type {str(type(weights))} was passed." - ) + if weights is None: + return - is_binary = np.all((np.power(weights, 2) - weights) == 0) - if not is_binary: - raise ValueError("weights must be binary for ATTE score.") + if not isinstance(weights, (np.ndarray, dict)): + raise TypeError(f"weights must be a numpy array or dictionary. weights of type {str(type(weights))} was passed.") - # check general form for ATE score - if isinstance(weights, dict): - assert score == "ATE" - expected_keys = ["weights", "weights_bar"] - if not set(weights.keys()) == set(expected_keys): - raise ValueError(f"weights must have keys {expected_keys}. keys {str(weights.keys())} were passed.") + if isinstance(weights, np.ndarray): + _check_weights_array(weights, n_obs) - if weights["weights"].shape != (n_obs,): - raise ValueError( - f"weights must have shape ({n_obs},). weights of shape {weights['weights'].shape} was passed." - ) - # weights_bar must be 2D with n_obs rows; the n_rep column is validated later when n_rep is known - if weights["weights_bar"].ndim != 2 or weights["weights_bar"].shape[0] != n_obs: - raise ValueError( - f"weights_bar must be a 2-dimensional array with {n_obs} rows. " - f"weights_bar of shape {weights['weights_bar'].shape} was passed." - ) - if n_rep is not None and weights["weights_bar"].shape[1] != n_rep: - raise ValueError( - f"weights_bar must have shape ({n_obs}, {n_rep}). " - f"weights_bar of shape {weights['weights_bar'].shape} was passed." - ) - if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)): - raise ValueError("All weights values must be greater or equal 0.") - if (weights["weights"].sum() == 0) or (weights["weights_bar"].sum() == 0): - raise ValueError("At least one weight must be non-zero.") - return + if score == "ATTE": + _check_weights_atte(weights) + + if isinstance(weights, dict): + _check_weights_dict(weights, score, n_obs, n_rep) def _check_external_predictions(external_predictions, valid_treatments, valid_learners, n_obs, n_rep): diff --git a/doubleml/utils/_learner.py b/doubleml/utils/_learner.py index 5c31a16a..83c1537f 100644 --- a/doubleml/utils/_learner.py +++ b/doubleml/utils/_learner.py @@ -63,6 +63,65 @@ def predict_method(self) -> str: return "predict_proba" if self.is_classifier else "predict" +def _check_learner_interface(learner: Any, err_prefix: str) -> None: + """Raise TypeError if learner is a class or lacks fit/set_params/get_params.""" + if isinstance(learner, type): + raise TypeError(err_prefix + "provide an instance of a learner instead of a class.") + for method in ("fit", "set_params", "get_params"): + if not hasattr(learner, method): + raise TypeError(err_prefix + f"{str(learner)} has no method .{method}().") + + +def _determine_learner_type(learner: Any, spec: LearnerSpec, warn_prefix: str) -> bool: + """Return True if learner should be treated as classifier; warn if type is ambiguous.""" + if spec.allow_regressor and spec.allow_classifier: + if is_classifier(learner): + return True + if is_regressor(learner): + return False + warnings.warn( + warn_prefix + + f"{str(learner)} is (probably) neither a regressor nor a classifier. " + + "Method predict is used for prediction." + ) + return False + if spec.allow_classifier: + if not is_classifier(learner): + warnings.warn(warn_prefix + f"{str(learner)} is (probably) no classifier.") + return True + if not is_regressor(learner): + warnings.warn(warn_prefix + f"{str(learner)} is (probably) no regressor.") + return False + + +def _check_binary_data_compatibility( + learner: Any, + spec: LearnerSpec, + learner_is_classifier: bool, + binary_outcome: bool, + binary_treatment: bool, +) -> None: + """Raise on classifier with non-binary data; warn on regressor with binary data.""" + if not spec.binary_data_check: + return + + is_outcome_check = spec.binary_data_check == "outcome" + data_is_binary = binary_outcome if is_outcome_check else binary_treatment + var_label = "outcome" if is_outcome_check else "treatment" + + if learner_is_classifier and not data_is_binary: + raise ValueError( + f"The {spec.name} learner {str(learner)} was identified as classifier " + f"but the {var_label} variable is not binary with values 0 and 1." + ) + + if not learner_is_classifier and data_is_binary: + action = "fit an additive probability model" if is_outcome_check else "estimate propensity scores" + warnings.warn( + f"Binary {var_label} detected. Consider using a classifier for {spec.name} " f"with predict_proba() to {action}." + ) + + def validate_learner( learner: Any, spec: LearnerSpec, @@ -100,80 +159,21 @@ def validate_learner( err_msg_prefix = f"Invalid learner provided for {spec.name}: " warn_msg_prefix = f"Learner provided for {spec.name} is probably invalid: " - # Check it's an instance, not a class - if isinstance(learner, type): - raise TypeError(err_msg_prefix + "provide an instance of a learner instead of a class.") - - # Check required methods - if not hasattr(learner, "fit"): - raise TypeError(err_msg_prefix + f"{str(learner)} has no method .fit().") - if not hasattr(learner, "set_params"): - raise TypeError(err_msg_prefix + f"{str(learner)} has no method .set_params().") - if not hasattr(learner, "get_params"): - raise TypeError(err_msg_prefix + f"{str(learner)} has no method .get_params().") - - # Determine learner type - learner_is_classifier: bool - if spec.allow_regressor and spec.allow_classifier: - if is_classifier(learner): - learner_is_classifier = True - elif is_regressor(learner): - learner_is_classifier = False - else: - warnings.warn( - warn_msg_prefix - + f"{str(learner)} is (probably) neither a regressor nor a classifier. " - + "Method predict is used for prediction." - ) - learner_is_classifier = False - elif spec.allow_classifier: - if not is_classifier(learner): - warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.") - learner_is_classifier = True - else: - if not is_regressor(learner): - warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.") - learner_is_classifier = False - - # Check type is allowed + _check_learner_interface(learner, err_msg_prefix) + learner_is_classifier = _determine_learner_type(learner, spec, warn_msg_prefix) + + # Check type is allowed by spec if learner_is_classifier and not spec.allow_classifier: raise ValueError(f"Classifier not allowed for {spec.name}. Use a regressor instead.") if not learner_is_classifier and not spec.allow_regressor: raise ValueError(f"Regressor not allowed for {spec.name}. Use a classifier instead.") # Check prediction method exists - if learner_is_classifier: - if not hasattr(learner, "predict_proba"): - raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict_proba().") - else: - if not hasattr(learner, "predict"): - raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict().") - - # Check binary data compatibility for classifiers - if learner_is_classifier and spec.binary_data_check: - if spec.binary_data_check == "outcome" and not binary_outcome: - raise ValueError( - f"The {spec.name} learner {str(learner)} was identified as classifier " - "but the outcome variable is not binary with values 0 and 1." - ) - if spec.binary_data_check == "treatment" and not binary_treatment: - raise ValueError( - f"The {spec.name} learner {str(learner)} was identified as classifier " - "but the treatment variable is not binary with values 0 and 1." - ) - - # Warn if regressor used with binary data - if not learner_is_classifier and spec.binary_data_check: - if spec.binary_data_check == "outcome" and binary_outcome: - warnings.warn( - f"Binary outcome detected. Consider using a classifier for {spec.name} " - "with predict_proba() to fit an additive probability model." - ) - elif spec.binary_data_check == "treatment" and binary_treatment: - warnings.warn( - f"Binary treatment detected. Consider using a classifier for {spec.name} " - "with predict_proba() to estimate propensity scores." - ) + predict_method = "predict_proba" if learner_is_classifier else "predict" + if not hasattr(learner, predict_method): + raise TypeError(err_msg_prefix + f"{str(learner)} has no method .{predict_method}().") + + _check_binary_data_compatibility(learner, spec, learner_is_classifier, binary_outcome, binary_treatment) return LearnerInfo( learner=clone(learner), From 4c0fe2ac274b5a8e1d29d6b587d397c9288dae0a Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 17:35:24 +0200 Subject: [PATCH 31/38] docs: fix docstring lint on new scalar/vector implementations Apply ruff D200/D213/D413 auto-fixes and add __init__ docstrings to DoubleMLVector and PLRVector. --- doubleml/double_ml_base.py | 24 ++++++++++++++--- doubleml/double_ml_framework.py | 7 +++++ doubleml/double_ml_linear_score.py | 8 +++--- doubleml/double_ml_scalar.py | 41 +++++++++++++++++++++++++++--- doubleml/double_ml_vector.py | 25 ++++++++++++++++++ doubleml/irm/irm_scalar.py | 18 +++++++++---- doubleml/plm/plr.py | 7 ++++- doubleml/plm/plr_scalar.py | 18 +++++++++---- doubleml/plm/plr_vector.py | 13 +++++++--- doubleml/utils/_checks.py | 1 + doubleml/utils/_learner.py | 8 +++--- doubleml/utils/_tune_optuna.py | 14 ++++++++-- doubleml/utils/blp.py | 9 +++++-- 13 files changed, 163 insertions(+), 30 deletions(-) diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py index df0affa8..05e80061 100644 --- a/doubleml/double_ml_base.py +++ b/doubleml/double_ml_base.py @@ -1,6 +1,4 @@ -""" -Abstract base class for Double Machine Learning estimators. -""" +"""Abstract base class for Double Machine Learning estimators.""" from abc import ABC, abstractmethod from typing import Dict, Optional @@ -46,6 +44,7 @@ class DoubleMLBase(ABC): Influence function values (shape: (n_obs, n_thetas, n_rep)). n_rep : int Number of repetitions for sample splitting. + """ def __init__( @@ -59,6 +58,7 @@ def __init__( ---------- obj_dml_data : DoubleMLBaseData The data object for the double machine learning model. + """ # Validate and store data if not isinstance(obj_dml_data, DoubleMLBaseData): @@ -89,6 +89,7 @@ def framework(self) -> DoubleMLFramework: ------ ValueError If framework is not yet initialized (fit() has not been called). + """ if self._framework is None: raise ValueError("The framework is not yet initialized. " "Call fit() before accessing estimation results.") @@ -103,6 +104,7 @@ def thetas(self) -> np.ndarray: ------- np.ndarray Parameter estimates (shape: (n_thetas,)). + """ return self.framework.thetas @@ -115,6 +117,7 @@ def coef(self) -> np.ndarray: ------- np.ndarray Parameter estimates (shape: (n_thetas,)). + """ return self.thetas @@ -127,6 +130,7 @@ def all_thetas(self) -> np.ndarray: ------- np.ndarray Parameter estimates for all repetitions (shape: (n_thetas, n_rep)). + """ return self.framework.all_thetas @@ -139,6 +143,7 @@ def all_coef(self) -> np.ndarray: ------- np.ndarray Parameter estimates for all repetitions (shape: (n_thetas, n_rep)). + """ return self.all_thetas @@ -151,6 +156,7 @@ def se(self) -> np.ndarray: ------- np.ndarray Standard errors (shape: (n_thetas,)). + """ return self.framework.ses @@ -163,6 +169,7 @@ def all_ses(self) -> np.ndarray: ------- np.ndarray Standard errors for all repetitions (shape: (n_thetas, n_rep)). + """ return self.framework.all_ses @@ -175,6 +182,7 @@ def summary(self) -> pd.DataFrame: ------- pd.DataFrame Summary statistics for all parameters. + """ return self.framework.summary @@ -187,6 +195,7 @@ def psi(self) -> np.ndarray: ------- np.ndarray Influence function values (shape: (n_obs, n_thetas, n_rep)). + """ return self.framework.scaled_psi @@ -200,6 +209,7 @@ def n_rep(self) -> int: ------- int Number of repetitions. + """ pass @@ -212,6 +222,7 @@ def n_obs(self) -> int: ------- int Number of observations in the dataset. + """ return self._n_obs @@ -234,6 +245,7 @@ def confint(self, joint: bool = False, level: float = 0.95) -> pd.DataFrame: ------- pd.DataFrame A DataFrame with confidence intervals. + """ return self.framework.confint(joint=joint, level=level) @@ -254,6 +266,7 @@ def bootstrap(self, method: str = "normal", n_rep_boot: int = 500) -> Self: ------- self : DoubleMLBase The DoubleML estimator with bootstrap results. + """ self.framework.bootstrap(method=method, n_rep_boot=n_rep_boot) return self @@ -271,6 +284,7 @@ def p_adjust(self, method: str = "romano-wolf") -> pd.DataFrame: ------- pd.DataFrame A DataFrame with adjusted p-values. + """ return self.framework.p_adjust(method=method) @@ -307,6 +321,7 @@ def sensitivity_analysis( ------- dict A dictionary with sensitivity analysis results. + """ return self.framework.sensitivity_analysis( cf_y=cf_y, @@ -334,6 +349,7 @@ def fit(self, **kwargs) -> Self: ------- self : DoubleMLBase The fitted DoubleML estimator. + """ pass @@ -345,6 +361,7 @@ def __str__(self) -> str: ------- str A formatted string summary of the model. + """ class_name = self.__class__.__name__ header = f"{'=' * 20} {class_name} Object {'=' * 20}" @@ -363,5 +380,6 @@ def __repr__(self) -> str: ------- str A string representation of the object. + """ return self.__str__() diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py index c82ad206..86927470 100644 --- a/doubleml/double_ml_framework.py +++ b/doubleml/double_ml_framework.py @@ -167,6 +167,7 @@ class DoubleMLFramework: ---------- dml_core : DoubleMLCore A DoubleMLCore object providing the estimated parameters and scores. + """ def __init__( @@ -383,6 +384,7 @@ def sensitivity_summary(self): ------- res : str Summary for the sensitivity analysis. + """ header = "================== Sensitivity Analysis ==================\n" if self.sensitivity_params is None: @@ -713,6 +715,7 @@ def sensitivity_analysis(self, cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95, null_h Returns ------- self : object + """ # check null_hypothesis if isinstance(null_hypothesis, float): @@ -772,6 +775,7 @@ def confint(self, joint=False, level=0.95): ------- df_ci : pd.DataFrame A data frame with the confidence interval(s). + """ if not isinstance(joint, bool): @@ -822,6 +826,7 @@ def bootstrap(self, method="normal", n_rep_boot=500): Returns ------- self : object + """ _check_bootstrap(method, n_rep_boot) @@ -858,6 +863,7 @@ def p_adjust(self, method="romano-wolf"): A data frame with adjusted p-values. all_p_vals_corrected : np.ndarray A numpy array with all corrected p-values for each repetition. + """ if not isinstance(method, str): raise TypeError(f"The p_adjust method must be of str type. {str(method)} of type {str(type(method))} was passed.") @@ -970,6 +976,7 @@ def sensitivity_plot( ------- fig : object Plotly figure of the sensitivity contours. + """ _check_integer(idx_treatment, "idx_treatment", lower_bound=0, upper_bound=self.n_thetas - 1) if not isinstance(value, str): diff --git a/doubleml/double_ml_linear_score.py b/doubleml/double_ml_linear_score.py index 4bada4d8..df35f8ec 100644 --- a/doubleml/double_ml_linear_score.py +++ b/doubleml/double_ml_linear_score.py @@ -1,6 +1,4 @@ -""" -Mixin for DoubleML models with linear score functions. -""" +"""Mixin for DoubleML models with linear score functions.""" from typing import Dict @@ -34,6 +32,7 @@ class LinearScoreMixin(DoubleMLScalar): Subclasses must implement: - _nuisance_est(): Estimate nuisance parameters for one fold - _get_score_elements(): Return dict with 'psi_a' and 'psi_b' arrays of shape (n_obs, n_rep) + """ def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None: @@ -60,6 +59,7 @@ def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None: - self._psi: Influence function values (n_obs, n_thetas=1, n_rep) - self._psi_deriv: Score derivative w.r.t. θ (n_obs, n_thetas=1, n_rep) - self._var_scaling_factors: Variance scaling factors (n_thetas=1,) + """ # Extract score elements if "psi_a" not in psi_elements or "psi_b" not in psi_elements: @@ -137,6 +137,7 @@ def _compute_score(self, psi_elements: Dict[str, np.ndarray], coef: float) -> np ------- np.ndarray Score function values, shape (n_obs, n_rep). + """ psi_a = psi_elements["psi_a"] psi_b = psi_elements["psi_b"] @@ -151,5 +152,6 @@ def _score_element_names(self) -> list: ------- list List of score element names: ['psi_a', 'psi_b'] + """ return ["psi_a", "psi_b"] diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index 10f74365..a00d415e 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -1,6 +1,4 @@ -""" -Abstract base class for scalar DoubleML models (single parameter estimation). -""" +"""Abstract base class for scalar DoubleML models (single parameter estimation).""" import warnings from abc import ABC, abstractmethod @@ -52,6 +50,7 @@ class DoubleMLScalar(DoubleMLBase, ABC): Number of repetitions for sample splitting (set via draw_sample_splitting). score : str The score function being used. + """ # Subclasses define all possible learners for the model @@ -81,6 +80,7 @@ def __init__( ------ ValueError If obj_dml_data contains more than one treatment column. + """ # Validate single treatment column if len(obj_dml_data.d_cols) != 1: @@ -137,6 +137,7 @@ def n_folds(self) -> int: ------ ValueError If sample splitting has not been performed yet. + """ if self._n_folds is None: raise ValueError("n_folds not set. Call draw_sample_splitting() first.") @@ -156,6 +157,7 @@ def n_rep(self) -> int: ------ ValueError If sample splitting has not been performed yet. + """ if self._n_rep is None: raise ValueError("n_rep not set. Call draw_sample_splitting() first.") @@ -170,6 +172,7 @@ def score(self) -> str: ------- str Score function name. + """ return self._score @@ -187,6 +190,7 @@ def predictions(self) -> dict[str, np.ndarray]: ------ ValueError If the model has not been fitted yet. + """ if self._predictions is None: raise ValueError("Predictions not available. Call fit() first.") @@ -208,6 +212,7 @@ def nuisance_targets(self) -> dict[str, np.ndarray]: ------ ValueError If the model has not been fitted yet. + """ if self._nuisance_targets is None: raise ValueError("Nuisance targets not available. Call fit() or fit_nuisance_models() first.") @@ -232,6 +237,7 @@ def nuisance_loss(self) -> dict[str, np.ndarray]: ------ ValueError If the model has not been fitted yet. + """ if self._nuisance_loss is None: raise ValueError("Nuisance loss not available. Call fit() or fit_nuisance_models() first.") @@ -250,6 +256,7 @@ def sensitivity_elements(self) -> dict[str, np.ndarray] | None: dict[str, np.ndarray] or None Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``), ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``). + """ return self._sensitivity_elements @@ -262,6 +269,7 @@ def smpls(self) -> list: ------- list List of sample splitting indices for each repetition. + """ if self._smpls is None: raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.") @@ -281,6 +289,7 @@ def smpls_cluster(self) -> list | None: ------ ValueError If cluster data is used but cluster splitting is not available. + """ if self._dml_data.is_cluster_data and self._smpls_cluster is None: raise ValueError("Cluster sample splitting has not been provided. Call set_sample_splitting() first.") @@ -304,6 +313,7 @@ def required_learners(self) -> list[str]: ------- list of str Ordered list of required learner names. + """ pass @@ -316,6 +326,7 @@ def learners(self) -> dict[str, object]: ------- dict Dictionary mapping learner names to estimator instances. + """ return {name: info.learner for name, info in self._learners.items()} @@ -337,6 +348,7 @@ def get_params(self, learner_name: str) -> dict: ------ ValueError If the learner is not registered. + """ if learner_name not in self._learners: raise ValueError(f"Learner '{learner_name}' not registered.") @@ -362,6 +374,7 @@ def set_params(self, learner_name: str, **params: object) -> Self: ------ ValueError If the learner is not registered. + """ if learner_name not in self._learners: raise ValueError(f"Learner '{learner_name}' not registered.") @@ -383,6 +396,7 @@ def _register_learner(self, name: str, learner: object) -> None: ------ ValueError If the learner name is not defined in _LEARNER_SPECS. + """ if name not in self._LEARNER_SPECS: raise ValueError(f"Learner '{name}' not defined for this model.") @@ -408,6 +422,7 @@ def set_learners(self) -> Self: ------- self : Self The estimator with learners set. + """ pass @@ -456,6 +471,7 @@ def fit( ------- self : Self The fitted estimator. + """ if self._smpls is None: self.draw_sample_splitting( @@ -511,6 +527,7 @@ def fit_nuisance_models( ------ ValueError If sample splitting has not been initialized. + """ if self._smpls is None: raise ValueError("Sample splitting has not been initialized. Call draw_sample_splitting() first.") @@ -580,6 +597,7 @@ def estimate_causal_parameters(self) -> Self: ------ ValueError If nuisance models have not been fitted yet. + """ if self._predictions is None: raise ValueError("Predictions not available. Call fit_nuisance_models() first.") @@ -625,6 +643,7 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: ------ ValueError If n_folds or n_rep have invalid values. + """ if not isinstance(n_folds, int) or n_folds < 2: raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.") @@ -689,6 +708,7 @@ def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None = If ``all_smpls`` is not a list or if tuple shorthand is used. ValueError If the partition is invalid or cluster splitting is missing. + """ if isinstance(all_smpls, tuple): raise TypeError("all_smpls must be a list of folds; tuple shorthand is not supported for DoubleMLScalar.") @@ -750,6 +770,7 @@ def _initialize_predictions_dict(self) -> dict[str, np.ndarray]: ------- dict Dictionary mapping learner names to NaN-filled arrays. + """ n_obs = self._n_obs n_rep = self.n_rep @@ -770,6 +791,7 @@ def _check_external_predictions(self, external_predictions: dict[str, np.ndarray If a value is not a numpy array. ValueError If a value does not match shape (n_obs, n_rep). + """ n_obs = self._n_obs n_rep = self.n_rep @@ -798,6 +820,7 @@ def _check_learners_available(self, external_predictions: dict[str, np.ndarray] ------ ValueError If a required learner is missing and not covered by external predictions. + """ ext_keys = set(external_predictions.keys()) if external_predictions is not None else set() @@ -816,6 +839,7 @@ def _construct_framework(self) -> DoubleMLFramework: ------- DoubleMLFramework The framework object with estimation results. + """ # Standardize the score function: psi / E[psi_deriv] # Both already in framework shape: (n_obs, n_thetas, n_rep) @@ -934,6 +958,7 @@ def evaluate_learners( >>> model.evaluate_learners() >>> model.evaluate_learners(metric=r2_score) >>> model.evaluate_learners(learners=["ml_m"], metric=log_loss) + """ if self._nuisance_targets is None: raise ValueError("Nuisance targets not available. Call fit() or fit_nuisance_models() first.") @@ -1007,6 +1032,7 @@ def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``), ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``). Return ``None`` (default) if sensitivity analysis is not implemented. + """ return None @@ -1042,6 +1068,7 @@ def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]: dict[str, np.ndarray or None] Dictionary mapping learner names to target arrays of shape ``(n_obs, n_rep)``, or ``None`` where targets are not available. + """ pass @@ -1078,6 +1105,7 @@ def _nuisance_est( If provided, a dictionary of external predictions. Learners whose names appear as keys should not be fitted; their predictions are already pre-filled in self._predictions. + """ pass @@ -1104,6 +1132,7 @@ def _get_score_elements(self) -> dict[str, np.ndarray]: psi_a = (D - m_hat) ** 2 # shape: (n_obs, n_rep) psi_b = (D - m_hat) * (Y - l_hat) # shape: (n_obs, n_rep) return {'psi_a': psi_a, 'psi_b': psi_b} + """ pass @@ -1133,6 +1162,7 @@ def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None: - self._psi should have shape (n_obs, n_thetas, n_rep) - self._psi_deriv should have shape (n_obs, n_thetas, n_rep) - self._var_scaling_factors should have shape (n_thetas,) + """ pass @@ -1188,6 +1218,7 @@ def tune_ml_models( tune_res : dict Dict of :class:`~doubleml.utils._tune_optuna.DMLOptunaResult` objects keyed by learner name. Returned when ``return_tune_res=True``. + """ if not isinstance(set_as_params, bool): raise TypeError(f"set_as_params must be True or False. Got {str(set_as_params)}.") @@ -1267,6 +1298,7 @@ def _expand_tuning_param_space(self, ml_param_space: dict[str, Callable | None]) alias nor a defined learner name. TypeError If a parameter space value is not callable. + """ if not isinstance(ml_param_space, dict): raise TypeError(f"ml_param_space must be a dict. Got {type(ml_param_space).__name__}.") @@ -1317,6 +1349,7 @@ def _validate_optuna_setting_keys(self, optuna_settings: dict | None) -> None: value is not a dict. ValueError If a key is not a global Optuna setting and not a valid learner name or alias. + """ if optuna_settings is not None and not isinstance(optuna_settings, dict): raise TypeError(f"optuna_settings must be a dict or None. Got {str(type(optuna_settings))}.") @@ -1375,6 +1408,7 @@ def _get_tuning_data( ------ NotImplementedError Always; subclasses must override this method. + """ raise NotImplementedError( f"_get_tuning_data not implemented for {self.__class__.__name__}. " "Subclasses must override this method." @@ -1388,6 +1422,7 @@ def __str__(self) -> str: ------- str A formatted string summary of the model. + """ class_name = self.__class__.__name__ header = f"{'=' * 20} {class_name} Object {'=' * 20}" diff --git a/doubleml/double_ml_vector.py b/doubleml/double_ml_vector.py index e2a7c7d7..d4cbe531 100644 --- a/doubleml/double_ml_vector.py +++ b/doubleml/double_ml_vector.py @@ -62,6 +62,7 @@ class DoubleMLVector(DoubleMLBase, ABC): The score function being used. modellist : list of DoubleMLScalar The scalar sub-models, one per treatment column (or model key). + """ def __init__( @@ -69,6 +70,7 @@ def __init__( obj_dml_data: DoubleMLData, score: str = "default", ) -> None: + """Initialize DoubleMLVector. See class docstring for parameter details.""" super().__init__(obj_dml_data) self._dml_data: DoubleMLData = obj_dml_data # narrow for attribute access self._score = score @@ -99,6 +101,7 @@ def n_rep(self) -> int: ------ ValueError If sample splitting has not been drawn yet. + """ if self._n_rep is None: raise ValueError("n_rep not set. Call draw_sample_splitting() first.") @@ -118,6 +121,7 @@ def n_folds(self) -> int: ------ ValueError If sample splitting has not been drawn yet. + """ if self._n_folds is None: raise ValueError("n_folds not set. Call draw_sample_splitting() first.") @@ -132,6 +136,7 @@ def score(self) -> str: ------- str Score function name. + """ return self._score @@ -149,6 +154,7 @@ def smpls(self) -> list: ------ ValueError If sample splitting has not been drawn yet. + """ if self._smpls is None: raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.") @@ -163,6 +169,7 @@ def modellist(self) -> list[DoubleMLScalar] | None: ------- list of DoubleMLScalar or None ``None`` before :meth:`_initialize_models` has been called by the subclass. + """ return self._modellist @@ -174,6 +181,7 @@ def n_rep_boot(self) -> int | None: Returns ------- int or None + """ return None if self._framework is None else self._framework.n_rep_boot @@ -185,6 +193,7 @@ def boot_method(self) -> str | None: Returns ------- str or None + """ return None if self._framework is None else self._framework.boot_method @@ -196,6 +205,7 @@ def boot_t_stat(self) -> np.ndarray | None: Returns ------- np.ndarray or None + """ return None if self._framework is None else self._framework.boot_t_stat @@ -207,6 +217,7 @@ def sensitivity_elements(self) -> dict[str, np.ndarray] | None: Returns ------- dict or None + """ return None if self._framework is None else self._framework.sensitivity_elements @@ -219,6 +230,7 @@ def sensitivity_params(self) -> dict | None: Returns ------- dict or None + """ return None if self._framework is None else self._framework.sensitivity_params @@ -235,6 +247,7 @@ def sensitivity_summary(self) -> str: ------ ValueError If :meth:`fit` has not been called yet. + """ if self._framework is None: raise ValueError("Apply fit() before accessing sensitivity_summary.") @@ -252,6 +265,7 @@ def required_learners(self) -> list[str]: ------- list of str Ordered list of required learner names. + """ @abstractmethod @@ -271,6 +285,7 @@ def set_learners(self, **kwargs: object) -> Self: Returns ------- self : Self + """ @abstractmethod @@ -287,6 +302,7 @@ def _initialize_models(self) -> list[DoubleMLScalar]: ------- list of DoubleMLScalar One configured scalar model per element of ``self._dml_data.d_cols``. + """ # ==================== Protected Helpers ==================== @@ -314,6 +330,7 @@ class would override this to return ``self._dml_data`` unchanged (each APO DoubleMLData A :class:`~doubleml.data.DoubleMLData` with ``d_cols=[d_col]`` and all other treatment columns added to ``x_cols``. + """ other_d_cols = [c for c in self._dml_data.d_cols if c != d_col] x_cols = list(self._dml_data.x_cols) + other_d_cols @@ -385,6 +402,7 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self: ------ ValueError If ``n_folds < 2`` or ``n_rep < 1``. + """ if not isinstance(n_folds, int) or n_folds < 2: raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.") @@ -427,6 +445,7 @@ def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None = If ``all_smpls`` is not a list. ValueError If the partition is invalid. + """ if isinstance(all_smpls, tuple): raise TypeError("all_smpls must be a list of folds; tuple shorthand is not supported for DoubleMLVector.") @@ -491,6 +510,7 @@ def fit( Returns ------- self : Self + """ if self._smpls is None: self.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep) @@ -529,6 +549,7 @@ def get_params(self, learner_name: str) -> list[dict]: ------- list of dict One parameter dict per sub-model, in ``d_cols`` order. + """ if self._modellist is None: raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.") @@ -548,6 +569,7 @@ def set_params(self, learner_name: str, **params: object) -> Self: Returns ------- self : Self + """ if self._modellist is None: raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.") @@ -644,6 +666,7 @@ def sensitivity_plot( ------ ValueError If :meth:`fit` has not been called yet. + """ if self._framework is None: raise ValueError("Apply fit() before sensitivity_plot().") @@ -689,6 +712,7 @@ def sensitivity_benchmark(self, benchmarking_set: list[str], fit_args: dict | No If ``benchmarking_set`` or ``fit_args`` have the wrong type. ValueError If ``benchmarking_set`` is empty or not a subset of ``x_cols``. + """ if self._framework is None: raise ValueError("Apply fit() before sensitivity_benchmark().") @@ -738,6 +762,7 @@ def __str__(self) -> str: ------- str A formatted string summary of the model. + """ class_name = self.__class__.__name__ header = f"{'=' * 20} {class_name} Object {'=' * 20}" diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py index 1ceac4af..3cac41b7 100644 --- a/doubleml/irm/irm_scalar.py +++ b/doubleml/irm/irm_scalar.py @@ -1,6 +1,4 @@ -""" -Interactive Regression Model (IRM) based on the new DoubleMLScalar hierarchy. -""" +"""Interactive Regression Model (IRM) based on the new DoubleMLScalar hierarchy.""" from __future__ import annotations @@ -23,7 +21,8 @@ class IRM(LinearScoreMixin): - """Double machine learning for interactive regression models. + """ + Double machine learning for interactive regression models. Based on the DoubleMLScalar + LinearScoreMixin hierarchy. @@ -78,6 +77,7 @@ class IRM(LinearScoreMixin): .. math:: \\theta_0 = \\mathbb{E}[g_0(1, X) - g_0(0, X) | D=1]. + """ # Define learner specifications for IRM @@ -122,6 +122,7 @@ def __init__( Weights for weighted ATE. ps_processor_config : PSProcessorConfig, optional Configuration for propensity score processing. + """ # Validate data self._check_data(obj_dml_data) @@ -215,6 +216,7 @@ def set_learners( ------- self : IRM The estimator with learners set. + """ # ml_g convenience: clone to ml_g0/ml_g1 if not explicitly set if ml_g is not None: @@ -310,7 +312,8 @@ def _nuisance_est( # ==================== Score Elements ==================== def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]: - """Return target arrays for nuisance loss evaluation. + """ + Return target arrays for nuisance loss evaluation. ml_g0 and ml_g1 are fitted only on the d==0 and d==1 subgroups respectively, so targets for the opposite group are NaN. ml_m target is d (binary treatment). @@ -383,6 +386,7 @@ def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> Dou ------- model : :class:`doubleml.DoubleMLBLP` Best linear predictor model. + """ if self.score != "ATE": raise ValueError(f"Invalid score '{self.score}'. CATE is only implemented for score='ATE'.") @@ -413,6 +417,7 @@ def gate(self, groups: pd.DataFrame, **kwargs: Any) -> DoubleMLBLP: ------- model : :class:`doubleml.DoubleMLBLP` Best linear predictor model for group effects. + """ if not isinstance(groups, pd.DataFrame): raise TypeError(f"Groups must be of DataFrame type. Groups of type {str(type(groups))} was passed.") @@ -485,6 +490,7 @@ def _get_tuning_data( ------ ValueError If ``learner_name`` is not a valid IRM learner name. + """ y = self._dml_data.y d = self._dml_data.d @@ -527,6 +533,7 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]: Shape (n_obs, n_rep) or broadcastable. weights_bar : np.ndarray Shape (n_obs, n_rep) or broadcastable. + """ d = self._dml_data.d @@ -572,6 +579,7 @@ def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: dict[str, np.ndarray] or None Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``), ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``). + """ y = self._dml_data.y # (n_obs,) d = self._dml_data.d # (n_obs,) diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index 825ec845..13182745 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -16,7 +16,8 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML): - """Double machine learning for partially linear regression models + """ + Double machine learning for partially linear regression models Parameters ---------- @@ -87,6 +88,7 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML): where :math:`Y` is the outcome variable and :math:`D` is the policy variable of interest. The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates, and :math:`\\zeta` and :math:`V` are stochastic errors. + """ def __init__( @@ -465,6 +467,7 @@ def cate(self, basis, is_gate=False, **kwargs): ------- model : :class:`doubleML.DoubleMLBLP` Best linear Predictor model. + """ if self._dml_data.n_treat > 1: raise NotImplementedError( @@ -500,6 +503,7 @@ def gate(self, groups, **kwargs): ------- model : :class:`doubleML.DoubleMLBLP` Best linear Predictor model for Group Effects. + """ if not isinstance(groups, pd.DataFrame): @@ -530,6 +534,7 @@ def _partial_out(self): The residual of the regression of Y on X. D_tilde : :class:`numpy.ndarray` The residual of the regression of D on X. + """ if self.predictions is None: raise ValueError("predictions are None. Call .fit(store_predictions=True) to store the predictions.") diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 77980881..7fbce66e 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -1,6 +1,4 @@ -""" -Partially Linear Regression (PLR) model based on the new DoubleMLScalar hierarchy. -""" +"""Partially Linear Regression (PLR) model based on the new DoubleMLScalar hierarchy.""" from __future__ import annotations @@ -21,7 +19,8 @@ class PLR(LinearScoreMixin): - """Double machine learning for partially linear regression models. + """ + Double machine learning for partially linear regression models. Based on the DoubleMLScalar + LinearScoreMixin hierarchy. @@ -38,6 +37,7 @@ class PLR(LinearScoreMixin): Learner for E[D|X]. Can be regressor or classifier. ml_g : estimator, optional Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor. + """ # Define learner specifications for PLR @@ -70,6 +70,7 @@ def __init__( Learner for E[D|X]. Can be regressor or classifier. ml_g : estimator, optional Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor. + """ # Validate data self._check_data(obj_dml_data) @@ -124,6 +125,7 @@ def set_learners( ------- self : PLR The estimator with learners set. + """ for name, learner in [("ml_l", ml_l), ("ml_m", ml_m), ("ml_g", ml_g)]: if learner is None: @@ -314,6 +316,7 @@ def _get_tuning_data( ------ ValueError If ``learner_name`` is not a valid PLR learner name. + """ y = self._dml_data.y d = self._dml_data.d @@ -360,7 +363,8 @@ def _get_tuning_data( raise ValueError(f"Unknown learner '{learner_name}' for PLR.") def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]: - """Return target arrays for nuisance loss evaluation. + """ + Return target arrays for nuisance loss evaluation. Returns y for ml_l, d for ml_m. For IV-type score, ml_g target is None because the adjusted outcome y - θ·d depends on the estimated parameter and varies per @@ -412,6 +416,7 @@ def _partial_out(self) -> tuple[np.ndarray, np.ndarray]: ------- Y_tilde, D_tilde : tuple[np.ndarray, np.ndarray] Outcome and treatment residuals, each of shape ``(n_obs, n_rep)``. + """ if self._predictions is None: raise ValueError("predictions are None. Call fit() first.") @@ -451,6 +456,7 @@ def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> Dou ------- model : :class:`doubleml.DoubleMLBLP` Best linear predictor model. + """ if self._dml_data.n_treat > 1: raise NotImplementedError( @@ -483,6 +489,7 @@ def gate(self, groups: pd.DataFrame, **kwargs: Any) -> DoubleMLBLP: ------- model : :class:`doubleml.DoubleMLBLP` Best linear predictor model for group effects. + """ if not isinstance(groups, pd.DataFrame): raise TypeError(f"Groups must be of DataFrame type. Groups of type {str(type(groups))} was passed.") @@ -515,6 +522,7 @@ def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None: Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``), ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``). Returns ``None`` for callable scores (no standard Riesz representer). + """ if callable(self.score): return None diff --git a/doubleml/plm/plr_vector.py b/doubleml/plm/plr_vector.py index 87b55764..c32ee4f0 100644 --- a/doubleml/plm/plr_vector.py +++ b/doubleml/plm/plr_vector.py @@ -13,7 +13,8 @@ class PLRVector(DoubleMLVector): - """Multi-treatment double machine learning for partially linear regression models. + """ + Multi-treatment double machine learning for partially linear regression models. Orchestrates one :class:`~doubleml.plm.plr_scalar.PLR` instance per treatment column in ``d_cols``. Sample splits are drawn once and shared across all sub-models; @@ -35,6 +36,7 @@ class PLRVector(DoubleMLVector): Learner for E[D|X]. Can be regressor or classifier. ml_g : estimator, optional Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor. + """ def __init__( @@ -45,6 +47,7 @@ def __init__( ml_m: object | None = None, ml_g: object | None = None, ) -> None: + """Initialize PLRVector. See class docstring for parameter details.""" # Validate at the vector level so the error fires before sub-model construction. self._check_data(obj_dml_data) valid_scores = ["partialling out", "IV-type"] @@ -61,7 +64,8 @@ def __init__( @staticmethod def _check_data(obj_dml_data: Any) -> None: - """Validate the data object for PLR vector estimation. + """ + Validate the data object for PLR vector estimation. Parameters ---------- @@ -75,6 +79,7 @@ def _check_data(obj_dml_data: Any) -> None: If ``obj_dml_data`` is not a :class:`~doubleml.data.DoubleMLData`. ValueError If ``obj_dml_data`` defines instrumental variables (``z_cols``). + """ if not isinstance(obj_dml_data, DoubleMLData): raise TypeError( @@ -100,7 +105,8 @@ def set_learners( ml_m: object | None = None, ml_g: object | None = None, ) -> Self: - """Set the learners for nuisance estimation on every sub-model. + """ + Set the learners for nuisance estimation on every sub-model. Parameters ---------- @@ -114,6 +120,7 @@ def set_learners( Returns ------- self : PLRVector + """ if self._modellist is None: raise RuntimeError("Sub-models are not initialized. _initialize_models() must run in __init__.") diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index 2857823a..eaddbc45 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -558,6 +558,7 @@ def _check_learner(learner, learner_name, regressor=True, classifier=True): TypeError If the learner is a class instead of an instance, or lacks required methods (fit, set_params, get_params, predict/predict_proba). + """ err_msg_prefix = f"Invalid learner provided for {learner_name}: " warn_msg_prefix = f"Learner provided for {learner_name} is probably invalid: " diff --git a/doubleml/utils/_learner.py b/doubleml/utils/_learner.py index 83c1537f..642e02f1 100644 --- a/doubleml/utils/_learner.py +++ b/doubleml/utils/_learner.py @@ -1,6 +1,4 @@ -""" -Learner specification and validation utilities for DoubleML. -""" +"""Learner specification and validation utilities for DoubleML.""" from __future__ import annotations @@ -29,6 +27,7 @@ class LearnerSpec: If specified, warns when using regressor with binary data. "outcome" checks binary_outcome, "treatment" checks binary_treatment. Default is ``None``. + """ name: str @@ -52,6 +51,7 @@ class LearnerInfo: The learner object (already cloned). is_classifier : bool Whether the learner is a classifier. + """ learner: Any @@ -155,6 +155,7 @@ def validate_learner( ValueError If the learner type is not allowed by the specification. If a classifier is used with non-binary data when required. + """ err_msg_prefix = f"Invalid learner provided for {spec.name}: " warn_msg_prefix = f"Learner provided for {spec.name} is probably invalid: " @@ -198,6 +199,7 @@ def predict_nuisance(learner: Any, X: np.ndarray, is_classifier: bool) -> np.nda ------- np.ndarray Predictions. For classifiers, returns probability of class 1. + """ if is_classifier: return learner.predict_proba(X)[:, 1] diff --git a/doubleml/utils/_tune_optuna.py b/doubleml/utils/_tune_optuna.py index f3e2a821..25eff3ba 100644 --- a/doubleml/utils/_tune_optuna.py +++ b/doubleml/utils/_tune_optuna.py @@ -15,6 +15,7 @@ >>> import logging >>> logging.basicConfig(level=logging.INFO) >>> # Now you'll see tuning progress and information + """ import logging @@ -85,6 +86,7 @@ class DMLOptunaResult: tuned : bool Indicates whether tuning was performed (True) or skipped (False). + """ learner_name: str @@ -296,7 +298,8 @@ def _default_optuna_settings(): def _resolve_optuna_scoring(scoring_method, learner, params_name): - """Resolve the scoring argument for an Optuna-tuned learner. + """ + Resolve the scoring argument for an Optuna-tuned learner. Parameters ---------- @@ -315,6 +318,7 @@ def _resolve_optuna_scoring(scoring_method, learner, params_name): :func:`sklearn.model_selection.cross_val_score` (``None`` means use the estimator's default ``score``) and a human-readable message describing the decision for logging purposes. + """ if scoring_method is not None: @@ -380,7 +384,8 @@ def _check_tuning_inputs( cv, params_name, ): - """Validate Optuna tuning inputs and normalize the cross-validation splitter. + """ + Validate Optuna tuning inputs and normalize the cross-validation splitter. Parameters ---------- @@ -404,6 +409,7 @@ def _check_tuning_inputs( cross-validator or list Cross-validation splitter or pre-made list of ``(train, test)`` index pairs as returned by :func:`resolve_optuna_cv`. + """ if y.shape[0] != x.shape[0]: @@ -444,6 +450,7 @@ def _get_optuna_settings(optuna_settings, params_name): ------- dict Resolved settings dictionary. + """ default_settings = _default_optuna_settings() @@ -497,6 +504,7 @@ def _create_study(settings, learner_name): ------- optuna.study.Study The Optuna study object ready for optimization. + """ # Check if a study instance is provided directly @@ -548,6 +556,7 @@ def _create_objective(param_grid_func, learner, x, y, cv, scoring_method): ------- callable Objective function for Optuna optimization. + """ # Build scorer once; scoring_method is already resolved (non-None) by _resolve_optuna_scoring scorer = check_scoring(clone(learner), scoring=scoring_method) @@ -627,6 +636,7 @@ def _dml_tune_optuna( ------- DMLOptunaResult A tuning result containing the optuna.Study object and further information. + """ scoring_method, scoring_message = _resolve_optuna_scoring(scoring_method, learner, params_name) diff --git a/doubleml/utils/blp.py b/doubleml/utils/blp.py index c5e59d7e..406e2a7f 100644 --- a/doubleml/utils/blp.py +++ b/doubleml/utils/blp.py @@ -10,7 +10,8 @@ class DoubleMLBLP: - """Best linear predictor (BLP) for DoubleML with orthogonal signals. + """ + Best linear predictor (BLP) for DoubleML with orthogonal signals. Manily used for CATE and GATE estimation for IRM models. Parameters @@ -29,6 +30,7 @@ class DoubleMLBLP: is_gate : bool Indicates whether the basis is constructed for GATEs (dummy-basis). Default is ``False``. + """ def __init__(self, orth_signal, basis, is_gate=False): @@ -60,7 +62,8 @@ def __init__(self, orth_signal, basis, is_gate=False): @staticmethod def _validate_basis(basis, n_obs, n_rep): - """Validate ``basis`` and return a list of length ``n_rep``. + """ + Validate ``basis`` and return a list of length ``n_rep``. ``basis`` may be a single ``pd.DataFrame`` (shared across reps) or a list of ``pd.DataFrame`` of length ``n_rep``. Per-rep DataFrames must share column names @@ -213,6 +216,7 @@ def fit(self, cov_type="HC0", **kwargs): Returns ------- self : object + """ # fit the best-linear-predictor of the orthogonal signal with respect to the grid @@ -261,6 +265,7 @@ def confint(self, basis=None, joint=False, level=0.95, n_rep_boot=500): ------- df_ci : pd.DataFrame A data frame with the confidence interval(s). + """ if not isinstance(joint, bool): raise TypeError(f"joint must be True or False. Got {str(joint)}.") From b996235f8dc495909a14edc720cc74af7d11d6ba Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 17:39:02 +0200 Subject: [PATCH 32/38] refactor: simplify set_learners method signature by removing kwargs --- doubleml/double_ml_vector.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/doubleml/double_ml_vector.py b/doubleml/double_ml_vector.py index d4cbe531..b6535d7a 100644 --- a/doubleml/double_ml_vector.py +++ b/doubleml/double_ml_vector.py @@ -269,7 +269,7 @@ def required_learners(self) -> list[str]: """ @abstractmethod - def set_learners(self, **kwargs: object) -> Self: + def set_learners(self) -> Self: """ Set the learners for nuisance estimation on all sub-models. @@ -277,11 +277,6 @@ def set_learners(self, **kwargs: object) -> Self: matching their model's learners (e.g., ``ml_l``, ``ml_m`` for PLR). The same learners (cloned per sub-model) are applied to every treatment. - Parameters - ---------- - **kwargs - Learner keyword arguments specific to the subclass. - Returns ------- self : Self From 39a010131a29d4a665b9c559315380d732ba9ba5 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 17:45:52 +0200 Subject: [PATCH 33/38] refactor: remove redundant pass statements in abstract methods and streamline sample comparison logic in tests --- doubleml/double_ml_base.py | 2 -- doubleml/double_ml_scalar.py | 1 - doubleml/irm/tests/test_irm_scalar_exceptions.py | 4 ---- doubleml/plm/tests/test_plr_scalar_exceptions.py | 2 -- doubleml/tests/test_scalar_set_sample_splitting.py | 10 +++++----- 5 files changed, 5 insertions(+), 14 deletions(-) diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py index 05e80061..7892c8d4 100644 --- a/doubleml/double_ml_base.py +++ b/doubleml/double_ml_base.py @@ -211,7 +211,6 @@ def n_rep(self) -> int: Number of repetitions. """ - pass @property def n_obs(self) -> int: @@ -351,7 +350,6 @@ def fit(self, **kwargs) -> Self: The fitted DoubleML estimator. """ - pass def __str__(self) -> str: """ diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index a00d415e..22bcb694 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -315,7 +315,6 @@ def required_learners(self) -> list[str]: Ordered list of required learner names. """ - pass @property def learners(self) -> dict[str, object]: diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py index a1db7598..4cca6d2d 100644 --- a/doubleml/irm/tests/test_irm_scalar_exceptions.py +++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py @@ -51,8 +51,6 @@ def test_irm_scalar_exception_instrument(): df = plr_data.data.copy() x_cols = [c for c in df.columns if c.startswith("X")] - import doubleml as dml - dml_data_iv = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols[:-1], z_cols=x_cols[-1]) msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\." @@ -349,7 +347,5 @@ def test_exception_sensitivity_level(fitted_irm_for_sensitivity): @pytest.mark.ci def test_exception_sensitivity_null_hypothesis(fitted_irm_for_sensitivity): """null_hypothesis with wrong shape raises ValueError.""" - import numpy as np - with pytest.raises(ValueError, match=r"null_hypothesis"): fitted_irm_for_sensitivity.sensitivity_analysis(null_hypothesis=np.array([0.0, 0.0])) diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py index 7d2a57b9..4bc205f1 100644 --- a/doubleml/plm/tests/test_plr_scalar_exceptions.py +++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py @@ -198,7 +198,5 @@ def test_exception_sensitivity_level(fitted_plr_for_sensitivity): @pytest.mark.ci def test_exception_sensitivity_null_hypothesis(fitted_plr_for_sensitivity): """null_hypothesis with wrong shape raises ValueError.""" - import numpy as np - with pytest.raises(ValueError, match=r"null_hypothesis"): fitted_plr_for_sensitivity.sensitivity_analysis(null_hypothesis=np.array([0.0, 0.0])) diff --git a/doubleml/tests/test_scalar_set_sample_splitting.py b/doubleml/tests/test_scalar_set_sample_splitting.py index bc9abd84..a049a984 100644 --- a/doubleml/tests/test_scalar_set_sample_splitting.py +++ b/doubleml/tests/test_scalar_set_sample_splitting.py @@ -9,11 +9,11 @@ def _assert_smpls_equal(smpls0, smpls1): assert len(smpls0) == len(smpls1) - for i_rep in range(len(smpls0)): - assert len(smpls0[i_rep]) == len(smpls1[i_rep]) - for i_fold in range(len(smpls0[i_rep])): - assert np.array_equal(smpls0[i_rep][i_fold][0], smpls1[i_rep][i_fold][0]) - assert np.array_equal(smpls0[i_rep][i_fold][1], smpls1[i_rep][i_fold][1]) + for rep0, rep1 in zip(smpls0, smpls1): + assert len(rep0) == len(rep1) + for fold0, fold1 in zip(rep0, rep1): + assert np.array_equal(fold0[0], fold1[0]) + assert np.array_equal(fold0[1], fold1[1]) @pytest.mark.ci From 9913dbf917c7504b0607e2fa9763b4f087dac2ea Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 20:09:04 +0200 Subject: [PATCH 34/38] refactor: remove redundant pass statement in DoubleMLScalar class --- doubleml/double_ml_scalar.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index 22bcb694..5ab55bdd 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -423,7 +423,6 @@ def set_learners(self) -> Self: The estimator with learners set. """ - pass # ==================== Concrete fit() Method (Template) ==================== From bd75efb181018282dba4edc6996ee6b0e20b8058 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 20:21:40 +0200 Subject: [PATCH 35/38] refactor: remove redundant pass statements in abstract methods of DoubleMLScalar class --- doubleml/double_ml_scalar.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index 5ab55bdd..d0b5e323 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -1068,7 +1068,6 @@ def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]: or ``None`` where targets are not available. """ - pass @abstractmethod def _nuisance_est( @@ -1105,7 +1104,6 @@ def _nuisance_est( pre-filled in self._predictions. """ - pass @abstractmethod def _get_score_elements(self) -> dict[str, np.ndarray]: @@ -1132,7 +1130,6 @@ def _get_score_elements(self) -> dict[str, np.ndarray]: return {'psi_a': psi_a, 'psi_b': psi_b} """ - pass @abstractmethod def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None: @@ -1162,7 +1159,6 @@ def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None: - self._var_scaling_factors should have shape (n_thetas,) """ - pass # ==================== Hyperparameter Tuning ==================== From 93247fda1ceceacfbc6150da4d54aa796c916bdb Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 20:57:49 +0200 Subject: [PATCH 36/38] refactor: simplify docstring for set_learners method in PLRVector class --- doubleml/plm/plr_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/plm/plr_vector.py b/doubleml/plm/plr_vector.py index c32ee4f0..dcc534bb 100644 --- a/doubleml/plm/plr_vector.py +++ b/doubleml/plm/plr_vector.py @@ -106,7 +106,7 @@ def set_learners( ml_g: object | None = None, ) -> Self: """ - Set the learners for nuisance estimation on every sub-model. + Set learners for nuisance estimation on every sub-model. Parameters ---------- From d56d105bb16b4a79247bc9d0a89438b056cd4ece Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 21:00:35 +0200 Subject: [PATCH 37/38] refactor: add doctest skip directive to evaluate_learners examples in DoubleMLScalar class --- doubleml/double_ml_scalar.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py index d0b5e323..265006b9 100644 --- a/doubleml/double_ml_scalar.py +++ b/doubleml/double_ml_scalar.py @@ -953,9 +953,9 @@ def evaluate_learners( Examples -------- >>> from sklearn.metrics import root_mean_squared_error, r2_score, log_loss - >>> model.evaluate_learners() - >>> model.evaluate_learners(metric=r2_score) - >>> model.evaluate_learners(learners=["ml_m"], metric=log_loss) + >>> model.evaluate_learners() # doctest: +SKIP + >>> model.evaluate_learners(metric=r2_score) # doctest: +SKIP + >>> model.evaluate_learners(learners=["ml_m"], metric=log_loss) # doctest: +SKIP """ if self._nuisance_targets is None: From 3ffa823754671b3375af0098e75a7a8d5b857d1a Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Sat, 9 May 2026 21:58:08 +0200 Subject: [PATCH 38/38] refactor: enhance basis validation in DoubleMLPLR and PLR classes --- doubleml/plm/plr.py | 16 +++++++++++++++- doubleml/plm/plr_scalar.py | 16 +++++++++++++++- doubleml/tests/test_exceptions.py | 4 ++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index 13182745..cba17b7d 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -476,7 +476,21 @@ def cate(self, basis, is_gate=False, **kwargs): Y_tilde, D_tilde = self._partial_out() - basis_per_rep = [basis.multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)] + if isinstance(basis, pd.DataFrame): + basis_list = [basis] * self.n_rep + elif isinstance(basis, list): + if len(basis) != self.n_rep: + raise ValueError(f"When basis is a list it must have length n_rep={self.n_rep}. Got length {len(basis)}.") + if not all(isinstance(b, pd.DataFrame) for b in basis): + raise TypeError("All entries of basis list must be of DataFrame type.") + basis_list = basis + else: + raise TypeError( + f"The basis must be of DataFrame type or a list of DataFrames. " + f"Basis of type {str(type(basis))} was passed." + ) + + basis_per_rep = [basis_list[i_rep].multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)] model = DoubleMLBLP( orth_signal=Y_tilde, basis=basis_per_rep, diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py index 7fbce66e..b11993fd 100644 --- a/doubleml/plm/plr_scalar.py +++ b/doubleml/plm/plr_scalar.py @@ -465,8 +465,22 @@ def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> Dou if self._predictions is None: raise ValueError("CATE requires a fitted model. Call fit() first.") + if isinstance(basis, pd.DataFrame): + basis_list = [basis] * self.n_rep + elif isinstance(basis, list): + if len(basis) != self.n_rep: + raise ValueError(f"When basis is a list it must have length n_rep={self.n_rep}. Got length {len(basis)}.") + if not all(isinstance(b, pd.DataFrame) for b in basis): + raise TypeError("All entries of basis list must be of DataFrame type.") + basis_list = basis + else: + raise TypeError( + f"The basis must be of DataFrame type or a list of DataFrames. " + f"Basis of type {str(type(basis))} was passed." + ) + Y_tilde, D_tilde = self._partial_out() - basis_per_rep = [basis.multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)] + basis_per_rep = [basis_list[i_rep].multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)] model = DoubleMLBLP(orth_signal=Y_tilde, basis=basis_per_rep, is_gate=is_gate) model.fit(**kwargs) diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index 7d164054..4fa99240 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -1418,7 +1418,7 @@ def test_doubleml_exception_cate(): n_rep=2, ) dml_irm_obj.fit() - msg = "The basis must be of DataFrame type. Basis of type was passed." + msg = r"The basis must be of DataFrame type or a list of DataFrames\. Basis of type was passed\." with pytest.raises(TypeError, match=msg): dml_irm_obj.cate(basis=2) @@ -1427,7 +1427,7 @@ def test_doubleml_exception_cate(): def test_doubleml_exception_plr_cate(): dml_plr_obj = DoubleMLPLR(dml_data, ml_l=Lasso(), ml_m=Lasso(), n_folds=2, n_rep=2) dml_plr_obj.fit() - msg = "The basis must be of DataFrame type. Basis of type was passed." + msg = r"The basis must be of DataFrame type or a list of DataFrames\. Basis of type was passed\." with pytest.raises(TypeError, match=msg): dml_plr_obj.cate(basis=2)