From a4b880c89882856f6861a99533ea9535cb90d09f Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 31 Jan 2026 09:58:03 +0100
Subject: [PATCH 01/38] first iteration of scalar implementation

---
 doubleml/__init__.py               |   6 +
 doubleml/double_ml_base.py         | 389 +++++++++++++++++++++++++
 doubleml/double_ml_framework.py    |   2 +-
 doubleml/double_ml_linear_score.py | 157 ++++++++++
 doubleml/double_ml_scalar.py       | 452 +++++++++++++++++++++++++++++
 5 files changed, 1005 insertions(+), 1 deletion(-)
 create mode 100644 doubleml/double_ml_base.py
 create mode 100644 doubleml/double_ml_linear_score.py
 create mode 100644 doubleml/double_ml_scalar.py

diff --git a/doubleml/__init__.py b/doubleml/__init__.py
index d4cbb943..0d046c4f 100644
--- a/doubleml/__init__.py
+++ b/doubleml/__init__.py
@@ -1,7 +1,10 @@
 from .data import DoubleMLClusterData, DoubleMLData, DoubleMLDIDData, DoubleMLPanelData, DoubleMLRDDData, DoubleMLSSMData
 from .did.did import DoubleMLDID
 from .did.did_cs import DoubleMLDIDCS
+from .double_ml_base import DoubleMLBase
 from .double_ml_framework import DoubleMLCore, DoubleMLFramework, concat
+from .double_ml_linear_score import LinearScoreMixin
+from .double_ml_scalar import DoubleMLScalar
 from .irm.apo import DoubleMLAPO
 from .irm.apos import DoubleMLAPOS
 from .irm.cvar import DoubleMLCVAR
@@ -20,7 +23,10 @@
 
 __all__ = [
     "concat",
+    "DoubleMLBase",
     "DoubleMLCore",
+    "DoubleMLScalar",
+    "LinearScoreMixin",
     "DoubleMLFramework",
     "DoubleMLPLR",
     "DoubleMLPLIV",
diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py
new file mode 100644
index 00000000..19eac58a
--- /dev/null
+++ b/doubleml/double_ml_base.py
@@ -0,0 +1,389 @@
+"""
+Abstract base class for Double Machine Learning estimators.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+
+import numpy as np
+import pandas as pd
+
+from .data.base_data import DoubleMLBaseData
+from .double_ml_framework import DoubleMLFramework
+
+
+class DoubleMLBase(ABC):
+    """
+    Abstract base class for Double Machine Learning.
+
+    Provides basic properties and abstract methods, e.g. the fit() method. Mainly handles
+    properties and methods which rely on an initialized DoubleMLFramework object.
+
+    This class serves as the foundation for both DoubleMLScalar (single parameter estimation)
+    and DoubleMLVector (parameter vector estimation).
+
+    Parameters
+    ----------
+    obj_dml_data : DoubleMLBaseData
+        The data object for the double machine learning model.
+
+    Attributes
+    ----------
+    framework : DoubleMLFramework
+        The DoubleMLFramework object containing estimation results and providing inference methods.
+    thetas : np.ndarray
+        Estimated parameter values (aggregated across repetitions, shape: (n_thetas,)).
+    all_thetas : np.ndarray
+        Estimated parameter values for each repetition (shape: (n_thetas, n_rep)).
+    ses : np.ndarray
+        Standard errors of parameter estimates (aggregated across repetitions, shape: (n_thetas,)).
+    all_ses : np.ndarray
+        Standard errors for each repetition (shape: (n_thetas, n_rep)).
+    summary : pd.DataFrame
+        Summary table with estimates, standard errors, confidence intervals, and p-values.
+    psi : np.ndarray
+        Influence function values (shape: (n_obs, n_thetas, n_rep)).
+    smpls : list
+        Sample splitting indices used for cross-fitting.
+    n_folds : int
+        Number of folds used for cross-fitting.
+    n_rep : int
+        Number of repetitions for sample splitting.
+    """
+
+    def __init__(
+        self,
+        obj_dml_data: DoubleMLBaseData,
+    ):
+        """
+        Initialize DoubleMLBase base class.
+
+        Parameters
+        ----------
+        obj_dml_data : DoubleMLBaseData
+            The data object for the double machine learning model.
+        """
+        # Validate and store data
+        if not isinstance(obj_dml_data, DoubleMLBaseData):
+            raise TypeError(f"obj_dml_data must be a DoubleMLBaseData instance. " f"Got {type(obj_dml_data)}.")
+
+        self._dml_data = obj_dml_data
+        self._n_obs = obj_dml_data.n_obs
+
+        # Framework is initialized after fit()
+        self._framework: Optional[DoubleMLFramework] = None
+
+        # Sample splits are initialized via draw_sample_splitting()
+        self._smpls: Optional[List] = None
+
+    # ==================== Properties (Delegating to Framework) ====================
+
+    @property
+    def framework(self) -> DoubleMLFramework:
+        """
+        The DoubleMLFramework object containing estimation results.
+
+        This object is created after calling fit() and provides methods for
+        statistical inference (confidence intervals, bootstrap, sensitivity analysis).
+
+        Returns
+        -------
+        DoubleMLFramework
+            The framework object with estimation results.
+
+        Raises
+        ------
+        ValueError
+            If framework is not yet initialized (fit() has not been called).
+        """
+        if self._framework is None:
+            raise ValueError("The framework is not yet initialized. " "Call fit() before accessing estimation results.")
+        return self._framework
+
+    @property
+    def thetas(self) -> np.ndarray:
+        """
+        Estimated parameter values (aggregated across repetitions).
+
+        Returns
+        -------
+        np.ndarray
+            Parameter estimates (shape: (n_thetas,)).
+        """
+        return self.framework.thetas
+
+    @property
+    def coef(self) -> np.ndarray:
+        """
+        Alias for thetas. Estimated parameter values (aggregated across repetitions).
+
+        Returns
+        -------
+        np.ndarray
+            Parameter estimates (shape: (n_thetas,)).
+        """
+        return self.thetas
+
+    @property
+    def all_thetas(self) -> np.ndarray:
+        """
+        Estimated parameter values for each repetition.
+
+        Returns
+        -------
+        np.ndarray
+            Parameter estimates for all repetitions (shape: (n_thetas, n_rep)).
+        """
+        return self.framework.all_thetas
+
+    @property
+    def all_coef(self) -> np.ndarray:
+        """
+        Alias for all_thetas. Estimated parameter values for each repetition.
+
+        Returns
+        -------
+        np.ndarray
+            Parameter estimates for all repetitions (shape: (n_thetas, n_rep)).
+        """
+        return self.all_thetas
+
+    @property
+    def se(self) -> np.ndarray:
+        """
+        Standard errors of parameter estimates (aggregated across repetitions).
+
+        Returns
+        -------
+        np.ndarray
+            Standard errors (shape: (n_thetas,)).
+        """
+        return self.framework.ses
+
+    @property
+    def all_ses(self) -> np.ndarray:
+        """
+        Standard errors for each repetition.
+
+        Returns
+        -------
+        np.ndarray
+            Standard errors for all repetitions (shape: (n_thetas, n_rep)).
+        """
+        return self.framework.all_ses
+
+    @property
+    def summary(self) -> pd.DataFrame:
+        """
+        Summary table with estimates, standard errors, confidence intervals, and p-values.
+
+        Returns
+        -------
+        pd.DataFrame
+            Summary statistics for all parameters.
+        """
+        return self.framework.summary
+
+    @property
+    def psi(self) -> np.ndarray:
+        """
+        Normalized influence function values (scaled score function).
+
+        Returns
+        -------
+        np.ndarray
+            Influence function values (shape: (n_obs, n_thetas, n_rep)).
+        """
+        return self.framework.scaled_psi
+
+    @property
+    def smpls(self) -> List:
+        """
+        Sample splitting indices used for cross-fitting.
+
+        Returns
+        -------
+        list
+            List of sample splitting indices for each repetition.
+        """
+        if self._smpls is None:
+            raise ValueError("Sample splitting has not been performed. " "Call draw_sample_splitting() first.")
+        return self._smpls
+
+    @property
+    def n_obs(self) -> int:
+        """
+        Number of observations.
+
+        Returns
+        -------
+        int
+            Number of observations in the dataset.
+        """
+        return self._n_obs
+
+    # ==================== Concrete Methods (Delegating to Framework) ====================
+
+    def confint(self, joint: bool = False, level: float = 0.95) -> pd.DataFrame:
+        """
+        Confidence intervals for DoubleML models.
+
+        Parameters
+        ----------
+        joint : bool, optional
+            Indicates whether joint confidence intervals are computed.
+            Default is False.
+        level : float, optional
+            The confidence level for the confidence interval.
+            Default is 0.95.
+
+        Returns
+        -------
+        pd.DataFrame
+            A DataFrame with confidence intervals.
+        """
+        return self.framework.confint(joint=joint, level=level)
+
+    def bootstrap(self, method: str = "normal", n_rep_boot: int = 500) -> "DoubleMLBase":
+        """
+        Multiplier bootstrap for DoubleML models.
+
+        Parameters
+        ----------
+        method : str, optional
+            The bootstrap method ('normal', 'Bayes', or 'wild').
+            Default is 'normal'.
+        n_rep_boot : int, optional
+            The number of bootstrap replications.
+            Default is 500.
+
+        Returns
+        -------
+        self : DoubleMLBase
+            The DoubleML estimator with bootstrap results.
+        """
+        self.framework.bootstrap(method=method, n_rep_boot=n_rep_boot)
+        return self
+
+    def p_adjust(self, method: str = "romano-wolf") -> pd.DataFrame:
+        """
+        Multiple testing adjustment of p-values.
+
+        Parameters
+        ----------
+        method : str, optional
+            The p-value adjustment method. Default is 'romano-wolf'.
+
+        Returns
+        -------
+        pd.DataFrame
+            A DataFrame with adjusted p-values.
+        """
+        return self.framework.p_adjust(method=method)
+
+    def sensitivity_analysis(
+        self,
+        cf_y: float = 0.03,
+        cf_d: float = 0.03,
+        rho: float = 1.0,
+        level: float = 0.95,
+        null_hypothesis: float = 0.0,
+    ) -> Dict:
+        """
+        Sensitivity analysis for DoubleML models.
+
+        Parameters
+        ----------
+        cf_y : float, optional
+            Percentage of residual variation in outcome explained by unobserved confounders.
+            Default is 0.03.
+        cf_d : float, optional
+            Percentage of residual variation in treatment explained by unobserved confounders.
+            Default is 0.03.
+        rho : float, optional
+            Correlation between unobserved confounders affecting outcome and treatment.
+            Default is 1.0.
+        level : float, optional
+            The confidence level for robustness analysis.
+            Default is 0.95.
+        null_hypothesis : float, optional
+            The null hypothesis value for the parameter.
+            Default is 0.0.
+
+        Returns
+        -------
+        dict
+            A dictionary with sensitivity analysis results.
+        """
+        return self.framework.sensitivity_analysis(
+            cf_y=cf_y,
+            cf_d=cf_d,
+            rho=rho,
+            level=level,
+            null_hypothesis=null_hypothesis,
+        )
+
+    # ==================== Abstract Methods ====================
+
+    @abstractmethod
+    def fit(self, **kwargs) -> "DoubleMLBase":
+        """
+        Estimate the DoubleML model.
+
+        This method must be implemented by subclasses (DoubleMLScalar or DoubleMLVector).
+
+        Parameters
+        ----------
+        **kwargs : dict
+            Additional keyword arguments for fitting.
+
+        Returns
+        -------
+        self : DoubleMLBase
+            The fitted DoubleML estimator.
+        """
+        pass
+
+    @abstractmethod
+    def draw_sample_splitting(self) -> "DoubleMLBase":
+        """
+        Draw sample splitting for cross-fitting.
+
+        This method must be implemented by subclasses to generate sample splits
+        using an appropriate resampling strategy.
+
+        Returns
+        -------
+        self : DoubleMLBase
+            The DoubleML estimator with initialized sample splits.
+        """
+        pass
+
+    def __str__(self) -> str:
+        """
+        String representation of the DoubleMLBase object.
+
+        Returns
+        -------
+        str
+            A formatted string summary of the model.
+        """
+        class_name = self.__class__.__name__
+        header = f"{'=' * 20} {class_name} Object {'=' * 20}"
+
+        if self._framework is not None:
+            summary_str = str(self.summary)
+            return f"{header}\n\n{summary_str}"
+        else:
+            return f"{header}\n\nModel not yet fitted. Call fit() first."
+
+    def __repr__(self) -> str:
+        """
+        Representation of the DoubleMLBase object.
+
+        Returns
+        -------
+        str
+            A string representation of the object.
+        """
+        return self.__str__()
diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py
index 99941c07..c82ad206 100644
--- a/doubleml/double_ml_framework.py
+++ b/doubleml/double_ml_framework.py
@@ -32,7 +32,7 @@ class DoubleMLCore:
     cluster_dict: Optional[Dict] = None
     sensitivity_elements: Optional[Dict[str, np.ndarray]] = None
     """
-    Core container for DoubleML results .
+    Internal container for DoubleML raw estimation results.
 
     This class stores the main results and diagnostics from a DoubleML estimation, including parameter estimates,
     standard errors, normalized scores, and (optionally) sensitivity and clustering information. It performs
diff --git a/doubleml/double_ml_linear_score.py b/doubleml/double_ml_linear_score.py
new file mode 100644
index 00000000..640e031d
--- /dev/null
+++ b/doubleml/double_ml_linear_score.py
@@ -0,0 +1,157 @@
+"""
+Mixin for DoubleML models with linear score functions.
+"""
+
+from typing import Dict
+
+import numpy as np
+
+from .double_ml_scalar import DoubleMLScalar
+
+
+class LinearScoreMixin(DoubleMLScalar):
+    """
+    Mixin for score functions linear in the target parameter.
+
+    This class extends DoubleMLScalar and implements the _est_causal_pars_and_se() method
+    for score functions that are linear in the target parameter θ.
+
+    Score form:
+        ψ(W; θ, η) = θ · ψ_a(W; η) + ψ_b(W; η)
+
+    The solution has a closed form:
+        θ̂ = -E[ψ_b] / E[ψ_a]
+
+    This applies to many common DoubleML models including:
+    - Partially Linear Regression (PLR)
+    - Partially Linear IV Regression (PLIV)
+    - Interactive Regression Model (IRM)
+    - Difference-in-Differences (DID)
+    - and others
+
+    Notes
+    -----
+    Subclasses must implement:
+    - _nuisance_est(): Estimate nuisance parameters for one fold
+    - _get_score_elements(): Return dict with 'psi_a' and 'psi_b' arrays of shape (n_obs, n_rep)
+    """
+
+    def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None:
+        """
+        Estimate causal parameters and standard errors for linear score.
+
+        This method implements the closed-form solution for linear score functions
+        and computes standard errors using the influence function.
+
+        All computations use framework convention: (n_obs, n_thetas, n_rep).
+
+        Parameters
+        ----------
+        psi_elements : dict
+            Dictionary with score elements. Must contain:
+            - 'psi_a': np.ndarray of shape (n_obs, n_rep)
+            - 'psi_b': np.ndarray of shape (n_obs, n_rep)
+
+        Notes
+        -----
+        Updates the following attributes (all in framework convention):
+        - self._all_thetas: Parameter estimates for each repetition (n_thetas=1, n_rep)
+        - self._all_ses: Standard errors for each repetition (n_thetas=1, n_rep)
+        - self._psi: Influence function values (n_obs, n_thetas=1, n_rep)
+        - self._psi_deriv: Score derivative w.r.t. θ (n_obs, n_thetas=1, n_rep)
+        - self._var_scaling_factors: Variance scaling factors (n_thetas=1,)
+        """
+        # Extract score elements
+        if "psi_a" not in psi_elements or "psi_b" not in psi_elements:
+            raise ValueError(
+                "LinearScoreMixin requires 'psi_a' and 'psi_b' in psi_elements. " f"Got keys: {list(psi_elements.keys())}"
+            )
+
+        psi_a = psi_elements["psi_a"]  # Shape: (n_obs, n_rep)
+        psi_b = psi_elements["psi_b"]  # Shape: (n_obs, n_rep)
+
+        # Validate shapes
+        if psi_a.shape != psi_b.shape:
+            raise ValueError(f"psi_a and psi_b must have the same shape. " f"Got psi_a: {psi_a.shape}, psi_b: {psi_b.shape}")
+
+        n_obs, n_rep = psi_a.shape
+
+        if n_rep != self.n_rep:
+            raise ValueError(f"Score elements have {n_rep} repetitions, but model expects {self.n_rep}.")
+
+        # Compute parameter estimates using closed-form solution
+        # θ̂ = -E[ψ_b] / E[ψ_a]
+        mean_psi_a = np.mean(psi_a, axis=0)  # (n_rep,)
+        mean_psi_b = np.mean(psi_b, axis=0)  # (n_rep,)
+
+        # Check for zero denominator
+        if np.any(np.abs(mean_psi_a) < 1e-12):
+            raise ValueError(
+                "Division by near-zero detected in linear score estimation. "
+                "E[psi_a] is very close to zero. This may indicate issues with "
+                "the nuisance models or data."
+            )
+
+        thetas = -mean_psi_b / mean_psi_a  # (n_rep,)
+
+        # Store parameter estimates in framework shape: (n_thetas=1, n_rep)
+        self._all_thetas = thetas[np.newaxis, :]  # (1, n_rep)
+
+        # Compute influence function (score evaluated at θ̂)
+        # ψ(W; θ̂, η) = θ̂ · ψ_a + ψ_b
+        # Shape: (n_obs, n_rep)
+        psi = thetas[np.newaxis, :] * psi_a + psi_b  # Broadcasting: (1, n_rep) * (n_obs, n_rep)
+
+        # Store influence function in framework shape: (n_obs, n_thetas=1, n_rep)
+        self._psi = psi[:, np.newaxis, :]  # (n_obs, 1, n_rep)
+
+        # Compute score derivative w.r.t. θ
+        # ∂ψ/∂θ = ψ_a
+        # Store in framework shape: (n_obs, n_thetas=1, n_rep)
+        self._psi_deriv = psi_a[:, np.newaxis, :]  # (n_obs, 1, n_rep)
+
+        # Compute standard errors
+        # SE = std(ψ) / sqrt(n)
+        se = np.std(psi, axis=0) / np.sqrt(n_obs)  # (n_rep,)
+        self._all_ses = se[np.newaxis, :]  # (1, n_rep)
+
+        # Compute variance scaling factors
+        # This is 1 / E[∂ψ/∂θ]^2 = 1 / E[ψ_a]^2
+        var_scaling_factors = 1.0 / (mean_psi_a**2)  # (n_rep,)
+
+        # Take mean across repetitions and store in framework shape: (n_thetas=1,)
+        self._var_scaling_factors = np.array([np.mean(var_scaling_factors)])  # (1,)
+
+    def _compute_score(self, psi_elements: Dict[str, np.ndarray], coef: float) -> np.ndarray:
+        """
+        Compute the score function value for a given coefficient.
+
+        This is primarily used for verification and diagnostic purposes.
+
+        Parameters
+        ----------
+        psi_elements : dict
+            Dictionary with 'psi_a' and 'psi_b' of shape (n_obs, n_rep).
+        coef : float
+            The coefficient value at which to evaluate the score.
+
+        Returns
+        -------
+        np.ndarray
+            Score function values, shape (n_obs, n_rep).
+        """
+        psi_a = psi_elements["psi_a"]
+        psi_b = psi_elements["psi_b"]
+
+        return coef * psi_a + psi_b
+
+    def _score_element_names(self) -> list:
+        """
+        Get the names of score elements for this model.
+
+        Returns
+        -------
+        list
+            List of score element names: ['psi_a', 'psi_b']
+        """
+        return ["psi_a", "psi_b"]
diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
new file mode 100644
index 00000000..5cd4a381
--- /dev/null
+++ b/doubleml/double_ml_scalar.py
@@ -0,0 +1,452 @@
+"""
+Abstract base class for scalar DoubleML models (single parameter estimation).
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Optional
+
+import numpy as np
+
+from .data.base_data import DoubleMLBaseData
+from .double_ml_base import DoubleMLBase
+from .double_ml_framework import DoubleMLCore as DoubleMLCoreData
+from .double_ml_framework import DoubleMLFramework
+from .utils.resampling import DoubleMLResampling
+
+
+class DoubleMLScalar(DoubleMLBase, ABC):
+    """
+    Abstract base class for scalar DoubleML models.
+
+    Defines the fit() method for a single parameter based on abstract private methods
+    such as nuisance_est(). Solves either linear or non-linear score functions.
+    Requires a single treatment column in DoubleMLData.
+
+    This class implements the template method pattern: the fit() method orchestrates
+    the estimation process by calling abstract methods that subclasses must implement.
+
+    Parameters
+    ----------
+    obj_dml_data : DoubleMLBaseData
+        The data object for the double machine learning model.
+        Must contain exactly one treatment variable.
+    n_folds : int, optional
+        Number of folds for cross-fitting. Default is 5.
+    n_rep : int, optional
+        Number of repetitions for sample splitting. Default is 1.
+    score : str, optional
+        The score function to use. Default is model-specific.
+    draw_sample_splitting : bool, optional
+        Whether to draw sample splits on initialization. Default is True.
+
+    Attributes
+    ----------
+    n_folds : int
+        Number of folds for cross-fitting.
+    n_rep : int
+        Number of repetitions for sample splitting.
+    score : str
+        The score function being used.
+    """
+
+    def __init__(
+        self,
+        obj_dml_data: DoubleMLBaseData,
+        n_folds: int = 5,
+        n_rep: int = 1,
+        score: str = "default",
+        draw_sample_splitting: bool = True,
+    ):
+        """
+        Initialize DoubleMLScalar.
+
+        Parameters
+        ----------
+        obj_dml_data : DoubleMLBaseData
+            The data object. Must have exactly one treatment column.
+        n_folds : int, optional
+            Number of folds for cross-fitting. Default is 5.
+        n_rep : int, optional
+            Number of repetitions for sample splitting. Default is 1.
+        score : str, optional
+            The score function to use. Default is 'default'.
+        draw_sample_splitting : bool, optional
+            Whether to draw sample splits on initialization. Default is True.
+
+        Raises
+        ------
+        ValueError
+            If obj_dml_data contains more than one treatment column.
+        TypeError
+            If parameters have incorrect types.
+        """
+        # Validate single treatment column
+        if len(obj_dml_data.d_cols) != 1:
+            raise ValueError(
+                f"DoubleMLScalar requires exactly one treatment column. "
+                f"Got {len(obj_dml_data.d_cols)}: {obj_dml_data.d_cols}. "
+                f"For multiple treatments, use DoubleMLVector."
+            )
+
+        # Call parent constructor
+        super().__init__(obj_dml_data)
+
+        # Validate and store resampling parameters
+        if not isinstance(n_folds, int) or n_folds < 2:
+            raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.")
+        if not isinstance(n_rep, int) or n_rep < 1:
+            raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.")
+        if not isinstance(draw_sample_splitting, bool):
+            raise TypeError(f"draw_sample_splitting must be bool. Got {type(draw_sample_splitting)}.")
+
+        self._n_folds = n_folds
+        self._n_rep = n_rep
+        self._score = score
+
+        # Initialize storage for predictions and results
+        self._predictions: Optional[Dict[str, np.ndarray]] = None
+        self._all_thetas: Optional[np.ndarray] = None
+        self._all_ses: Optional[np.ndarray] = None
+        self._psi: Optional[np.ndarray] = None
+        self._psi_deriv: Optional[np.ndarray] = None
+        self._var_scaling_factors: Optional[np.ndarray] = None
+
+        # For iteration (used during fit)
+        self._i_rep: Optional[int] = None
+        self._i_fold: Optional[int] = None
+
+        # Draw sample splitting if requested
+        if draw_sample_splitting:
+            self.draw_sample_splitting()
+
+    # ==================== Properties ====================
+
+    @property
+    def n_folds(self) -> int:
+        """
+        Number of folds for cross-fitting.
+
+        Returns
+        -------
+        int
+            Number of folds.
+        """
+        return self._n_folds
+
+    @property
+    def n_rep(self) -> int:
+        """
+        Number of repetitions for sample splitting.
+
+        Returns
+        -------
+        int
+            Number of repetitions.
+        """
+        return self._n_rep
+
+    @property
+    def score(self) -> str:
+        """
+        The score function being used.
+
+        Returns
+        -------
+        str
+            Score function name.
+        """
+        return self._score
+
+    @property
+    def predictions(self) -> Dict[str, np.ndarray]:
+        """
+        Predictions from nuisance models (if stored during fit).
+
+        Returns
+        -------
+        dict
+            Dictionary with predictions for each nuisance component.
+
+        Raises
+        ------
+        ValueError
+            If predictions were not stored during fit.
+        """
+        if self._predictions is None:
+            raise ValueError("Predictions not available. Call fit() with store_predictions=True.")
+        return self._predictions
+
+    # ==================== Concrete fit() Method (Template) ====================
+
+    def fit(self, n_jobs_cv: Optional[int] = None, store_predictions: bool = True, **kwargs) -> "DoubleMLScalar":
+        """
+        Estimate the DoubleML model.
+
+        This is the concrete implementation of the fit() method using the template method pattern.
+        It orchestrates the estimation by:
+        1. Ensuring sample splitting is initialized
+        2. Initializing storage arrays
+        3. Looping over repetitions and folds
+        4. Calling abstract _nuisance_est() for each fold (implemented by subclasses)
+        5. Computing score elements via _get_score_elements() (implemented by subclasses)
+        6. Estimating parameters via _est_causal_pars_and_se() (from score mixin)
+        7. Constructing the DoubleMLFramework
+
+        Parameters
+        ----------
+        n_jobs_cv : int, optional
+            Number of jobs for parallel processing during cross-validation.
+            Currently not used (reserved for future parallelization).
+        store_predictions : bool, optional
+            Whether to store predictions from nuisance models. Default is True.
+        **kwargs : dict
+            Additional keyword arguments (for future extensibility).
+
+        Returns
+        -------
+        self : DoubleMLScalar
+            The fitted estimator.
+        """
+        # Step 1: Ensure sample splitting is initialized
+        if self._smpls is None:
+            self.draw_sample_splitting()
+
+        # Step 2: Initialize storage arrays
+        self._initialize_arrays(store_predictions=store_predictions)
+
+        # Step 3: Cross-fitting loop over repetitions and folds
+        for i_rep in range(self.n_rep):
+            self._i_rep = i_rep
+
+            for i_fold in range(self.n_folds):
+                self._i_fold = i_fold
+
+                # Get train/test indices for this fold
+                train_idx, test_idx = self._smpls[i_rep][i_fold]
+
+                # Step 4: Call abstract method - subclass implements nuisance estimation
+                self._nuisance_est(
+                    train_idx=train_idx,
+                    test_idx=test_idx,
+                    i_rep=i_rep,
+                    i_fold=i_fold,
+                )
+
+        # Step 5: Get score elements - subclass implements
+        psi_elements = self._get_score_elements()
+
+        # Step 6: Estimate causal parameters - from score mixin
+        self._est_causal_pars_and_se(psi_elements)
+
+        # Step 7: Construct framework
+        self._framework = self._construct_framework()
+
+        return self
+
+    def draw_sample_splitting(self) -> "DoubleMLScalar":
+        """
+        Draw sample splitting for cross-fitting.
+
+        Uses DoubleMLResampling to generate K-fold cross-validation splits
+        with multiple repetitions.
+
+        Returns
+        -------
+        self : DoubleMLScalar
+            The estimator with initialized sample splits.
+        """
+        # Create resampler
+        resampler = DoubleMLResampling(
+            n_folds=self.n_folds,
+            n_rep=self.n_rep,
+            n_obs=self._n_obs,
+        )
+
+        # Generate splits
+        self._smpls = resampler.split_samples()
+
+        return self
+
+    # ==================== Private Helper Methods ====================
+
+    def _initialize_arrays(self, store_predictions: bool = True) -> None:
+        """
+        Initialize storage arrays for predictions and results.
+
+        Parameters
+        ----------
+        store_predictions : bool
+            Whether to allocate arrays for storing predictions.
+        """
+        n_obs = self._n_obs
+        n_rep = self.n_rep
+        n_thetas = 1  # Scalar model estimates single parameter
+
+        # Initialize predictions storage if requested
+        if store_predictions:
+            self._predictions = self._initialize_predictions_dict()
+
+        # Initialize result arrays using framework convention
+        # These will be filled by _est_causal_pars_and_se()
+        # Shapes follow framework: (n_thetas, n_rep) for params, (n_obs, n_thetas, n_rep) for scores
+        self._all_thetas = np.zeros((n_thetas, n_rep))  # (n_thetas=1, n_rep)
+        self._all_ses = np.zeros((n_thetas, n_rep))
+        self._psi = np.zeros((n_obs, n_thetas, n_rep))  # (n_obs, n_thetas=1, n_rep)
+        self._psi_deriv = np.zeros((n_obs, n_thetas, n_rep))
+
+    def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]:
+        """
+        Initialize dictionary for storing predictions.
+
+        Subclasses can override this to define their specific prediction storage structure.
+
+        Returns
+        -------
+        dict
+            Empty dictionary (subclasses should override).
+        """
+        # Default: return empty dict
+        # Subclasses should override to create arrays for their specific nuisance components
+        return {}
+
+    def _construct_framework(self) -> DoubleMLFramework:
+        """
+        Construct DoubleMLFramework from estimation results.
+
+        Returns
+        -------
+        DoubleMLFramework
+            The framework object with estimation results.
+        """
+        # Standardize the score function: psi / E[psi_deriv]
+        # Both already in framework shape: (n_obs, n_thetas, n_rep)
+        scaled_psi = np.divide(self._psi, np.mean(self._psi_deriv, axis=0, keepdims=True))
+
+        # Create data container (no transpose needed - already in framework convention!)
+        framework_data = DoubleMLCoreData(
+            all_thetas=self._all_thetas,  # (n_thetas, n_rep)
+            all_ses=self._all_ses,  # (n_thetas, n_rep)
+            var_scaling_factors=self._var_scaling_factors,  # (n_thetas,)
+            scaled_psi=scaled_psi,  # (n_obs, n_thetas, n_rep)
+            is_cluster_data=False,  # TODO: Add cluster data support
+        )
+
+        # Create and return framework
+        return DoubleMLFramework(
+            dml_core=framework_data,
+            treatment_names=self._dml_data.d_cols,
+        )
+
+    # ==================== Abstract Methods (Must be Implemented by Subclasses) ====================
+
+    @abstractmethod
+    def _nuisance_est(
+        self,
+        train_idx: np.ndarray,
+        test_idx: np.ndarray,
+        i_rep: int,
+        i_fold: int,
+    ) -> None:
+        """
+        Estimate nuisance parameters for one fold.
+
+        This is the main method subclasses must implement. It should:
+        1. Extract training and test data using train_idx and test_idx
+        2. Fit nuisance models (e.g., outcome model, treatment model) on training data
+        3. Predict on test data
+        4. Store predictions in self._predictions
+
+        Parameters
+        ----------
+        train_idx : np.ndarray
+            Indices of training observations for this fold.
+        test_idx : np.ndarray
+            Indices of test observations for this fold.
+        i_rep : int
+            Repetition index (0 to n_rep-1).
+        i_fold : int
+            Fold index (0 to n_folds-1).
+
+        Notes
+        -----
+        Subclasses should store predictions in self._predictions, for example:
+            self._predictions['ml_l'][test_idx, i_rep] = l_hat
+            self._predictions['ml_m'][test_idx, i_rep] = m_hat
+        """
+        pass
+
+    @abstractmethod
+    def _get_score_elements(self) -> Dict[str, np.ndarray]:
+        """
+        Compute score function elements from nuisance predictions.
+
+        This method should use the predictions stored in self._predictions
+        to compute the components of the score function.
+
+        Returns
+        -------
+        dict
+            Dictionary with score elements.
+            For LinearScoreMixin: {'psi_a': array, 'psi_b': array}
+            For NonLinearScoreMixin: model-specific elements
+
+        Notes
+        -----
+        The score elements should have shape (n_obs, n_rep) for scalar models.
+
+        Example for PLR (linear score):
+            psi_a = (D - m_hat) ** 2  # shape: (n_obs, n_rep)
+            psi_b = (D - m_hat) * (Y - l_hat)  # shape: (n_obs, n_rep)
+            return {'psi_a': psi_a, 'psi_b': psi_b}
+        """
+        pass
+
+    @abstractmethod
+    def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None:
+        """
+        Estimate causal parameters and standard errors from score elements.
+
+        This method is implemented by score mixins (LinearScoreMixin or NonLinearScoreMixin).
+        It should:
+        1. Compute parameter estimates (self._all_thetas)
+        2. Compute standard errors (self._all_ses)
+        3. Compute influence function (self._psi)
+        4. Compute score derivative (self._psi_deriv)
+        5. Compute variance scaling factors (self._var_scaling_factors)
+
+        Parameters
+        ----------
+        psi_elements : dict
+            Dictionary with score function elements from _get_score_elements().
+
+        Notes
+        -----
+        After this method, all arrays must follow framework convention:
+        - self._all_thetas should have shape (n_thetas, n_rep)
+        - self._all_ses should have shape (n_thetas, n_rep)
+        - self._psi should have shape (n_obs, n_thetas, n_rep)
+        - self._psi_deriv should have shape (n_obs, n_thetas, n_rep)
+        - self._var_scaling_factors should have shape (n_thetas,)
+        """
+        pass
+
+    def __str__(self) -> str:
+        """
+        String representation of the DoubleMLScalar object.
+
+        Returns
+        -------
+        str
+            A formatted string summary of the model.
+        """
+        class_name = self.__class__.__name__
+        header = f"{'=' * 20} {class_name} Object {'=' * 20}"
+
+        info = f"Score function: {self.score}\n"
+        info += f"Resampling: {self.n_folds}-fold CV, {self.n_rep} repetitions\n"
+
+        if self._framework is not None:
+            summary_str = str(self.summary)
+            return f"{header}\n\n{info}\n{summary_str}"
+        else:
+            return f"{header}\n\n{info}\nModel not yet fitted. Call fit() first."

From 4f4c25574b49a13b7c3e049c5a9800f8523dc866 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sun, 1 Feb 2026 10:26:37 +0100
Subject: [PATCH 02/38] refactor DoubleMLScalar to split fit() into separate
 parts

---
 doubleml/double_ml_base.py         |  41 +----
 doubleml/double_ml_linear_score.py |  16 +-
 doubleml/double_ml_scalar.py       | 270 +++++++++++++++++++----------
 3 files changed, 193 insertions(+), 134 deletions(-)

diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py
index 19eac58a..645e3ed6 100644
--- a/doubleml/double_ml_base.py
+++ b/doubleml/double_ml_base.py
@@ -3,7 +3,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional
+from typing import Dict, Optional, Self
 
 import numpy as np
 import pandas as pd
@@ -43,10 +43,6 @@ class DoubleMLBase(ABC):
         Summary table with estimates, standard errors, confidence intervals, and p-values.
     psi : np.ndarray
         Influence function values (shape: (n_obs, n_thetas, n_rep)).
-    smpls : list
-        Sample splitting indices used for cross-fitting.
-    n_folds : int
-        Number of folds used for cross-fitting.
     n_rep : int
         Number of repetitions for sample splitting.
     """
@@ -73,9 +69,6 @@ def __init__(
         # Framework is initialized after fit()
         self._framework: Optional[DoubleMLFramework] = None
 
-        # Sample splits are initialized via draw_sample_splitting()
-        self._smpls: Optional[List] = None
-
     # ==================== Properties (Delegating to Framework) ====================
 
     @property
@@ -197,18 +190,17 @@ def psi(self) -> np.ndarray:
         return self.framework.scaled_psi
 
     @property
-    def smpls(self) -> List:
+    @abstractmethod
+    def n_rep(self) -> int:
         """
-        Sample splitting indices used for cross-fitting.
+        Number of repetitions for sample splitting.
 
         Returns
         -------
-        list
-            List of sample splitting indices for each repetition.
+        int
+            Number of repetitions.
         """
-        if self._smpls is None:
-            raise ValueError("Sample splitting has not been performed. " "Call draw_sample_splitting() first.")
-        return self._smpls
+        pass
 
     @property
     def n_obs(self) -> int:
@@ -244,7 +236,7 @@ def confint(self, joint: bool = False, level: float = 0.95) -> pd.DataFrame:
         """
         return self.framework.confint(joint=joint, level=level)
 
-    def bootstrap(self, method: str = "normal", n_rep_boot: int = 500) -> "DoubleMLBase":
+    def bootstrap(self, method: str = "normal", n_rep_boot: int = 500) -> Self:
         """
         Multiplier bootstrap for DoubleML models.
 
@@ -326,7 +318,7 @@ def sensitivity_analysis(
     # ==================== Abstract Methods ====================
 
     @abstractmethod
-    def fit(self, **kwargs) -> "DoubleMLBase":
+    def fit(self, **kwargs) -> Self:
         """
         Estimate the DoubleML model.
 
@@ -344,21 +336,6 @@ def fit(self, **kwargs) -> "DoubleMLBase":
         """
         pass
 
-    @abstractmethod
-    def draw_sample_splitting(self) -> "DoubleMLBase":
-        """
-        Draw sample splitting for cross-fitting.
-
-        This method must be implemented by subclasses to generate sample splits
-        using an appropriate resampling strategy.
-
-        Returns
-        -------
-        self : DoubleMLBase
-            The DoubleML estimator with initialized sample splits.
-        """
-        pass
-
     def __str__(self) -> str:
         """
         String representation of the DoubleMLBase object.
diff --git a/doubleml/double_ml_linear_score.py b/doubleml/double_ml_linear_score.py
index 640e031d..4bada4d8 100644
--- a/doubleml/double_ml_linear_score.py
+++ b/doubleml/double_ml_linear_score.py
@@ -110,17 +110,15 @@ def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None:
         # Store in framework shape: (n_obs, n_thetas=1, n_rep)
         self._psi_deriv = psi_a[:, np.newaxis, :]  # (n_obs, 1, n_rep)
 
-        # Compute standard errors
-        # SE = std(ψ) / sqrt(n)
-        se = np.std(psi, axis=0) / np.sqrt(n_obs)  # (n_rep,)
+        # Compute standard errors using sandwich variance estimator
+        # Var(θ̂) = E[ψ²] / (n · J²), where J = E[ψ_a]
+        # SE = sqrt(E[ψ²]) / (|J| · sqrt(n))
+        gamma_hat = np.mean(psi**2, axis=0)  # (n_rep,)
+        se = np.sqrt(gamma_hat) / (np.abs(mean_psi_a) * np.sqrt(n_obs))  # (n_rep,)
         self._all_ses = se[np.newaxis, :]  # (1, n_rep)
 
-        # Compute variance scaling factors
-        # This is 1 / E[∂ψ/∂θ]^2 = 1 / E[ψ_a]^2
-        var_scaling_factors = 1.0 / (mean_psi_a**2)  # (n_rep,)
-
-        # Take mean across repetitions and store in framework shape: (n_thetas=1,)
-        self._var_scaling_factors = np.array([np.mean(var_scaling_factors)])  # (1,)
+        # Variance scaling factor: n / J² (used by framework for aggregation)
+        self._var_scaling_factors = np.array([n_obs])  # (1,)
 
     def _compute_score(self, psi_elements: Dict[str, np.ndarray], coef: float) -> np.ndarray:
         """
diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index 5cd4a381..d4b3cf36 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -3,7 +3,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Dict, Optional
+from typing import Dict, List, Optional, Self
 
 import numpy as np
 
@@ -30,21 +30,15 @@ class DoubleMLScalar(DoubleMLBase, ABC):
     obj_dml_data : DoubleMLBaseData
         The data object for the double machine learning model.
         Must contain exactly one treatment variable.
-    n_folds : int, optional
-        Number of folds for cross-fitting. Default is 5.
-    n_rep : int, optional
-        Number of repetitions for sample splitting. Default is 1.
     score : str, optional
         The score function to use. Default is model-specific.
-    draw_sample_splitting : bool, optional
-        Whether to draw sample splits on initialization. Default is True.
 
     Attributes
     ----------
     n_folds : int
-        Number of folds for cross-fitting.
+        Number of folds for cross-fitting (set via draw_sample_splitting).
     n_rep : int
-        Number of repetitions for sample splitting.
+        Number of repetitions for sample splitting (set via draw_sample_splitting).
     score : str
         The score function being used.
     """
@@ -52,10 +46,7 @@ class DoubleMLScalar(DoubleMLBase, ABC):
     def __init__(
         self,
         obj_dml_data: DoubleMLBaseData,
-        n_folds: int = 5,
-        n_rep: int = 1,
         score: str = "default",
-        draw_sample_splitting: bool = True,
     ):
         """
         Initialize DoubleMLScalar.
@@ -64,21 +55,13 @@ def __init__(
         ----------
         obj_dml_data : DoubleMLBaseData
             The data object. Must have exactly one treatment column.
-        n_folds : int, optional
-            Number of folds for cross-fitting. Default is 5.
-        n_rep : int, optional
-            Number of repetitions for sample splitting. Default is 1.
         score : str, optional
             The score function to use. Default is 'default'.
-        draw_sample_splitting : bool, optional
-            Whether to draw sample splits on initialization. Default is True.
 
         Raises
         ------
         ValueError
             If obj_dml_data contains more than one treatment column.
-        TypeError
-            If parameters have incorrect types.
         """
         # Validate single treatment column
         if len(obj_dml_data.d_cols) != 1:
@@ -91,18 +74,13 @@ def __init__(
         # Call parent constructor
         super().__init__(obj_dml_data)
 
-        # Validate and store resampling parameters
-        if not isinstance(n_folds, int) or n_folds < 2:
-            raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.")
-        if not isinstance(n_rep, int) or n_rep < 1:
-            raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.")
-        if not isinstance(draw_sample_splitting, bool):
-            raise TypeError(f"draw_sample_splitting must be bool. Got {type(draw_sample_splitting)}.")
-
-        self._n_folds = n_folds
-        self._n_rep = n_rep
         self._score = score
 
+        # Resampling parameters (set via draw_sample_splitting)
+        self._n_folds: Optional[int] = None
+        self._n_rep: Optional[int] = None
+        self._smpls: Optional[List] = None
+
         # Initialize storage for predictions and results
         self._predictions: Optional[Dict[str, np.ndarray]] = None
         self._all_thetas: Optional[np.ndarray] = None
@@ -115,10 +93,6 @@ def __init__(
         self._i_rep: Optional[int] = None
         self._i_fold: Optional[int] = None
 
-        # Draw sample splitting if requested
-        if draw_sample_splitting:
-            self.draw_sample_splitting()
-
     # ==================== Properties ====================
 
     @property
@@ -130,7 +104,14 @@ def n_folds(self) -> int:
         -------
         int
             Number of folds.
+
+        Raises
+        ------
+        ValueError
+            If sample splitting has not been performed yet.
         """
+        if self._n_folds is None:
+            raise ValueError("n_folds not set. Call draw_sample_splitting() first.")
         return self._n_folds
 
     @property
@@ -142,7 +123,14 @@ def n_rep(self) -> int:
         -------
         int
             Number of repetitions.
+
+        Raises
+        ------
+        ValueError
+            If sample splitting has not been performed yet.
         """
+        if self._n_rep is None:
+            raise ValueError("n_rep not set. Call draw_sample_splitting() first.")
         return self._n_rep
 
     @property
@@ -160,7 +148,7 @@ def score(self) -> str:
     @property
     def predictions(self) -> Dict[str, np.ndarray]:
         """
-        Predictions from nuisance models (if stored during fit).
+        Predictions from nuisance models.
 
         Returns
         -------
@@ -170,51 +158,115 @@ def predictions(self) -> Dict[str, np.ndarray]:
         Raises
         ------
         ValueError
-            If predictions were not stored during fit.
+            If the model has not been fitted yet.
         """
         if self._predictions is None:
-            raise ValueError("Predictions not available. Call fit() with store_predictions=True.")
+            raise ValueError("Predictions not available. Call fit() first.")
         return self._predictions
 
+    @property
+    def smpls(self) -> List:
+        """
+        Sample splitting indices used for cross-fitting.
+
+        Returns
+        -------
+        list
+            List of sample splitting indices for each repetition.
+        """
+        if self._smpls is None:
+            raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.")
+        return self._smpls
+
     # ==================== Concrete fit() Method (Template) ====================
 
-    def fit(self, n_jobs_cv: Optional[int] = None, store_predictions: bool = True, **kwargs) -> "DoubleMLScalar":
+    def fit(
+        self,
+        n_folds: int = 5,
+        n_rep: int = 1,
+        n_jobs_cv: Optional[int] = None,
+        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+        **kwargs,
+    ) -> Self:
         """
         Estimate the DoubleML model.
 
-        This is the concrete implementation of the fit() method using the template method pattern.
-        It orchestrates the estimation by:
-        1. Ensuring sample splitting is initialized
-        2. Initializing storage arrays
-        3. Looping over repetitions and folds
-        4. Calling abstract _nuisance_est() for each fold (implemented by subclasses)
-        5. Computing score elements via _get_score_elements() (implemented by subclasses)
-        6. Estimating parameters via _est_causal_pars_and_se() (from score mixin)
-        7. Constructing the DoubleMLFramework
+        Calls :meth:`draw_sample_splitting` (if not yet done),
+        :meth:`fit_nuisance_models`, and :meth:`estimate_causal_parameters`.
 
         Parameters
         ----------
+        n_folds : int, optional
+            Number of folds for cross-fitting. Default is 5.
+            Only used if sample splitting has not been drawn yet.
+        n_rep : int, optional
+            Number of repetitions for sample splitting. Default is 1.
+            Only used if sample splitting has not been drawn yet.
         n_jobs_cv : int, optional
             Number of jobs for parallel processing during cross-validation.
             Currently not used (reserved for future parallelization).
-        store_predictions : bool, optional
-            Whether to store predictions from nuisance models. Default is True.
+        external_predictions : dict or None, optional
+            Dictionary of pre-computed nuisance predictions to use instead of fitting
+            learners. Keys are learner names (e.g., ``'ml_l'``, ``'ml_m'``), values are
+            arrays of shape ``(n_obs, n_rep)``. Learners not in the dict are fitted normally.
+            Default is ``None``.
         **kwargs : dict
             Additional keyword arguments (for future extensibility).
 
         Returns
         -------
-        self : DoubleMLScalar
+        self : Self
             The fitted estimator.
         """
-        # Step 1: Ensure sample splitting is initialized
         if self._smpls is None:
-            self.draw_sample_splitting()
+            self.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+        self.fit_nuisance_models(n_jobs_cv=n_jobs_cv, external_predictions=external_predictions)
+        self.estimate_causal_parameters()
+        return self
+
+    def fit_nuisance_models(
+        self,
+        n_jobs_cv: Optional[int] = None,
+        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+    ) -> Self:
+        """
+        Fit nuisance models via cross-fitting.
+
+        Requires sample splitting to be initialized via :meth:`draw_sample_splitting`
+        before calling this method.
+
+        Parameters
+        ----------
+        n_jobs_cv : int, optional
+            Number of jobs for parallel processing during cross-validation.
+            Currently not used (reserved for future parallelization).
+        external_predictions : dict or None, optional
+            Dictionary of pre-computed nuisance predictions. Keys are learner names,
+            values are arrays of shape ``(n_obs, n_rep)``. Default is ``None``.
+
+        Returns
+        -------
+        self : Self
+            The estimator with fitted nuisance models and stored predictions.
+
+        Raises
+        ------
+        ValueError
+            If sample splitting has not been initialized.
+        """
+        if self._smpls is None:
+            raise ValueError("Sample splitting has not been initialized. Call draw_sample_splitting() first.")
+
+        # Initialize prediction arrays
+        self._predictions = self._initialize_predictions_dict()
 
-        # Step 2: Initialize storage arrays
-        self._initialize_arrays(store_predictions=store_predictions)
+        # Pre-fill external predictions
+        if external_predictions is not None:
+            for key, values in external_predictions.items():
+                if key in self._predictions:
+                    self._predictions[key][:] = values
 
-        # Step 3: Cross-fitting loop over repetitions and folds
+        # Cross-fitting loop over repetitions and folds
         for i_rep in range(self.n_rep):
             self._i_rep = i_rep
 
@@ -224,41 +276,87 @@ def fit(self, n_jobs_cv: Optional[int] = None, store_predictions: bool = True, *
                 # Get train/test indices for this fold
                 train_idx, test_idx = self._smpls[i_rep][i_fold]
 
-                # Step 4: Call abstract method - subclass implements nuisance estimation
+                # Call abstract method - subclass implements nuisance estimation
                 self._nuisance_est(
                     train_idx=train_idx,
                     test_idx=test_idx,
                     i_rep=i_rep,
                     i_fold=i_fold,
+                    external_predictions=external_predictions,
                 )
 
-        # Step 5: Get score elements - subclass implements
+        return self
+
+    def estimate_causal_parameters(self) -> Self:
+        """
+        Estimate causal parameters from nuisance predictions.
+
+        Computes score elements, estimates parameters and standard errors, and
+        constructs the DoubleMLFramework. Must be called after :meth:`fit_nuisance_models`.
+
+        Returns
+        -------
+        self : Self
+            The estimator with estimated causal parameters.
+
+        Raises
+        ------
+        ValueError
+            If nuisance models have not been fitted yet.
+        """
+        if self._predictions is None:
+            raise ValueError("Predictions not available. Call fit_nuisance_models() first.")
+
+        # Initialize result arrays
+        self._initialize_result_arrays()
+
+        # Get score elements - subclass implements
         psi_elements = self._get_score_elements()
 
-        # Step 6: Estimate causal parameters - from score mixin
+        # Estimate causal parameters - from score mixin
         self._est_causal_pars_and_se(psi_elements)
 
-        # Step 7: Construct framework
+        # Construct framework
         self._framework = self._construct_framework()
 
         return self
 
-    def draw_sample_splitting(self) -> "DoubleMLScalar":
+    def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
         """
         Draw sample splitting for cross-fitting.
 
         Uses DoubleMLResampling to generate K-fold cross-validation splits
         with multiple repetitions.
 
+        Parameters
+        ----------
+        n_folds : int, optional
+            Number of folds for cross-fitting. Default is 5.
+        n_rep : int, optional
+            Number of repetitions for sample splitting. Default is 1.
+
         Returns
         -------
-        self : DoubleMLScalar
+        self : Self
             The estimator with initialized sample splits.
+
+        Raises
+        ------
+        ValueError
+            If n_folds or n_rep have invalid values.
         """
+        if not isinstance(n_folds, int) or n_folds < 2:
+            raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.")
+        if not isinstance(n_rep, int) or n_rep < 1:
+            raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.")
+
+        self._n_folds = n_folds
+        self._n_rep = n_rep
+
         # Create resampler
         resampler = DoubleMLResampling(
-            n_folds=self.n_folds,
-            n_rep=self.n_rep,
+            n_folds=n_folds,
+            n_rep=n_rep,
             n_obs=self._n_obs,
         )
 
@@ -269,29 +367,16 @@ def draw_sample_splitting(self) -> "DoubleMLScalar":
 
     # ==================== Private Helper Methods ====================
 
-    def _initialize_arrays(self, store_predictions: bool = True) -> None:
-        """
-        Initialize storage arrays for predictions and results.
-
-        Parameters
-        ----------
-        store_predictions : bool
-            Whether to allocate arrays for storing predictions.
-        """
+    def _initialize_result_arrays(self) -> None:
+        """Initialize storage arrays for causal parameter estimation results."""
         n_obs = self._n_obs
         n_rep = self.n_rep
         n_thetas = 1  # Scalar model estimates single parameter
 
-        # Initialize predictions storage if requested
-        if store_predictions:
-            self._predictions = self._initialize_predictions_dict()
-
-        # Initialize result arrays using framework convention
-        # These will be filled by _est_causal_pars_and_se()
         # Shapes follow framework: (n_thetas, n_rep) for params, (n_obs, n_thetas, n_rep) for scores
-        self._all_thetas = np.zeros((n_thetas, n_rep))  # (n_thetas=1, n_rep)
+        self._all_thetas = np.zeros((n_thetas, n_rep))
         self._all_ses = np.zeros((n_thetas, n_rep))
-        self._psi = np.zeros((n_obs, n_thetas, n_rep))  # (n_obs, n_thetas=1, n_rep)
+        self._psi = np.zeros((n_obs, n_thetas, n_rep))
         self._psi_deriv = np.zeros((n_obs, n_thetas, n_rep))
 
     def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]:
@@ -305,8 +390,6 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]:
         dict
             Empty dictionary (subclasses should override).
         """
-        # Default: return empty dict
-        # Subclasses should override to create arrays for their specific nuisance components
         return {}
 
     def _construct_framework(self) -> DoubleMLFramework:
@@ -346,15 +429,17 @@ def _nuisance_est(
         test_idx: np.ndarray,
         i_rep: int,
         i_fold: int,
+        external_predictions: Optional[Dict[str, np.ndarray]] = None,
     ) -> None:
         """
         Estimate nuisance parameters for one fold.
 
         This is the main method subclasses must implement. It should:
-        1. Extract training and test data using train_idx and test_idx
-        2. Fit nuisance models (e.g., outcome model, treatment model) on training data
-        3. Predict on test data
-        4. Store predictions in self._predictions
+        1. Check external_predictions for pre-computed values (skip fitting if present)
+        2. Extract training and test data using train_idx and test_idx
+        3. Fit nuisance models on training data
+        4. Predict on test data
+        5. Store predictions in self._predictions
 
         Parameters
         ----------
@@ -366,12 +451,10 @@ def _nuisance_est(
             Repetition index (0 to n_rep-1).
         i_fold : int
             Fold index (0 to n_folds-1).
-
-        Notes
-        -----
-        Subclasses should store predictions in self._predictions, for example:
-            self._predictions['ml_l'][test_idx, i_rep] = l_hat
-            self._predictions['ml_m'][test_idx, i_rep] = m_hat
+        external_predictions : dict or None, optional
+            If provided, a dictionary of external predictions. Learners whose names
+            appear as keys should not be fitted; their predictions are already
+            pre-filled in self._predictions.
         """
         pass
 
@@ -443,7 +526,8 @@ def __str__(self) -> str:
         header = f"{'=' * 20} {class_name} Object {'=' * 20}"
 
         info = f"Score function: {self.score}\n"
-        info += f"Resampling: {self.n_folds}-fold CV, {self.n_rep} repetitions\n"
+        if self._n_folds is not None:
+            info += f"Resampling: {self._n_folds}-fold CV, {self._n_rep} repetitions\n"
 
         if self._framework is not None:
             summary_str = str(self.summary)

From ae2e5be2a1a36b35403aa2a08bfba50160fc86bb Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sun, 1 Feb 2026 10:26:48 +0100
Subject: [PATCH 03/38] add plr_scalar implementation

---
 doubleml/plm/plr_scalar.py                    | 151 ++++++++++++++++++
 doubleml/plm/tests/test_plr_scalar.py         |  86 ++++++++++
 .../plm/tests/test_plr_scalar_exceptions.py   |  88 ++++++++++
 .../plm/tests/test_plr_scalar_return_types.py | 123 ++++++++++++++
 doubleml/plm/tests/test_plr_scalar_vs_plr.py  |  82 ++++++++++
 5 files changed, 530 insertions(+)
 create mode 100644 doubleml/plm/plr_scalar.py
 create mode 100644 doubleml/plm/tests/test_plr_scalar.py
 create mode 100644 doubleml/plm/tests/test_plr_scalar_exceptions.py
 create mode 100644 doubleml/plm/tests/test_plr_scalar_return_types.py
 create mode 100644 doubleml/plm/tests/test_plr_scalar_vs_plr.py

diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
new file mode 100644
index 00000000..2a5259f6
--- /dev/null
+++ b/doubleml/plm/plr_scalar.py
@@ -0,0 +1,151 @@
+"""
+Partially Linear Regression (PLR) model based on the new DoubleMLScalar hierarchy.
+"""
+
+import warnings
+
+import numpy as np
+from sklearn.base import clone
+
+from ..data.base_data import DoubleMLData
+from ..double_ml_linear_score import LinearScoreMixin
+
+
+class PLR(LinearScoreMixin):
+    """Double machine learning for partially linear regression models.
+
+    Based on the DoubleMLScalar + LinearScoreMixin hierarchy.
+
+    Parameters
+    ----------
+    obj_dml_data : DoubleMLData
+        The data object providing the data and specifying the variables for the causal model.
+    ml_l : estimator
+        A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
+        function :math:`\\ell_0(X) = E[Y|X]`.
+    ml_m : estimator
+        A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
+        function :math:`m_0(X) = E[D|X]`.
+    ml_g : estimator, optional
+        A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
+        function :math:`g_0(X) = E[Y - D\\theta_0|X]`.
+        Only required for ``score='IV-type'``.
+    score : str, optional
+        The score function (``'partialling out'`` or ``'IV-type'``).
+        Default is ``'partialling out'``.
+    """
+
+    def __init__(
+        self,
+        obj_dml_data,
+        ml_l,
+        ml_m,
+        ml_g=None,
+        score="partialling out",
+    ):
+        # Validate data
+        self._check_data(obj_dml_data)
+
+        # Validate score
+        valid_scores = ["partialling out", "IV-type"]
+        if score not in valid_scores:
+            raise ValueError(f"Invalid score '{score}'. Valid scores: {valid_scores}.")
+
+        # Store learners
+        self._learner = {"ml_l": clone(ml_l), "ml_m": clone(ml_m)}
+
+        if ml_g is not None:
+            if score == "IV-type":
+                self._learner["ml_g"] = clone(ml_g)
+            else:
+                warnings.warn(
+                    "A learner ml_g has been provided for score = 'partialling out' but will be ignored. "
+                    "A learner ml_g is not required for estimation."
+                )
+        elif score == "IV-type":
+            warnings.warn("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).")
+            self._learner["ml_g"] = clone(ml_l)
+
+        super().__init__(
+            obj_dml_data=obj_dml_data,
+            score=score,
+        )
+
+    @staticmethod
+    def _check_data(obj_dml_data):
+        if not isinstance(obj_dml_data, DoubleMLData):
+            raise TypeError(
+                f"The data must be of DoubleMLData type. " f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed."
+            )
+        if obj_dml_data.z_cols is not None:
+            raise ValueError(
+                "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). "
+                "To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR."
+            )
+
+    def _initialize_predictions_dict(self):
+        n_obs = self._n_obs
+        n_rep = self.n_rep
+        preds = {
+            "ml_l": np.full((n_obs, n_rep), np.nan),
+            "ml_m": np.full((n_obs, n_rep), np.nan),
+        }
+        if "ml_g" in self._learner:
+            preds["ml_g"] = np.full((n_obs, n_rep), np.nan)
+        return preds
+
+    def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold):
+        x = self._dml_data.x
+        y = self._dml_data.y
+        d = self._dml_data.d
+
+        x_train, x_test = x[train_idx], x[test_idx]
+        y_train = y[train_idx]
+        d_train = d[train_idx]
+
+        # Fit and predict ml_l: E[Y|X]
+        ml_l = clone(self._learner["ml_l"])
+        ml_l.fit(x_train, y_train)
+        self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test)
+
+        # Fit and predict ml_m: E[D|X]
+        ml_m = clone(self._learner["ml_m"])
+        ml_m.fit(x_train, d_train)
+        self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test)
+
+        # For IV-type: fit ml_g after last fold when all ml_l/ml_m predictions are available
+        is_last_fold = i_fold == self.n_folds - 1
+        if is_last_fold and "ml_g" in self._learner and "ml_g" in self._predictions:
+            # Compute initial theta from full cross-fitted predictions
+            l_hat = self._predictions["ml_l"][:, i_rep]
+            m_hat = self._predictions["ml_m"][:, i_rep]
+            psi_a = -(d - m_hat) * (d - m_hat)
+            psi_b = (d - m_hat) * (y - l_hat)
+            theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a)
+
+            # Second pass: fit ml_g with cross-fitting across all folds
+            for j_fold in range(self.n_folds):
+                train_j, test_j = self._smpls[i_rep][j_fold]
+                ml_g = clone(self._learner["ml_g"])
+                ml_g.fit(x[train_j], y[train_j] - theta_initial * d[train_j])
+                self._predictions["ml_g"][test_j, i_rep] = ml_g.predict(x[test_j])
+
+    def _get_score_elements(self):
+        y = self._dml_data.y
+        d = self._dml_data.d
+
+        m_hat = self._predictions["ml_m"]  # (n_obs, n_rep)
+        v_hat = d[:, np.newaxis] - m_hat  # (n_obs, n_rep)
+
+        if self.score == "partialling out":
+            l_hat = self._predictions["ml_l"]
+            u_hat = y[:, np.newaxis] - l_hat
+            psi_a = -v_hat * v_hat
+            psi_b = v_hat * u_hat
+        else:
+            assert self.score == "IV-type"
+            g_hat = self._predictions["ml_g"]
+            psi_a = -v_hat * d[:, np.newaxis]
+            psi_b = v_hat * (y[:, np.newaxis] - g_hat)
+
+        return {"psi_a": psi_a, "psi_b": psi_b}
diff --git a/doubleml/plm/tests/test_plr_scalar.py b/doubleml/plm/tests/test_plr_scalar.py
new file mode 100644
index 00000000..581121b2
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar.py
@@ -0,0 +1,86 @@
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.linear_model import Lasso, LinearRegression
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+
+@pytest.fixture(scope="module", params=[LinearRegression(), Lasso(alpha=0.1)])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["IV-type", "partialling out"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_plr_scalar_fixture(learner, score):
+    n_folds = 5
+    theta = 0.5
+
+    ml_l = clone(learner)
+    ml_m = clone(learner)
+    ml_g = clone(learner)
+
+    np.random.seed(3141)
+    obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=theta)
+
+    dml_obj = PLR(obj_dml_data, ml_l, ml_m, ml_g, score=score)
+    dml_obj.draw_sample_splitting(n_folds=n_folds)
+    dml_obj.fit()
+
+    res_dict = {
+        "coef": dml_obj.coef[0],
+        "se": dml_obj.se[0],
+        "true_coef": theta,
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_plr_scalar_coef(dml_plr_scalar_fixture):
+    coef = dml_plr_scalar_fixture["coef"]
+    se = dml_plr_scalar_fixture["se"]
+    true_coef = dml_plr_scalar_fixture["true_coef"]
+    assert abs(coef - true_coef) <= 3.0 * se
+
+
+@pytest.fixture(scope="module")
+def dml_plr_scalar_rep_fixture():
+    """Test with multiple repetitions."""
+    n_folds = 3
+    n_rep = 3
+    theta = 0.5
+
+    np.random.seed(3141)
+    obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=theta)
+
+    dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression())
+    dml_obj.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_obj.fit()
+
+    return {
+        "dml_obj": dml_obj,
+        "true_coef": theta,
+        "n_rep": n_rep,
+    }
+
+
+@pytest.mark.ci
+def test_dml_plr_scalar_rep_coef(dml_plr_scalar_rep_fixture):
+    dml_obj = dml_plr_scalar_rep_fixture["dml_obj"]
+    true_coef = dml_plr_scalar_rep_fixture["true_coef"]
+    assert abs(dml_obj.coef[0] - true_coef) <= 3.0 * dml_obj.se[0]
+
+
+@pytest.mark.ci
+def test_dml_plr_scalar_rep_shapes(dml_plr_scalar_rep_fixture):
+    dml_obj = dml_plr_scalar_rep_fixture["dml_obj"]
+    n_rep = dml_plr_scalar_rep_fixture["n_rep"]
+    assert dml_obj.all_thetas.shape == (1, n_rep)
+    assert dml_obj.all_ses.shape == (1, n_rep)
diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py
new file mode 100644
index 00000000..5797191a
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py
@@ -0,0 +1,88 @@
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import Lasso
+
+import doubleml as dml
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+np.random.seed(3141)
+obj_dml_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=10, alpha=0.5)
+
+# Create data with instruments for IV check
+df = obj_dml_data.data.copy()
+x_cols = [c for c in df.columns if c.startswith("X")]
+dml_data_iv = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols[:-1], z_cols=x_cols[-1])
+
+ml_l = Lasso(alpha=0.1)
+ml_m = Lasso(alpha=0.1)
+ml_g = Lasso(alpha=0.1)
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_data():
+    msg = r"The data must be of DoubleMLData type\."
+    with pytest.raises(TypeError, match=msg):
+        PLR(pd.DataFrame(), ml_l, ml_m)
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_instrument():
+    msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\."
+    with pytest.raises(ValueError, match=msg):
+        PLR(dml_data_iv, ml_l, ml_m)
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_score():
+    msg = r"Invalid score 'invalid'\."
+    with pytest.raises(ValueError, match=msg):
+        PLR(obj_dml_data, ml_l, ml_m, score="invalid")
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_n_folds():
+    dml_obj = PLR(obj_dml_data, ml_l, ml_m)
+    msg = r"n_folds must be an integer >= 2\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_folds=1)
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_folds=0)
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_n_rep():
+    dml_obj = PLR(obj_dml_data, ml_l, ml_m)
+    msg = r"n_rep must be an integer >= 1\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_rep=0)
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_fit_nuisance_without_smpls():
+    dml_obj = PLR(obj_dml_data, ml_l, ml_m)
+    msg = r"Sample splitting has not been initialized\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit_nuisance_models()
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_estimate_causal_without_predictions():
+    dml_obj = PLR(obj_dml_data, ml_l, ml_m)
+    dml_obj.draw_sample_splitting()
+    msg = r"Predictions not available\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.estimate_causal_parameters()
+
+
+@pytest.mark.ci
+def test_plr_scalar_warning_ml_g_partialling_out():
+    with pytest.warns(UserWarning, match="will be ignored"):
+        PLR(obj_dml_data, ml_l, ml_m, ml_g, score="partialling out")
+
+
+@pytest.mark.ci
+def test_plr_scalar_warning_ml_g_iv_type_missing():
+    with pytest.warns(UserWarning, match="ml_l and ml_g should be specified"):
+        PLR(obj_dml_data, ml_l, ml_m, score="IV-type")
diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py
new file mode 100644
index 00000000..b6f25a71
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar_return_types.py
@@ -0,0 +1,123 @@
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import LinearRegression
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+N_OBS = 200
+N_FOLDS = 3
+N_REP = 2
+N_REP_BOOT = 314
+
+np.random.seed(3141)
+obj_dml_data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=10, alpha=0.5)
+
+
+@pytest.fixture(scope="module")
+def fitted_dml_obj():
+    np.random.seed(3141)
+    dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression())
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+    dml_obj.bootstrap(n_rep_boot=N_REP_BOOT)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_coef_type_and_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.coef, np.ndarray)
+    assert fitted_dml_obj.coef.shape == (1,)
+
+
+@pytest.mark.ci
+def test_se_type_and_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.se, np.ndarray)
+    assert fitted_dml_obj.se.shape == (1,)
+
+
+@pytest.mark.ci
+def test_all_thetas_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.all_thetas, np.ndarray)
+    assert fitted_dml_obj.all_thetas.shape == (1, N_REP)
+
+
+@pytest.mark.ci
+def test_all_coef_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.all_coef, np.ndarray)
+    assert fitted_dml_obj.all_coef.shape == (1, N_REP)
+
+
+@pytest.mark.ci
+def test_all_ses_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.all_ses, np.ndarray)
+    assert fitted_dml_obj.all_ses.shape == (1, N_REP)
+
+
+@pytest.mark.ci
+def test_summary_type(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.summary, pd.DataFrame)
+    assert fitted_dml_obj.summary.shape[0] == 1
+
+
+@pytest.mark.ci
+def test_confint_type_and_shape(fitted_dml_obj):
+    ci = fitted_dml_obj.confint()
+    assert isinstance(ci, pd.DataFrame)
+    assert ci.shape == (1, 2)
+
+
+@pytest.mark.ci
+def test_confint_joint(fitted_dml_obj):
+    ci_joint = fitted_dml_obj.confint(joint=True)
+    assert isinstance(ci_joint, pd.DataFrame)
+    assert ci_joint.shape == (1, 2)
+
+
+@pytest.mark.ci
+def test_psi_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.psi, np.ndarray)
+    assert fitted_dml_obj.psi.shape == (N_OBS, 1, N_REP)
+
+
+@pytest.mark.ci
+def test_predictions_type(fitted_dml_obj):
+    preds = fitted_dml_obj.predictions
+    assert isinstance(preds, dict)
+    assert "ml_l" in preds
+    assert "ml_m" in preds
+    assert preds["ml_l"].shape == (N_OBS, N_REP)
+    assert preds["ml_m"].shape == (N_OBS, N_REP)
+
+
+@pytest.mark.ci
+def test_smpls_type(fitted_dml_obj):
+    smpls = fitted_dml_obj.smpls
+    assert isinstance(smpls, list)
+    assert len(smpls) == N_REP
+    assert len(smpls[0]) == N_FOLDS
+
+
+@pytest.mark.ci
+def test_n_properties(fitted_dml_obj):
+    assert fitted_dml_obj.n_obs == N_OBS
+    assert fitted_dml_obj.n_folds == N_FOLDS
+    assert fitted_dml_obj.n_rep == N_REP
+    assert fitted_dml_obj.score == "partialling out"
+
+
+@pytest.mark.ci
+def test_str_repr(fitted_dml_obj):
+    assert isinstance(str(fitted_dml_obj), str)
+    assert isinstance(repr(fitted_dml_obj), str)
+
+
+@pytest.mark.ci
+def test_before_fit_raises():
+    np.random.seed(3141)
+    dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression())
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.coef
+    with pytest.raises(ValueError, match="Predictions not available. Call fit"):
+        _ = dml_obj.predictions
diff --git a/doubleml/plm/tests/test_plr_scalar_vs_plr.py b/doubleml/plm/tests/test_plr_scalar_vs_plr.py
new file mode 100644
index 00000000..f87a1af5
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar_vs_plr.py
@@ -0,0 +1,82 @@
+"""Compare PLR against the existing DoubleMLPLR implementation."""
+
+import numpy as np
+import pytest
+from sklearn.linear_model import Lasso, LinearRegression
+
+import doubleml as dml
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+
+@pytest.fixture(scope="module", params=[LinearRegression(), Lasso(alpha=0.1)])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["partialling out", "IV-type"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def comparison_fixture(learner, score, n_rep):
+    n_folds = 5
+    seed = 3141
+
+    np.random.seed(42)
+    obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5)
+
+    # Old PLR
+    np.random.seed(seed)
+    dml_old = dml.DoubleMLPLR(
+        obj_dml_data,
+        learner,
+        learner,
+        learner,
+        n_folds=n_folds,
+        n_rep=n_rep,
+        score=score,
+    )
+    dml_old.fit()
+
+    # New PLR
+    np.random.seed(seed)
+    dml_new = PLR(obj_dml_data, learner, learner, learner, score=score)
+    dml_new.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_new.fit()
+
+    return {"old": dml_old, "new": dml_new}
+
+
+@pytest.mark.ci
+def test_coef_equal(comparison_fixture):
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.coef, old.coef, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_se_equal(comparison_fixture):
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.se, old.se, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_all_coef_equal(comparison_fixture):
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.all_thetas, old.all_coef, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_all_se_equal(comparison_fixture):
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9)

From dad5e4c8a6851c316e6f92818b22ef6a4e8cda90 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sun, 1 Feb 2026 15:42:19 +0100
Subject: [PATCH 04/38] fix external predictions for doublemlscalar

---
 doubleml/plm/plr_scalar.py                    |  23 ++--
 .../test_plr_scalar_external_predictions.py   | 103 ++++++++++++++++++
 2 files changed, 118 insertions(+), 8 deletions(-)
 create mode 100644 doubleml/plm/tests/test_plr_scalar_external_predictions.py

diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 2a5259f6..9a3b181c 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -94,7 +94,7 @@ def _initialize_predictions_dict(self):
             preds["ml_g"] = np.full((n_obs, n_rep), np.nan)
         return preds
 
-    def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold):
+    def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions=None):
         x = self._dml_data.x
         y = self._dml_data.y
         d = self._dml_data.d
@@ -103,19 +103,26 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold):
         y_train = y[train_idx]
         d_train = d[train_idx]
 
+        # Check which learners have external predictions
+        l_external = external_predictions is not None and "ml_l" in external_predictions
+        m_external = external_predictions is not None and "ml_m" in external_predictions
+        g_external = external_predictions is not None and "ml_g" in external_predictions
+
         # Fit and predict ml_l: E[Y|X]
-        ml_l = clone(self._learner["ml_l"])
-        ml_l.fit(x_train, y_train)
-        self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test)
+        if not l_external:
+            ml_l = clone(self._learner["ml_l"])
+            ml_l.fit(x_train, y_train)
+            self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test)
 
         # Fit and predict ml_m: E[D|X]
-        ml_m = clone(self._learner["ml_m"])
-        ml_m.fit(x_train, d_train)
-        self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test)
+        if not m_external:
+            ml_m = clone(self._learner["ml_m"])
+            ml_m.fit(x_train, d_train)
+            self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test)
 
         # For IV-type: fit ml_g after last fold when all ml_l/ml_m predictions are available
         is_last_fold = i_fold == self.n_folds - 1
-        if is_last_fold and "ml_g" in self._learner and "ml_g" in self._predictions:
+        if is_last_fold and "ml_g" in self._learner and "ml_g" in self._predictions and not g_external:
             # Compute initial theta from full cross-fitted predictions
             l_hat = self._predictions["ml_l"][:, i_rep]
             m_hat = self._predictions["ml_m"][:, i_rep]
diff --git a/doubleml/plm/tests/test_plr_scalar_external_predictions.py b/doubleml/plm/tests/test_plr_scalar_external_predictions.py
new file mode 100644
index 00000000..da6ac9ce
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar_external_predictions.py
@@ -0,0 +1,103 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression
+
+from doubleml import DoubleMLData
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+
+@pytest.fixture(scope="module", params=["IV-type", "partialling out"])
+def plr_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_m_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_l_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_g_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_plr_scalar_fixture(plr_score, n_rep, set_ml_m_ext, set_ml_l_ext, set_ml_g_ext):
+    n_folds = 3
+    ext_predictions = {}
+
+    x, y, d = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type="np.array")
+
+    np.random.seed(3141)
+    dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d)
+
+    kwargs = {"obj_dml_data": dml_data, "score": plr_score}
+    if plr_score == "IV-type":
+        kwargs["ml_g"] = LinearRegression()
+
+    # Fit reference model
+    dml_plr = PLR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs)
+    np.random.seed(3141)
+    dml_plr.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_plr.fit()
+
+    # Build external predictions dict
+    if set_ml_m_ext:
+        ext_predictions["ml_m"] = dml_plr.predictions["ml_m"]
+
+    if set_ml_l_ext:
+        ext_predictions["ml_l"] = dml_plr.predictions["ml_l"]
+
+    if plr_score == "IV-type" and set_ml_g_ext:
+        ext_predictions["ml_g"] = dml_plr.predictions["ml_g"]
+        kwargs["ml_g"] = LinearRegression()
+    elif plr_score == "IV-type":
+        kwargs["ml_g"] = LinearRegression()
+
+    # Fit model with external predictions
+    dml_plr_ext = PLR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs)
+    np.random.seed(3141)
+    dml_plr_ext.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_plr_ext.fit(external_predictions=ext_predictions if ext_predictions else None)
+
+    res_dict = {
+        "coef_normal": dml_plr.coef[0],
+        "coef_ext": dml_plr_ext.coef[0],
+        "se_normal": dml_plr.se[0],
+        "se_ext": dml_plr_ext.se[0],
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_doubleml_plr_scalar_coef(doubleml_plr_scalar_fixture):
+    assert math.isclose(
+        doubleml_plr_scalar_fixture["coef_normal"],
+        doubleml_plr_scalar_fixture["coef_ext"],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_doubleml_plr_scalar_se(doubleml_plr_scalar_fixture):
+    assert math.isclose(
+        doubleml_plr_scalar_fixture["se_normal"],
+        doubleml_plr_scalar_fixture["se_ext"],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )

From 5f0a1378599dc256713e71f88890d145931cf36c Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sun, 1 Feb 2026 19:07:06 +0100
Subject: [PATCH 05/38] Enhance PLR and DoubleMLScalar with learner management
 and validation

- Introduced learner management in DoubleMLScalar with properties for learner names and instances.
- Added abstract method `set_learners` to enforce learner setting in subclasses.
- Updated PLR to utilize the new learner management system, including validation checks for learner instances.
- Refactored tests to align with the new learner management approach, ensuring proper exception handling and validation.
---
 doubleml/double_ml_scalar.py                  | 84 ++++++++++++++++-
 doubleml/plm/plr_scalar.py                    | 92 +++++++++++--------
 doubleml/plm/tests/test_plr_scalar.py         |  9 +-
 .../plm/tests/test_plr_scalar_exceptions.py   | 46 +++++++---
 .../test_plr_scalar_external_predictions.py   | 27 ++++--
 .../plm/tests/test_plr_scalar_return_types.py | 12 ++-
 doubleml/plm/tests/test_plr_scalar_vs_plr.py  |  3 +-
 doubleml/utils/_checks.py                     | 73 +++++++++++++++
 8 files changed, 278 insertions(+), 68 deletions(-)

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index d4b3cf36..c969ec84 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -76,6 +76,10 @@ def __init__(
 
         self._score = score
 
+        # Learner names (set by subclass) and learner storage (set via set_learners)
+        self._learner_names: List[str] = []
+        self._learners: Dict[str, object] = {}
+
         # Resampling parameters (set via draw_sample_splitting)
         self._n_folds: Optional[int] = None
         self._n_rep: Optional[int] = None
@@ -178,6 +182,50 @@ def smpls(self) -> List:
             raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.")
         return self._smpls
 
+    @property
+    def learner_names(self) -> List[str]:
+        """
+        Names of the required learners for this model.
+
+        Returns
+        -------
+        list of str
+            List of required learner names.
+        """
+        return self._learner_names
+
+    @property
+    def learners(self) -> Dict[str, object]:
+        """
+        The learners used for nuisance estimation.
+
+        Returns
+        -------
+        dict
+            Dictionary mapping learner names to estimator instances.
+        """
+        return self._learners
+
+    @abstractmethod
+    def set_learners(self, **kwargs) -> Self:
+        """
+        Set the learners for nuisance estimation.
+
+        Subclasses must implement this method with explicit keyword arguments
+        for each learner (e.g., ``ml_l``, ``ml_m``, ``ml_g`` for PLR).
+
+        Parameters
+        ----------
+        **kwargs
+            Learner keyword arguments specific to the subclass.
+
+        Returns
+        -------
+        self : Self
+            The estimator with learners set.
+        """
+        pass
+
     # ==================== Concrete fit() Method (Template) ====================
 
     def fit(
@@ -257,6 +305,9 @@ def fit_nuisance_models(
         if self._smpls is None:
             raise ValueError("Sample splitting has not been initialized. Call draw_sample_splitting() first.")
 
+        # Validate that all required learners are available
+        self._check_learners_available(external_predictions)
+
         # Initialize prediction arrays
         self._predictions = self._initialize_predictions_dict()
 
@@ -383,14 +434,41 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]:
         """
         Initialize dictionary for storing predictions.
 
-        Subclasses can override this to define their specific prediction storage structure.
+        Creates a prediction array of shape ``(n_obs, n_rep)`` for each learner
+        in :attr:`learner_names`, filled with ``NaN``. Subclasses can override
+        this for custom prediction storage.
 
         Returns
         -------
         dict
-            Empty dictionary (subclasses should override).
+            Dictionary mapping learner names to NaN-filled arrays.
+        """
+        n_obs = self._n_obs
+        n_rep = self.n_rep
+        return {name: np.full((n_obs, n_rep), np.nan) for name in self._learner_names}
+
+    def _check_learners_available(self, external_predictions=None) -> None:
         """
-        return {}
+        Validate that all required learners are set or covered by external predictions.
+
+        Parameters
+        ----------
+        external_predictions : dict or None
+            External predictions that may cover some learners.
+
+        Raises
+        ------
+        ValueError
+            If a required learner is missing and not covered by external predictions.
+        """
+        ext_keys = set(external_predictions.keys()) if external_predictions is not None else set()
+
+        for name in self._learner_names:
+            if name not in self._learners and name not in ext_keys:
+                raise ValueError(
+                    f"Learner '{name}' is required but not set and no external predictions provided for it. "
+                    f"Call set_learners({name}=...) or provide external_predictions."
+                )
 
     def _construct_framework(self) -> DoubleMLFramework:
         """
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 9a3b181c..b915bd76 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -9,6 +9,7 @@
 
 from ..data.base_data import DoubleMLData
 from ..double_ml_linear_score import LinearScoreMixin
+from ..utils._checks import _check_learner
 
 
 class PLR(LinearScoreMixin):
@@ -20,16 +21,6 @@ class PLR(LinearScoreMixin):
     ----------
     obj_dml_data : DoubleMLData
         The data object providing the data and specifying the variables for the causal model.
-    ml_l : estimator
-        A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
-        function :math:`\\ell_0(X) = E[Y|X]`.
-    ml_m : estimator
-        A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
-        function :math:`m_0(X) = E[D|X]`.
-    ml_g : estimator, optional
-        A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
-        function :math:`g_0(X) = E[Y - D\\theta_0|X]`.
-        Only required for ``score='IV-type'``.
     score : str, optional
         The score function (``'partialling out'`` or ``'IV-type'``).
         Default is ``'partialling out'``.
@@ -38,9 +29,6 @@ class PLR(LinearScoreMixin):
     def __init__(
         self,
         obj_dml_data,
-        ml_l,
-        ml_m,
-        ml_g=None,
         score="partialling out",
     ):
         # Validate data
@@ -51,25 +39,57 @@ def __init__(
         if score not in valid_scores:
             raise ValueError(f"Invalid score '{score}'. Valid scores: {valid_scores}.")
 
-        # Store learners
-        self._learner = {"ml_l": clone(ml_l), "ml_m": clone(ml_m)}
+        super().__init__(
+            obj_dml_data=obj_dml_data,
+            score=score,
+        )
+
+        # Set required learner names based on score
+        self._learner_names = ["ml_l", "ml_m"]
+        if score == "IV-type":
+            self._learner_names.append("ml_g")
+
+    def set_learners(self, ml_l=None, ml_m=None, ml_g=None):
+        """
+        Set the learners for nuisance estimation.
+
+        Parameters
+        ----------
+        ml_l : estimator or None, optional
+            A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
+            function :math:`\\ell_0(X) = E[Y|X]`.
+        ml_m : estimator or None, optional
+            A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
+            function :math:`m_0(X) = E[D|X]`.
+        ml_g : estimator or None, optional
+            A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
+            function :math:`g_0(X) = E[Y - D\\theta_0|X]`.
+            Only required for ``score='IV-type'``.
+
+        Returns
+        -------
+        self : PLR
+            The estimator with learners set.
+        """
+        if ml_l is not None:
+            _check_learner(ml_l, "ml_l", regressor=True, classifier=True)
+            self._learners["ml_l"] = clone(ml_l)
+
+        if ml_m is not None:
+            _check_learner(ml_m, "ml_m", regressor=True, classifier=True)
+            self._learners["ml_m"] = clone(ml_m)
 
         if ml_g is not None:
-            if score == "IV-type":
-                self._learner["ml_g"] = clone(ml_g)
+            if self.score == "IV-type":
+                _check_learner(ml_g, "ml_g", regressor=True, classifier=False)
+                self._learners["ml_g"] = clone(ml_g)
             else:
                 warnings.warn(
                     "A learner ml_g has been provided for score = 'partialling out' but will be ignored. "
                     "A learner ml_g is not required for estimation."
                 )
-        elif score == "IV-type":
-            warnings.warn("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).")
-            self._learner["ml_g"] = clone(ml_l)
 
-        super().__init__(
-            obj_dml_data=obj_dml_data,
-            score=score,
-        )
+        return self
 
     @staticmethod
     def _check_data(obj_dml_data):
@@ -83,17 +103,6 @@ def _check_data(obj_dml_data):
                 "To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR."
             )
 
-    def _initialize_predictions_dict(self):
-        n_obs = self._n_obs
-        n_rep = self.n_rep
-        preds = {
-            "ml_l": np.full((n_obs, n_rep), np.nan),
-            "ml_m": np.full((n_obs, n_rep), np.nan),
-        }
-        if "ml_g" in self._learner:
-            preds["ml_g"] = np.full((n_obs, n_rep), np.nan)
-        return preds
-
     def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions=None):
         x = self._dml_data.x
         y = self._dml_data.y
@@ -110,19 +119,24 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions
 
         # Fit and predict ml_l: E[Y|X]
         if not l_external:
-            ml_l = clone(self._learner["ml_l"])
+            ml_l = clone(self._learners["ml_l"])
             ml_l.fit(x_train, y_train)
             self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test)
 
         # Fit and predict ml_m: E[D|X]
         if not m_external:
-            ml_m = clone(self._learner["ml_m"])
+            ml_m = clone(self._learners["ml_m"])
             ml_m.fit(x_train, d_train)
             self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test)
 
         # For IV-type: fit ml_g after last fold when all ml_l/ml_m predictions are available
         is_last_fold = i_fold == self.n_folds - 1
-        if is_last_fold and "ml_g" in self._learner and "ml_g" in self._predictions and not g_external:
+        if is_last_fold and self.score == "IV-type" and not g_external:
+            # If ml_g not explicitly set, default to clone of ml_l
+            if "ml_g" not in self._learners:
+                warnings.warn("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).")
+                self._learners["ml_g"] = clone(self._learners["ml_l"])
+
             # Compute initial theta from full cross-fitted predictions
             l_hat = self._predictions["ml_l"][:, i_rep]
             m_hat = self._predictions["ml_m"][:, i_rep]
@@ -133,7 +147,7 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions
             # Second pass: fit ml_g with cross-fitting across all folds
             for j_fold in range(self.n_folds):
                 train_j, test_j = self._smpls[i_rep][j_fold]
-                ml_g = clone(self._learner["ml_g"])
+                ml_g = clone(self._learners["ml_g"])
                 ml_g.fit(x[train_j], y[train_j] - theta_initial * d[train_j])
                 self._predictions["ml_g"][test_j, i_rep] = ml_g.predict(x[test_j])
 
diff --git a/doubleml/plm/tests/test_plr_scalar.py b/doubleml/plm/tests/test_plr_scalar.py
index 581121b2..db9eed6e 100644
--- a/doubleml/plm/tests/test_plr_scalar.py
+++ b/doubleml/plm/tests/test_plr_scalar.py
@@ -29,7 +29,11 @@ def dml_plr_scalar_fixture(learner, score):
     np.random.seed(3141)
     obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=theta)
 
-    dml_obj = PLR(obj_dml_data, ml_l, ml_m, ml_g, score=score)
+    dml_obj = PLR(obj_dml_data, score=score)
+    if score == "IV-type":
+        dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
+    else:
+        dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m)
     dml_obj.draw_sample_splitting(n_folds=n_folds)
     dml_obj.fit()
 
@@ -60,7 +64,8 @@ def dml_plr_scalar_rep_fixture():
     np.random.seed(3141)
     obj_dml_data = make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=theta)
 
-    dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression())
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
     dml_obj.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
     dml_obj.fit()
 
diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py
index 5797191a..7cc74aac 100644
--- a/doubleml/plm/tests/test_plr_scalar_exceptions.py
+++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py
@@ -24,26 +24,26 @@
 def test_plr_scalar_exception_data():
     msg = r"The data must be of DoubleMLData type\."
     with pytest.raises(TypeError, match=msg):
-        PLR(pd.DataFrame(), ml_l, ml_m)
+        PLR(pd.DataFrame())
 
 
 @pytest.mark.ci
 def test_plr_scalar_exception_instrument():
     msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\."
     with pytest.raises(ValueError, match=msg):
-        PLR(dml_data_iv, ml_l, ml_m)
+        PLR(dml_data_iv)
 
 
 @pytest.mark.ci
 def test_plr_scalar_exception_score():
     msg = r"Invalid score 'invalid'\."
     with pytest.raises(ValueError, match=msg):
-        PLR(obj_dml_data, ml_l, ml_m, score="invalid")
+        PLR(obj_dml_data, score="invalid")
 
 
 @pytest.mark.ci
 def test_plr_scalar_exception_n_folds():
-    dml_obj = PLR(obj_dml_data, ml_l, ml_m)
+    dml_obj = PLR(obj_dml_data)
     msg = r"n_folds must be an integer >= 2\."
     with pytest.raises(ValueError, match=msg):
         dml_obj.draw_sample_splitting(n_folds=1)
@@ -53,7 +53,7 @@ def test_plr_scalar_exception_n_folds():
 
 @pytest.mark.ci
 def test_plr_scalar_exception_n_rep():
-    dml_obj = PLR(obj_dml_data, ml_l, ml_m)
+    dml_obj = PLR(obj_dml_data)
     msg = r"n_rep must be an integer >= 1\."
     with pytest.raises(ValueError, match=msg):
         dml_obj.draw_sample_splitting(n_rep=0)
@@ -61,7 +61,8 @@ def test_plr_scalar_exception_n_rep():
 
 @pytest.mark.ci
 def test_plr_scalar_exception_fit_nuisance_without_smpls():
-    dml_obj = PLR(obj_dml_data, ml_l, ml_m)
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m)
     msg = r"Sample splitting has not been initialized\."
     with pytest.raises(ValueError, match=msg):
         dml_obj.fit_nuisance_models()
@@ -69,7 +70,8 @@ def test_plr_scalar_exception_fit_nuisance_without_smpls():
 
 @pytest.mark.ci
 def test_plr_scalar_exception_estimate_causal_without_predictions():
-    dml_obj = PLR(obj_dml_data, ml_l, ml_m)
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m)
     dml_obj.draw_sample_splitting()
     msg = r"Predictions not available\."
     with pytest.raises(ValueError, match=msg):
@@ -78,11 +80,33 @@ def test_plr_scalar_exception_estimate_causal_without_predictions():
 
 @pytest.mark.ci
 def test_plr_scalar_warning_ml_g_partialling_out():
+    dml_obj = PLR(obj_dml_data, score="partialling out")
     with pytest.warns(UserWarning, match="will be ignored"):
-        PLR(obj_dml_data, ml_l, ml_m, ml_g, score="partialling out")
+        dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
 
 
 @pytest.mark.ci
-def test_plr_scalar_warning_ml_g_iv_type_missing():
-    with pytest.warns(UserWarning, match="ml_l and ml_g should be specified"):
-        PLR(obj_dml_data, ml_l, ml_m, score="IV-type")
+def test_plr_scalar_exception_missing_learner():
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.draw_sample_splitting()
+    msg = r"Learner 'ml_l' is required but not set"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit()
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_missing_learner_partial():
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=ml_l)
+    dml_obj.draw_sample_splitting()
+    msg = r"Learner 'ml_m' is required but not set"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit()
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_invalid_learner():
+    dml_obj = PLR(obj_dml_data)
+    msg = r"Invalid learner provided for ml_l: provide an instance"
+    with pytest.raises(TypeError, match=msg):
+        dml_obj.set_learners(ml_l=Lasso)  # class instead of instance
diff --git a/doubleml/plm/tests/test_plr_scalar_external_predictions.py b/doubleml/plm/tests/test_plr_scalar_external_predictions.py
index da6ac9ce..693d3b73 100644
--- a/doubleml/plm/tests/test_plr_scalar_external_predictions.py
+++ b/doubleml/plm/tests/test_plr_scalar_external_predictions.py
@@ -44,12 +44,12 @@ def doubleml_plr_scalar_fixture(plr_score, n_rep, set_ml_m_ext, set_ml_l_ext, se
     np.random.seed(3141)
     dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d)
 
-    kwargs = {"obj_dml_data": dml_data, "score": plr_score}
-    if plr_score == "IV-type":
-        kwargs["ml_g"] = LinearRegression()
-
     # Fit reference model
-    dml_plr = PLR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs)
+    dml_plr = PLR(dml_data, score=plr_score)
+    if plr_score == "IV-type":
+        dml_plr.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression(), ml_g=LinearRegression())
+    else:
+        dml_plr.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
     np.random.seed(3141)
     dml_plr.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
     dml_plr.fit()
@@ -63,12 +63,19 @@ def doubleml_plr_scalar_fixture(plr_score, n_rep, set_ml_m_ext, set_ml_l_ext, se
 
     if plr_score == "IV-type" and set_ml_g_ext:
         ext_predictions["ml_g"] = dml_plr.predictions["ml_g"]
-        kwargs["ml_g"] = LinearRegression()
-    elif plr_score == "IV-type":
-        kwargs["ml_g"] = LinearRegression()
 
-    # Fit model with external predictions
-    dml_plr_ext = PLR(ml_m=LinearRegression(), ml_l=LinearRegression(), **kwargs)
+    # Fit model with external predictions — only set learners that are needed
+    dml_plr_ext = PLR(dml_data, score=plr_score)
+    learner_kwargs = {}
+    if not set_ml_l_ext:
+        learner_kwargs["ml_l"] = LinearRegression()
+    if not set_ml_m_ext:
+        learner_kwargs["ml_m"] = LinearRegression()
+    if plr_score == "IV-type" and not set_ml_g_ext:
+        learner_kwargs["ml_g"] = LinearRegression()
+    if learner_kwargs:
+        dml_plr_ext.set_learners(**learner_kwargs)
+
     np.random.seed(3141)
     dml_plr_ext.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
     dml_plr_ext.fit(external_predictions=ext_predictions if ext_predictions else None)
diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py
index b6f25a71..63e06cdd 100644
--- a/doubleml/plm/tests/test_plr_scalar_return_types.py
+++ b/doubleml/plm/tests/test_plr_scalar_return_types.py
@@ -18,7 +18,8 @@
 @pytest.fixture(scope="module")
 def fitted_dml_obj():
     np.random.seed(3141)
-    dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression())
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
     dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
     dml_obj.fit()
     dml_obj.bootstrap(n_rep_boot=N_REP_BOOT)
@@ -107,6 +108,13 @@ def test_n_properties(fitted_dml_obj):
     assert fitted_dml_obj.score == "partialling out"
 
 
+@pytest.mark.ci
+def test_learner_names(fitted_dml_obj):
+    assert fitted_dml_obj.learner_names == ["ml_l", "ml_m"]
+    assert "ml_l" in fitted_dml_obj.learners
+    assert "ml_m" in fitted_dml_obj.learners
+
+
 @pytest.mark.ci
 def test_str_repr(fitted_dml_obj):
     assert isinstance(str(fitted_dml_obj), str)
@@ -116,7 +124,7 @@ def test_str_repr(fitted_dml_obj):
 @pytest.mark.ci
 def test_before_fit_raises():
     np.random.seed(3141)
-    dml_obj = PLR(obj_dml_data, LinearRegression(), LinearRegression())
+    dml_obj = PLR(obj_dml_data)
     with pytest.raises(ValueError, match="framework is not yet initialized"):
         _ = dml_obj.coef
     with pytest.raises(ValueError, match="Predictions not available. Call fit"):
diff --git a/doubleml/plm/tests/test_plr_scalar_vs_plr.py b/doubleml/plm/tests/test_plr_scalar_vs_plr.py
index f87a1af5..15453c12 100644
--- a/doubleml/plm/tests/test_plr_scalar_vs_plr.py
+++ b/doubleml/plm/tests/test_plr_scalar_vs_plr.py
@@ -47,7 +47,8 @@ def comparison_fixture(learner, score, n_rep):
 
     # New PLR
     np.random.seed(seed)
-    dml_new = PLR(obj_dml_data, learner, learner, learner, score=score)
+    dml_new = PLR(obj_dml_data, score=score)
+    dml_new.set_learners(ml_l=learner, ml_m=learner, ml_g=learner)
     dml_new.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
     dml_new.fit()
 
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index edc828fb..7db749dc 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -1,6 +1,7 @@
 import warnings
 
 import numpy as np
+from sklearn.base import is_classifier, is_regressor
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import has_fit_parameter
 
@@ -513,6 +514,78 @@ def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_d
     return smpls, smpls_cluster, n_rep, n_folds
 
 
+def _check_learner(learner, learner_name, regressor=True, classifier=True):
+    """
+    Validate that a learner has the required interface for DoubleML estimation.
+
+    Parameters
+    ----------
+    learner : object
+        The learner to validate.
+    learner_name : str
+        Name of the learner (for error messages).
+    regressor : bool
+        Whether regressors are accepted. Default is ``True``.
+    classifier : bool
+        Whether classifiers are accepted. Default is ``True``.
+
+    Returns
+    -------
+    bool
+        ``True`` if the learner is a classifier, ``False`` otherwise.
+
+    Raises
+    ------
+    TypeError
+        If the learner is a class instead of an instance, or lacks
+        required methods (fit, set_params, get_params, predict/predict_proba).
+    """
+    err_msg_prefix = f"Invalid learner provided for {learner_name}: "
+    warn_msg_prefix = f"Learner provided for {learner_name} is probably invalid: "
+
+    if isinstance(learner, type):
+        raise TypeError(err_msg_prefix + "provide an instance of a learner instead of a class.")
+
+    if not hasattr(learner, "fit"):
+        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .fit().")
+    if not hasattr(learner, "set_params"):
+        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .set_params().")
+    if not hasattr(learner, "get_params"):
+        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .get_params().")
+
+    if regressor & classifier:
+        if is_classifier(learner):
+            learner_is_classifier = True
+        elif is_regressor(learner):
+            learner_is_classifier = False
+        else:
+            warnings.warn(
+                warn_msg_prefix
+                + f"{str(learner)} is (probably) neither a regressor nor a classifier. "
+                + "Method predict is used for prediction."
+            )
+            learner_is_classifier = False
+    elif classifier:
+        if not is_classifier(learner):
+            warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.")
+        learner_is_classifier = True
+    else:
+        assert regressor  # classifier, regressor or both must be True
+        if not is_regressor(learner):
+            warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.")
+        learner_is_classifier = False
+
+    # check existence of the prediction method
+    if learner_is_classifier:
+        if not hasattr(learner, "predict_proba"):
+            raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict_proba().")
+    else:
+        if not hasattr(learner, "predict"):
+            raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict().")
+
+    return learner_is_classifier
+
+
 def _check_supports_sample_weights(learner, learner_name):
     if not has_fit_parameter(learner, "sample_weight"):
         raise ValueError(

From 384beba220bc3a6d4bb7395d79ea98f62ecb529a Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sun, 1 Feb 2026 19:36:51 +0100
Subject: [PATCH 06/38] Add architecture documentation for DoubleMLScalar and
 class hierarchy

---
 doc/diagrams/architecture.md | 177 +++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 doc/diagrams/architecture.md

diff --git a/doc/diagrams/architecture.md b/doc/diagrams/architecture.md
new file mode 100644
index 00000000..4e531e9d
--- /dev/null
+++ b/doc/diagrams/architecture.md
@@ -0,0 +1,177 @@
+# DoubleML Scalar Architecture
+
+## Class Hierarchy
+
+```
+DoubleMLBase (ABC)
+│   Data storage, framework delegation (coef, se, summary, confint, bootstrap, ...)
+│
+└── DoubleMLScalar (ABC)
+    │   Single-parameter estimation: fit(), draw_sample_splitting(),
+    │   fit_nuisance_models(), estimate_causal_parameters()
+    │   Learner management: set_learners(), _check_learners_available()
+    │   Prediction storage: _initialize_predictions_dict()
+    │
+    ├── LinearScoreMixin
+    │   │   Implements _est_causal_pars_and_se() for linear scores
+    │   │   θ̂ = -E[ψ_b] / E[ψ_a]
+    │   │
+    │   ├── PLR          (partialling out, IV-type)
+    │   ├── PLIV         (planned)
+    │   ├── IRM          (planned)
+    │   └── DID          (planned)
+    │
+    └── NonLinearScoreMixin (planned)
+        │   Implements _est_causal_pars_and_se() via numerical root-finding
+        │
+        └── ...
+```
+
+## UML Class Diagram
+
+```
+┌─────────────────────────────────────────┐
+│          DoubleMLBase (ABC)             │
+├─────────────────────────────────────────┤
+│ - _dml_data: DoubleMLBaseData           │
+│ - _n_obs: int                           │
+│ - _framework: DoubleMLFramework | None  │
+├─────────────────────────────────────────┤
+│ + framework: DoubleMLFramework          │
+│ + thetas / coef: np.ndarray             │
+│ + all_thetas / all_coef: np.ndarray     │
+│ + se: np.ndarray                        │
+│ + all_ses: np.ndarray                   │
+│ + summary: pd.DataFrame                 │
+│ + psi: np.ndarray                       │
+│ + n_obs: int                            │
+│ + confint()                             │
+│ + bootstrap()                           │
+│ + p_adjust()                            │
+│ + sensitivity_analysis()                │
+│ «abstract» + fit()                      │
+│ «abstract» + n_rep: int                 │
+└─────────────────┬───────────────────────┘
+                  │ inherits
+┌─────────────────▼───────────────────────┐
+│         DoubleMLScalar (ABC)            │
+├─────────────────────────────────────────┤
+│ - _score: str                           │
+│ - _learner_names: List[str]             │
+│ - _learners: Dict[str, object]          │
+│ - _n_folds: int | None                  │
+│ - _n_rep: int | None                    │
+│ - _smpls: List | None                   │
+│ - _predictions: Dict | None             │
+│ - _all_thetas: np.ndarray | None        │
+│ - _all_ses: np.ndarray | None           │
+│ - _psi: np.ndarray | None               │
+│ - _psi_deriv: np.ndarray | None         │
+│ - _var_scaling_factors: np.ndarray|None │
+├─────────────────────────────────────────┤
+│ + score: str                            │
+│ + n_folds: int                          │
+│ + n_rep: int                            │
+│ + predictions: Dict                     │
+│ + smpls: List                           │
+│ + learner_names: List[str]              │
+│ + learners: Dict[str, object]           │
+│ + fit(n_folds, n_rep, external_preds)   │
+│ + fit_nuisance_models(external_preds)   │
+│ + estimate_causal_parameters()          │
+│ + draw_sample_splitting(n_folds, n_rep) │
+│ + _initialize_predictions_dict()        │
+│ + _check_learners_available()           │
+│ + _initialize_result_arrays()           │
+│ + _construct_framework()                │
+│ «abstract» + set_learners()             │
+│ «abstract» + _nuisance_est()            │
+│ «abstract» + _get_score_elements()      │
+│ «abstract» + _est_causal_pars_and_se()  │
+└──────────┬──────────────────────────────┘
+           │ inherits
+┌──────────▼──────────────────────────────┐
+│       LinearScoreMixin                  │
+├─────────────────────────────────────────┤
+│ (no additional state)                   │
+├─────────────────────────────────────────┤
+│ + _est_causal_pars_and_se(psi_elements) │
+│   → closed-form: θ̂ = -E[ψ_b]/E[ψ_a]  │
+│ + _compute_score(psi_elements, coef)    │
+│ + _score_element_names() → [psi_a,b]   │
+└──────────┬──────────────────────────────┘
+           │ inherits
+┌──────────▼──────────────────────────────┐
+│              PLR                        │
+├─────────────────────────────────────────┤
+│ _learner_names = [ml_l, ml_m(, ml_g)]  │
+├─────────────────────────────────────────┤
+│ + __init__(obj_dml_data, score)         │
+│ + set_learners(ml_l, ml_m, ml_g)       │
+│ + _check_data()                         │
+│ + _nuisance_est(train, test, i_rep, ..) │
+│ + _get_score_elements() → {psi_a,psi_b}│
+└─────────────────────────────────────────┘
+```
+
+## Method Resolution & Workflow
+
+The `fit()` call follows the template method pattern:
+
+```
+PLR.fit()
+  │
+  ├─ DoubleMLScalar.draw_sample_splitting()    ← if not already done
+  │    └─ DoubleMLResampling.split_samples()
+  │
+  ├─ DoubleMLScalar.fit_nuisance_models()
+  │    ├─ DoubleMLScalar._check_learners_available()
+  │    ├─ DoubleMLScalar._initialize_predictions_dict()  ← uses _learner_names
+  │    └─ loop(n_rep × n_folds):
+  │         └─ PLR._nuisance_est()                       ← subclass implements
+  │
+  └─ DoubleMLScalar.estimate_causal_parameters()
+       ├─ DoubleMLScalar._initialize_result_arrays()
+       ├─ PLR._get_score_elements()                      ← subclass implements
+       ├─ LinearScoreMixin._est_causal_pars_and_se()     ← mixin implements
+       └─ DoubleMLScalar._construct_framework()
+            └─ DoubleMLFramework(...)
+```
+
+## Typical User Workflow
+
+```python
+# 1. Define model (data + score)
+plr = PLR(obj_dml_data, score="partialling out")
+
+# 2. Set learners
+plr.set_learners(ml_l=RandomForestRegressor(), ml_m=RandomForestRegressor())
+
+# 3. Draw sample splitting
+plr.draw_sample_splitting(n_folds=5, n_rep=1)
+
+# 4. Fit
+plr.fit()
+
+# 5. Results (delegated to DoubleMLFramework via DoubleMLBase)
+print(plr.summary)
+plr.confint()
+plr.bootstrap()
+```
+
+## What Each Layer Provides
+
+| Layer | Responsibilities |
+|---|---|
+| **DoubleMLBase** | Data storage, framework delegation (coef, se, summary, confint, bootstrap, p_adjust, sensitivity_analysis) |
+| **DoubleMLScalar** | Single-parameter fit orchestration, sample splitting, learner management (`_learner_names`, `_learners`, `set_learners`, `_check_learners_available`), prediction storage, result array initialization, framework construction |
+| **LinearScoreMixin** | Closed-form parameter estimation for linear scores: `θ̂ = -E[ψ_b]/E[ψ_a]`, SE computation, influence function |
+| **PLR** | PLR-specific: data validation, learner names (`ml_l`, `ml_m`, `ml_g`), nuisance estimation logic, score element computation |
+
+## Key Design Decisions
+
+- **Learners separated from constructor**: `__init__` takes only data + score; learners are set via `set_learners()` with explicit kwargs per subclass
+- **`_learner_names` as single source of truth**: Drives `_initialize_predictions_dict()` and `_check_learners_available()` — subclasses just set the list
+- **Resampling separated from constructor**: `draw_sample_splitting()` is a separate step, can be called independently
+- **External predictions**: Passed to `fit()` / `fit_nuisance_models()`, validated against `_learner_names`, pre-filled before cross-fitting loop
+- **Template method pattern**: `fit()` orchestrates; subclasses implement `_nuisance_est()` and `_get_score_elements()`; mixin implements `_est_causal_pars_and_se()`

From 838d0ca742f7122fc326d17db2da648eb2027279 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Tue, 3 Feb 2026 18:47:55 +0100
Subject: [PATCH 07/38] Add code simplifier and technical debt finder
 documentation

---
 .claude/CLAUDE.md                       | 222 ++++++++++++++++++++++++
 .claude/skills/code-simplifier/SKILL.md | 178 +++++++++++++++++++
 .claude/skills/techdebt/SKILL.md        | 125 +++++++++++++
 3 files changed, 525 insertions(+)
 create mode 100644 .claude/CLAUDE.md
 create mode 100644 .claude/skills/code-simplifier/SKILL.md
 create mode 100644 .claude/skills/techdebt/SKILL.md

diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
new file mode 100644
index 00000000..6c302c11
--- /dev/null
+++ b/.claude/CLAUDE.md
@@ -0,0 +1,222 @@
+# DoubleML for Python - Claude Code Memory
+
+## Project Purpose
+
+DoubleML is a Python package implementing Double/Debiased Machine Learning (DML) methods for causal inference. The package provides:
+- Partially Linear Models (PLR, PLIV, PLPR, LPLR)
+- Interactive Regression Models (IRM, IIVM, APO, QTE, CVAR, SSM)
+- Difference-in-Differences estimators (DID, DIDCSBinary, DIDMulti)
+- Regression Discontinuity Design (RDD)
+
+**Documentation**: https://docs.doubleml.org
+
+## Coding Standards
+
+### Python
+- **Version**: Python 3.11+ (supports 3.11, 3.12, 3.13)
+- **Formatter**: black with line-length 127
+- **Linter**: ruff (rules: E, F, W, I)
+- **Type Checker**: mypy with `disallow_untyped_defs = true`
+- **Type hints**: Required for all functions
+- **Docstrings**: NumPy-style (see example below)
+- **Max line length**: 127 characters
+
+### NumPy Docstring Style
+```python
+def example_function(param1: int, param2: str) -> bool:
+    """
+    Short description of the function.
+
+    Parameters
+    ----------
+    param1 : int
+        Description of param1.
+    param2 : str
+        Description of param2.
+
+    Returns
+    -------
+    bool
+        Description of return value.
+
+    Raises
+    ------
+    ValueError
+        If param1 is negative.
+    """
+```
+
+### Code Quality Commands
+```bash
+# Format code
+black .
+
+# Lint code
+ruff check .
+
+# Fix linting issues
+ruff check --fix .
+
+# Type check
+mypy doubleml
+```
+
+### Pre-commit Hooks
+Pre-commit is configured with:
+- File format checks (yaml, toml)
+- Debug statement detection
+- Large file checks
+- Trailing whitespace and line ending fixes
+- black formatting
+- ruff linting with auto-fix
+
+Run pre-commit manually: `pre-commit run --all-files`
+
+## Architecture Overview
+
+### Class Hierarchy
+```
+DoubleMLBase (ABC)
+└─> DoubleMLScalar (ABC) - single-parameter models
+    ├─> LinearScoreMixin - closed-form solver
+    │   ├─> DoubleMLPLR
+    │   ├─> DoubleMLIRM
+    │   ├─> DoubleMLPLIV
+    │   ├─> DoubleMLIIVM
+    │   └─> DoubleML DID variants
+    └─> NonLinearScoreMixin - numerical solver (planned)
+
+DoubleML - multi-parameter estimation (extends DoubleMLScalar)
+```
+
+### Key Design Patterns
+- **Template Method**: `fit()` orchestrates; subclasses implement abstract methods
+- **Mixin Pattern**: LinearScoreMixin provides closed-form θ = -E[ψ_b]/E[ψ_a]
+- **Delegation**: DoubleMLBase delegates inference to DoubleMLFramework
+
+### Core Files
+| File | Purpose |
+|------|---------|
+| `doubleml/double_ml_base.py` | Abstract base with properties (coef, se, summary) and inference methods |
+| `doubleml/double_ml_scalar.py` | Single-parameter estimation orchestrator |
+| `doubleml/double_ml.py` | Multi-parameter estimation with sample splitting |
+| `doubleml/double_ml_framework.py` | Statistical inference (confint, bootstrap, sensitivity) |
+| `doubleml/double_ml_linear_score.py` | Linear score mixin |
+
+### Package Structure
+```
+doubleml/
+├── data/          # Data containers (DoubleMLData, DoubleMLDIDData, etc.)
+├── plm/           # Partially Linear Models (PLR, PLIV, PLPR, LPLR)
+├── irm/           # Interactive Regression Models (IRM, IIVM, APO, QTE, etc.)
+├── did/           # Difference-in-Differences estimators
+├── rdd/           # Regression Discontinuity Design
+├── utils/         # Helpers (_checks, _estimation, resampling, tuning)
+└── tests/         # Main test directory
+```
+
+## Testing
+
+### Run Tests
+```bash
+# Run all tests
+pytest
+
+# Run with coverage
+pytest --cov
+
+# Run specific marker (CI tests)
+pytest -m ci
+
+# Run specific test file
+pytest doubleml/tests/test_framework.py
+
+# Run tests for a specific module
+pytest doubleml/plm/tests/
+```
+
+### Test Markers
+- `ci`: Continuous integration tests for GitHub Actions
+- `ci_rdd`: RDD-specific CI tests
+
+### Test Organization
+- Each module (plm, irm, did) has its own `tests/` subdirectory
+- Test utilities in `doubleml/tests/_utils*.py`
+- Manual computation helpers verify results independently
+
+## Git Workflow
+
+### Branches
+- `main`: Main development branch
+- Feature branches for new work
+
+### Commit Format
+Use Conventional Commits:
+- `feat:` new feature
+- `fix:` bug fix
+- `docs:` documentation
+- `refactor:` code refactoring
+- `test:` adding tests
+- `chore:` maintenance
+
+## Key Dependencies
+
+### Core
+- numpy>=2.0.0, pandas>=2.0.0, scipy>=1.7.0
+- scikit-learn>=1.6.0, statsmodels>=0.14.0
+
+### ML/Tuning
+- optuna>=4.6.0 (hyperparameter tuning)
+- joblib>=1.2.0 (parallelization)
+
+### Visualization
+- matplotlib>=3.9.0, seaborn>=0.13, plotly>=5.0.0
+
+### Development
+- pytest>=8.3.0, pytest-cov>=6.0.0
+- black>=25.1.0, ruff>=0.11.1, mypy>=1.18.0
+- xgboost>=2.1.0, lightgbm>=4.6.0 (for testing)
+
+## Known Pitfalls
+
+### Type Annotations
+- MyPy is strict: `disallow_untyped_defs = true`
+- All functions need full type hints including return types
+- Use `from __future__ import annotations` for forward references
+
+### Learner Validation
+- Learners must be scikit-learn compatible (fit/predict interface)
+- Use `_check_learner()` from `doubleml/utils/_checks.py` for validation
+- Classifiers need `predict_proba()` for propensity scores
+
+### Sample Splitting
+- Cross-fitting uses `DoubleMLResampling` from `doubleml/utils/resampling.py`
+- Default is 5-fold cross-fitting with 1 repetition
+- Cluster-robust resampling available for clustered data
+
+### Score Functions
+- Linear scores use closed-form: θ = -E[ψ_b]/E[ψ_a]
+- Custom scores can be passed as callables
+- Score elements: `psi_a` (derivative), `psi_b` (moment)
+
+### External Predictions
+- Models support external predictions via `set_external_predictions()`
+- Predictions must match sample splitting structure
+
+## Verification
+
+Before completing any task:
+1. Run `ruff check .` to check for linting issues
+2. Run `mypy doubleml` for type checking
+3. Run relevant tests: `pytest doubleml/path/to/tests/`
+4. Format code: `black .`
+
+## Useful Links
+
+- **Documentation**: https://docs.doubleml.org
+- **Source**: https://github.com/DoubleML/doubleml-for-py
+- **Bug Tracker**: https://github.com/DoubleML/doubleml-for-py/issues
+- **Architecture Docs**: [doc/diagrams/architecture.md](doc/diagrams/architecture.md)
+
+---
+*Update this file when Claude makes mistakes to prevent future issues.*
diff --git a/.claude/skills/code-simplifier/SKILL.md b/.claude/skills/code-simplifier/SKILL.md
new file mode 100644
index 00000000..583b70ae
--- /dev/null
+++ b/.claude/skills/code-simplifier/SKILL.md
@@ -0,0 +1,178 @@
+---
+name: code-simplifier
+description: Simplify and clean up DoubleML code after changes. Reduces complexity, improves readability, ensures NumPy-style docstrings and type hints.
+---
+
+# Code Simplifier for DoubleML
+
+Clean up and simplify code after making changes.
+
+## When to Use
+
+Run after completing a feature or fix to ensure code is clean, readable, and follows DoubleML patterns.
+
+## Simplification Goals
+
+### Reduce Complexity
+- Break long functions into smaller, focused ones (target: <50 lines)
+- Reduce nesting depth (max 3 levels)
+- Simplify complex conditionals
+- Extract magic numbers to named constants (e.g., `DEFAULT_N_FOLDS = 5`)
+
+### Improve Readability
+- Use descriptive variable and function names
+- Add clarifying comments for non-obvious logic
+- Ensure consistent formatting (127 char line limit)
+- Remove unnecessary comments
+
+### Apply Pythonic Patterns
+- Use list/dict/set comprehensions where appropriate
+- Use `with` statements for resource management
+- Use `enumerate()` instead of manual indexing
+- Use `zip()` for parallel iteration
+- Use f-strings for formatting
+- Use `pathlib` for file paths
+- Use `is None` / `is not None` instead of `== None`
+
+### DoubleML-Specific Patterns
+- Use `clone()` for sklearn learners instead of direct copy
+- Use `_check_learner()` for learner validation
+- Use `_check_score()` for score function validation
+- Consistent `psi_a`/`psi_b` naming for score elements
+- Use `DoubleMLResampling` for sample splitting logic
+- Prefer numpy operations over Python loops for arrays
+
+### Type Hints (Python 3.11+)
+- Use built-in generics: `list[int]` not `typing.List[int]`
+- Use `X | None` instead of `Optional[X]`
+- Use `X | Y` instead of `Union[X, Y]`
+- Add `from __future__ import annotations` for forward references
+- Ensure all public functions have complete type hints
+
+### NumPy-Style Docstrings
+- Ensure `Parameters` section lists all arguments
+- Ensure `Returns` section describes return value
+- Add `Raises` section for exceptions
+- Use `:class:` references for DoubleML types
+
+### Clean Up
+- Remove unused imports
+- Remove unused variables
+- Remove commented-out code
+- Remove redundant code paths
+- Consolidate duplicate logic
+
+## Workflow
+
+1. **Identify Changed Files**
+   ```bash
+   git diff --name-only HEAD~1  # Recent changes
+   git status --short           # Uncommitted changes
+   ```
+
+2. **Analyze Each File**
+   - Check for simplification opportunities
+   - Prioritize high-impact improvements
+
+3. **Apply Simplifications**
+   - Make incremental changes
+   - Preserve original behavior
+   - Run tests after each change
+
+4. **Format and Lint**
+   ```bash
+   black .
+   ruff check --fix .
+   ```
+
+5. **Type Check**
+   ```bash
+   mypy doubleml
+   ```
+
+6. **Verify**
+   ```bash
+   pytest -m ci
+   ```
+
+## Arguments
+
+Optionally specify files or directories to simplify.
+
+Usage:
+- `/code-simplifier` - Simplify recently changed files
+- `/code-simplifier doubleml/plm/plr.py` - Simplify specific file
+- `/code-simplifier doubleml/utils/` - Simplify entire directory
+
+## Example Transformations
+
+### Loop to Comprehension
+```python
+# Before
+result = []
+for i in range(len(items)):
+    if items[i].is_valid == True:
+        result.append(items[i].value)
+
+# After
+result = [item.value for item in items if item.is_valid]
+```
+
+### Flatten Nesting
+```python
+# Before
+if x != None:
+    if y != None:
+        if z != None:
+            process(x, y, z)
+
+# After
+if all(v is not None for v in (x, y, z)):
+    process(x, y, z)
+```
+
+### Modern Type Hints
+```python
+# Before
+from typing import List, Optional, Union, Dict
+
+def process(items: List[int], config: Optional[Dict[str, Any]] = None) -> Union[int, None]:
+    ...
+
+# After
+def process(items: list[int], config: dict[str, Any] | None = None) -> int | None:
+    ...
+```
+
+### NumPy Operations
+```python
+# Before
+result = []
+for i in range(len(predictions)):
+    result.append(predictions[i] - true_values[i])
+result = np.array(result)
+
+# After
+result = predictions - true_values
+```
+
+### DoubleML Learner Pattern
+```python
+# Before
+ml_l_copy = copy.deepcopy(ml_l)
+
+# After
+from sklearn.base import clone
+ml_l_copy = clone(ml_l)
+```
+
+### Score Element Naming
+```python
+# Before
+def _get_score_elements(self, ...):
+    return {"a": psi_derivative, "b": psi_moment}
+
+# After
+def _get_score_elements(self, ...):
+    return {"psi_a": psi_derivative, "psi_b": psi_moment}
+```
diff --git a/.claude/skills/techdebt/SKILL.md b/.claude/skills/techdebt/SKILL.md
new file mode 100644
index 00000000..af0e4b6a
--- /dev/null
+++ b/.claude/skills/techdebt/SKILL.md
@@ -0,0 +1,125 @@
+---
+name: techdebt
+description: Find and fix technical debt in DoubleML codebase. Checks for code smells, type issues, style violations, and outdated patterns.
+---
+
+# Technical Debt Finder for DoubleML
+
+Identify and fix technical debt aligned with project standards.
+
+## Project-Specific Checks
+
+### Type Annotations (MyPy Strict Mode)
+- Missing type hints on functions (`disallow_untyped_defs = true`)
+- Missing return type annotations
+- Use of `Any` where specific types are possible
+- Old-style typing (`typing.List` → `list`, `typing.Dict` → `dict`)
+- Missing `from __future__ import annotations` for forward references
+
+### Docstrings (NumPy Style)
+- Missing docstrings on public functions/classes
+- Incorrect docstring format (must be NumPy-style)
+- Missing `Parameters`, `Returns`, or `Raises` sections
+- Outdated parameter documentation
+
+### Code Style (Black + Ruff)
+- Lines exceeding 127 characters
+- Import ordering issues (ruff rule I)
+- Unused imports (ruff rule F401)
+- Undefined names (ruff rule F821)
+- Old-style string formatting (use f-strings)
+
+### Scikit-learn Compatibility
+- Learners missing `fit()`/`predict()` interface
+- Classifiers missing `predict_proba()` for propensity scores
+- Missing `clone()` compatibility
+
+### DoubleML Patterns
+- Inconsistent use of `_check_learner()` for validation
+- Missing score function validation with `_check_score()`
+- Incorrect sample splitting structure
+- Missing `psi_a`/`psi_b` score elements
+
+## Workflow
+
+1. **Run Automated Checks**
+   ```bash
+   # Type checking
+   mypy doubleml
+
+   # Linting
+   ruff check .
+
+   # Format check (dry-run)
+   black --check .
+   ```
+
+2. **Scan for Code Smells**
+   - Functions longer than 50 lines
+   - More than 5 parameters
+   - Deep nesting (> 3 levels)
+   - Duplicate code blocks
+   - Magic numbers without constants
+
+3. **Check for Dead Code**
+   - Unused imports
+   - Unused functions/classes
+   - Commented-out code blocks
+   - Unreachable code paths
+
+4. **Report Findings**
+   Format: `file_path:line_number - [severity] description`
+
+5. **Fix Issues**
+   - Auto-fix with `ruff check --fix .`
+   - Auto-format with `black .`
+   - Manual fixes for type hints and docstrings
+
+6. **Verify**
+   ```bash
+   ruff check .
+   mypy doubleml
+   pytest -m ci  # Run CI tests
+   ```
+
+## Severity Levels
+
+| Severity | Description | Examples |
+|----------|-------------|----------|
+| **high** | Breaks CI or type safety | Missing type hints, mypy errors |
+| **medium** | Style violations | Line length, import order |
+| **low** | Code smells | Long functions, magic numbers |
+
+## Arguments
+
+Specify scope to focus the scan:
+
+- `/techdebt` - Scan entire `doubleml/` package
+- `/techdebt doubleml/plm/` - Scan PLM module
+- `/techdebt doubleml/utils/_checks.py` - Scan specific file
+
+## Output Format
+
+```markdown
+## Technical Debt Report
+
+### High Severity
+- `doubleml/plm/plr.py:45` - Missing return type annotation
+- `doubleml/utils/_checks.py:123` - Type hint uses `typing.List`
+
+### Medium Severity
+- `doubleml/did/did.py:89` - Line exceeds 127 characters
+- `doubleml/irm/irm.py:12` - Unused import `warnings`
+
+### Low Severity
+- `doubleml/double_ml.py:234` - Function has 67 lines (>50)
+- `doubleml/utils/resampling.py:45` - Magic number `5` should be constant
+
+### Fixed
+- ✓ Auto-fixed 3 import ordering issues
+- ✓ Auto-formatted 2 files with black
+
+### Remaining
+- 2 high severity items need manual fixes
+- Consider refactoring `_nuisance_est()` in next session
+```

From 54e9eb469c452bb7c515ba4784e17d0e0f184d1d Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Fri, 6 Feb 2026 11:29:36 +0100
Subject: [PATCH 08/38] Enhance DoubleMLScalar and PLR with learner management,
 validation, and utility functions

---
 doubleml/double_ml_scalar.py                  | 112 ++++++++--
 doubleml/plm/plr_scalar.py                    | 157 ++++++++++----
 .../plm/tests/test_plr_scalar_exceptions.py   |   2 +-
 .../plm/tests/test_plr_scalar_return_types.py |  29 ++-
 doubleml/utils/_learner.py                    | 201 ++++++++++++++++++
 5 files changed, 448 insertions(+), 53 deletions(-)
 create mode 100644 doubleml/utils/_learner.py

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index c969ec84..a0010005 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -3,7 +3,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional, Self
+from typing import ClassVar, Dict, List, Optional, Self
 
 import numpy as np
 
@@ -11,6 +11,7 @@
 from .double_ml_base import DoubleMLBase
 from .double_ml_framework import DoubleMLCore as DoubleMLCoreData
 from .double_ml_framework import DoubleMLFramework
+from .utils._learner import LearnerInfo, LearnerSpec, validate_learner
 from .utils.resampling import DoubleMLResampling
 
 
@@ -43,6 +44,9 @@ class DoubleMLScalar(DoubleMLBase, ABC):
         The score function being used.
     """
 
+    # Subclasses define all possible learners for the model
+    _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]]
+
     def __init__(
         self,
         obj_dml_data: DoubleMLBaseData,
@@ -76,9 +80,8 @@ def __init__(
 
         self._score = score
 
-        # Learner names (set by subclass) and learner storage (set via set_learners)
-        self._learner_names: List[str] = []
-        self._learners: Dict[str, object] = {}
+        # Learner storage: single dict for all learner state
+        self._learners: Dict[str, LearnerInfo] = {}
 
         # Resampling parameters (set via draw_sample_splitting)
         self._n_folds: Optional[int] = None
@@ -183,31 +186,112 @@ def smpls(self) -> List:
         return self._smpls
 
     @property
-    def learner_names(self) -> List[str]:
+    @abstractmethod
+    def required_learners(self) -> List[str]:
         """
-        Names of the required learners for this model.
+        Names of the required learners for current configuration.
+
+        Subclasses implement this as a property that returns the learner names
+        needed based on the current score function or model configuration.
 
         Returns
         -------
         list of str
             List of required learner names.
         """
-        return self._learner_names
+        pass
 
     @property
     def learners(self) -> Dict[str, object]:
         """
-        The learners used for nuisance estimation.
+        Access registered learner objects by name.
 
         Returns
         -------
         dict
             Dictionary mapping learner names to estimator instances.
         """
-        return self._learners
+        return {name: info.learner for name, info in self._learners.items()}
+
+    def get_params(self, learner_name: str) -> Dict:
+        """
+        Get parameters of a registered learner.
+
+        Parameters
+        ----------
+        learner_name : str
+            Name of the learner.
+
+        Returns
+        -------
+        dict
+            Dictionary of learner parameters.
+
+        Raises
+        ------
+        ValueError
+            If the learner is not registered.
+        """
+        if learner_name not in self._learners:
+            raise ValueError(f"Learner '{learner_name}' not registered.")
+        return self._learners[learner_name].learner.get_params()
+
+    def set_params(self, learner_name: str, **params: object) -> Self:
+        """
+        Set parameters of a registered learner.
+
+        Parameters
+        ----------
+        learner_name : str
+            Name of the learner.
+        **params
+            Parameters to set on the learner.
+
+        Returns
+        -------
+        self : Self
+            The estimator with updated learner parameters.
+
+        Raises
+        ------
+        ValueError
+            If the learner is not registered.
+        """
+        if learner_name not in self._learners:
+            raise ValueError(f"Learner '{learner_name}' not registered.")
+        self._learners[learner_name].learner.set_params(**params)
+        return self
+
+    def _register_learner(self, name: str, learner: object) -> None:
+        """
+        Validate and register a single learner.
+
+        Parameters
+        ----------
+        name : str
+            Name of the learner (must be in _LEARNER_SPECS).
+        learner : object
+            The learner instance to register.
+
+        Raises
+        ------
+        ValueError
+            If the learner name is not defined in _LEARNER_SPECS.
+        """
+        if name not in self._LEARNER_SPECS:
+            raise ValueError(f"Learner '{name}' not defined for this model.")
+
+        spec = self._LEARNER_SPECS[name]
+        info = validate_learner(
+            learner,
+            spec,
+            binary_outcome=self._dml_data.binary_outcome,
+            binary_treatment=self._dml_data.binary_treats.all(),
+        )
+        self._learners[name] = info
 
     @abstractmethod
-    def set_learners(self, **kwargs) -> Self:
+    def set_learners(self, **kwargs: object) -> Self:
         """
         Set the learners for nuisance estimation.
 
@@ -435,7 +519,7 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]:
         Initialize dictionary for storing predictions.
 
         Creates a prediction array of shape ``(n_obs, n_rep)`` for each learner
-        in :attr:`learner_names`, filled with ``NaN``. Subclasses can override
+        in :attr:`required_learners`, filled with ``NaN``. Subclasses can override
         this for custom prediction storage.
 
         Returns
@@ -445,9 +529,9 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]:
         """
         n_obs = self._n_obs
         n_rep = self.n_rep
-        return {name: np.full((n_obs, n_rep), np.nan) for name in self._learner_names}
+        return {name: np.full((n_obs, n_rep), np.nan) for name in self.required_learners}
 
-    def _check_learners_available(self, external_predictions=None) -> None:
+    def _check_learners_available(self, external_predictions: Optional[Dict[str, np.ndarray]] = None) -> None:
         """
         Validate that all required learners are set or covered by external predictions.
 
@@ -463,7 +547,7 @@ def _check_learners_available(self, external_predictions=None) -> None:
         """
         ext_keys = set(external_predictions.keys()) if external_predictions is not None else set()
 
-        for name in self._learner_names:
+        for name in self.required_learners:
             if name not in self._learners and name not in ext_keys:
                 raise ValueError(
                     f"Learner '{name}' is required but not set and no external predictions provided for it. "
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index b915bd76..ef18fb68 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -2,14 +2,17 @@
 Partially Linear Regression (PLR) model based on the new DoubleMLScalar hierarchy.
 """
 
+from __future__ import annotations
+
 import warnings
+from typing import Dict, List, Optional, Self
 
 import numpy as np
 from sklearn.base import clone
 
 from ..data.base_data import DoubleMLData
 from ..double_ml_linear_score import LinearScoreMixin
-from ..utils._checks import _check_learner
+from ..utils._learner import LearnerSpec, predict_nuisance
 
 
 class PLR(LinearScoreMixin):
@@ -21,16 +24,48 @@ class PLR(LinearScoreMixin):
     ----------
     obj_dml_data : DoubleMLData
         The data object providing the data and specifying the variables for the causal model.
-    score : str, optional
+    score : str
         The score function (``'partialling out'`` or ``'IV-type'``).
         Default is ``'partialling out'``.
+    ml_l : estimator, optional
+        Learner for E[Y|X]. Can be regressor or classifier.
+    ml_m : estimator, optional
+        Learner for E[D|X]. Can be regressor or classifier.
+    ml_g : estimator, optional
+        Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor.
     """
 
+    # Define learner specifications for PLR
+    _LEARNER_SPECS: Dict[str, LearnerSpec] = {
+        "ml_l": LearnerSpec("ml_l", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"),
+        "ml_m": LearnerSpec("ml_m", allow_regressor=True, allow_classifier=True, binary_data_check="treatment"),
+        "ml_g": LearnerSpec("ml_g", allow_regressor=True, allow_classifier=False),
+    }
+
     def __init__(
         self,
-        obj_dml_data,
-        score="partialling out",
+        obj_dml_data: DoubleMLData,
+        score: str = "partialling out",
+        ml_l: Optional[object] = None,
+        ml_m: Optional[object] = None,
+        ml_g: Optional[object] = None,
     ):
+        """
+        Initialize PLR model.
+
+        Parameters
+        ----------
+        obj_dml_data : DoubleMLData
+            The data object.
+        score : str
+            Score function ('partialling out' or 'IV-type').
+        ml_l : estimator, optional
+            Learner for E[Y|X]. Can be regressor or classifier.
+        ml_m : estimator, optional
+            Learner for E[D|X]. Can be regressor or classifier.
+        ml_g : estimator, optional
+            Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor.
+        """
         # Validate data
         self._check_data(obj_dml_data)
 
@@ -44,12 +79,24 @@ def __init__(
             score=score,
         )
 
-        # Set required learner names based on score
-        self._learner_names = ["ml_l", "ml_m"]
-        if score == "IV-type":
-            self._learner_names.append("ml_g")
+        # Set learners if provided
+        if any(learner is not None for learner in [ml_l, ml_m, ml_g]):
+            self.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
+
+    @property
+    def required_learners(self) -> List[str]:
+        """Required learners for current score."""
+        names = ["ml_l", "ml_m"]
+        if self.score == "IV-type":
+            names.append("ml_g")
+        return names
 
-    def set_learners(self, ml_l=None, ml_m=None, ml_g=None):
+    def set_learners(
+        self,
+        ml_l: Optional[object] = None,
+        ml_m: Optional[object] = None,
+        ml_g: Optional[object] = None,
+    ) -> Self:
         """
         Set the learners for nuisance estimation.
 
@@ -71,26 +118,48 @@ def set_learners(self, ml_l=None, ml_m=None, ml_g=None):
         self : PLR
             The estimator with learners set.
         """
-        if ml_l is not None:
-            _check_learner(ml_l, "ml_l", regressor=True, classifier=True)
-            self._learners["ml_l"] = clone(ml_l)
-
-        if ml_m is not None:
-            _check_learner(ml_m, "ml_m", regressor=True, classifier=True)
-            self._learners["ml_m"] = clone(ml_m)
-
-        if ml_g is not None:
-            if self.score == "IV-type":
-                _check_learner(ml_g, "ml_g", regressor=True, classifier=False)
-                self._learners["ml_g"] = clone(ml_g)
-            else:
-                warnings.warn(
-                    "A learner ml_g has been provided for score = 'partialling out' but will be ignored. "
-                    "A learner ml_g is not required for estimation."
-                )
+        for name, learner in [("ml_l", ml_l), ("ml_m", ml_m), ("ml_g", ml_g)]:
+            if learner is None:
+                continue
+            if name not in self.required_learners:
+                warnings.warn(f"Learner '{name}' not required for score='{self.score}', ignored.")
+                continue
+            self._register_learner(name, learner)
 
+        # IV-type: clone ml_l to ml_g if only one provided
+        self._handle_iv_cloning()
         return self
 
+    def _handle_iv_cloning(self) -> None:
+        """For IV-type score: clone ml_l to ml_g or vice versa if one is missing."""
+        if self.score != "IV-type":
+            return
+        if "ml_g" not in self.required_learners:
+            return
+
+        has_l = "ml_l" in self._learners
+        has_g = "ml_g" in self._learners
+
+        if has_l and not has_g:
+            warnings.warn("For score='IV-type', ml_g not set. Cloning ml_l to ml_g.")
+            # Clone the learner and register with same info
+            from ..utils._learner import LearnerInfo
+
+            ml_l_info = self._learners["ml_l"]
+            self._learners["ml_g"] = LearnerInfo(
+                learner=clone(ml_l_info.learner),
+                is_classifier=ml_l_info.is_classifier,
+            )
+        elif has_g and not has_l:
+            warnings.warn("For score='IV-type', ml_l not set. Cloning ml_g to ml_l.")
+            from ..utils._learner import LearnerInfo
+
+            ml_g_info = self._learners["ml_g"]
+            self._learners["ml_l"] = LearnerInfo(
+                learner=clone(ml_g_info.learner),
+                is_classifier=ml_g_info.is_classifier,
+            )
+
     @staticmethod
     def _check_data(obj_dml_data):
         if not isinstance(obj_dml_data, DoubleMLData):
@@ -103,7 +172,14 @@ def _check_data(obj_dml_data):
                 "To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR."
             )
 
-    def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions=None):
+    def _nuisance_est(
+        self,
+        train_idx: np.ndarray,
+        test_idx: np.ndarray,
+        i_rep: int,
+        i_fold: int,
+        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+    ) -> None:
         x = self._dml_data.x
         y = self._dml_data.y
         d = self._dml_data.d
@@ -119,23 +195,31 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions
 
         # Fit and predict ml_l: E[Y|X]
         if not l_external:
-            ml_l = clone(self._learners["ml_l"])
+            ml_l_info = self._learners["ml_l"]
+            ml_l = clone(ml_l_info.learner)
             ml_l.fit(x_train, y_train)
-            self._predictions["ml_l"][test_idx, i_rep] = ml_l.predict(x_test)
+            self._predictions["ml_l"][test_idx, i_rep] = predict_nuisance(ml_l, x_test, ml_l_info.is_classifier)
 
         # Fit and predict ml_m: E[D|X]
         if not m_external:
-            ml_m = clone(self._learners["ml_m"])
+            ml_m_info = self._learners["ml_m"]
+            ml_m = clone(ml_m_info.learner)
             ml_m.fit(x_train, d_train)
-            self._predictions["ml_m"][test_idx, i_rep] = ml_m.predict(x_test)
+            self._predictions["ml_m"][test_idx, i_rep] = predict_nuisance(ml_m, x_test, ml_m_info.is_classifier)
 
         # For IV-type: fit ml_g after last fold when all ml_l/ml_m predictions are available
         is_last_fold = i_fold == self.n_folds - 1
         if is_last_fold and self.score == "IV-type" and not g_external:
-            # If ml_g not explicitly set, default to clone of ml_l
+            # If ml_g not explicitly set, clone ml_l (already handled in _handle_iv_cloning)
             if "ml_g" not in self._learners:
                 warnings.warn("For score = 'IV-type', learners ml_l and ml_g should be specified. Set ml_g = clone(ml_l).")
-                self._learners["ml_g"] = clone(self._learners["ml_l"])
+                from ..utils._learner import LearnerInfo
+
+                ml_l_info = self._learners["ml_l"]
+                self._learners["ml_g"] = LearnerInfo(
+                    learner=clone(ml_l_info.learner),
+                    is_classifier=ml_l_info.is_classifier,
+                )
 
             # Compute initial theta from full cross-fitted predictions
             l_hat = self._predictions["ml_l"][:, i_rep]
@@ -145,13 +229,14 @@ def _nuisance_est(self, train_idx, test_idx, i_rep, i_fold, external_predictions
             theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a)
 
             # Second pass: fit ml_g with cross-fitting across all folds
+            ml_g_info = self._learners["ml_g"]
             for j_fold in range(self.n_folds):
                 train_j, test_j = self._smpls[i_rep][j_fold]
-                ml_g = clone(self._learners["ml_g"])
+                ml_g = clone(ml_g_info.learner)
                 ml_g.fit(x[train_j], y[train_j] - theta_initial * d[train_j])
-                self._predictions["ml_g"][test_j, i_rep] = ml_g.predict(x[test_j])
+                self._predictions["ml_g"][test_j, i_rep] = predict_nuisance(ml_g, x[test_j], ml_g_info.is_classifier)
 
-    def _get_score_elements(self):
+    def _get_score_elements(self) -> Dict[str, np.ndarray]:
         y = self._dml_data.y
         d = self._dml_data.d
 
diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py
index 7cc74aac..fb1ba7a9 100644
--- a/doubleml/plm/tests/test_plr_scalar_exceptions.py
+++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py
@@ -81,7 +81,7 @@ def test_plr_scalar_exception_estimate_causal_without_predictions():
 @pytest.mark.ci
 def test_plr_scalar_warning_ml_g_partialling_out():
     dml_obj = PLR(obj_dml_data, score="partialling out")
-    with pytest.warns(UserWarning, match="will be ignored"):
+    with pytest.warns(UserWarning, match="not required for score.*ignored"):
         dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
 
 
diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py
index 63e06cdd..09832931 100644
--- a/doubleml/plm/tests/test_plr_scalar_return_types.py
+++ b/doubleml/plm/tests/test_plr_scalar_return_types.py
@@ -109,8 +109,8 @@ def test_n_properties(fitted_dml_obj):
 
 
 @pytest.mark.ci
-def test_learner_names(fitted_dml_obj):
-    assert fitted_dml_obj.learner_names == ["ml_l", "ml_m"]
+def test_required_learners(fitted_dml_obj):
+    assert fitted_dml_obj.required_learners == ["ml_l", "ml_m"]
     assert "ml_l" in fitted_dml_obj.learners
     assert "ml_m" in fitted_dml_obj.learners
 
@@ -121,6 +121,31 @@ def test_str_repr(fitted_dml_obj):
     assert isinstance(repr(fitted_dml_obj), str)
 
 
+@pytest.mark.ci
+def test_get_params(fitted_dml_obj):
+    params = fitted_dml_obj.get_params("ml_l")
+    assert isinstance(params, dict)
+    # LinearRegression has 'fit_intercept' param
+    assert "fit_intercept" in params
+
+
+@pytest.mark.ci
+def test_set_params(fitted_dml_obj):
+    # Note: This modifies the fitted object, but we're just testing the method works
+    result = fitted_dml_obj.set_params("ml_l", fit_intercept=False)
+    assert result is fitted_dml_obj  # Returns self
+    params = fitted_dml_obj.get_params("ml_l")
+    assert params["fit_intercept"] is False
+    # Reset for other tests
+    fitted_dml_obj.set_params("ml_l", fit_intercept=True)
+
+
+@pytest.mark.ci
+def test_get_params_invalid_learner(fitted_dml_obj):
+    with pytest.raises(ValueError, match="not registered"):
+        fitted_dml_obj.get_params("ml_invalid")
+
+
 @pytest.mark.ci
 def test_before_fit_raises():
     np.random.seed(3141)
diff --git a/doubleml/utils/_learner.py b/doubleml/utils/_learner.py
new file mode 100644
index 00000000..04659c98
--- /dev/null
+++ b/doubleml/utils/_learner.py
@@ -0,0 +1,201 @@
+"""
+Learner specification and validation utilities for DoubleML.
+"""
+
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+from typing import Any, Literal, Optional
+
+import numpy as np
+from sklearn.base import clone, is_classifier, is_regressor
+
+
+@dataclass(frozen=True)
+class LearnerSpec:
+    """
+    Immutable specification for a learner requirement.
+
+    Parameters
+    ----------
+    name : str
+        Name of the learner (e.g., "ml_l", "ml_m").
+    allow_regressor : bool
+        Whether regressors are allowed. Default is ``True``.
+    allow_classifier : bool
+        Whether classifiers are allowed. Default is ``True``.
+    binary_data_check : {"outcome", "treatment"} or None
+        If specified, warns when using regressor with binary data.
+        "outcome" checks binary_outcome, "treatment" checks binary_treatment.
+        Default is ``None``.
+    """
+
+    name: str
+    allow_regressor: bool = True
+    allow_classifier: bool = True
+    binary_data_check: Optional[Literal["outcome", "treatment"]] = None
+
+
+@dataclass
+class LearnerInfo:
+    """
+    Mutable info about a registered learner.
+
+    Parameters
+    ----------
+    learner : object
+        The learner object (already cloned).
+    is_classifier : bool
+        Whether the learner is a classifier.
+    """
+
+    learner: Any
+    is_classifier: bool
+
+    @property
+    def predict_method(self) -> str:
+        """Return the appropriate prediction method name."""
+        return "predict_proba" if self.is_classifier else "predict"
+
+
+def validate_learner(
+    learner: Any,
+    spec: LearnerSpec,
+    binary_outcome: bool = False,
+    binary_treatment: bool = False,
+) -> LearnerInfo:
+    """
+    Validate learner against specification and data properties.
+
+    Parameters
+    ----------
+    learner : object
+        The learner to validate.
+    spec : LearnerSpec
+        Specification for this learner.
+    binary_outcome : bool
+        Whether the outcome variable is binary.
+    binary_treatment : bool
+        Whether the treatment variable is binary.
+
+    Returns
+    -------
+    LearnerInfo
+        Information about the validated learner.
+
+    Raises
+    ------
+    TypeError
+        If the learner is a class instead of an instance, or lacks
+        required methods (fit, set_params, get_params, predict/predict_proba).
+    ValueError
+        If the learner type is not allowed by the specification.
+        If a classifier is used with non-binary data when required.
+    """
+    err_msg_prefix = f"Invalid learner provided for {spec.name}: "
+    warn_msg_prefix = f"Learner provided for {spec.name} is probably invalid: "
+
+    # Check it's an instance, not a class
+    if isinstance(learner, type):
+        raise TypeError(err_msg_prefix + "provide an instance of a learner instead of a class.")
+
+    # Check required methods
+    if not hasattr(learner, "fit"):
+        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .fit().")
+    if not hasattr(learner, "set_params"):
+        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .set_params().")
+    if not hasattr(learner, "get_params"):
+        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .get_params().")
+
+    # Determine learner type
+    learner_is_classifier: bool
+    if spec.allow_regressor and spec.allow_classifier:
+        if is_classifier(learner):
+            learner_is_classifier = True
+        elif is_regressor(learner):
+            learner_is_classifier = False
+        else:
+            warnings.warn(
+                warn_msg_prefix
+                + f"{str(learner)} is (probably) neither a regressor nor a classifier. "
+                + "Method predict is used for prediction."
+            )
+            learner_is_classifier = False
+    elif spec.allow_classifier:
+        if not is_classifier(learner):
+            warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.")
+        learner_is_classifier = True
+    else:
+        assert spec.allow_regressor  # At least one must be True
+        if not is_regressor(learner):
+            warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.")
+        learner_is_classifier = False
+
+    # Check type is allowed
+    if learner_is_classifier and not spec.allow_classifier:
+        raise ValueError(f"Classifier not allowed for {spec.name}. Use a regressor instead.")
+    if not learner_is_classifier and not spec.allow_regressor:
+        raise ValueError(f"Regressor not allowed for {spec.name}. Use a classifier instead.")
+
+    # Check prediction method exists
+    if learner_is_classifier:
+        if not hasattr(learner, "predict_proba"):
+            raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict_proba().")
+    else:
+        if not hasattr(learner, "predict"):
+            raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict().")
+
+    # Check binary data compatibility for classifiers
+    if learner_is_classifier and spec.binary_data_check:
+        if spec.binary_data_check == "outcome" and not binary_outcome:
+            raise ValueError(
+                f"The {spec.name} learner {str(learner)} was identified as classifier "
+                "but the outcome variable is not binary with values 0 and 1."
+            )
+        if spec.binary_data_check == "treatment" and not binary_treatment:
+            raise ValueError(
+                f"The {spec.name} learner {str(learner)} was identified as classifier "
+                "but the treatment variable is not binary with values 0 and 1."
+            )
+
+    # Warn if regressor used with binary data
+    if not learner_is_classifier and spec.binary_data_check:
+        if spec.binary_data_check == "outcome" and binary_outcome:
+            warnings.warn(
+                f"Binary outcome detected. Consider using a classifier for {spec.name} "
+                "with predict_proba() to fit an additive probability model."
+            )
+        elif spec.binary_data_check == "treatment" and binary_treatment:
+            warnings.warn(
+                f"Binary treatment detected. Consider using a classifier for {spec.name} "
+                "with predict_proba() to estimate propensity scores."
+            )
+
+    return LearnerInfo(
+        learner=clone(learner),
+        is_classifier=learner_is_classifier,
+    )
+
+
+def predict_nuisance(learner: Any, X: np.ndarray, is_classifier: bool) -> np.ndarray:
+    """
+    Predict using the appropriate method based on learner type.
+
+    Parameters
+    ----------
+    learner : object
+        Fitted learner with predict() or predict_proba() method.
+    X : np.ndarray
+        Features to predict on.
+    is_classifier : bool
+        Whether the learner is a classifier.
+
+    Returns
+    -------
+    np.ndarray
+        Predictions. For classifiers, returns probability of class 1.
+    """
+    if is_classifier:
+        return learner.predict_proba(X)[:, 1]
+    return learner.predict(X)

From aa2cffadbe8a461c853e6edb10386c1bb6bd6113 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Fri, 6 Feb 2026 18:18:33 +0100
Subject: [PATCH 09/38] Add Interactive Regression Model (IRM) implementation
 and tests

- Implemented the IRM class for double machine learning with interactive regression models in irm_scalar.py.
- Added core estimation tests for IRM scalar in test_irm_scalar.py.
- Created exception handling tests for IRM scalar in test_irm_scalar_exceptions.py.
- Developed tests for handling external predictions in test_irm_scalar_external_predictions.py.
- Added return type validation tests for IRM scalar in test_irm_scalar_return_types.py.
- Compared the new IRM scalar implementation against the existing DoubleMLIRM in test_irm_scalar_vs_irm.py.
---
 doc/diagrams/architecture.md                  |   2 +-
 doc/diagrams/testing_structure.md             | 321 +++++++++++++
 doubleml/irm/irm_scalar.py                    | 421 ++++++++++++++++++
 doubleml/irm/tests/test_irm_scalar.py         |  66 +++
 .../irm/tests/test_irm_scalar_exceptions.py   | 133 ++++++
 .../test_irm_scalar_external_predictions.py   | 105 +++++
 .../irm/tests/test_irm_scalar_return_types.py | 170 +++++++
 doubleml/irm/tests/test_irm_scalar_vs_irm.py  |  82 ++++
 8 files changed, 1299 insertions(+), 1 deletion(-)
 create mode 100644 doc/diagrams/testing_structure.md
 create mode 100644 doubleml/irm/irm_scalar.py
 create mode 100644 doubleml/irm/tests/test_irm_scalar.py
 create mode 100644 doubleml/irm/tests/test_irm_scalar_exceptions.py
 create mode 100644 doubleml/irm/tests/test_irm_scalar_external_predictions.py
 create mode 100644 doubleml/irm/tests/test_irm_scalar_return_types.py
 create mode 100644 doubleml/irm/tests/test_irm_scalar_vs_irm.py

diff --git a/doc/diagrams/architecture.md b/doc/diagrams/architecture.md
index 4e531e9d..5081f641 100644
--- a/doc/diagrams/architecture.md
+++ b/doc/diagrams/architecture.md
@@ -17,8 +17,8 @@ DoubleMLBase (ABC)
     │   │   θ̂ = -E[ψ_b] / E[ψ_a]
     │   │
     │   ├── PLR          (partialling out, IV-type)
+    │   ├── IRM          (ATE, ATTE)
     │   ├── PLIV         (planned)
-    │   ├── IRM          (planned)
     │   └── DID          (planned)
     │
     └── NonLinearScoreMixin (planned)
diff --git a/doc/diagrams/testing_structure.md b/doc/diagrams/testing_structure.md
new file mode 100644
index 00000000..e6383c6d
--- /dev/null
+++ b/doc/diagrams/testing_structure.md
@@ -0,0 +1,321 @@
+# Testing Structure for DoubleML Scalar Models
+
+This document defines the testing standard for all new models built on the `DoubleMLScalar` hierarchy. Each model should have a consistent set of test files covering estimation accuracy, return types, input validation, backward compatibility, and external predictions.
+
+## Test File Convention
+
+For a model `<model>` in module `<module>/` (e.g., `plr` in `plm/`, `irm` in `irm/`):
+
+| File | Purpose |
+|------|---------|
+| `test_<model>_scalar.py` | Core estimation accuracy |
+| `test_<model>_scalar_return_types.py` | Property types and shapes after fitting |
+| `test_<model>_scalar_exceptions.py` | Input validation and error handling |
+| `test_<model>_scalar_vs_<model>.py` | Comparison with old `DoubleML` implementation |
+| `test_<model>_scalar_external_predictions.py` | External predictions workflow |
+
+All test files live in `doubleml/<module>/tests/`.
+
+All test functions should be marked with `@pytest.mark.ci`.
+
+---
+
+## 1. Core Estimation Tests (`test_<model>_scalar.py`)
+
+Verify that the model produces statistically reasonable estimates.
+
+### Fixture Pattern
+
+```python
+@pytest.fixture(scope="module", params=[...])  # score variants
+def score(request):
+    return request.param
+
+@pytest.fixture(scope="module", params=[True, False])  # model-specific options
+def option(request):
+    return request.param
+
+@pytest.fixture(scope="module")
+def fitted_fixture(score, option):
+    np.random.seed(3141)
+    data = make_<model>_data(theta=true_theta, n_obs=500, ...)
+    dml_obj = <Model>(data, score=score, option=option)
+    dml_obj.set_learners(...)
+    dml_obj.draw_sample_splitting(n_folds=5, n_rep=1)
+    dml_obj.fit()
+    return {"coef": dml_obj.coef[0], "se": dml_obj.se[0], "true_theta": true_theta, "score": score}
+```
+
+### Required Tests
+
+- **`test_coef`**: For scores where the DGP theta equals the target parameter, check the 3-sigma rule: `abs(coef - true_theta) <= 3.0 * se`. For scores where the true parameter differs from the DGP theta (e.g., ATTE), check `np.isfinite(coef)` and `abs(coef) < 10.0`.
+- **`test_se`**: `se > 0`
+
+### Assertion Pattern
+
+```python
+# When true parameter matches DGP theta
+assert abs(coef - true_theta) <= 3.0 * se
+
+# When true parameter is unknown (e.g., ATTE with heterogeneous effects)
+assert np.isfinite(coef)
+assert abs(coef) < 10.0
+```
+
+---
+
+## 2. Return Types Tests (`test_<model>_scalar_return_types.py`)
+
+Verify that all properties have the correct types and shapes after fitting.
+
+### Constants
+
+```python
+N_OBS = 200  # small for speed
+N_FOLDS = 3
+N_REP = 2
+```
+
+### Fixture Pattern
+
+```python
+@pytest.fixture(scope="module")
+def fitted_model():
+    np.random.seed(42)
+    data = make_<model>_data(n_obs=N_OBS, ...)
+    dml_obj = <Model>(data, score=<default_score>)
+    dml_obj.set_learners(...)
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+    return dml_obj
+```
+
+### Required Tests
+
+| Test | Assertion |
+|------|-----------|
+| `test_coef_type_and_shape` | `isinstance(coef, np.ndarray)`, `shape == (1,)` |
+| `test_se_type_and_shape` | `isinstance(se, np.ndarray)`, `shape == (1,)` |
+| `test_all_thetas_shape` | `shape == (1, N_REP)` |
+| `test_all_ses_shape` | `shape == (1, N_REP)` |
+| `test_summary_type` | `isinstance(summary, pd.DataFrame)`, `len == 1` |
+| `test_confint_type_and_shape` | `isinstance(ci, pd.DataFrame)`, `shape == (1, 2)` |
+| `test_psi_shape` | `shape == (N_OBS, 1, N_REP)` |
+| `test_predictions_type` | `isinstance(predictions, dict)`, correct keys, each `shape == (N_OBS, N_REP)` |
+| `test_smpls_type` | `len(smpls) == N_REP`, each has `N_FOLDS` tuples of `(train, test)` arrays |
+| `test_n_properties` | `n_obs == N_OBS`, `n_folds == N_FOLDS`, `n_rep == N_REP`, `score == <expected>` |
+| `test_required_learners` | Returns expected list of learner names |
+| `test_str_repr` | `str(model)` and `repr(model)` return `str` |
+| `test_get_params` | Returns dict with expected learner keys |
+| `test_set_params` | Modifies and confirms learner parameter change |
+| `test_before_fit_raises` | Accessing `coef` / `se` before `fit()` raises appropriate error |
+
+---
+
+## 3. Exception Tests (`test_<model>_scalar_exceptions.py`)
+
+Verify that invalid inputs produce clear error messages.
+
+### Required Tests (Common to All Models)
+
+| Test | Input | Expected |
+|------|-------|----------|
+| `test_exception_data` | Non-DoubleMLData | `TypeError` |
+| `test_exception_score` | Invalid score string | `ValueError` |
+| `test_exception_n_folds` | `n_folds < 2` | `ValueError` |
+| `test_exception_n_rep` | `n_rep < 1` | `ValueError` |
+| `test_exception_fit_nuisance_without_smpls` | Call `fit_nuisance_models()` before `draw_sample_splitting()` | `ValueError` |
+| `test_exception_estimate_causal_without_predictions` | Call `estimate_causal_parameters()` before `fit_nuisance_models()` | `ValueError` |
+| `test_exception_missing_learner` | Call `fit()` without setting required learners | `ValueError` |
+| `test_exception_invalid_learner` | Pass a class instead of an instance | `TypeError` |
+
+### Model-Specific Exception Tests
+
+Add tests for model-specific constraints:
+- **PLR**: Instrumental variables check (`z_cols`), `ml_g` warning for partialling out
+- **IRM**: Binary treatment check, instruments check, `normalize_ipw` type check, `ml_m` must be classifier
+
+### Assertion Pattern
+
+```python
+@pytest.mark.ci
+def test_exception_data():
+    msg = r"The data must be of DoubleMLData type\."
+    with pytest.raises(TypeError, match=msg):
+        <Model>(pd.DataFrame())
+```
+
+Always use `match=` with regex patterns to verify error messages.
+
+---
+
+## 4. Comparison Tests (`test_<model>_scalar_vs_<model>.py`)
+
+Verify exact numerical equivalence with the old `DoubleML` implementation.
+
+### Fixture Pattern
+
+```python
+@pytest.fixture(scope="module", params=[...])
+def score(request):
+    return request.param
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+@pytest.fixture(scope="module")
+def comparison_fixture(score, n_rep):
+    n_folds = 5
+    seed = 3141
+
+    np.random.seed(42)
+    data = make_<model>_data(...)
+
+    # Old model
+    np.random.seed(seed)
+    dml_old = dml.DoubleML<Model>(data, learner1, learner2, n_folds=n_folds, n_rep=n_rep, score=score)
+    dml_old.fit()
+
+    # New model — share sample splits from old model
+    dml_new = <Model>(data, score=score)
+    dml_new.set_learners(...)
+    dml_new._n_folds = n_folds
+    dml_new._n_rep = n_rep
+    dml_new._smpls = dml_old.smpls
+    dml_new.fit()
+
+    return {"old": dml_old, "new": dml_new}
+```
+
+**Key**: Share sample splits from the old model directly (`dml_new._smpls = dml_old.smpls`) because the old and new implementations consume random state differently during `__init__`.
+
+### Required Tests
+
+```python
+def test_coef_equal(comparison_fixture):
+    np.testing.assert_allclose(new.coef, old.coef, rtol=1e-9)
+
+def test_se_equal(comparison_fixture):
+    np.testing.assert_allclose(new.se, old.se, rtol=1e-9)
+
+def test_all_coef_equal(comparison_fixture):
+    np.testing.assert_allclose(new.all_thetas, old.all_coef, rtol=1e-9)
+
+def test_all_se_equal(comparison_fixture):
+    np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9)
+```
+
+Note the property name differences: new uses `all_thetas`/`all_ses`, old uses `all_coef`/`all_se`.
+
+---
+
+## 5. External Predictions Tests (`test_<model>_scalar_external_predictions.py`)
+
+Verify that providing pre-computed predictions produces equivalent results.
+
+### Fixture Pattern
+
+```python
+@pytest.fixture(scope="module", params=[...])
+def score(request):
+    return request.param
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_x_ext(request):   # one fixture per learner
+    return request.param
+
+@pytest.fixture(scope="module")
+def ext_pred_fixture(score, n_rep, set_ml_x_ext, ...):
+    # 1. Fit reference model
+    dml_ref = <Model>(data, score=score)
+    dml_ref.set_learners(...)
+    dml_ref.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_ref.fit()
+
+    # 2. Build external_predictions dict from reference model
+    external_predictions = {}
+    if set_ml_x_ext:
+        external_predictions["ml_x"] = dml_ref.predictions["ml_x"]
+
+    # 3. Fit new model with shared splits and external predictions
+    dml_ext = <Model>(data, score=score)
+    dml_ext.set_learners(...)   # set non-external learners
+    dml_ext._n_folds = n_folds
+    dml_ext._n_rep = n_rep
+    dml_ext._smpls = dml_ref.smpls
+    dml_ext.fit(external_predictions=external_predictions)
+
+    return {"ref": dml_ref, "ext": dml_ext}
+```
+
+### Required Tests
+
+```python
+import math
+
+def test_coef(ext_pred_fixture):
+    assert math.isclose(ref.coef[0], ext.coef[0], rel_tol=1e-9, abs_tol=1e-4)
+
+def test_se(ext_pred_fixture):
+    assert math.isclose(ref.se[0], ext.se[0], rel_tol=1e-9, abs_tol=1e-4)
+```
+
+Use `math.isclose` with `abs_tol=1e-4` instead of `np.testing.assert_allclose` because small numerical differences can accumulate when mixing external and fitted predictions.
+
+---
+
+## Assertion Patterns Summary
+
+| Context | Assertion | Tolerance |
+|---------|-----------|-----------|
+| Comparison with old model | `np.testing.assert_allclose(new, old, rtol=1e-9)` | Exact match |
+| External predictions | `math.isclose(a, b, rel_tol=1e-9, abs_tol=1e-4)` | Small tolerance |
+| Statistical accuracy | `abs(coef - true) <= 3.0 * se` | 3-sigma rule |
+| Exception handling | `pytest.raises(Error, match=r"regex pattern")` | Exact message match |
+
+---
+
+## Fixture Scope Guidelines
+
+| Scope | Use Case |
+|-------|----------|
+| `module` | Parametrized fixtures that fit models (expensive). Each parameter combination creates one instance shared across tests in the module. |
+| `session` | Data generation that should be shared across all test modules (not typically needed for scalar model tests). |
+| `function` | Only when test modifies state (rare for read-only assertion tests). |
+
+---
+
+## Checklist for New Scalar Models
+
+When adding a new scalar model `<Model>` to the `DoubleMLScalar` hierarchy:
+
+- [ ] **Implementation**: `doubleml/<module>/<model>_scalar.py`
+  - [ ] Class inherits from `LinearScoreMixin` (or `NonLinearScoreMixin`)
+  - [ ] `_LEARNER_SPECS` class variable defined
+  - [ ] `required_learners` property returns score-dependent list
+  - [ ] `set_learners()` with model-specific kwargs
+  - [ ] `_check_data()` static method
+  - [ ] `draw_sample_splitting()` (override if stratification needed)
+  - [ ] `_nuisance_est()` per-fold estimation
+  - [ ] `_get_score_elements()` returns `{psi_a, psi_b}`
+
+- [ ] **Tests**: `doubleml/<module>/tests/`
+  - [ ] `test_<model>_scalar.py` — core estimation
+  - [ ] `test_<model>_scalar_return_types.py` — property shapes/types
+  - [ ] `test_<model>_scalar_exceptions.py` — input validation
+  - [ ] `test_<model>_scalar_vs_<model>.py` — comparison with old implementation
+  - [ ] `test_<model>_scalar_external_predictions.py` — external predictions
+
+- [ ] **Verification**
+  - [ ] All new tests pass: `pytest doubleml/<module>/tests/test_<model>_scalar*.py -v -m ci`
+  - [ ] Lint: `ruff check doubleml/<module>/<model>_scalar.py`
+  - [ ] Format: `black doubleml/<module>/<model>_scalar.py`
+  - [ ] Type check: `mypy doubleml/<module>/<model>_scalar.py`
+  - [ ] Old tests still pass: `pytest doubleml/<module>/tests/ -v`
+
+- [ ] **Documentation**: Update `doc/diagrams/architecture.md` class hierarchy
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
new file mode 100644
index 00000000..f8c0ee53
--- /dev/null
+++ b/doubleml/irm/irm_scalar.py
@@ -0,0 +1,421 @@
+"""
+Interactive Regression Model (IRM) based on the new DoubleMLScalar hierarchy.
+"""
+
+from __future__ import annotations
+
+from typing import ClassVar, Dict, List, Optional, Self, Union
+
+import numpy as np
+from sklearn.base import clone
+from sklearn.utils.multiclass import type_of_target
+
+from ..data.base_data import DoubleMLData
+from ..double_ml_linear_score import LinearScoreMixin
+from ..utils._checks import _check_score, _check_weights
+from ..utils._learner import LearnerSpec, predict_nuisance
+from ..utils._propensity_score import _propensity_score_adjustment
+from ..utils.propensity_score_processing import PSProcessor, PSProcessorConfig
+from ..utils.resampling import DoubleMLResampling
+
+
+class IRM(LinearScoreMixin):
+    """Double machine learning for interactive regression models.
+
+    Based on the DoubleMLScalar + LinearScoreMixin hierarchy.
+
+    Parameters
+    ----------
+    obj_dml_data : DoubleMLData
+        The data object providing the data and specifying the variables for the causal model.
+        Must contain exactly one binary treatment variable with values 0 and 1.
+    score : str
+        The score function (``'ATE'`` or ``'ATTE'``).
+        Default is ``'ATE'``.
+    ml_g : estimator, optional
+        A machine learner implementing ``fit()`` and ``predict()`` for the nuisance
+        function :math:`g_0(D, X) = E[Y|X, D]`. Cloned to ``ml_g0`` and ``ml_g1``
+        internally. For a binary outcome, a classifier implementing ``fit()`` and
+        ``predict_proba()`` can also be specified.
+    ml_m : classifier, optional
+        A machine learner implementing ``fit()`` and ``predict_proba()`` for the
+        nuisance function :math:`m_0(X) = E[D|X]`. Must be a classifier.
+    normalize_ipw : bool
+        Indicates whether the inverse probability weights are normalized.
+        Default is ``False``.
+    weights : array, dict or None
+        Weights for each individual observation. If ``None``, uniform weights are used
+        (corresponds to standard ATE). Can only be used with ``score='ATE'``.
+        An array must have shape ``(n,)``. A dictionary must contain keys ``'weights'``
+        and ``'weights_bar'``.
+        Default is ``None``.
+    ps_processor_config : PSProcessorConfig, optional
+        Configuration for propensity score processing (clipping, calibration, etc.).
+        Default is ``None`` (uses default clipping threshold of 0.01).
+
+    Notes
+    -----
+    **Interactive regression (IRM)** models take the form
+
+    .. math::
+
+        Y = g_0(D, X) + U, & &\\mathbb{E}(U | X, D) = 0,
+
+        D = m_0(X) + V, & &\\mathbb{E}(V | X) = 0,
+
+    where the treatment variable is binary, :math:`D \\in \\lbrace 0,1 \\rbrace`.
+    Target parameters of interest are the average treatment effect (ATE),
+
+    .. math::
+
+        \\theta_0 = \\mathbb{E}[g_0(1, X) - g_0(0, X)]
+
+    and the average treatment effect of the treated (ATTE),
+
+    .. math::
+
+        \\theta_0 = \\mathbb{E}[g_0(1, X) - g_0(0, X) | D=1].
+    """
+
+    # Define learner specifications for IRM
+    _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] = {
+        "ml_g0": LearnerSpec("ml_g0", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"),
+        "ml_g1": LearnerSpec("ml_g1", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"),
+        "ml_m": LearnerSpec("ml_m", allow_regressor=False, allow_classifier=True),
+    }
+
+    def __init__(
+        self,
+        obj_dml_data: DoubleMLData,
+        score: str = "ATE",
+        ml_g: Optional[object] = None,
+        ml_m: Optional[object] = None,
+        normalize_ipw: bool = False,
+        weights: Optional[Union[np.ndarray, Dict]] = None,
+        ps_processor_config: Optional[PSProcessorConfig] = None,
+    ):
+        """
+        Initialize IRM model.
+
+        Parameters
+        ----------
+        obj_dml_data : DoubleMLData
+            The data object. Must have exactly one binary treatment variable.
+        score : str
+            Score function (``'ATE'`` or ``'ATTE'``).
+        ml_g : estimator, optional
+            Learner for E[Y|X, D]. Cloned to ml_g0 and ml_g1.
+        ml_m : classifier, optional
+            Learner for E[D|X]. Must be a classifier.
+        normalize_ipw : bool
+            Whether to normalize inverse probability weights.
+        weights : array, dict or None, optional
+            Weights for weighted ATE.
+        ps_processor_config : PSProcessorConfig, optional
+            Configuration for propensity score processing.
+        """
+        # Validate data
+        self._check_data(obj_dml_data)
+
+        # Validate score
+        valid_scores = ["ATE", "ATTE"]
+        _check_score(score, valid_scores, allow_callable=False)
+
+        super().__init__(
+            obj_dml_data=obj_dml_data,
+            score=score,
+        )
+
+        # Normalize IPW
+        if not isinstance(normalize_ipw, bool):
+            raise TypeError("Normalization indicator has to be boolean. " f"Object of type {str(type(normalize_ipw))} passed.")
+        self._normalize_ipw = normalize_ipw
+
+        # Propensity score processing
+        if ps_processor_config is not None:
+            self._ps_processor_config = ps_processor_config
+            self._ps_processor = PSProcessor.from_config(ps_processor_config)
+        else:
+            self._ps_processor_config = PSProcessorConfig()
+            self._ps_processor = PSProcessor.from_config(self._ps_processor_config)
+
+        # Weights
+        _check_weights(weights, score, obj_dml_data.n_obs, n_rep=1)
+        self._initialize_weights(weights)
+
+        # Set learners if provided
+        if any(learner is not None for learner in [ml_g, ml_m]):
+            self.set_learners(ml_g=ml_g, ml_m=ml_m)
+
+    # ==================== Properties ====================
+
+    @property
+    def normalize_ipw(self) -> bool:
+        """Indicates whether the inverse probability weights are normalized."""
+        return self._normalize_ipw
+
+    @property
+    def ps_processor_config(self) -> PSProcessorConfig:
+        """Configuration for propensity score processing."""
+        return self._ps_processor_config
+
+    @property
+    def ps_processor(self) -> PSProcessor:
+        """Propensity score processor."""
+        return self._ps_processor
+
+    @property
+    def weights(self) -> Dict:
+        """Weights for weighted ATE/ATTE."""
+        return self._weights
+
+    @property
+    def required_learners(self) -> List[str]:
+        """Required learners for IRM: ml_g0, ml_g1, and ml_m."""
+        return ["ml_g0", "ml_g1", "ml_m"]
+
+    # ==================== Learner Management ====================
+
+    def set_learners(
+        self,
+        ml_g: Optional[object] = None,
+        ml_g0: Optional[object] = None,
+        ml_g1: Optional[object] = None,
+        ml_m: Optional[object] = None,
+    ) -> Self:
+        """
+        Set the learners for nuisance estimation.
+
+        Parameters
+        ----------
+        ml_g : estimator or None, optional
+            A machine learner for the outcome regression :math:`g_0(D, X) = E[Y|X, D]`.
+            Cloned to ``ml_g0`` and ``ml_g1`` if they are not explicitly set.
+        ml_g0 : estimator or None, optional
+            A machine learner for :math:`E[Y|X, D=0]`. Takes precedence over ``ml_g``.
+        ml_g1 : estimator or None, optional
+            A machine learner for :math:`E[Y|X, D=1]`. Takes precedence over ``ml_g``.
+        ml_m : classifier or None, optional
+            A machine learner for the propensity score :math:`m_0(X) = E[D|X]`.
+            Must be a classifier with ``predict_proba()`` method.
+
+        Returns
+        -------
+        self : IRM
+            The estimator with learners set.
+        """
+        # ml_g convenience: clone to ml_g0/ml_g1 if not explicitly set
+        if ml_g is not None:
+            # Validate ml_g is an instance (not a class) before cloning
+            if isinstance(ml_g, type):
+                raise TypeError("Invalid learner provided for ml_g: provide an instance of a learner instead of a class.")
+            if ml_g0 is None:
+                ml_g0 = clone(ml_g)
+            if ml_g1 is None:
+                ml_g1 = clone(ml_g)
+
+        # Register each learner
+        for name, learner in [("ml_g0", ml_g0), ("ml_g1", ml_g1), ("ml_m", ml_m)]:
+            if learner is not None:
+                self._register_learner(name, learner)
+
+        return self
+
+    # ==================== Sample Splitting ====================
+
+    def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
+        """
+        Draw stratified sample splitting for cross-fitting.
+
+        Uses stratified K-fold splitting to ensure each fold contains both
+        treatment groups (D=0 and D=1).
+
+        Parameters
+        ----------
+        n_folds : int, optional
+            Number of folds for cross-fitting. Default is 5.
+        n_rep : int, optional
+            Number of repetitions for sample splitting. Default is 1.
+
+        Returns
+        -------
+        self : IRM
+            The estimator with initialized sample splits.
+        """
+        if not isinstance(n_folds, int) or n_folds < 2:
+            raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.")
+        if not isinstance(n_rep, int) or n_rep < 1:
+            raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.")
+
+        self._n_folds = n_folds
+        self._n_rep = n_rep
+
+        # Create stratified resampler
+        resampler = DoubleMLResampling(
+            n_folds=n_folds,
+            n_rep=n_rep,
+            n_obs=self._n_obs,
+            stratify=self._dml_data.d,
+        )
+
+        self._smpls = resampler.split_samples()
+        return self
+
+    # ==================== Nuisance Estimation ====================
+
+    def _nuisance_est(
+        self,
+        train_idx: np.ndarray,
+        test_idx: np.ndarray,
+        i_rep: int,
+        i_fold: int,
+        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+    ) -> None:
+        x = self._dml_data.x
+        y = self._dml_data.y
+        d = self._dml_data.d
+
+        x_train, x_test = x[train_idx], x[test_idx]
+        d_train = d[train_idx]
+
+        # Check which learners have external predictions
+        g0_external = external_predictions is not None and "ml_g0" in external_predictions
+        g1_external = external_predictions is not None and "ml_g1" in external_predictions
+        m_external = external_predictions is not None and "ml_m" in external_predictions
+
+        # ml_g0: fit on d==0 subset of training data, predict on ALL test observations
+        if not g0_external:
+            train_d0 = train_idx[d[train_idx] == 0]
+            ml_g0_info = self._learners["ml_g0"]
+            ml_g0 = clone(ml_g0_info.learner)
+            ml_g0.fit(x[train_d0], y[train_d0])
+            self._predictions["ml_g0"][test_idx, i_rep] = predict_nuisance(ml_g0, x_test, ml_g0_info.is_classifier)
+
+        # ml_g1: fit on d==1 subset of training data, predict on ALL test observations
+        if not g1_external:
+            train_d1 = train_idx[d[train_idx] == 1]
+            ml_g1_info = self._learners["ml_g1"]
+            ml_g1 = clone(ml_g1_info.learner)
+            ml_g1.fit(x[train_d1], y[train_d1])
+            self._predictions["ml_g1"][test_idx, i_rep] = predict_nuisance(ml_g1, x_test, ml_g1_info.is_classifier)
+
+        # ml_m: fit on ALL training data, predict on test
+        if not m_external:
+            ml_m_info = self._learners["ml_m"]
+            ml_m = clone(ml_m_info.learner)
+            ml_m.fit(x_train, d_train)
+            self._predictions["ml_m"][test_idx, i_rep] = predict_nuisance(ml_m, x_test, ml_m_info.is_classifier)
+
+    # ==================== Score Elements ====================
+
+    def _get_score_elements(self) -> Dict[str, np.ndarray]:
+        y = self._dml_data.y
+        d = self._dml_data.d
+
+        g_hat0 = self._predictions["ml_g0"]  # (n_obs, n_rep)
+        g_hat1 = self._predictions["ml_g1"]  # (n_obs, n_rep)
+        m_hat_raw = self._predictions["ml_m"]  # (n_obs, n_rep)
+
+        # Apply PS processing per repetition
+        m_hat = np.zeros_like(m_hat_raw)
+        for i_rep in range(self.n_rep):
+            m_hat[:, i_rep] = self._ps_processor.adjust_ps(m_hat_raw[:, i_rep], d, cv=self._smpls[i_rep], learner_name="ml_m")
+
+        # Apply IPW normalization per repetition
+        m_hat_adj = np.zeros_like(m_hat)
+        for i_rep in range(self.n_rep):
+            m_hat_adj[:, i_rep] = _propensity_score_adjustment(
+                propensity_score=m_hat[:, i_rep],
+                treatment_indicator=d,
+                normalize_ipw=self.normalize_ipw,
+            )
+
+        # Residuals: (n_obs, n_rep)
+        u_hat0 = y[:, np.newaxis] - g_hat0
+        u_hat1 = y[:, np.newaxis] - g_hat1
+
+        d_col = d[:, np.newaxis]  # (n_obs, 1) for broadcasting
+
+        if self.score == "ATE" or self.score == "ATTE":
+            weights, weights_bar = self._get_weights(m_hat_adj)
+
+            psi_b = weights * (g_hat1 - g_hat0) + weights_bar * (
+                np.divide(d_col * u_hat1, m_hat_adj) - np.divide((1.0 - d_col) * u_hat0, 1.0 - m_hat_adj)
+            )
+            psi_a = -1.0 * np.divide(weights, np.mean(weights, axis=0, keepdims=True))
+
+        return {"psi_a": psi_a, "psi_b": psi_b}
+
+    # ==================== Private Helpers ====================
+
+    @staticmethod
+    def _check_data(obj_dml_data: object) -> None:
+        """Validate that the data is compatible with IRM."""
+        if not isinstance(obj_dml_data, DoubleMLData):
+            raise TypeError(
+                f"The data must be of DoubleMLData type. " f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed."
+            )
+        if obj_dml_data.z_cols is not None:
+            raise ValueError(
+                "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). "
+                "To fit an interactive IV regression model use DoubleMLIIVM instead of IRM."
+            )
+        one_treat = obj_dml_data.n_treat == 1
+        binary_treat = type_of_target(obj_dml_data.d) == "binary"
+        zero_one_treat = np.all((np.power(obj_dml_data.d, 2) - obj_dml_data.d) == 0)
+        if not (one_treat & binary_treat & zero_one_treat):
+            raise ValueError(
+                "Incompatible data. "
+                "To fit an IRM model with DML "
+                "exactly one binary variable with values 0 and 1 "
+                "needs to be specified as treatment variable."
+            )
+
+    def _initialize_weights(self, weights: Optional[Union[np.ndarray, Dict]]) -> None:
+        """Initialize weights storage."""
+        if weights is None:
+            weights = np.ones(self._dml_data.n_obs)
+        if isinstance(weights, np.ndarray):
+            self._weights = {"weights": weights}
+        else:
+            assert isinstance(weights, dict)
+            self._weights = weights
+
+    def _get_weights(self, m_hat: np.ndarray) -> tuple:
+        """
+        Compute weights and weights_bar for score computation.
+
+        Parameters
+        ----------
+        m_hat : np.ndarray
+            Adjusted propensity scores, shape (n_obs, n_rep).
+
+        Returns
+        -------
+        weights : np.ndarray
+            Shape (n_obs, n_rep) or broadcastable.
+        weights_bar : np.ndarray
+            Shape (n_obs, n_rep) or broadcastable.
+        """
+        d = self._dml_data.d
+
+        if self.score == "ATE":
+            w = self._weights["weights"]
+            weights = w[:, np.newaxis] * np.ones((1, self.n_rep))  # (n_obs, n_rep)
+            if "weights_bar" in self._weights:
+                # weights_bar has shape (n_obs, n_rep) already
+                weights_bar = self._weights["weights_bar"]
+            else:
+                weights_bar = weights.copy()
+        else:
+            # ATTE
+            assert self.score == "ATTE"
+            w = self._weights["weights"]
+            subgroup = w * d
+            subgroup_probability = np.mean(subgroup)
+            weights = np.divide(subgroup, subgroup_probability)[:, np.newaxis] * np.ones((1, self.n_rep))
+
+            # weights_bar depends on m_hat per repetition
+            weights_bar = np.divide(m_hat * w[:, np.newaxis], subgroup_probability)
+
+        return weights, weights_bar
diff --git a/doubleml/irm/tests/test_irm_scalar.py b/doubleml/irm/tests/test_irm_scalar.py
new file mode 100644
index 00000000..91a8e3c9
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar.py
@@ -0,0 +1,66 @@
+"""Core estimation tests for IRM scalar."""
+
+import numpy as np
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+
+
+@pytest.fixture(scope="module", params=["ATE", "ATTE"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def normalize_ipw(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_irm_scalar_fixture(score, normalize_ipw):
+    n_folds = 5
+    true_theta = 0.5
+
+    np.random.seed(3141)
+    data = make_irm_data(theta=true_theta, n_obs=500, dim_x=20, return_type="DoubleMLData")
+
+    ml_g = RandomForestRegressor(n_estimators=100, max_features=10, max_depth=5, min_samples_leaf=2, random_state=42)
+    ml_m = RandomForestClassifier(n_estimators=100, max_features=10, max_depth=5, min_samples_leaf=2, random_state=42)
+
+    np.random.seed(3141)
+    dml_obj = IRM(data, score=score, normalize_ipw=normalize_ipw)
+    dml_obj.set_learners(ml_g=ml_g, ml_m=ml_m)
+    dml_obj.draw_sample_splitting(n_folds=n_folds, n_rep=1)
+    dml_obj.fit()
+
+    return {
+        "coef": dml_obj.coef[0],
+        "se": dml_obj.se[0],
+        "true_theta": true_theta,
+        "score": score,
+    }
+
+
+@pytest.mark.ci
+def test_dml_irm_scalar_coef(dml_irm_scalar_fixture):
+    coef = dml_irm_scalar_fixture["coef"]
+    se = dml_irm_scalar_fixture["se"]
+    true_theta = dml_irm_scalar_fixture["true_theta"]
+    score = dml_irm_scalar_fixture["score"]
+
+    # For ATE, the DGP theta is the true ATE parameter
+    # For ATTE, the true ATTE differs from theta due to heterogeneous effects in the DGP
+    if score == "ATE":
+        assert abs(coef - true_theta) <= 3.0 * se
+    else:
+        # ATTE: just check estimate is finite and reasonable
+        assert np.isfinite(coef)
+        assert abs(coef) < 10.0
+
+
+@pytest.mark.ci
+def test_dml_irm_scalar_se(dml_irm_scalar_fixture):
+    se = dml_irm_scalar_fixture["se"]
+    assert se > 0
diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py
new file mode 100644
index 00000000..df0aab60
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py
@@ -0,0 +1,133 @@
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+
+np.random.seed(3141)
+obj_dml_data = make_irm_data(theta=0.5, n_obs=100, dim_x=10, return_type="DoubleMLData")
+
+ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
+ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_data():
+    msg = r"The data must be of DoubleMLData type\."
+    with pytest.raises(TypeError, match=msg):
+        IRM(pd.DataFrame())
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_instrument():
+    # Create data with instruments
+    np.random.seed(3141)
+    plr_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=10, alpha=0.5)
+    df = plr_data.data.copy()
+    x_cols = [c for c in df.columns if c.startswith("X")]
+
+    import doubleml as dml
+
+    dml_data_iv = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols[:-1], z_cols=x_cols[-1])
+
+    msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\."
+    with pytest.raises(ValueError, match=msg):
+        IRM(dml_data_iv)
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_non_binary_treatment():
+    # Create data with continuous treatment
+    np.random.seed(3141)
+    plr_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=10, alpha=0.5)
+    msg = r"Incompatible data.*exactly one binary variable"
+    with pytest.raises(ValueError, match=msg):
+        IRM(plr_data)
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_score():
+    msg = r"Invalid score"
+    with pytest.raises(ValueError, match=msg):
+        IRM(obj_dml_data, score="invalid")
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_n_folds():
+    dml_obj = IRM(obj_dml_data)
+    msg = r"n_folds must be an integer >= 2\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_folds=1)
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_folds=0)
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_n_rep():
+    dml_obj = IRM(obj_dml_data)
+    msg = r"n_rep must be an integer >= 1\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_rep=0)
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_fit_nuisance_without_smpls():
+    dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m)
+    msg = r"Sample splitting has not been initialized\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit_nuisance_models()
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_estimate_causal_without_predictions():
+    dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m)
+    dml_obj.draw_sample_splitting()
+    msg = r"Predictions not available\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.estimate_causal_parameters()
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_missing_learner():
+    dml_obj = IRM(obj_dml_data)
+    dml_obj.draw_sample_splitting()
+    msg = r"Learner 'ml_g0' is required but not set"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit()
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_missing_learner_partial():
+    dml_obj = IRM(obj_dml_data)
+    dml_obj.set_learners(ml_g=ml_g)
+    dml_obj.draw_sample_splitting()
+    msg = r"Learner 'ml_m' is required but not set"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit()
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_invalid_learner():
+    dml_obj = IRM(obj_dml_data)
+    msg = r"Invalid learner provided for ml_g: provide an instance"
+    with pytest.raises(TypeError, match=msg):
+        dml_obj.set_learners(ml_g=RandomForestRegressor)  # class instead of instance
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_ml_m_regressor():
+    dml_obj = IRM(obj_dml_data)
+    # LinearRegression is a regressor, not allowed for ml_m; warns then raises TypeError (no predict_proba)
+    with pytest.raises(TypeError, match=r"has no method .predict_proba"):
+        dml_obj.set_learners(ml_m=LinearRegression())
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_normalize_ipw_type():
+    msg = r"Normalization indicator has to be boolean"
+    with pytest.raises(TypeError, match=msg):
+        IRM(obj_dml_data, normalize_ipw="True")
diff --git a/doubleml/irm/tests/test_irm_scalar_external_predictions.py b/doubleml/irm/tests/test_irm_scalar_external_predictions.py
new file mode 100644
index 00000000..a7ea60c5
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_external_predictions.py
@@ -0,0 +1,105 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+
+
+@pytest.fixture(scope="module", params=["ATE", "ATTE"])
+def irm_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_g0_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_g1_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_m_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_irm_scalar_fixture(irm_score, n_rep, set_ml_g0_ext, set_ml_g1_ext, set_ml_m_ext):
+    n_folds = 3
+    ext_predictions = {}
+
+    np.random.seed(42)
+    data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type="DoubleMLData")
+
+    ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
+    ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
+
+    # Fit reference model
+    dml_irm = IRM(data, score=irm_score)
+    dml_irm.set_learners(ml_g=ml_g, ml_m=ml_m)
+    np.random.seed(3141)
+    dml_irm.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_irm.fit()
+
+    # Build external predictions dict
+    if set_ml_g0_ext:
+        ext_predictions["ml_g0"] = dml_irm.predictions["ml_g0"]
+
+    if set_ml_g1_ext:
+        ext_predictions["ml_g1"] = dml_irm.predictions["ml_g1"]
+
+    if set_ml_m_ext:
+        ext_predictions["ml_m"] = dml_irm.predictions["ml_m"]
+
+    # Fit model with external predictions — only set learners that are needed
+    dml_irm_ext = IRM(data, score=irm_score)
+    learner_kwargs = {}
+    if not (set_ml_g0_ext and set_ml_g1_ext):
+        learner_kwargs["ml_g"] = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
+    if not set_ml_m_ext:
+        learner_kwargs["ml_m"] = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
+    if learner_kwargs:
+        dml_irm_ext.set_learners(**learner_kwargs)
+
+    np.random.seed(3141)
+    dml_irm_ext.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_irm_ext.fit(external_predictions=ext_predictions if ext_predictions else None)
+
+    res_dict = {
+        "coef_normal": dml_irm.coef[0],
+        "coef_ext": dml_irm_ext.coef[0],
+        "se_normal": dml_irm.se[0],
+        "se_ext": dml_irm_ext.se[0],
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_doubleml_irm_scalar_coef(doubleml_irm_scalar_fixture):
+    assert math.isclose(
+        doubleml_irm_scalar_fixture["coef_normal"],
+        doubleml_irm_scalar_fixture["coef_ext"],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_doubleml_irm_scalar_se(doubleml_irm_scalar_fixture):
+    assert math.isclose(
+        doubleml_irm_scalar_fixture["se_normal"],
+        doubleml_irm_scalar_fixture["se_ext"],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
diff --git a/doubleml/irm/tests/test_irm_scalar_return_types.py b/doubleml/irm/tests/test_irm_scalar_return_types.py
new file mode 100644
index 00000000..15eaae82
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_return_types.py
@@ -0,0 +1,170 @@
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+
+N_OBS = 200
+N_FOLDS = 3
+N_REP = 2
+N_REP_BOOT = 314
+
+np.random.seed(3141)
+obj_dml_data = make_irm_data(theta=0.5, n_obs=N_OBS, dim_x=10, return_type="DoubleMLData")
+
+
+@pytest.fixture(scope="module")
+def fitted_dml_obj():
+    np.random.seed(3141)
+    dml_obj = IRM(obj_dml_data)
+    dml_obj.set_learners(
+        ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42),
+        ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42),
+    )
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+    dml_obj.bootstrap(n_rep_boot=N_REP_BOOT)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_coef_type_and_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.coef, np.ndarray)
+    assert fitted_dml_obj.coef.shape == (1,)
+
+
+@pytest.mark.ci
+def test_se_type_and_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.se, np.ndarray)
+    assert fitted_dml_obj.se.shape == (1,)
+
+
+@pytest.mark.ci
+def test_all_thetas_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.all_thetas, np.ndarray)
+    assert fitted_dml_obj.all_thetas.shape == (1, N_REP)
+
+
+@pytest.mark.ci
+def test_all_coef_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.all_coef, np.ndarray)
+    assert fitted_dml_obj.all_coef.shape == (1, N_REP)
+
+
+@pytest.mark.ci
+def test_all_ses_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.all_ses, np.ndarray)
+    assert fitted_dml_obj.all_ses.shape == (1, N_REP)
+
+
+@pytest.mark.ci
+def test_summary_type(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.summary, pd.DataFrame)
+    assert fitted_dml_obj.summary.shape[0] == 1
+
+
+@pytest.mark.ci
+def test_confint_type_and_shape(fitted_dml_obj):
+    ci = fitted_dml_obj.confint()
+    assert isinstance(ci, pd.DataFrame)
+    assert ci.shape == (1, 2)
+
+
+@pytest.mark.ci
+def test_confint_joint(fitted_dml_obj):
+    ci_joint = fitted_dml_obj.confint(joint=True)
+    assert isinstance(ci_joint, pd.DataFrame)
+    assert ci_joint.shape == (1, 2)
+
+
+@pytest.mark.ci
+def test_psi_shape(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.psi, np.ndarray)
+    assert fitted_dml_obj.psi.shape == (N_OBS, 1, N_REP)
+
+
+@pytest.mark.ci
+def test_predictions_type(fitted_dml_obj):
+    preds = fitted_dml_obj.predictions
+    assert isinstance(preds, dict)
+    assert "ml_g0" in preds
+    assert "ml_g1" in preds
+    assert "ml_m" in preds
+    assert preds["ml_g0"].shape == (N_OBS, N_REP)
+    assert preds["ml_g1"].shape == (N_OBS, N_REP)
+    assert preds["ml_m"].shape == (N_OBS, N_REP)
+
+
+@pytest.mark.ci
+def test_smpls_type(fitted_dml_obj):
+    smpls = fitted_dml_obj.smpls
+    assert isinstance(smpls, list)
+    assert len(smpls) == N_REP
+    assert len(smpls[0]) == N_FOLDS
+
+
+@pytest.mark.ci
+def test_n_properties(fitted_dml_obj):
+    assert fitted_dml_obj.n_obs == N_OBS
+    assert fitted_dml_obj.n_folds == N_FOLDS
+    assert fitted_dml_obj.n_rep == N_REP
+    assert fitted_dml_obj.score == "ATE"
+
+
+@pytest.mark.ci
+def test_required_learners(fitted_dml_obj):
+    assert fitted_dml_obj.required_learners == ["ml_g0", "ml_g1", "ml_m"]
+    assert "ml_g0" in fitted_dml_obj.learners
+    assert "ml_g1" in fitted_dml_obj.learners
+    assert "ml_m" in fitted_dml_obj.learners
+
+
+@pytest.mark.ci
+def test_str_repr(fitted_dml_obj):
+    assert isinstance(str(fitted_dml_obj), str)
+    assert isinstance(repr(fitted_dml_obj), str)
+
+
+@pytest.mark.ci
+def test_get_params(fitted_dml_obj):
+    params = fitted_dml_obj.get_params("ml_g0")
+    assert isinstance(params, dict)
+    assert "n_estimators" in params
+
+
+@pytest.mark.ci
+def test_set_params(fitted_dml_obj):
+    result = fitted_dml_obj.set_params("ml_g0", n_estimators=5)
+    assert result is fitted_dml_obj
+    params = fitted_dml_obj.get_params("ml_g0")
+    assert params["n_estimators"] == 5
+    # Reset
+    fitted_dml_obj.set_params("ml_g0", n_estimators=10)
+
+
+@pytest.mark.ci
+def test_get_params_invalid_learner(fitted_dml_obj):
+    with pytest.raises(ValueError, match="not registered"):
+        fitted_dml_obj.get_params("ml_invalid")
+
+
+@pytest.mark.ci
+def test_before_fit_raises():
+    np.random.seed(3141)
+    dml_obj = IRM(obj_dml_data)
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.coef
+    with pytest.raises(ValueError, match="Predictions not available. Call fit"):
+        _ = dml_obj.predictions
+
+
+@pytest.mark.ci
+def test_irm_properties(fitted_dml_obj):
+    assert isinstance(fitted_dml_obj.normalize_ipw, bool)
+    assert fitted_dml_obj.normalize_ipw is False
+    assert isinstance(fitted_dml_obj.weights, dict)
+    assert "weights" in fitted_dml_obj.weights
+    assert fitted_dml_obj.ps_processor is not None
+    assert fitted_dml_obj.ps_processor_config is not None
diff --git a/doubleml/irm/tests/test_irm_scalar_vs_irm.py b/doubleml/irm/tests/test_irm_scalar_vs_irm.py
new file mode 100644
index 00000000..196385e8
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_vs_irm.py
@@ -0,0 +1,82 @@
+"""Compare IRM scalar against the existing DoubleMLIRM implementation."""
+
+import numpy as np
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+import doubleml as dml
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+
+
+@pytest.fixture(scope="module", params=["ATE", "ATTE"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def comparison_fixture(score, n_rep):
+    n_folds = 5
+    seed = 3141
+
+    np.random.seed(42)
+    obj_dml_data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type="DoubleMLData")
+
+    ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
+    ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
+
+    # Old IRM
+    np.random.seed(seed)
+    dml_old = dml.DoubleMLIRM(
+        obj_dml_data,
+        ml_g,
+        ml_m,
+        n_folds=n_folds,
+        n_rep=n_rep,
+        score=score,
+    )
+    dml_old.fit()
+
+    # New IRM scalar — share sample splits from old model for exact comparison
+    dml_new = IRM(obj_dml_data, score=score)
+    dml_new.set_learners(ml_g=ml_g, ml_m=ml_m)
+    # Copy sample splits directly to ensure identical cross-fitting structure
+    dml_new._n_folds = n_folds
+    dml_new._n_rep = n_rep
+    dml_new._smpls = dml_old.smpls
+    dml_new.fit()
+
+    return {"old": dml_old, "new": dml_new}
+
+
+@pytest.mark.ci
+def test_coef_equal(comparison_fixture):
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.coef, old.coef, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_se_equal(comparison_fixture):
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.se, old.se, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_all_coef_equal(comparison_fixture):
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.all_thetas, old.all_coef, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_all_se_equal(comparison_fixture):
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9)

From 0947c9d19e640266360b9d088b5508f833f6c80d Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 7 Feb 2026 10:25:42 +0100
Subject: [PATCH 10/38] Refactor documentation and guidelines for DoubleML,
 including coding standards, error handling, performance guidelines, and
 testing conventions.

---
 .claude/CLAUDE.md                          | 199 ++++-----------------
 .claude/agents/py-reviewer.md              |  66 +++++++
 .claude/rules/dml-scalar-test-structure.md | 135 ++++++++++++++
 .claude/rules/error-handling.md            |  91 ++++++++++
 .claude/rules/performance-guidelines.md    |  67 +++++++
 .claude/rules/py-code-conventions.md       | 196 ++++++++++++++++++++
 .claude/rules/testing-conventions.md       | 104 +++++++++++
 7 files changed, 689 insertions(+), 169 deletions(-)
 create mode 100644 .claude/agents/py-reviewer.md
 create mode 100644 .claude/rules/dml-scalar-test-structure.md
 create mode 100644 .claude/rules/error-handling.md
 create mode 100644 .claude/rules/performance-guidelines.md
 create mode 100644 .claude/rules/py-code-conventions.md
 create mode 100644 .claude/rules/testing-conventions.md

diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
index 6c302c11..0dc51dca 100644
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -1,84 +1,20 @@
-# DoubleML for Python - Claude Code Memory
+# DoubleML for Python
 
-## Project Purpose
-
-DoubleML is a Python package implementing Double/Debiased Machine Learning (DML) methods for causal inference. The package provides:
+DoubleML is a Python package implementing Double/Debiased Machine Learning (DML) methods for causal inference:
 - Partially Linear Models (PLR, PLIV, PLPR, LPLR)
 - Interactive Regression Models (IRM, IIVM, APO, QTE, CVAR, SSM)
 - Difference-in-Differences estimators (DID, DIDCSBinary, DIDMulti)
 - Regression Discontinuity Design (RDD)
 
-**Documentation**: https://docs.doubleml.org
-
-## Coding Standards
-
-### Python
-- **Version**: Python 3.11+ (supports 3.11, 3.12, 3.13)
-- **Formatter**: black with line-length 127
-- **Linter**: ruff (rules: E, F, W, I)
-- **Type Checker**: mypy with `disallow_untyped_defs = true`
-- **Type hints**: Required for all functions
-- **Docstrings**: NumPy-style (see example below)
-- **Max line length**: 127 characters
-
-### NumPy Docstring Style
-```python
-def example_function(param1: int, param2: str) -> bool:
-    """
-    Short description of the function.
-
-    Parameters
-    ----------
-    param1 : int
-        Description of param1.
-    param2 : str
-        Description of param2.
-
-    Returns
-    -------
-    bool
-        Description of return value.
-
-    Raises
-    ------
-    ValueError
-        If param1 is negative.
-    """
-```
-
-### Code Quality Commands
-```bash
-# Format code
-black .
-
-# Lint code
-ruff check .
-
-# Fix linting issues
-ruff check --fix .
-
-# Type check
-mypy doubleml
-```
-
-### Pre-commit Hooks
-Pre-commit is configured with:
-- File format checks (yaml, toml)
-- Debug statement detection
-- Large file checks
-- Trailing whitespace and line ending fixes
-- black formatting
-- ruff linting with auto-fix
-
-Run pre-commit manually: `pre-commit run --all-files`
+**Docs**: https://docs.doubleml.org | **Source**: https://github.com/DoubleML/doubleml-for-py
 
-## Architecture Overview
+## Architecture
 
 ### Class Hierarchy
 ```
 DoubleMLBase (ABC)
 └─> DoubleMLScalar (ABC) - single-parameter models
-    ├─> LinearScoreMixin - closed-form solver
+    ├─> LinearScoreMixin - closed-form solver (θ = -E[ψ_b]/E[ψ_a])
     │   ├─> DoubleMLPLR
     │   ├─> DoubleMLIRM
     │   ├─> DoubleMLPLIV
@@ -89,15 +25,15 @@ DoubleMLBase (ABC)
 DoubleML - multi-parameter estimation (extends DoubleMLScalar)
 ```
 
-### Key Design Patterns
-- **Template Method**: `fit()` orchestrates; subclasses implement abstract methods
-- **Mixin Pattern**: LinearScoreMixin provides closed-form θ = -E[ψ_b]/E[ψ_a]
-- **Delegation**: DoubleMLBase delegates inference to DoubleMLFramework
+### Design Patterns
+- **Template Method**: `fit()` orchestrates; subclasses implement `_nuisance_est()`, `_get_score_elements()`
+- **Mixin Pattern**: `LinearScoreMixin` provides closed-form coefficient estimation
+- **Delegation**: `DoubleMLBase` delegates inference to `DoubleMLFramework`
 
 ### Core Files
 | File | Purpose |
 |------|---------|
-| `doubleml/double_ml_base.py` | Abstract base with properties (coef, se, summary) and inference methods |
+| `doubleml/double_ml_base.py` | Abstract base with properties (coef, se, summary) and inference |
 | `doubleml/double_ml_scalar.py` | Single-parameter estimation orchestrator |
 | `doubleml/double_ml.py` | Multi-parameter estimation with sample splitting |
 | `doubleml/double_ml_framework.py` | Statistical inference (confint, bootstrap, sensitivity) |
@@ -115,108 +51,33 @@ doubleml/
 └── tests/         # Main test directory
 ```
 
-## Testing
-
-### Run Tests
-```bash
-# Run all tests
-pytest
-
-# Run with coverage
-pytest --cov
-
-# Run specific marker (CI tests)
-pytest -m ci
-
-# Run specific test file
-pytest doubleml/tests/test_framework.py
-
-# Run tests for a specific module
-pytest doubleml/plm/tests/
-```
-
-### Test Markers
-- `ci`: Continuous integration tests for GitHub Actions
-- `ci_rdd`: RDD-specific CI tests
-
-### Test Organization
-- Each module (plm, irm, did) has its own `tests/` subdirectory
-- Test utilities in `doubleml/tests/_utils*.py`
-- Manual computation helpers verify results independently
-
-## Git Workflow
-
-### Branches
-- `main`: Main development branch
-- Feature branches for new work
-
-### Commit Format
-Use Conventional Commits:
-- `feat:` new feature
-- `fix:` bug fix
-- `docs:` documentation
-- `refactor:` code refactoring
-- `test:` adding tests
-- `chore:` maintenance
-
 ## Key Dependencies
 
-### Core
-- numpy>=2.0.0, pandas>=2.0.0, scipy>=1.7.0
-- scikit-learn>=1.6.0, statsmodels>=0.14.0
-
-### ML/Tuning
-- optuna>=4.6.0 (hyperparameter tuning)
-- joblib>=1.2.0 (parallelization)
-
-### Visualization
-- matplotlib>=3.9.0, seaborn>=0.13, plotly>=5.0.0
-
-### Development
-- pytest>=8.3.0, pytest-cov>=6.0.0
-- black>=25.1.0, ruff>=0.11.1, mypy>=1.18.0
-- xgboost>=2.1.0, lightgbm>=4.6.0 (for testing)
+**Core**: numpy>=2.0.0, pandas>=2.0.0, scipy>=1.7.0, scikit-learn>=1.6.0, statsmodels>=0.14.0
+**ML/Tuning**: optuna>=4.6.0, joblib>=1.2.0
+**Visualization**: matplotlib>=3.9.0, seaborn>=0.13, plotly>=5.0.0
+**Dev**: pytest>=8.3.0, black>=25.1.0, ruff>=0.11.1, mypy>=1.18.0, xgboost>=2.1.0, lightgbm>=4.6.0
 
-## Known Pitfalls
-
-### Type Annotations
-- MyPy is strict: `disallow_untyped_defs = true`
-- All functions need full type hints including return types
-- Use `from __future__ import annotations` for forward references
-
-### Learner Validation
-- Learners must be scikit-learn compatible (fit/predict interface)
-- Use `_check_learner()` from `doubleml/utils/_checks.py` for validation
-- Classifiers need `predict_proba()` for propensity scores
-
-### Sample Splitting
-- Cross-fitting uses `DoubleMLResampling` from `doubleml/utils/resampling.py`
-- Default is 5-fold cross-fitting with 1 repetition
-- Cluster-robust resampling available for clustered data
-
-### Score Functions
-- Linear scores use closed-form: θ = -E[ψ_b]/E[ψ_a]
-- Custom scores can be passed as callables
-- Score elements: `psi_a` (derivative), `psi_b` (moment)
+## Git Workflow
 
-### External Predictions
-- Models support external predictions via `set_external_predictions()`
-- Predictions must match sample splitting structure
+- **Main branch**: `main`
+- **Commits**: Conventional Commits — `feat:`, `fix:`, `docs:`, `refactor:`, `test:`, `chore:`
 
 ## Verification
 
 Before completing any task:
-1. Run `ruff check .` to check for linting issues
-2. Run `mypy doubleml` for type checking
-3. Run relevant tests: `pytest doubleml/path/to/tests/`
-4. Format code: `black .`
-
-## Useful Links
+```bash
+black .                    # Format
+ruff check --fix .         # Lint
+mypy doubleml              # Type check
+pytest -m ci               # Tests
+```
 
-- **Documentation**: https://docs.doubleml.org
-- **Source**: https://github.com/DoubleML/doubleml-for-py
-- **Bug Tracker**: https://github.com/DoubleML/doubleml-for-py/issues
-- **Architecture Docs**: [doc/diagrams/architecture.md](doc/diagrams/architecture.md)
+## Coding Standards
 
----
-*Update this file when Claude makes mistakes to prevent future issues.*
+Detailed conventions are in `.claude/rules/`:
+- **py-code-conventions.md** — Formatting, type hints, docstrings, naming, DML-specific patterns
+- **error-handling.md** — Exception types, validation patterns, warnings vs. errors
+- **performance-guidelines.md** — Vectorization, pre-allocation, DML computation patterns
+- **testing-conventions.md** — Markers, fixtures, assertion patterns
+- **dml-scalar-test-structure.md** — Mandatory 5-file test structure for scalar models
diff --git a/.claude/agents/py-reviewer.md b/.claude/agents/py-reviewer.md
new file mode 100644
index 00000000..dece1193
--- /dev/null
+++ b/.claude/agents/py-reviewer.md
@@ -0,0 +1,66 @@
+---
+name: py-reviewer
+description: Python code reviewer for DoubleML. Checks type safety, learner handling, score contracts, and test coverage. Use after writing or modifying Python files.
+tools: Read, Grep, Glob, Bash
+model: inherit
+---
+
+Review Python code changes against DoubleML project conventions. Report issues only — never edit source files.
+
+## Workflow
+
+1. Run `git diff --name-only HEAD~1` to identify changed files (use Bash)
+2. Read each changed `.py` file
+3. Review against the checklist below
+4. Output findings in the format specified
+
+## Review Checklist
+
+### Critical (must fix — blocks merge)
+- **Type hints**: All functions have parameter types and return types. Missing `-> None` counts.
+- **`from __future__ import annotations`**: Present when class methods reference their own type (forward refs)
+- **Learner validation**: `_check_learner()` called for every user-provided learner
+- **Learner cloning**: `clone(learner)` before `.fit()` — learners are mutable
+- **Score contract**: `_get_score_elements()` returns `{'psi_a': ..., 'psi_b': ...}` with shape `(n_obs,)`
+- **Sample splitting**: Uses `DoubleMLResampling`, never raw `KFold`
+- **Test markers**: Every test function has `@pytest.mark.ci`
+- **Exception messages**: Include expected vs. actual values (`got {value}`)
+
+### Warnings (should fix)
+- **Module docstring**: File starts with `"""..."""` describing the module
+- **NumPy-style docstrings**: Public functions/classes have Parameters + Returns sections
+- **Naming**: Classes use `DoubleML` prefix, score elements use `psi_a`/`psi_b`, stats use `theta`/`se`/`n_obs`
+- **Magic numbers**: Unexplained numeric literals (should be named constants)
+- **Vectorization**: Python loops over `n_obs`-sized arrays (should be NumPy ops)
+- **Error handling**: `_check_*` helpers from `doubleml/utils/_checks.py` used where applicable
+
+### Suggestions (nice to have)
+- **Property vs. method**: Cheap computed attributes should be `@property`, side effects should be methods
+- **Decorator usage**: `@staticmethod` for `_check_data()`, `@abstractmethod` for template hooks
+- **Class vs. instance variables**: `_LEARNER_SPECS`/`_VALID_SCORES` should be class-level
+
+### Intentionally Acceptable (do NOT flag)
+- `Any` type for scikit-learn estimators and learner objects
+- `E721` type comparisons (`type(x) == Y`) — intentionally allowed by ruff config
+- Test files without type annotations — excluded from mypy
+- `# type: ignore` when suppressing third-party library issues (not own code)
+
+## Output Format
+
+```markdown
+## Code Review: `<filename>`
+
+### Critical
+- **line N**: [issue description]. Fix: `<concrete code fix>`
+
+### Warnings
+- **line N**: [issue description]. Consider: `<suggestion>`
+
+### Suggestions
+- **line N**: [issue description]
+
+### Summary
+[1-2 sentences: overall assessment, number of issues by severity]
+```
+
+Review each changed file separately. If no issues found, state "No issues found" for that file.
diff --git a/.claude/rules/dml-scalar-test-structure.md b/.claude/rules/dml-scalar-test-structure.md
new file mode 100644
index 00000000..8ee372ae
--- /dev/null
+++ b/.claude/rules/dml-scalar-test-structure.md
@@ -0,0 +1,135 @@
+# DoubleMLScalar Test Structure
+
+> **Apply when**: Implementing a new model in the `DoubleMLScalar` hierarchy.
+> **Source**: Derived from `doc/diagrams/testing_structure.md`.
+
+## Required Test Files
+
+Every scalar model `<model>` in module `<module>/` requires **5 test files** in `doubleml/<module>/tests/`:
+
+| File | Purpose |
+|------|---------|
+| `test_<model>_scalar.py` | Core estimation accuracy (3-sigma rule) |
+| `test_<model>_scalar_return_types.py` | Property types, shapes, API contracts |
+| `test_<model>_scalar_exceptions.py` | Input validation, error messages |
+| `test_<model>_scalar_vs_<model>.py` | Exact match with old `DoubleML<Model>` |
+| `test_<model>_scalar_external_predictions.py` | External predictions equivalence |
+
+All test functions must be marked `@pytest.mark.ci`.
+
+---
+
+## 1. Core Estimation (`test_<model>_scalar.py`)
+
+**Fixture**: Parametrize over `score` variants and model-specific options. Use `scope="module"`, `np.random.seed(3141)`, `n_obs=500`, `n_folds=5`, `n_rep=1`.
+
+**Required tests**:
+- `test_coef`: `abs(coef - true_theta) <= 3.0 * se` (when true theta matches DGP)
+  - For unknown true params (e.g., ATTE): `np.isfinite(coef)` and `abs(coef) < 10.0`
+- `test_se`: `se > 0`
+
+## 2. Return Types (`test_<model>_scalar_return_types.py`)
+
+**Constants**: `N_OBS=200`, `N_FOLDS=3`, `N_REP=2`. Single fixture fitting one model.
+
+**Required tests**:
+
+| Test | Assertion |
+|------|-----------|
+| `test_coef_type_and_shape` | `isinstance(coef, np.ndarray)`, `shape == (1,)` |
+| `test_se_type_and_shape` | `isinstance(se, np.ndarray)`, `shape == (1,)` |
+| `test_all_thetas_shape` | `shape == (1, N_REP)` |
+| `test_all_ses_shape` | `shape == (1, N_REP)` |
+| `test_summary_type` | `isinstance(summary, pd.DataFrame)`, `len == 1` |
+| `test_confint_type_and_shape` | `isinstance(ci, pd.DataFrame)`, `shape == (1, 2)` |
+| `test_psi_shape` | `shape == (N_OBS, 1, N_REP)` |
+| `test_predictions_type` | `isinstance(predictions, dict)`, each value `shape == (N_OBS, N_REP)` |
+| `test_smpls_type` | `len(smpls) == N_REP`, each has `N_FOLDS` tuples of `(train, test)` arrays |
+| `test_n_properties` | `n_obs == N_OBS`, `n_folds == N_FOLDS`, `n_rep == N_REP`, `score == expected` |
+| `test_required_learners` | Returns list of expected learner names |
+| `test_str_repr` | `str(model)` and `repr(model)` return `str` |
+| `test_get_params` | Returns dict with learner keys |
+| `test_set_params` | Modifies and confirms learner parameter change |
+| `test_before_fit_raises` | `coef`/`se` before `fit()` raises error |
+
+## 3. Exceptions (`test_<model>_scalar_exceptions.py`)
+
+**Common exception tests** (required for all models):
+
+| Test | Input | Expected |
+|------|-------|----------|
+| `test_exception_data` | Non-DoubleMLData | `TypeError` |
+| `test_exception_score` | Invalid score string | `ValueError` |
+| `test_exception_n_folds` | `n_folds < 2` | `ValueError` |
+| `test_exception_n_rep` | `n_rep < 1` | `ValueError` |
+| `test_exception_fit_nuisance_without_smpls` | Fit before `draw_sample_splitting()` | `ValueError` |
+| `test_exception_estimate_causal_without_predictions` | Estimate before `fit_nuisance_models()` | `ValueError` |
+| `test_exception_missing_learner` | `fit()` without required learners | `ValueError` |
+| `test_exception_invalid_learner` | Class instead of instance | `TypeError` |
+
+**Model-specific exceptions** to add per model:
+- PLR: multiple treatments, `ml_g` warning for partialling out
+- IRM: non-binary treatment, `ml_m` must be classifier, `normalize_ipw` type
+
+Always use `pytest.raises(Error, match=r"regex pattern")`.
+
+## 4. Comparison (`test_<model>_scalar_vs_<model>.py`)
+
+**Fixture**: Parametrize `score` and `n_rep` (use `[1, 3]`).
+
+**Critical pattern**: Share sample splits from old model:
+```python
+dml_new._smpls = dml_old.smpls  # Old/new consume random state differently
+```
+
+**Required tests** — all use `np.testing.assert_allclose(..., rtol=1e-9)`:
+- `test_coef_equal`: `new.coef` vs `old.coef`
+- `test_se_equal`: `new.se` vs `old.se`
+- `test_all_coef_equal`: `new.all_thetas` vs `old.all_coef` (note: property name differs!)
+- `test_all_se_equal`: `new.all_ses` vs `old.all_se`
+
+## 5. External Predictions (`test_<model>_scalar_external_predictions.py`)
+
+**Fixture**: Parametrize `score`, `n_rep` (`[1, 3]`), and one `set_ml_x_ext` bool fixture per learner.
+
+**Pattern**:
+1. Fit reference model normally
+2. Extract `dml_ref.predictions['ml_x']` for external learners
+3. Fit test model with `dml_ext._smpls = dml_ref.smpls` and `fit(external_predictions=...)`
+
+**Required tests** — use `math.isclose(a, b, rel_tol=1e-9, abs_tol=1e-4)` (not `assert_allclose`):
+- `test_coef`: Reference vs. external
+- `test_se`: Reference vs. external
+
+`math.isclose` with `abs_tol=1e-4` because small numerical differences accumulate when mixing external and fitted predictions.
+
+---
+
+## Assertion Tolerance Summary
+
+| Context | Method | Why |
+|---------|--------|-----|
+| Core estimation | `abs(coef - true) <= 3.0 * se` | Statistical 3-sigma |
+| Backward compatibility | `assert_allclose(rtol=1e-9)` | Must be identical |
+| External predictions | `math.isclose(rel_tol=1e-9, abs_tol=1e-4)` | Numerical accumulation |
+
+## New Model Checklist
+
+### Implementation
+- [ ] Inherits from `LinearScoreMixin` (or `NonLinearScoreMixin`)
+- [ ] `_LEARNER_SPECS` class variable defined
+- [ ] `required_learners` property (score-dependent list)
+- [ ] `set_learners()` with model-specific kwargs
+- [ ] `_check_data()` static method
+- [ ] `draw_sample_splitting()` (override if stratification needed)
+- [ ] `_nuisance_est()` per-fold estimation
+- [ ] `_get_score_elements()` returns `{'psi_a': ..., 'psi_b': ...}`
+
+### Tests
+- [ ] All 5 test files created and pass: `pytest doubleml/<module>/tests/test_<model>_scalar*.py -v -m ci`
+- [ ] Old tests still pass: `pytest doubleml/<module>/tests/ -v`
+
+### Quality
+- [ ] `black doubleml/<module>/`
+- [ ] `ruff check doubleml/<module>/`
+- [ ] `mypy doubleml/<module>/`
diff --git a/.claude/rules/error-handling.md b/.claude/rules/error-handling.md
new file mode 100644
index 00000000..285be7c0
--- /dev/null
+++ b/.claude/rules/error-handling.md
@@ -0,0 +1,91 @@
+# Error Handling — DoubleML
+
+> **Apply when**: Adding input validation, raising exceptions, or writing `pytest.raises` tests.
+
+## Exception Type Mapping
+
+| Situation | Exception | Example |
+|-----------|-----------|---------|
+| Invalid parameter value | `ValueError` | `n_folds < 2`, unknown `score` |
+| Wrong argument type | `TypeError` | Non-`DoubleMLData` passed, class instead of instance |
+| Property accessed before `fit()` | `ValueError` | `model.coef` before fitting |
+| Wrong method call order | `ValueError` | `fit_nuisance_models()` before `draw_sample_splitting()` |
+
+## Validation Patterns
+
+### Use Project Helpers
+
+Always use validation functions from `doubleml/utils/_checks.py`:
+
+```python
+from doubleml.utils._checks import _check_learner, _check_score, _check_finite_predictions
+
+# Learner validation (checks sklearn compatibility, instance vs class)
+self._learner_ml_l = _check_learner(ml_l, 'ml_l', regressor=True, classifier=False)
+
+# Score validation
+_check_score(score, valid_scores=['IV-type', 'partialling out'], allow_callable=True)
+```
+
+### Fail Fast — Validate in Constructor and Setters
+
+```python
+def __init__(self, data: DoubleMLData, score: str = "ATE") -> None:
+    self._check_data(data)  # Validate immediately
+
+    if score not in self._VALID_SCORES:
+        raise ValueError(f"score must be one of {self._VALID_SCORES}, got '{score}'")
+```
+
+### Error Messages Must Include Expected vs. Actual
+
+```python
+# Good: specific and actionable
+raise ValueError(f"n_folds must be at least 2, got {n_folds}")
+raise TypeError(
+    f"ml_m must be a classifier with predict_proba(). "
+    f"Got {type(ml_m).__name__}. Did you pass a class instead of an instance?"
+)
+
+# Bad: vague
+raise ValueError("Invalid input")
+```
+
+### Method Call Order Validation
+
+```python
+def fit_nuisance_models(self) -> None:
+    if self._smpls is None:
+        raise ValueError("Sample splitting has not been drawn. Call draw_sample_splitting() first.")
+
+def estimate_causal_parameters(self) -> None:
+    if self._predictions is None:
+        raise ValueError("Nuisance models not fitted. Call fit_nuisance_models() first, or use fit().")
+```
+
+## Warnings vs. Exceptions
+
+- **Exception**: Input is invalid, execution cannot continue
+- **`warnings.warn()`**: Input is valid but may cause poor results
+
+```python
+# Warn on extreme propensity scores (valid but risky)
+if np.any((propensity < 1e-12) | (propensity > 1 - 1e-12)):
+    warnings.warn(
+        f"Propensity scores close to 0 or 1 (eps=1e-12). "
+        f"Trimming at {self._trimming_threshold}.",
+        UserWarning
+    )
+```
+
+## Testing Exceptions
+
+Always use `match=` with regex to verify the error message:
+
+```python
+@pytest.mark.ci
+def test_exception_invalid_score():
+    msg = r"score must be one of .*, got 'invalid'"
+    with pytest.raises(ValueError, match=msg):
+        DoubleMLPLR(data, score='invalid')
+```
diff --git a/.claude/rules/performance-guidelines.md b/.claude/rules/performance-guidelines.md
new file mode 100644
index 00000000..6854eaba
--- /dev/null
+++ b/.claude/rules/performance-guidelines.md
@@ -0,0 +1,67 @@
+# Performance Guidelines — DoubleML
+
+> **Apply when**: Writing nuisance estimation, score computation, or any code operating on `(n_obs,)` or `(n_obs, n_rep)` arrays.
+
+## Core Rules
+
+1. **Vectorize** — Use NumPy array operations, never Python loops over observations
+2. **Pre-allocate** — Create output arrays at full size before filling per-fold
+3. **Clone before fit** — `clone(learner).fit(X, y)` — learners are mutable
+4. **Profile first** — Don't optimize without measuring
+
+## DoubleML-Specific Patterns
+
+### Nuisance Estimation (Per-Fold)
+
+```python
+# Pre-allocate prediction arrays
+predictions = {
+    'ml_l': np.zeros((n_obs, n_rep)),
+    'ml_m': np.zeros((n_obs, n_rep)),
+}
+
+for i_rep, smpl in enumerate(smpls):
+    for train_idx, test_idx in smpl:
+        # Clone learner (mutable!), fit, predict in one chain
+        predictions['ml_l'][test_idx, i_rep] = (
+            clone(self._learner_ml_l).fit(X[train_idx], y[train_idx]).predict(X[test_idx])
+        )
+```
+
+### Score Computation
+
+```python
+# Vectorized — operates on full arrays
+psi_a = -d_res * d_res          # (n_obs,)
+psi_b = d_res * (y - ml_g_hat)  # (n_obs,)
+theta = -np.mean(psi_b) / np.mean(psi_a)
+```
+
+### Propensity Scores
+
+```python
+# predict_proba returns (n_obs, 2) — take column 1
+propensity = clone(self._learner_ml_m).fit(X_train, d_train).predict_proba(X_test)[:, 1]
+
+# Clip in one vectorized operation
+propensity = np.clip(propensity, self._trimming_threshold, 1 - self._trimming_threshold)
+```
+
+### Matrix Operations
+
+```python
+# Use lstsq, not manual inversion
+beta = np.linalg.lstsq(X, y, rcond=None)[0]
+
+# Not: beta = np.linalg.inv(X.T @ X) @ X.T @ y  (numerically unstable)
+```
+
+## Anti-Patterns
+
+| Don't | Do Instead |
+|-------|-----------|
+| `for i in range(n_obs): result[i] = ...` | `result = vectorized_op(array)` |
+| `np.append(result, value)` in a loop | Pre-allocate `np.zeros(n)`, fill by index |
+| `df.apply(lambda x: ...)` | `df['col'] ** 2` or `np.log(df['col'])` |
+| `KFold(n_splits=5)` | `DoubleMLResampling(n_folds=5, ...)` |
+| `np.linalg.inv(X.T @ X) @ X.T @ y` | `np.linalg.lstsq(X, y, rcond=None)[0]` |
diff --git a/.claude/rules/py-code-conventions.md b/.claude/rules/py-code-conventions.md
new file mode 100644
index 00000000..4344a3a0
--- /dev/null
+++ b/.claude/rules/py-code-conventions.md
@@ -0,0 +1,196 @@
+# Python Code Conventions — DoubleML
+
+> **Apply when**: Writing or modifying any Python file in `doubleml/`.
+
+## Tooling (from `pyproject.toml`)
+
+| Tool | Config | Command |
+|------|--------|---------|
+| **black** | line-length=127, preview=true, target py310-313 | `black .` |
+| **ruff** | rules E,F,W,I; ignores E721; target py312 | `ruff check .` / `ruff check --fix .` |
+| **mypy** | `disallow_untyped_defs=true`, `no_implicit_optional=true`, excludes tests | `mypy doubleml` |
+| **pre-commit** | black + ruff + trailing whitespace + debug statements | `pre-commit run --all-files` |
+
+## File Structure
+
+Every new or modified Python file must start with a **module-level docstring**. Do not add copyright headers, author/date stamps, or file paths — git tracks all of that.
+
+### Module Docstring Patterns
+
+Match the existing codebase style depending on file type:
+
+```python
+# Implementation files — one sentence: what the module contains
+"""Partially Linear Regression (PLR) model based on the DoubleMLScalar hierarchy."""
+
+# __init__.py files — Sphinx :mod: reference
+"""The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models."""
+
+# Test files — one sentence: what is being tested
+"""Compare PLR scalar against the existing DoubleMLPLR implementation."""
+```
+
+### Full File Header (implementation files)
+
+```python
+"""Partially Linear Regression (PLR) model based on the DoubleMLScalar hierarchy."""
+from __future__ import annotations  # needed when class methods return Self/own type
+
+from typing import Any, Optional
+
+import numpy as np
+import pandas as pd
+from sklearn.base import clone
+
+from doubleml.double_ml_scalar import DoubleMLScalar
+from doubleml.utils._checks import _check_learner
+```
+
+Import order (enforced by ruff/isort): standard library, third-party, local.
+
+### `from __future__ import annotations`
+
+Not required in every file. Use it when a class references its own type in annotations (forward reference). Since the project targets Python 3.10+, `list[int]`, `dict[str, T]`, and `X | Y` unions work natively without it.
+
+## Type Hints
+
+All functions require complete type annotations including return types.
+
+```python
+def _nuisance_est(self, smpls: list[tuple[np.ndarray, np.ndarray]], n_rep: int = 1) -> dict[str, np.ndarray]:
+```
+
+- Use `-> None` for functions without return value
+- Use `Optional[X]` or `X | None` (with `__future__` import) for nullable params
+- `Any` is acceptable for scikit-learn estimators and dynamic learner objects
+- Never suppress valid errors with `# type: ignore` — fix the type instead
+
+## Docstrings (NumPy Style)
+
+Required sections: **summary**, **Parameters**, **Returns**. Optional: **Raises**, **Examples**, **Notes**.
+
+```python
+def _get_score_elements(self, psi_predictions: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
+    """
+    Compute score elements from nuisance predictions.
+
+    Parameters
+    ----------
+    psi_predictions : dict[str, np.ndarray]
+        Dictionary with keys ``'ml_l'``, ``'ml_m'`` containing predictions of shape ``(n_obs,)``.
+
+    Returns
+    -------
+    dict[str, np.ndarray]
+        Dictionary with keys ``'psi_a'`` (derivative) and ``'psi_b'`` (moment condition).
+    """
+```
+
+Use `:class:\`~doubleml.DoubleMLData\`` for Sphinx cross-references. Use `.. math::` blocks for formulas.
+
+## Naming Conventions
+
+| Element | Convention | Example |
+|---------|-----------|---------|
+| Modules | `snake_case` | `double_ml_plr.py` |
+| Classes | `PascalCase` with `DoubleML` prefix | `DoubleMLPLR` |
+| Methods/functions | `snake_case` | `fit_nuisance_models()` |
+| Private methods | `_leading_underscore` | `_nuisance_est()` |
+| Class variables | `_UPPER_SNAKE` | `_LEARNER_SPECS` |
+| Constants | `UPPER_SNAKE` | `DEFAULT_N_FOLDS` |
+| Statistical notation | Conventional names | `theta`, `se`, `psi_a`, `psi_b`, `n_obs`, `n_folds` |
+
+## Class Design Patterns
+
+### Property vs. Method
+
+- **`@property`**: Computed attributes that are cheap and feel like data — `coef`, `se`, `summary`, `predictions`, `n_obs`, `n_folds`, `n_rep`, `score`
+- **Methods**: Actions with side effects or expensive computation — `fit()`, `confint()`, `bootstrap()`, `draw_sample_splitting()`
+- **`fit()` returns `self`** to enable chaining
+
+### Class Variables vs. Instance Variables
+
+- **Class variable**: Shared metadata — `_LEARNER_SPECS`, `_VALID_SCORES`
+- **Instance variable**: Per-object state — `_dml_data`, `_smpls`, `_predictions`
+
+### Decorators
+
+- `@staticmethod` for stateless validation: `_check_data()`
+- `@property` for computed attributes: `coef`, `se`
+- `@abstractmethod` for template method hooks: `_nuisance_est()`, `_get_score_elements()`
+
+### Score Function Contract
+
+`_get_score_elements()` must return `dict[str, np.ndarray]` with:
+- `'psi_a'`: Score derivative, shape `(n_obs,)`
+- `'psi_b'`: Moment condition, shape `(n_obs,)`
+
+Linear scores use closed-form: `theta = -mean(psi_b) / mean(psi_a)`.
+
+## DoubleML-Specific Patterns
+
+### Learner Handling
+
+```python
+# Always validate learners with _check_learner
+self._learner_ml_l = _check_learner(ml_l, 'ml_l', regressor=True, classifier=False)
+
+# Always clone before fitting (learners are mutable)
+fitted_learner = clone(self._learner_ml_l).fit(X_train, y_train)
+
+# Classifiers need predict_proba for propensity scores
+propensity = fitted_learner.predict_proba(X_test)[:, 1]
+```
+
+### Sample Splitting
+
+Always use `DoubleMLResampling`, never raw `KFold`:
+
+```python
+from doubleml.utils.resampling import DoubleMLResampling
+resampling = DoubleMLResampling(n_folds=5, n_repeats=1, n_obs=n_obs)
+```
+
+### Vectorized Score Computation
+
+```python
+# Correct: vectorized NumPy operations
+psi_a = -d_res * d_res          # shape: (n_obs,)
+psi_b = d_res * (y - ml_g_hat)  # shape: (n_obs,)
+
+# Wrong: Python loops over observations
+```
+
+Pre-allocate prediction arrays: `np.zeros((n_obs, n_rep))`.
+
+### Error Messages
+
+Include expected vs. actual values. Use `_check_*` helpers from `doubleml/utils/_checks.py`.
+
+```python
+if score not in self._VALID_SCORES:
+    raise ValueError(f"score must be one of {self._VALID_SCORES}, got '{score}'")
+```
+
+Use `warnings.warn()` for non-fatal issues (e.g., extreme propensity scores), exceptions for invalid input.
+
+## Verification Checklist
+
+Before completing any task, run:
+
+```bash
+black .                    # Format
+ruff check --fix .         # Lint + auto-fix
+mypy doubleml              # Type check
+pytest -m ci               # Tests
+```
+
+Check:
+- [ ] All functions have type hints and return types
+- [ ] File starts with a module-level docstring (one sentence, matching file type pattern)
+- [ ] Public functions/classes have NumPy-style docstrings
+- [ ] Learners validated with `_check_learner()`, cloned with `clone()` before fitting
+- [ ] Score elements named `psi_a`/`psi_b`, shapes are `(n_obs,)`
+- [ ] No `print()`, `breakpoint()`, or debug statements
+- [ ] No magic numbers — use named constants
+- [ ] Sample splitting uses `DoubleMLResampling`, not raw `KFold`
diff --git a/.claude/rules/testing-conventions.md b/.claude/rules/testing-conventions.md
new file mode 100644
index 00000000..0508ae42
--- /dev/null
+++ b/.claude/rules/testing-conventions.md
@@ -0,0 +1,104 @@
+# Testing Conventions — DoubleML
+
+> **Apply when**: Writing or modifying test files in `doubleml/**/tests/`.
+
+## Test Organization
+
+```
+doubleml/<module>/tests/
+├── __init__.py
+├── conftest.py              # Shared fixtures
+├── test_<model>.py          # Legacy model tests
+├── test_<model>_scalar.py   # Scalar model tests (see dml-scalar-test-structure.md)
+└── ...
+```
+
+Package-level tests and utilities live in `doubleml/tests/` (with `_utils*.py` helpers).
+
+## Markers
+
+**All test functions must be marked `@pytest.mark.ci`** — this is the CI gate.
+
+```python
+@pytest.mark.ci
+def test_coef_accuracy(fitted_model):
+    ...
+
+@pytest.mark.ci
+@pytest.mark.parametrize("score", ["IV-type", "partialling out"])
+def test_score_variants(score):
+    ...
+```
+
+Other markers: `@pytest.mark.ci_rdd` for RDD-specific tests.
+
+Run: `pytest -m ci` (CI), `pytest doubleml/plm/tests/` (module), `pytest -k "plr and scalar"` (pattern).
+
+## Fixtures
+
+### Use `scope="module"` for Expensive Operations
+
+Model fitting is expensive. Fit once, share across tests:
+
+```python
+@pytest.fixture(scope="module")
+def fitted_model():
+    np.random.seed(42)
+    data = make_plr_data(n_obs=200)
+    dml_obj = DoubleMLPLRScalar(data, score="IV-type")
+    dml_obj.set_learners(ml_l=Lasso(), ml_m=Lasso())
+    dml_obj.draw_sample_splitting(n_folds=3, n_rep=2)
+    dml_obj.fit()
+    return dml_obj
+```
+
+### Parametrize for Multiple Scenarios
+
+```python
+@pytest.fixture(scope="module", params=["IV-type", "partialling out"])
+def score(request):
+    return request.param
+```
+
+Each combination creates one fixture instance shared across all tests in the module.
+
+## Assertion Patterns
+
+| Context | Pattern | Tolerance |
+|---------|---------|-----------|
+| Statistical accuracy | `abs(coef - true_theta) <= 3.0 * se` | 3-sigma rule |
+| Backward compatibility | `np.testing.assert_allclose(new, old, rtol=1e-9)` | Exact match |
+| External predictions | `math.isclose(a, b, rel_tol=1e-9, abs_tol=1e-4)` | Small tolerance |
+| Exception messages | `pytest.raises(ValueError, match=r"regex")` | Exact message |
+| Types and shapes | `isinstance(x, np.ndarray)`, `x.shape == (n,)` | Exact |
+
+### Key: Always Use `match=` for Exception Tests
+
+```python
+msg = r"score must be one of .*, got 'invalid'"
+with pytest.raises(ValueError, match=msg):
+    DoubleMLPLR(data, score='invalid')
+```
+
+## Reproducibility
+
+- **Always seed**: `np.random.seed(42)` at the start of data generation
+- **Share sample splits** in comparison tests: `dml_new._smpls = dml_old.smpls`
+  (Old and new implementations consume random state differently during `__init__`)
+- **Small data for speed**: `n_obs=200`, `n_folds=3` for return type / exception tests
+- **Larger data for accuracy**: `n_obs=500`, `n_folds=5` for estimation tests
+
+## Naming
+
+- Files: `test_<model>.py`, `test_<model>_scalar.py`, `test_<model>_scalar_exceptions.py`
+- Functions: `test_<what>` — e.g., `test_coef_within_3_sigma`, `test_exception_invalid_score`
+- Docstrings: Every test function gets a one-line docstring explaining what it verifies
+
+## Checklist
+
+- [ ] All tests marked `@pytest.mark.ci`
+- [ ] Fixtures use `scope="module"` for model fitting
+- [ ] Exception tests use `match=` with regex
+- [ ] Seeds set for reproducibility
+- [ ] Test functions have descriptive names and docstrings
+- [ ] New scalar models have all 5 required test files (see `dml-scalar-test-structure.md`)

From 48ae3a9d2004dfc33fed40a43e6ae41b313a91a9 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 7 Feb 2026 13:16:57 +0100
Subject: [PATCH 11/38] Refactor IRM class type hints to use built-in types and
 improve code clarity

---
 doubleml/irm/irm_scalar.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index f8c0ee53..83a7f47b 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from typing import ClassVar, Dict, List, Optional, Self, Union
+from typing import ClassVar, Self
 
 import numpy as np
 from sklearn.base import clone
@@ -78,7 +78,7 @@ class IRM(LinearScoreMixin):
     """
 
     # Define learner specifications for IRM
-    _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] = {
+    _LEARNER_SPECS: ClassVar[dict[str, LearnerSpec]] = {
         "ml_g0": LearnerSpec("ml_g0", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"),
         "ml_g1": LearnerSpec("ml_g1", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"),
         "ml_m": LearnerSpec("ml_m", allow_regressor=False, allow_classifier=True),
@@ -88,11 +88,11 @@ def __init__(
         self,
         obj_dml_data: DoubleMLData,
         score: str = "ATE",
-        ml_g: Optional[object] = None,
-        ml_m: Optional[object] = None,
+        ml_g: object | None = None,
+        ml_m: object | None = None,
         normalize_ipw: bool = False,
-        weights: Optional[Union[np.ndarray, Dict]] = None,
-        ps_processor_config: Optional[PSProcessorConfig] = None,
+        weights: np.ndarray | dict | None = None,
+        ps_processor_config: PSProcessorConfig | None = None,
     ):
         """
         Initialize IRM model.
@@ -165,12 +165,12 @@ def ps_processor(self) -> PSProcessor:
         return self._ps_processor
 
     @property
-    def weights(self) -> Dict:
+    def weights(self) -> dict:
         """Weights for weighted ATE/ATTE."""
         return self._weights
 
     @property
-    def required_learners(self) -> List[str]:
+    def required_learners(self) -> list[str]:
         """Required learners for IRM: ml_g0, ml_g1, and ml_m."""
         return ["ml_g0", "ml_g1", "ml_m"]
 
@@ -178,10 +178,10 @@ def required_learners(self) -> List[str]:
 
     def set_learners(
         self,
-        ml_g: Optional[object] = None,
-        ml_g0: Optional[object] = None,
-        ml_g1: Optional[object] = None,
-        ml_m: Optional[object] = None,
+        ml_g: object | None = None,
+        ml_g0: object | None = None,
+        ml_g1: object | None = None,
+        ml_m: object | None = None,
     ) -> Self:
         """
         Set the learners for nuisance estimation.
@@ -269,7 +269,7 @@ def _nuisance_est(
         test_idx: np.ndarray,
         i_rep: int,
         i_fold: int,
-        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+        external_predictions: dict[str, np.ndarray] | None = None,
     ) -> None:
         x = self._dml_data.x
         y = self._dml_data.y
@@ -308,7 +308,7 @@ def _nuisance_est(
 
     # ==================== Score Elements ====================
 
-    def _get_score_elements(self) -> Dict[str, np.ndarray]:
+    def _get_score_elements(self) -> dict[str, np.ndarray]:
         y = self._dml_data.y
         d = self._dml_data.d
 
@@ -371,17 +371,18 @@ def _check_data(obj_dml_data: object) -> None:
                 "needs to be specified as treatment variable."
             )
 
-    def _initialize_weights(self, weights: Optional[Union[np.ndarray, Dict]]) -> None:
+    def _initialize_weights(self, weights: np.ndarray | dict | None) -> None:
         """Initialize weights storage."""
         if weights is None:
             weights = np.ones(self._dml_data.n_obs)
         if isinstance(weights, np.ndarray):
             self._weights = {"weights": weights}
         else:
-            assert isinstance(weights, dict)
+            if not isinstance(weights, dict):
+                raise TypeError(f"weights must be np.ndarray or dict, got {type(weights).__name__}")
             self._weights = weights
 
-    def _get_weights(self, m_hat: np.ndarray) -> tuple:
+    def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
         """
         Compute weights and weights_bar for score computation.
 
@@ -408,8 +409,7 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple:
             else:
                 weights_bar = weights.copy()
         else:
-            # ATTE
-            assert self.score == "ATTE"
+            # ATTE (score validated in __init__)
             w = self._weights["weights"]
             subgroup = w * d
             subgroup_probability = np.mean(subgroup)

From 1886a7db00a21c3caecdf76b1f890b253de1cfea Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Mon, 9 Feb 2026 09:13:23 +0100
Subject: [PATCH 12/38] Refactor DoubleMLScalar to enhance sample splitting
 functionality; add tests for cluster-based sample splitting and external
 prediction validation.

---
 doubleml/double_ml_scalar.py                  | 228 +++++++++++++++---
 doubleml/tests/test_scalar_cluster.py         | 134 ++++++++++
 doubleml/tests/test_scalar_ext_predictions.py |  45 ++++
 .../tests/test_scalar_set_sample_splitting.py |  62 +++++
 4 files changed, 431 insertions(+), 38 deletions(-)
 create mode 100644 doubleml/tests/test_scalar_cluster.py
 create mode 100644 doubleml/tests/test_scalar_ext_predictions.py
 create mode 100644 doubleml/tests/test_scalar_set_sample_splitting.py

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index a0010005..0f69a3e1 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -3,7 +3,7 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import ClassVar, Dict, List, Optional, Self
+from typing import ClassVar, Self
 
 import numpy as np
 
@@ -11,8 +11,9 @@
 from .double_ml_base import DoubleMLBase
 from .double_ml_framework import DoubleMLCore as DoubleMLCoreData
 from .double_ml_framework import DoubleMLFramework
+from .utils._checks import _check_sample_splitting
 from .utils._learner import LearnerInfo, LearnerSpec, validate_learner
-from .utils.resampling import DoubleMLResampling
+from .utils.resampling import DoubleMLClusterResampling, DoubleMLResampling
 
 
 class DoubleMLScalar(DoubleMLBase, ABC):
@@ -45,7 +46,7 @@ class DoubleMLScalar(DoubleMLBase, ABC):
     """
 
     # Subclasses define all possible learners for the model
-    _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]]
+    _LEARNER_SPECS: ClassVar[dict[str, LearnerSpec]]
 
     def __init__(
         self,
@@ -81,24 +82,26 @@ def __init__(
         self._score = score
 
         # Learner storage: single dict for all learner state
-        self._learners: Dict[str, LearnerInfo] = {}
+        self._learners: dict[str, LearnerInfo] = {}
 
         # Resampling parameters (set via draw_sample_splitting)
-        self._n_folds: Optional[int] = None
-        self._n_rep: Optional[int] = None
-        self._smpls: Optional[List] = None
+        self._n_folds: int | None = None
+        self._n_folds_per_cluster: int | None = None
+        self._n_rep: int | None = None
+        self._smpls: list | None = None
+        self._smpls_cluster: list | None = None
 
         # Initialize storage for predictions and results
-        self._predictions: Optional[Dict[str, np.ndarray]] = None
-        self._all_thetas: Optional[np.ndarray] = None
-        self._all_ses: Optional[np.ndarray] = None
-        self._psi: Optional[np.ndarray] = None
-        self._psi_deriv: Optional[np.ndarray] = None
-        self._var_scaling_factors: Optional[np.ndarray] = None
+        self._predictions: dict[str, np.ndarray] | None = None
+        self._all_thetas: np.ndarray | None = None
+        self._all_ses: np.ndarray | None = None
+        self._psi: np.ndarray | None = None
+        self._psi_deriv: np.ndarray | None = None
+        self._var_scaling_factors: np.ndarray | None = None
 
         # For iteration (used during fit)
-        self._i_rep: Optional[int] = None
-        self._i_fold: Optional[int] = None
+        self._i_rep: int | None = None
+        self._i_fold: int | None = None
 
     # ==================== Properties ====================
 
@@ -153,7 +156,7 @@ def score(self) -> str:
         return self._score
 
     @property
-    def predictions(self) -> Dict[str, np.ndarray]:
+    def predictions(self) -> dict[str, np.ndarray]:
         """
         Predictions from nuisance models.
 
@@ -172,7 +175,7 @@ def predictions(self) -> Dict[str, np.ndarray]:
         return self._predictions
 
     @property
-    def smpls(self) -> List:
+    def smpls(self) -> list:
         """
         Sample splitting indices used for cross-fitting.
 
@@ -185,9 +188,28 @@ def smpls(self) -> List:
             raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.")
         return self._smpls
 
+    @property
+    def smpls_cluster(self) -> list | None:
+        """
+        Cluster-based sample splitting indices used for cross-fitting.
+
+        Returns
+        -------
+        list or None
+            List of cluster sample splitting indices for each repetition, or None.
+
+        Raises
+        ------
+        ValueError
+            If cluster data is used but cluster splitting is not available.
+        """
+        if self._dml_data.is_cluster_data and self._smpls_cluster is None:
+            raise ValueError("Cluster sample splitting has not been provided. Call set_sample_splitting() first.")
+        return self._smpls_cluster
+
     @property
     @abstractmethod
-    def required_learners(self) -> List[str]:
+    def required_learners(self) -> list[str]:
         """
         Names of the required learners for current configuration.
 
@@ -202,7 +224,7 @@ def required_learners(self) -> List[str]:
         pass
 
     @property
-    def learners(self) -> Dict[str, object]:
+    def learners(self) -> dict[str, object]:
         """
         Access registered learner objects by name.
 
@@ -213,7 +235,7 @@ def learners(self) -> Dict[str, object]:
         """
         return {name: info.learner for name, info in self._learners.items()}
 
-    def get_params(self, learner_name: str) -> Dict:
+    def get_params(self, learner_name: str) -> dict:
         """
         Get parameters of a registered learner.
 
@@ -316,8 +338,8 @@ def fit(
         self,
         n_folds: int = 5,
         n_rep: int = 1,
-        n_jobs_cv: Optional[int] = None,
-        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+        n_jobs_cv: int | None = None,
+        external_predictions: dict[str, np.ndarray] | None = None,
         **kwargs,
     ) -> Self:
         """
@@ -358,8 +380,8 @@ def fit(
 
     def fit_nuisance_models(
         self,
-        n_jobs_cv: Optional[int] = None,
-        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+        n_jobs_cv: int | None = None,
+        external_predictions: dict[str, np.ndarray] | None = None,
     ) -> Self:
         """
         Fit nuisance models via cross-fitting.
@@ -389,6 +411,9 @@ def fit_nuisance_models(
         if self._smpls is None:
             raise ValueError("Sample splitting has not been initialized. Call draw_sample_splitting() first.")
 
+        if external_predictions is not None:
+            self._check_external_predictions(external_predictions)
+
         # Validate that all required learners are available
         self._check_learners_available(external_predictions)
 
@@ -485,18 +510,93 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
         if not isinstance(n_rep, int) or n_rep < 1:
             raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.")
 
-        self._n_folds = n_folds
-        self._n_rep = n_rep
+        if self._dml_data.is_cluster_data:
+            self._n_folds_per_cluster = n_folds
+            self._n_rep = n_rep
+            self._n_folds = n_folds**self._dml_data.n_cluster_vars
+
+            resampler = DoubleMLClusterResampling(
+                n_folds=n_folds,
+                n_rep=n_rep,
+                n_obs=self._n_obs,
+                n_cluster_vars=self._dml_data.n_cluster_vars,
+                cluster_vars=self._dml_data.cluster_vars,
+            )
+            self._smpls, self._smpls_cluster = resampler.split_samples()
+        else:
+            self._n_folds = n_folds
+            self._n_folds_per_cluster = None
+            self._n_rep = n_rep
+
+            # Create resampler
+            resampler = DoubleMLResampling(
+                n_folds=n_folds,
+                n_rep=n_rep,
+                n_obs=self._n_obs,
+            )
+
+            # Generate splits
+            self._smpls = resampler.split_samples()
+            self._smpls_cluster = None
+
+        self._reset_fit_state()
+
+        return self
+
+    def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None = None) -> Self:
+        """
+        Set the sample splitting for DoubleMLScalar models.
+
+        Parameters
+        ----------
+        all_smpls : list
+            List of tuples (train_ind, test_ind) per fold, or list of lists of tuples
+            for repeated sample splitting.
+        all_smpls_cluster : list or None
+            Nested list for cluster sample splitting. Required for cluster data.
+            Default is ``None``.
+
+        Returns
+        -------
+        self : Self
 
-        # Create resampler
-        resampler = DoubleMLResampling(
-            n_folds=n_folds,
-            n_rep=n_rep,
+        Raises
+        ------
+        TypeError
+            If ``all_smpls`` is not a list or if tuple shorthand is used.
+        ValueError
+            If the partition is invalid or cluster splitting is missing.
+        """
+        if isinstance(all_smpls, tuple):
+            raise TypeError("all_smpls must be a list of folds; tuple shorthand is not supported for DoubleMLScalar.")
+        if not isinstance(all_smpls, list):
+            raise TypeError(f"all_smpls must be of list type. {str(all_smpls)} of type {str(type(all_smpls))} was passed.")
+
+        smpls, smpls_cluster, n_rep, n_folds = _check_sample_splitting(
+            all_smpls,
+            all_smpls_cluster,
+            self._dml_data,
+            self._dml_data.is_cluster_data,
             n_obs=self._n_obs,
         )
 
-        # Generate splits
-        self._smpls = resampler.split_samples()
+        self._smpls = smpls
+        self._smpls_cluster = smpls_cluster
+        self._n_rep = n_rep
+        self._n_folds = n_folds
+        if self._dml_data.is_cluster_data:
+            n_cluster_vars = self._dml_data.n_cluster_vars
+            n_folds_per_cluster = int(round(n_folds ** (1.0 / n_cluster_vars)))
+            if n_folds_per_cluster**n_cluster_vars != n_folds:
+                raise ValueError(
+                    "Invalid cluster sample splitting. n_folds must be a power of n_folds_per_cluster "
+                    "for the number of cluster variables."
+                )
+            self._n_folds_per_cluster = n_folds_per_cluster
+        else:
+            self._n_folds_per_cluster = None
+
+        self._reset_fit_state()
 
         return self
 
@@ -514,7 +614,7 @@ def _initialize_result_arrays(self) -> None:
         self._psi = np.zeros((n_obs, n_thetas, n_rep))
         self._psi_deriv = np.zeros((n_obs, n_thetas, n_rep))
 
-    def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]:
+    def _initialize_predictions_dict(self) -> dict[str, np.ndarray]:
         """
         Initialize dictionary for storing predictions.
 
@@ -531,7 +631,37 @@ def _initialize_predictions_dict(self) -> Dict[str, np.ndarray]:
         n_rep = self.n_rep
         return {name: np.full((n_obs, n_rep), np.nan) for name in self.required_learners}
 
-    def _check_learners_available(self, external_predictions: Optional[Dict[str, np.ndarray]] = None) -> None:
+    def _check_external_predictions(self, external_predictions: dict[str, np.ndarray]) -> None:
+        """
+        Validate external prediction arrays.
+
+        Parameters
+        ----------
+        external_predictions : dict
+            Dictionary of external predictions keyed by learner name.
+
+        Raises
+        ------
+        TypeError
+            If a value is not a numpy array.
+        ValueError
+            If a value does not match shape (n_obs, n_rep).
+        """
+        n_obs = self._n_obs
+        n_rep = self.n_rep
+        required = set(self.required_learners)
+
+        for key, values in external_predictions.items():
+            if key not in required:
+                raise ValueError(
+                    f"External predictions provided for unknown learner '{key}'. " f"Allowed learners: {sorted(required)}."
+                )
+            if not isinstance(values, np.ndarray):
+                raise TypeError(f"External predictions for '{key}' must be a numpy array. Got {type(values).__name__}.")
+            if values.shape != (n_obs, n_rep):
+                raise ValueError(f"External predictions for '{key}' must have shape ({n_obs}, {n_rep}). Got {values.shape}.")
+
+    def _check_learners_available(self, external_predictions: dict[str, np.ndarray] | None = None) -> None:
         """
         Validate that all required learners are set or covered by external predictions.
 
@@ -567,13 +697,23 @@ def _construct_framework(self) -> DoubleMLFramework:
         # Both already in framework shape: (n_obs, n_thetas, n_rep)
         scaled_psi = np.divide(self._psi, np.mean(self._psi_deriv, axis=0, keepdims=True))
 
+        cluster_dict = None
+        if self._dml_data.is_cluster_data:
+            cluster_dict = {
+                "smpls": self.smpls,
+                "smpls_cluster": self.smpls_cluster,
+                "cluster_vars": self._dml_data.cluster_vars,
+                "n_folds_per_cluster": self._n_folds_per_cluster,
+            }
+
         # Create data container (no transpose needed - already in framework convention!)
         framework_data = DoubleMLCoreData(
             all_thetas=self._all_thetas,  # (n_thetas, n_rep)
             all_ses=self._all_ses,  # (n_thetas, n_rep)
             var_scaling_factors=self._var_scaling_factors,  # (n_thetas,)
             scaled_psi=scaled_psi,  # (n_obs, n_thetas, n_rep)
-            is_cluster_data=False,  # TODO: Add cluster data support
+            is_cluster_data=self._dml_data.is_cluster_data,
+            cluster_dict=cluster_dict,
         )
 
         # Create and return framework
@@ -582,6 +722,18 @@ def _construct_framework(self) -> DoubleMLFramework:
             treatment_names=self._dml_data.d_cols,
         )
 
+    def _reset_fit_state(self) -> None:
+        """Clear fit-dependent state after changing the sample splitting."""
+        self._predictions = None
+        self._framework = None
+        self._all_thetas = None
+        self._all_ses = None
+        self._psi = None
+        self._psi_deriv = None
+        self._var_scaling_factors = None
+        self._i_rep = None
+        self._i_fold = None
+
     # ==================== Abstract Methods (Must be Implemented by Subclasses) ====================
 
     @abstractmethod
@@ -591,7 +743,7 @@ def _nuisance_est(
         test_idx: np.ndarray,
         i_rep: int,
         i_fold: int,
-        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+        external_predictions: dict[str, np.ndarray] | None = None,
     ) -> None:
         """
         Estimate nuisance parameters for one fold.
@@ -621,7 +773,7 @@ def _nuisance_est(
         pass
 
     @abstractmethod
-    def _get_score_elements(self) -> Dict[str, np.ndarray]:
+    def _get_score_elements(self) -> dict[str, np.ndarray]:
         """
         Compute score function elements from nuisance predictions.
 
@@ -647,7 +799,7 @@ def _get_score_elements(self) -> Dict[str, np.ndarray]:
         pass
 
     @abstractmethod
-    def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None:
+    def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None:
         """
         Estimate causal parameters and standard errors from score elements.
 
diff --git a/doubleml/tests/test_scalar_cluster.py b/doubleml/tests/test_scalar_cluster.py
new file mode 100644
index 00000000..0ad05f68
--- /dev/null
+++ b/doubleml/tests/test_scalar_cluster.py
@@ -0,0 +1,134 @@
+"""Test cluster-based sample splitting for scalar PLR models."""
+
+import numpy as np
+import pytest
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import Lasso, LinearRegression
+
+from doubleml import DoubleMLData
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+from ._utils import _clone
+
+
+@pytest.fixture(
+    scope="module", params=[RandomForestRegressor(max_depth=2, n_estimators=10), LinearRegression(), Lasso(alpha=0.1)]
+)
+def learner(request):
+    return request.param
+
+
+@pytest.mark.ci
+def test_scalar_plr_cluster_set_sample_splitting():
+    """Check set_sample_splitting consistency for scalar PLR cluster data."""
+    np.random.seed(3141)
+    n_i = 5
+    n_j = 6
+    n_obs = n_i * n_j
+
+    df = make_plr_CCDDHNR2018(n_obs=n_obs, return_type="DataFrame")
+    x_cols = [col for col in df.columns if col.startswith("X")]
+
+    df["cluster_i"] = np.repeat(np.arange(n_i), n_j)
+    df["cluster_j"] = np.tile(np.arange(n_j), n_i)
+
+    dml_data = DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols=["cluster_i", "cluster_j"])
+
+    ml_l = LinearRegression()
+    ml_m = LinearRegression()
+
+    dml_obj = PLR(dml_data)
+    dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m)
+    dml_obj.draw_sample_splitting(n_folds=2, n_rep=2)
+    dml_obj.fit()
+
+    dml_obj_ext = PLR(dml_data)
+    dml_obj_ext.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    dml_obj_ext.set_sample_splitting(all_smpls=dml_obj.smpls, all_smpls_cluster=dml_obj.smpls_cluster)
+    dml_obj_ext.fit()
+
+    assert np.isclose(dml_obj.coef[0], dml_obj_ext.coef[0], rtol=1e-9, atol=1e-4)
+    assert np.isclose(dml_obj.se[0], dml_obj_ext.se[0], rtol=1e-9, atol=1e-4)
+
+
+@pytest.fixture(scope="module")
+def dml_plr_scalar_cluster_with_index(generate_data1, learner):
+    """Fit scalar PLR with and without clustering for comparison."""
+    # in the one-way cluster case with exactly one observation per cluster, we get the same result w & w/o clustering
+    n_folds = 2
+
+    data = generate_data1
+    x_cols = data.columns[data.columns.str.startswith("X")].tolist()
+
+    ml_l = _clone(learner)
+    ml_m = _clone(learner)
+
+    obj_dml_data = DoubleMLData(data, "y", ["d"], x_cols)
+    np.random.seed(3141)
+    dml_plr_obj = PLR(obj_dml_data)
+    dml_plr_obj.set_learners(ml_l=ml_l, ml_m=ml_m)
+    dml_plr_obj.draw_sample_splitting(n_folds=n_folds)
+    dml_plr_obj.fit()
+
+    df = data.reset_index()
+    dml_cluster_data = DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols, cluster_cols="index")
+    np.random.seed(3141)
+    dml_plr_cluster_obj = PLR(dml_cluster_data)
+    dml_plr_cluster_obj.set_learners(ml_l=_clone(learner), ml_m=_clone(learner))
+    dml_plr_cluster_obj.draw_sample_splitting(n_folds=n_folds)
+    dml_plr_cluster_obj.fit()
+
+    dml_plr_cluster_ext_smpls = PLR(dml_cluster_data)
+    dml_plr_cluster_ext_smpls.set_learners(ml_l=_clone(learner), ml_m=_clone(learner))
+    dml_plr_cluster_ext_smpls.set_sample_splitting(
+        all_smpls=dml_plr_cluster_obj.smpls,
+        all_smpls_cluster=dml_plr_cluster_obj.smpls_cluster,
+    )
+    np.random.seed(3141)
+    dml_plr_cluster_ext_smpls.fit()
+
+    res_dict = {
+        "coef": dml_plr_obj.coef,
+        "coef_manual": dml_plr_cluster_obj.coef,
+        "se": dml_plr_obj.se,
+        "se_manual": dml_plr_cluster_obj.se,
+        "coef_ext_smpls": dml_plr_cluster_ext_smpls.coef,
+        "se_ext_smpls": dml_plr_cluster_ext_smpls.se,
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_plr_scalar_cluster_with_index_coef(dml_plr_scalar_cluster_with_index):
+    """Validate scalar PLR cluster coefficients match across configurations."""
+    assert np.isclose(
+        dml_plr_scalar_cluster_with_index["coef"][0],
+        dml_plr_scalar_cluster_with_index["coef_manual"][0],
+        rtol=1e-9,
+        atol=1e-4,
+    )
+    assert np.isclose(
+        dml_plr_scalar_cluster_with_index["coef"][0],
+        dml_plr_scalar_cluster_with_index["coef_ext_smpls"][0],
+        rtol=1e-9,
+        atol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_dml_plr_scalar_cluster_with_index_se(dml_plr_scalar_cluster_with_index):
+    """Validate scalar PLR cluster standard errors match across configurations."""
+    assert np.isclose(
+        dml_plr_scalar_cluster_with_index["se"][0],
+        dml_plr_scalar_cluster_with_index["se_manual"][0],
+        rtol=1e-9,
+        atol=1e-4,
+    )
+    assert np.isclose(
+        dml_plr_scalar_cluster_with_index["se"][0],
+        dml_plr_scalar_cluster_with_index["se_ext_smpls"][0],
+        rtol=1e-9,
+        atol=1e-4,
+    )
diff --git a/doubleml/tests/test_scalar_ext_predictions.py b/doubleml/tests/test_scalar_ext_predictions.py
new file mode 100644
index 00000000..349ad25c
--- /dev/null
+++ b/doubleml/tests/test_scalar_ext_predictions.py
@@ -0,0 +1,45 @@
+"""Test external prediction validation for scalar DoubleML models."""
+
+import numpy as np
+import pytest
+from sklearn.linear_model import Lasso
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+
+@pytest.mark.ci
+def test_scalar_external_predictions_unknown_key():
+    """Reject external predictions with unknown learner keys."""
+    np.random.seed(3141)
+    dml_data = make_plr_CCDDHNR2018(n_obs=10)
+    dml_obj = PLR(dml_data)
+    dml_obj.set_learners(ml_l=Lasso(), ml_m=Lasso())
+    dml_obj.draw_sample_splitting(n_folds=2, n_rep=1)
+
+    ext_predictions = {
+        "ml_l": np.zeros((10, 1)),
+        "ml_m": np.zeros((10, 1)),
+        "ml_unknown": np.zeros((10, 1)),
+    }
+    msg = "External predictions provided for unknown learner 'ml_unknown'"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit_nuisance_models(external_predictions=ext_predictions)
+
+
+@pytest.mark.ci
+def test_scalar_external_predictions_shape():
+    """Reject external predictions with incorrect shape."""
+    np.random.seed(3141)
+    dml_data = make_plr_CCDDHNR2018(n_obs=10)
+    dml_obj = PLR(dml_data)
+    dml_obj.set_learners(ml_l=Lasso(), ml_m=Lasso())
+    dml_obj.draw_sample_splitting(n_folds=2, n_rep=1)
+
+    ext_predictions = {
+        "ml_l": np.zeros((10, 2)),
+        "ml_m": np.zeros((10, 1)),
+    }
+    msg = r"External predictions for 'ml_l' must have shape \(10, 1\)"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit_nuisance_models(external_predictions=ext_predictions)
diff --git a/doubleml/tests/test_scalar_set_sample_splitting.py b/doubleml/tests/test_scalar_set_sample_splitting.py
new file mode 100644
index 00000000..bc9abd84
--- /dev/null
+++ b/doubleml/tests/test_scalar_set_sample_splitting.py
@@ -0,0 +1,62 @@
+"""Test sample splitting setup for scalar DoubleML models."""
+
+import numpy as np
+import pytest
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+
+def _assert_smpls_equal(smpls0, smpls1):
+    assert len(smpls0) == len(smpls1)
+    for i_rep in range(len(smpls0)):
+        assert len(smpls0[i_rep]) == len(smpls1[i_rep])
+        for i_fold in range(len(smpls0[i_rep])):
+            assert np.array_equal(smpls0[i_rep][i_fold][0], smpls1[i_rep][i_fold][0])
+            assert np.array_equal(smpls0[i_rep][i_fold][1], smpls1[i_rep][i_fold][1])
+
+
+@pytest.mark.ci
+def test_scalar_set_sample_splitting_list():
+    """Ensure list-of-tuples splits set n_folds/n_rep correctly."""
+    np.random.seed(3141)
+    dml_data = make_plr_CCDDHNR2018(n_obs=10)
+    dml_obj = PLR(dml_data)
+
+    smpls = [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])]
+    dml_obj.set_sample_splitting(smpls)
+
+    assert dml_obj.n_folds == 2
+    assert dml_obj.n_rep == 1
+    _assert_smpls_equal([smpls], dml_obj.smpls)
+
+
+@pytest.mark.ci
+def test_scalar_set_sample_splitting_list_of_lists():
+    """Ensure list-of-list splits set repeated sample splitting correctly."""
+    np.random.seed(3141)
+    dml_data = make_plr_CCDDHNR2018(n_obs=10)
+    dml_obj = PLR(dml_data)
+
+    smpls = [
+        [([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])],
+        [([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]), ([1, 3, 5, 7, 9], [0, 2, 4, 6, 8])],
+    ]
+    dml_obj.set_sample_splitting(smpls)
+
+    assert dml_obj.n_folds == 2
+    assert dml_obj.n_rep == 2
+    _assert_smpls_equal(smpls, dml_obj.smpls)
+
+
+@pytest.mark.ci
+def test_scalar_set_sample_splitting_tuple_rejected():
+    """Reject tuple shorthand for scalar set_sample_splitting."""
+    np.random.seed(3141)
+    dml_data = make_plr_CCDDHNR2018(n_obs=10)
+    dml_obj = PLR(dml_data)
+
+    smpls = (np.arange(10), np.arange(10))
+    msg = "all_smpls must be a list of folds; tuple shorthand is not supported"
+    with pytest.raises(TypeError, match=msg):
+        dml_obj.set_sample_splitting(smpls)

From 0ca053d70b02082b18e9677f780f563cd31a6182 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Mon, 9 Feb 2026 09:13:32 +0100
Subject: [PATCH 13/38] Refactor IRM and PLR classes to reset fit state after
 updating learners; enhance tests for return types and reset behavior.

---
 .claude/rules/dml-scalar-test-structure.md    | 78 ++++++++++---------
 doubleml/irm/irm_scalar.py                    |  1 +
 .../irm/tests/test_irm_scalar_return_types.py | 46 +++++++++++
 doubleml/plm/plr_scalar.py                    |  1 +
 .../plm/tests/test_plr_scalar_return_types.py | 37 +++++++++
 5 files changed, 125 insertions(+), 38 deletions(-)

diff --git a/.claude/rules/dml-scalar-test-structure.md b/.claude/rules/dml-scalar-test-structure.md
index 8ee372ae..d6a03327 100644
--- a/.claude/rules/dml-scalar-test-structure.md
+++ b/.claude/rules/dml-scalar-test-structure.md
@@ -7,13 +7,13 @@
 
 Every scalar model `<model>` in module `<module>/` requires **5 test files** in `doubleml/<module>/tests/`:
 
-| File | Purpose |
-|------|---------|
-| `test_<model>_scalar.py` | Core estimation accuracy (3-sigma rule) |
-| `test_<model>_scalar_return_types.py` | Property types, shapes, API contracts |
-| `test_<model>_scalar_exceptions.py` | Input validation, error messages |
-| `test_<model>_scalar_vs_<model>.py` | Exact match with old `DoubleML<Model>` |
-| `test_<model>_scalar_external_predictions.py` | External predictions equivalence |
+| File                                          | Purpose                                 |
+| --------------------------------------------- | --------------------------------------- |
+| `test_<model>_scalar.py`                      | Core estimation accuracy (3-sigma rule) |
+| `test_<model>_scalar_return_types.py`         | Property types, shapes, API contracts   |
+| `test_<model>_scalar_exceptions.py`           | Input validation, error messages        |
+| `test_<model>_scalar_vs_<model>.py`           | Exact match with old `DoubleML<Model>`  |
+| `test_<model>_scalar_external_predictions.py` | External predictions equivalence        |
 
 All test functions must be marked `@pytest.mark.ci`.
 
@@ -34,38 +34,40 @@ All test functions must be marked `@pytest.mark.ci`.
 
 **Required tests**:
 
-| Test | Assertion |
-|------|-----------|
-| `test_coef_type_and_shape` | `isinstance(coef, np.ndarray)`, `shape == (1,)` |
-| `test_se_type_and_shape` | `isinstance(se, np.ndarray)`, `shape == (1,)` |
-| `test_all_thetas_shape` | `shape == (1, N_REP)` |
-| `test_all_ses_shape` | `shape == (1, N_REP)` |
-| `test_summary_type` | `isinstance(summary, pd.DataFrame)`, `len == 1` |
-| `test_confint_type_and_shape` | `isinstance(ci, pd.DataFrame)`, `shape == (1, 2)` |
-| `test_psi_shape` | `shape == (N_OBS, 1, N_REP)` |
-| `test_predictions_type` | `isinstance(predictions, dict)`, each value `shape == (N_OBS, N_REP)` |
-| `test_smpls_type` | `len(smpls) == N_REP`, each has `N_FOLDS` tuples of `(train, test)` arrays |
-| `test_n_properties` | `n_obs == N_OBS`, `n_folds == N_FOLDS`, `n_rep == N_REP`, `score == expected` |
-| `test_required_learners` | Returns list of expected learner names |
-| `test_str_repr` | `str(model)` and `repr(model)` return `str` |
-| `test_get_params` | Returns dict with learner keys |
-| `test_set_params` | Modifies and confirms learner parameter change |
-| `test_before_fit_raises` | `coef`/`se` before `fit()` raises error |
+| Test                                     | Assertion                                                                     |
+| ---------------------------------------- | ----------------------------------------------------------------------------- |
+| `test_coef_type_and_shape`               | `isinstance(coef, np.ndarray)`, `shape == (1,)`                               |
+| `test_se_type_and_shape`                 | `isinstance(se, np.ndarray)`, `shape == (1,)`                                 |
+| `test_all_thetas_shape`                  | `shape == (1, N_REP)`                                                         |
+| `test_all_ses_shape`                     | `shape == (1, N_REP)`                                                         |
+| `test_summary_type`                      | `isinstance(summary, pd.DataFrame)`, `len == 1`                               |
+| `test_confint_type_and_shape`            | `isinstance(ci, pd.DataFrame)`, `shape == (1, 2)`                             |
+| `test_psi_shape`                         | `shape == (N_OBS, 1, N_REP)`                                                  |
+| `test_predictions_type`                  | `isinstance(predictions, dict)`, each value `shape == (N_OBS, N_REP)`         |
+| `test_smpls_type`                        | `len(smpls) == N_REP`, each has `N_FOLDS` tuples of `(train, test)` arrays    |
+| `test_n_properties`                      | `n_obs == N_OBS`, `n_folds == N_FOLDS`, `n_rep == N_REP`, `score == expected` |
+| `test_required_learners`                 | Returns list of expected learner names                                        |
+| `test_str_repr`                          | `str(model)` and `repr(model)` return `str`                                   |
+| `test_get_params`                        | Returns dict with learner keys                                                |
+| `test_set_params`                        | Modifies and confirms learner parameter change                                |
+| `test_before_fit_raises`                 | `coef`/`se` before `fit()` raises error                                       |
+| `test_reset_after_set_learners`          | Updating learners clears fitted results                                       |
+| `test_reset_after_draw_sample_splitting` | Changing splits clears fitted results                                         |
 
 ## 3. Exceptions (`test_<model>_scalar_exceptions.py`)
 
 **Common exception tests** (required for all models):
 
-| Test | Input | Expected |
-|------|-------|----------|
-| `test_exception_data` | Non-DoubleMLData | `TypeError` |
-| `test_exception_score` | Invalid score string | `ValueError` |
-| `test_exception_n_folds` | `n_folds < 2` | `ValueError` |
-| `test_exception_n_rep` | `n_rep < 1` | `ValueError` |
-| `test_exception_fit_nuisance_without_smpls` | Fit before `draw_sample_splitting()` | `ValueError` |
+| Test                                                 | Input                                   | Expected     |
+| ---------------------------------------------------- | --------------------------------------- | ------------ |
+| `test_exception_data`                                | Non-DoubleMLData                        | `TypeError`  |
+| `test_exception_score`                               | Invalid score string                    | `ValueError` |
+| `test_exception_n_folds`                             | `n_folds < 2`                           | `ValueError` |
+| `test_exception_n_rep`                               | `n_rep < 1`                             | `ValueError` |
+| `test_exception_fit_nuisance_without_smpls`          | Fit before `draw_sample_splitting()`    | `ValueError` |
 | `test_exception_estimate_causal_without_predictions` | Estimate before `fit_nuisance_models()` | `ValueError` |
-| `test_exception_missing_learner` | `fit()` without required learners | `ValueError` |
-| `test_exception_invalid_learner` | Class instead of instance | `TypeError` |
+| `test_exception_missing_learner`                     | `fit()` without required learners       | `ValueError` |
+| `test_exception_invalid_learner`                     | Class instead of instance               | `TypeError`  |
 
 **Model-specific exceptions** to add per model:
 - PLR: multiple treatments, `ml_g` warning for partialling out
@@ -107,11 +109,11 @@ dml_new._smpls = dml_old.smpls  # Old/new consume random state differently
 
 ## Assertion Tolerance Summary
 
-| Context | Method | Why |
-|---------|--------|-----|
-| Core estimation | `abs(coef - true) <= 3.0 * se` | Statistical 3-sigma |
-| Backward compatibility | `assert_allclose(rtol=1e-9)` | Must be identical |
-| External predictions | `math.isclose(rel_tol=1e-9, abs_tol=1e-4)` | Numerical accumulation |
+| Context                | Method                                     | Why                    |
+| ---------------------- | ------------------------------------------ | ---------------------- |
+| Core estimation        | `abs(coef - true) <= 3.0 * se`             | Statistical 3-sigma    |
+| Backward compatibility | `assert_allclose(rtol=1e-9)`               | Must be identical      |
+| External predictions   | `math.isclose(rel_tol=1e-9, abs_tol=1e-4)` | Numerical accumulation |
 
 ## New Model Checklist
 
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index 83a7f47b..71f2f142 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -219,6 +219,7 @@ def set_learners(
             if learner is not None:
                 self._register_learner(name, learner)
 
+        self._reset_fit_state()
         return self
 
     # ==================== Sample Splitting ====================
diff --git a/doubleml/irm/tests/test_irm_scalar_return_types.py b/doubleml/irm/tests/test_irm_scalar_return_types.py
index 15eaae82..a437f49d 100644
--- a/doubleml/irm/tests/test_irm_scalar_return_types.py
+++ b/doubleml/irm/tests/test_irm_scalar_return_types.py
@@ -1,3 +1,5 @@
+"""Validate IRM scalar return types and reset behavior."""
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -152,6 +154,7 @@ def test_get_params_invalid_learner(fitted_dml_obj):
 
 @pytest.mark.ci
 def test_before_fit_raises():
+    """Raise errors when accessing results before fitting."""
     np.random.seed(3141)
     dml_obj = IRM(obj_dml_data)
     with pytest.raises(ValueError, match="framework is not yet initialized"):
@@ -168,3 +171,46 @@ def test_irm_properties(fitted_dml_obj):
     assert "weights" in fitted_dml_obj.weights
     assert fitted_dml_obj.ps_processor is not None
     assert fitted_dml_obj.ps_processor_config is not None
+
+
+@pytest.mark.ci
+def test_reset_after_set_learners():
+    """Reset fitted state after updating learners."""
+    np.random.seed(3141)
+    dml_obj = IRM(obj_dml_data)
+    dml_obj.set_learners(
+        ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42),
+        ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42),
+    )
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+
+    dml_obj.set_learners(
+        ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42),
+        ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42),
+    )
+
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.coef
+    with pytest.raises(ValueError, match="Predictions not available. Call fit"):
+        _ = dml_obj.predictions
+
+
+@pytest.mark.ci
+def test_reset_after_draw_sample_splitting():
+    """Reset fitted state after changing sample splits."""
+    np.random.seed(3141)
+    dml_obj = IRM(obj_dml_data)
+    dml_obj.set_learners(
+        ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42),
+        ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42),
+    )
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.coef
+    with pytest.raises(ValueError, match="Predictions not available. Call fit"):
+        _ = dml_obj.predictions
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index ef18fb68..e4aeaa51 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -128,6 +128,7 @@ def set_learners(
 
         # IV-type: clone ml_l to ml_g if only one provided
         self._handle_iv_cloning()
+        self._reset_fit_state()
         return self
 
     def _handle_iv_cloning(self) -> None:
diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py
index 09832931..39fe77e6 100644
--- a/doubleml/plm/tests/test_plr_scalar_return_types.py
+++ b/doubleml/plm/tests/test_plr_scalar_return_types.py
@@ -1,3 +1,5 @@
+"""Validate PLR scalar return types and reset behavior."""
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -148,8 +150,43 @@ def test_get_params_invalid_learner(fitted_dml_obj):
 
 @pytest.mark.ci
 def test_before_fit_raises():
+    """Raise errors when accessing results before fitting."""
+    np.random.seed(3141)
+    dml_obj = PLR(obj_dml_data)
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.coef
+    with pytest.raises(ValueError, match="Predictions not available. Call fit"):
+        _ = dml_obj.predictions
+
+
+@pytest.mark.ci
+def test_reset_after_set_learners():
+    """Reset fitted state after updating learners."""
     np.random.seed(3141)
     dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.coef
+    with pytest.raises(ValueError, match="Predictions not available. Call fit"):
+        _ = dml_obj.predictions
+
+
+@pytest.mark.ci
+def test_reset_after_draw_sample_splitting():
+    """Reset fitted state after changing sample splits."""
+    np.random.seed(3141)
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+
     with pytest.raises(ValueError, match="framework is not yet initialized"):
         _ = dml_obj.coef
     with pytest.raises(ValueError, match="Predictions not available. Call fit"):

From 33c8b01bdb93a188f18abc4624ed4c63d6a86e10 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Mon, 9 Feb 2026 11:21:37 +0100
Subject: [PATCH 14/38] Add copilot documentation for code style, error
 handling, performance, testing, and scalar model test structure

---
 .github/copilot-instructions.md   | 46 +++++++++++++++++++++++++++++++
 .github/copilot/README.md         | 10 +++++++
 .github/copilot/code-style.md     | 10 +++++++
 .github/copilot/error-handling.md |  9 ++++++
 .github/copilot/performance.md    |  9 ++++++
 .github/copilot/scalar-tests.md   | 12 ++++++++
 .github/copilot/testing.md        |  9 ++++++
 7 files changed, 105 insertions(+)
 create mode 100644 .github/copilot-instructions.md
 create mode 100644 .github/copilot/README.md
 create mode 100644 .github/copilot/code-style.md
 create mode 100644 .github/copilot/error-handling.md
 create mode 100644 .github/copilot/performance.md
 create mode 100644 .github/copilot/scalar-tests.md
 create mode 100644 .github/copilot/testing.md

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 00000000..eaf517e5
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,46 @@
+# Copilot instructions for DoubleML (Python)
+
+These instructions guide code and test authoring. Keep changes aligned with the detailed rules in .claude to avoid drift.
+
+## Scope
+- Authoring guidance only (not reviewer-only rules).
+- Prefer concise, targeted edits; avoid unrelated refactors.
+
+## Code style and design
+- Start each Python file with a one-sentence module docstring.
+- Use NumPy-style docstrings for public APIs (summary, Parameters, Returns).
+- Require full type hints, including return types.
+- Follow DoubleML patterns: `_check_*` validation helpers, `DoubleMLResampling` for splitting, clone learners before fit.
+- Keep score outputs named `psi_a` and `psi_b` with shape `(n_obs,)`.
+
+## Error handling
+- Use `ValueError` for invalid values and `TypeError` for wrong types.
+- Validate early (constructors and setters) with clear expected/actual messages.
+- In tests, always use `pytest.raises(..., match=...)`.
+
+## Testing
+- Mark all tests with `@pytest.mark.ci`.
+- Use module-scoped fixtures for expensive fits.
+- Seed random generators for reproducibility.
+- For new scalar models, follow the required 5-file test structure.
+
+## Verification (lightweight)
+Run relevant checks when changes warrant it:
+- `black .`
+- `ruff check --fix .`
+- `mypy doubleml`
+- `pytest -m ci`
+
+## References (canonical rules)
+- Code style: .claude/rules/py-code-conventions.md
+- Error handling: .claude/rules/error-handling.md
+- Performance: .claude/rules/performance-guidelines.md
+- Testing: .claude/rules/testing-conventions.md
+- Scalar test structure: .claude/rules/dml-scalar-test-structure.md
+
+## Optional reference docs
+- .github/copilot/code-style.md
+- .github/copilot/error-handling.md
+- .github/copilot/performance.md
+- .github/copilot/testing.md
+- .github/copilot/scalar-tests.md
diff --git a/.github/copilot/README.md b/.github/copilot/README.md
new file mode 100644
index 00000000..26b7d41c
--- /dev/null
+++ b/.github/copilot/README.md
@@ -0,0 +1,10 @@
+# Copilot reference docs
+
+These short guides summarize the canonical rules in .claude.
+Use them as a quick pointer, not a source of truth.
+
+- code-style.md -> .claude/rules/py-code-conventions.md
+- error-handling.md -> .claude/rules/error-handling.md
+- performance.md -> .claude/rules/performance-guidelines.md
+- testing.md -> .claude/rules/testing-conventions.md
+- scalar-tests.md -> .claude/rules/dml-scalar-test-structure.md
diff --git a/.github/copilot/code-style.md b/.github/copilot/code-style.md
new file mode 100644
index 00000000..66b6cc0c
--- /dev/null
+++ b/.github/copilot/code-style.md
@@ -0,0 +1,10 @@
+# Code style (summary)
+
+- Module-level docstring required (one sentence).
+- NumPy-style docstrings for public APIs.
+- Full type hints, including return types.
+- Use built-in generics (list[int], dict[str, T]) for Python 3.10+.
+- Follow DoubleML patterns: `_check_*` helpers, `DoubleMLResampling`, clone learners.
+- Score outputs use `psi_a` and `psi_b` with shape `(n_obs,)`.
+
+Canonical: .claude/rules/py-code-conventions.md
diff --git a/.github/copilot/error-handling.md b/.github/copilot/error-handling.md
new file mode 100644
index 00000000..1e76440f
--- /dev/null
+++ b/.github/copilot/error-handling.md
@@ -0,0 +1,9 @@
+# Error handling (summary)
+
+- Invalid values -> `ValueError`; wrong types -> `TypeError`.
+- Validate early in constructors and setters.
+- Error messages include expected vs actual values.
+- Prefer `_check_*` helpers from doubleml/utils/_checks.py.
+- Tests must use `pytest.raises(..., match=...)`.
+
+Canonical: .claude/rules/error-handling.md
diff --git a/.github/copilot/performance.md b/.github/copilot/performance.md
new file mode 100644
index 00000000..cba0a4e1
--- /dev/null
+++ b/.github/copilot/performance.md
@@ -0,0 +1,9 @@
+# Performance (summary)
+
+- Vectorize array operations; avoid Python loops over observations.
+- Pre-allocate `(n_obs, n_rep)` arrays before filling.
+- Clone learners before fit (mutable estimators).
+- Use `DoubleMLResampling`, not raw `KFold`.
+- Prefer `np.linalg.lstsq` over manual inversion.
+
+Canonical: .claude/rules/performance-guidelines.md
diff --git a/.github/copilot/scalar-tests.md b/.github/copilot/scalar-tests.md
new file mode 100644
index 00000000..f9e6a983
--- /dev/null
+++ b/.github/copilot/scalar-tests.md
@@ -0,0 +1,12 @@
+# Scalar model test structure (summary)
+
+New DoubleMLScalar models require five test files:
+- test_<model>_scalar.py
+- test_<model>_scalar_return_types.py
+- test_<model>_scalar_exceptions.py
+- test_<model>_scalar_vs_<model>.py
+- test_<model>_scalar_external_predictions.py
+
+See details and required assertions in the canonical rule.
+
+Canonical: .claude/rules/dml-scalar-test-structure.md
diff --git a/.github/copilot/testing.md b/.github/copilot/testing.md
new file mode 100644
index 00000000..7f21f957
--- /dev/null
+++ b/.github/copilot/testing.md
@@ -0,0 +1,9 @@
+# Testing (summary)
+
+- Mark all tests with `@pytest.mark.ci`.
+- Use module-scoped fixtures for expensive fits.
+- Seed RNGs for reproducibility.
+- Use `match=` in exception tests.
+- Follow naming: `test_<model>_scalar*.py` for scalar models.
+
+Canonical: .claude/rules/testing-conventions.md

From 45c5f4879e03c37075617622bcd15cd8e37d4d60 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Mon, 9 Feb 2026 14:15:29 +0100
Subject: [PATCH 15/38] Enhance DoubleMLScalar and IRM classes for stratified
 sample splitting; update tests for consistency

---
 doubleml/double_ml_scalar.py          |  4 ++-
 doubleml/irm/irm_scalar.py            | 44 ++-------------------------
 doubleml/tests/test_scalar_cluster.py |  8 +++--
 3 files changed, 12 insertions(+), 44 deletions(-)

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index 0f69a3e1..783b79ce 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -90,6 +90,7 @@ def __init__(
         self._n_rep: int | None = None
         self._smpls: list | None = None
         self._smpls_cluster: list | None = None
+        self._stratify_variable: np.ndarray | None = None  # For stratified sample splitting
 
         # Initialize storage for predictions and results
         self._predictions: dict[str, np.ndarray] | None = None
@@ -528,11 +529,12 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
             self._n_folds_per_cluster = None
             self._n_rep = n_rep
 
-            # Create resampler
+            # Create resampler (with optional stratification)
             resampler = DoubleMLResampling(
                 n_folds=n_folds,
                 n_rep=n_rep,
                 n_obs=self._n_obs,
+                stratify=self._stratify_variable,
             )
 
             # Generate splits
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index 71f2f142..70c03ff4 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -16,7 +16,6 @@
 from ..utils._learner import LearnerSpec, predict_nuisance
 from ..utils._propensity_score import _propensity_score_adjustment
 from ..utils.propensity_score_processing import PSProcessor, PSProcessorConfig
-from ..utils.resampling import DoubleMLResampling
 
 
 class IRM(LinearScoreMixin):
@@ -126,6 +125,9 @@ def __init__(
             score=score,
         )
 
+        # Enable stratified sample splitting for binary treatment
+        self._stratify_variable = self._dml_data.d
+
         # Normalize IPW
         if not isinstance(normalize_ipw, bool):
             raise TypeError("Normalization indicator has to be boolean. " f"Object of type {str(type(normalize_ipw))} passed.")
@@ -222,46 +224,6 @@ def set_learners(
         self._reset_fit_state()
         return self
 
-    # ==================== Sample Splitting ====================
-
-    def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
-        """
-        Draw stratified sample splitting for cross-fitting.
-
-        Uses stratified K-fold splitting to ensure each fold contains both
-        treatment groups (D=0 and D=1).
-
-        Parameters
-        ----------
-        n_folds : int, optional
-            Number of folds for cross-fitting. Default is 5.
-        n_rep : int, optional
-            Number of repetitions for sample splitting. Default is 1.
-
-        Returns
-        -------
-        self : IRM
-            The estimator with initialized sample splits.
-        """
-        if not isinstance(n_folds, int) or n_folds < 2:
-            raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.")
-        if not isinstance(n_rep, int) or n_rep < 1:
-            raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.")
-
-        self._n_folds = n_folds
-        self._n_rep = n_rep
-
-        # Create stratified resampler
-        resampler = DoubleMLResampling(
-            n_folds=n_folds,
-            n_rep=n_rep,
-            n_obs=self._n_obs,
-            stratify=self._dml_data.d,
-        )
-
-        self._smpls = resampler.split_samples()
-        return self
-
     # ==================== Nuisance Estimation ====================
 
     def _nuisance_est(
diff --git a/doubleml/tests/test_scalar_cluster.py b/doubleml/tests/test_scalar_cluster.py
index 0ad05f68..4388ec74 100644
--- a/doubleml/tests/test_scalar_cluster.py
+++ b/doubleml/tests/test_scalar_cluster.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.linear_model import Lasso, LinearRegression
+from sklearn.linear_model import LinearRegression
 
 from doubleml import DoubleMLData
 from doubleml.plm.datasets import make_plr_CCDDHNR2018
@@ -13,7 +13,11 @@
 
 
 @pytest.fixture(
-    scope="module", params=[RandomForestRegressor(max_depth=2, n_estimators=10), LinearRegression(), Lasso(alpha=0.1)]
+    scope="module",
+    params=[
+        RandomForestRegressor(max_depth=2, n_estimators=10, random_state=42),
+        LinearRegression(),
+    ],
 )
 def learner(request):
     return request.param

From 0051c77173f96d5c8b73fa6a121337ad0c0149aa Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Fri, 13 Feb 2026 09:11:00 +0100
Subject: [PATCH 16/38] add post_nuisance checks

---
 doubleml/double_ml_scalar.py                  |  6 +++
 doubleml/irm/irm_scalar.py                    | 30 +++++++++++++-
 .../irm/tests/test_irm_scalar_exceptions.py   | 32 +++++++++++++++
 doubleml/plm/plr_scalar.py                    | 40 +++++++++++++++++++
 .../plm/tests/test_plr_scalar_exceptions.py   | 28 ++++++++++++-
 doubleml/utils/_checks.py                     | 19 ++++++---
 6 files changed, 147 insertions(+), 8 deletions(-)

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index 783b79ce..958e4b3e 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -446,6 +446,9 @@ def fit_nuisance_models(
                     external_predictions=external_predictions,
                 )
 
+        # Post-nuisance prediction checks (model-specific)
+        self._post_nuisance_checks()
+
         return self
 
     def estimate_causal_parameters(self) -> Self:
@@ -738,6 +741,9 @@ def _reset_fit_state(self) -> None:
 
     # ==================== Abstract Methods (Must be Implemented by Subclasses) ====================
 
+    def _post_nuisance_checks(self) -> None:
+        """Post-nuisance prediction validation hook. Override in subclasses for model-specific checks."""
+
     @abstractmethod
     def _nuisance_est(
         self,
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index 70c03ff4..57f4be91 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -12,7 +12,7 @@
 
 from ..data.base_data import DoubleMLData
 from ..double_ml_linear_score import LinearScoreMixin
-from ..utils._checks import _check_score, _check_weights
+from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_score, _check_weights
 from ..utils._learner import LearnerSpec, predict_nuisance
 from ..utils._propensity_score import _propensity_score_adjustment
 from ..utils.propensity_score_processing import PSProcessor, PSProcessorConfig
@@ -226,6 +226,34 @@ def set_learners(
 
     # ==================== Nuisance Estimation ====================
 
+    def _post_nuisance_checks(self) -> None:
+        """Check predictions for validity after cross-fitting completes."""
+        for i_rep in range(self.n_rep):
+            # After full K-fold cross-fitting all observations are test observations
+            # in exactly one fold, so the full prediction array is populated.
+
+            # Skip checks for learners with external predictions (not registered in _learners)
+            if "ml_g0" in self._learners:
+                _check_finite_predictions(self._predictions["ml_g0"][:, i_rep], self._learners["ml_g0"].learner, "ml_g0")
+                if self._dml_data.binary_outcome:
+                    _check_binary_predictions(
+                        self._predictions["ml_g0"][:, i_rep],
+                        self._learners["ml_g0"].learner,
+                        "ml_g0",
+                        self._dml_data.y_col,
+                    )
+            if "ml_g1" in self._learners:
+                _check_finite_predictions(self._predictions["ml_g1"][:, i_rep], self._learners["ml_g1"].learner, "ml_g1")
+                if self._dml_data.binary_outcome:
+                    _check_binary_predictions(
+                        self._predictions["ml_g1"][:, i_rep],
+                        self._learners["ml_g1"].learner,
+                        "ml_g1",
+                        self._dml_data.y_col,
+                    )
+            if "ml_m" in self._learners:
+                _check_finite_predictions(self._predictions["ml_m"][:, i_rep], self._learners["ml_m"].learner, "ml_m")
+
     def _nuisance_est(
         self,
         train_idx: np.ndarray,
diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py
index df0aab60..59dc91a1 100644
--- a/doubleml/irm/tests/test_irm_scalar_exceptions.py
+++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py
@@ -4,6 +4,7 @@
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.linear_model import LinearRegression
 
+import doubleml as dml
 from doubleml.irm.datasets import make_irm_data
 from doubleml.irm.irm_scalar import IRM
 from doubleml.plm.datasets import make_plr_CCDDHNR2018
@@ -11,6 +12,26 @@
 np.random.seed(3141)
 obj_dml_data = make_irm_data(theta=0.5, n_obs=100, dim_x=10, return_type="DoubleMLData")
 
+# Binary-outcome data for binary predictions check tests
+np.random.seed(42)
+_n = 200
+_X = np.random.normal(size=(_n, 3))
+_d_bin = (np.random.normal(size=_n) > 0).astype(float)
+_y_bin = (np.random.normal(size=_n) > 0).astype(float)
+_df_binary = pd.DataFrame({"y": _y_bin, "d": _d_bin, "X1": _X[:, 0], "X2": _X[:, 1], "X3": _X[:, 2]})
+obj_dml_data_binary = dml.DoubleMLData(_df_binary, y_col="y", d_cols="d", x_cols=["X1", "X2", "X3"])
+
+
+class _HardLabelClassifier(RandomForestClassifier):
+    """Classifier that returns hard 0/1 labels instead of probabilities — for testing only."""
+
+    def predict_proba(self, X):
+        preds = np.zeros((len(X), 2))
+        preds[:, 1] = (np.arange(len(X)) % 2).astype(float)
+        preds[:, 0] = 1.0 - preds[:, 1]
+        return preds
+
+
 ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
 ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
 
@@ -131,3 +152,14 @@ def test_irm_scalar_exception_normalize_ipw_type():
     msg = r"Normalization indicator has to be boolean"
     with pytest.raises(TypeError, match=msg):
         IRM(obj_dml_data, normalize_ipw="True")
+
+
+@pytest.mark.ci
+def test_irm_scalar_exception_binary_predictions_g():
+    """Classifier ml_g returning hard labels (0/1) instead of probabilities raises ValueError."""
+    ml_m_test = RandomForestClassifier(n_estimators=5, random_state=42)
+    dml_obj = IRM(obj_dml_data_binary, ml_g=_HardLabelClassifier(), ml_m=ml_m_test)
+    dml_obj.draw_sample_splitting(n_folds=3)
+    msg = r"For the binary variable .+, predictions .+ are also observed to be binary"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit_nuisance_models()
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index e4aeaa51..1c38af24 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -12,6 +12,7 @@
 
 from ..data.base_data import DoubleMLData
 from ..double_ml_linear_score import LinearScoreMixin
+from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_is_propensity
 from ..utils._learner import LearnerSpec, predict_nuisance
 
 
@@ -73,6 +74,8 @@ def __init__(
         valid_scores = ["partialling out", "IV-type"]
         if score not in valid_scores:
             raise ValueError(f"Invalid score '{score}'. Valid scores: {valid_scores}.")
+        if score == "IV-type" and obj_dml_data.binary_outcome:
+            raise ValueError("For score = 'IV-type', additive probability models (binary outcomes) are not supported.")
 
         super().__init__(
             obj_dml_data=obj_dml_data,
@@ -126,6 +129,14 @@ def set_learners(
                 continue
             self._register_learner(name, learner)
 
+        # Warn when a classifier is used for ml_l with a binary outcome
+        if ml_l is not None and "ml_l" in self._learners:
+            if self._learners["ml_l"].is_classifier and self._dml_data.binary_outcome:
+                warnings.warn(
+                    f"The ml_l learner {str(ml_l)} was identified as classifier. " "Fitting an additive probability model.",
+                    UserWarning,
+                )
+
         # IV-type: clone ml_l to ml_g if only one provided
         self._handle_iv_cloning()
         self._reset_fit_state()
@@ -173,6 +184,35 @@ def _check_data(obj_dml_data):
                 "To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR."
             )
 
+    def _post_nuisance_checks(self) -> None:
+        """Check predictions for validity after cross-fitting completes."""
+        for i_rep in range(self.n_rep):
+            # After full K-fold cross-fitting, all observations are test observations
+            # in exactly one fold, so the full prediction array is populated.
+
+            # Skip checks for learners with external predictions (not registered in _learners)
+            if "ml_l" in self._learners:
+                _check_finite_predictions(self._predictions["ml_l"][:, i_rep], self._learners["ml_l"].learner, "ml_l")
+            if "ml_m" in self._learners:
+                _check_finite_predictions(self._predictions["ml_m"][:, i_rep], self._learners["ml_m"].learner, "ml_m")
+
+                # Propensity score range check when ml_m is a classifier
+                if self._learners["ml_m"].is_classifier:
+                    _check_is_propensity(
+                        self._predictions["ml_m"][:, i_rep],
+                        self._learners["ml_m"].learner,
+                        "ml_m",
+                    )
+
+                # Binary predictions check for binary treatment
+                if self._dml_data.binary_treats.all():
+                    _check_binary_predictions(
+                        self._predictions["ml_m"][:, i_rep],
+                        self._learners["ml_m"].learner,
+                        "ml_m",
+                        self._dml_data.d_cols[0],
+                    )
+
     def _nuisance_est(
         self,
         train_idx: np.ndarray,
diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py
index fb1ba7a9..d49d1902 100644
--- a/doubleml/plm/tests/test_plr_scalar_exceptions.py
+++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
-from sklearn.linear_model import Lasso
+from sklearn.linear_model import Lasso, LogisticRegression
 
 import doubleml as dml
 from doubleml.plm.datasets import make_plr_CCDDHNR2018
@@ -10,6 +10,15 @@
 np.random.seed(3141)
 obj_dml_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=10, alpha=0.5)
 
+# Binary-outcome data for binary-specific tests
+np.random.seed(42)
+_n = 100
+_X = np.random.normal(size=(_n, 3))
+_d = (np.random.normal(size=_n) > 0).astype(float)
+_y_bin = (np.random.normal(size=_n) > 0).astype(float)
+_df_binary = pd.DataFrame({"y": _y_bin, "d": _d, "X1": _X[:, 0], "X2": _X[:, 1], "X3": _X[:, 2]})
+obj_dml_data_binary = dml.DoubleMLData(_df_binary, y_col="y", d_cols="d", x_cols=["X1", "X2", "X3"])
+
 # Create data with instruments for IV check
 df = obj_dml_data.data.copy()
 x_cols = [c for c in df.columns if c.startswith("X")]
@@ -110,3 +119,20 @@ def test_plr_scalar_exception_invalid_learner():
     msg = r"Invalid learner provided for ml_l: provide an instance"
     with pytest.raises(TypeError, match=msg):
         dml_obj.set_learners(ml_l=Lasso)  # class instead of instance
+
+
+@pytest.mark.ci
+def test_plr_scalar_exception_iv_type_binary_outcome():
+    """IV-type score with binary outcome raises ValueError."""
+    msg = r"For score = 'IV-type', additive probability models \(binary outcomes\) are not supported\."
+    with pytest.raises(ValueError, match=msg):
+        PLR(obj_dml_data_binary, score="IV-type")
+
+
+@pytest.mark.ci
+def test_plr_scalar_warning_binary_outcome_classifier():
+    """Classifier ml_l with binary outcome warns about fitting an additive probability model."""
+    dml_obj = PLR(obj_dml_data_binary)
+    msg = r"The ml_l learner .+ was identified as classifier\. Fitting an additive probability model\."
+    with pytest.warns(UserWarning, match=msg):
+        dml_obj.set_learners(ml_l=LogisticRegression(), ml_m=Lasso())
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index 7db749dc..ad493e28 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -109,9 +109,13 @@ def _check_smpl_split_tpl(tpl, n_obs, check_intersect=False):
     return train_index, test_index
 
 
-def _check_finite_predictions(preds, learner, learner_name, smpls):
-    test_indices = np.concatenate([test_index for _, test_index in smpls])
-    if not np.all(np.isfinite(preds[test_indices])):
+def _check_finite_predictions(preds, learner, learner_name, smpls=None):
+    if smpls is not None:
+        indices = np.concatenate([test_index for _, test_index in smpls])
+        check_preds = preds[indices]
+    else:
+        check_preds = preds
+    if not np.all(np.isfinite(check_preds)):
         raise ValueError(f"Predictions from learner {str(learner)} for {learner_name} are not finite.")
     return
 
@@ -189,9 +193,12 @@ def _check_contains_iv(obj_dml_data):
     return
 
 
-def _check_is_propensity(preds, learner, learner_name, smpls, eps=1e-12):
-    test_indices = np.concatenate([test_index for _, test_index in smpls])
-    if any((preds[test_indices] < eps) | (preds[test_indices] > 1 - eps)):
+def _check_is_propensity(preds, learner, learner_name, smpls=None, eps=1e-12):
+    if smpls is not None:
+        check_preds = preds[np.concatenate([test_index for _, test_index in smpls])]
+    else:
+        check_preds = preds
+    if any((check_preds < eps) | (check_preds > 1 - eps)):
         warnings.warn(
             f"Propensity predictions from learner {str(learner)} for {learner_name} are close to zero or one (eps={eps})."
         )

From 35434bb0fc5cec60ddc2369244ed4c6ad4ef20c7 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 28 Feb 2026 08:32:48 +0100
Subject: [PATCH 17/38] add guideline for using absolute imports from project
 root

---
 .claude/rules/py-code-conventions.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.claude/rules/py-code-conventions.md b/.claude/rules/py-code-conventions.md
index 4344a3a0..ae8b022d 100644
--- a/.claude/rules/py-code-conventions.md
+++ b/.claude/rules/py-code-conventions.md
@@ -48,6 +48,8 @@ from doubleml.utils._checks import _check_learner
 
 Import order (enforced by ruff/isort): standard library, third-party, local.
 
+Use absolute imports from the project root (`doubleml.`) rather than relative imports (`..utils._checks`).
+
 ### `from __future__ import annotations`
 
 Not required in every file. Use it when a class references its own type in annotations (forward reference). Since the project targets Python 3.10+, `list[int]`, `dict[str, T]`, and `X | Y` unions work natively without it.

From d195fffea2345b94fbb2541de3ba4db34ce36a96 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 28 Feb 2026 13:11:37 +0100
Subject: [PATCH 18/38] add guidelines for tuning tests and required fixtures
 for scalar models

---
 .claude/agents/py-general-reviewer.md | 59 +++++++++++++++++++++++++++
 .claude/rules/testing-conventions.md  | 52 ++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 .claude/agents/py-general-reviewer.md

diff --git a/.claude/agents/py-general-reviewer.md b/.claude/agents/py-general-reviewer.md
new file mode 100644
index 00000000..3f3b2a70
--- /dev/null
+++ b/.claude/agents/py-general-reviewer.md
@@ -0,0 +1,59 @@
+---
+name: py-general-reviewer
+description: Professional Python code reviewer focusing on logic, performance, and best practices. Uses a debate-driven approach to minimize false positives.
+tools: Read, Grep, Glob, Bash
+model: inherit
+---
+
+Review Python code changes for functional correctness and industry-standard best practices. Report issues only — never edit source files.
+
+## Workflow
+
+1. **Identify Changes**: Run `git diff --name-only HEAD~1` to identify changed `.py` files.
+2. **Read**: Read the content of each modified file.
+3. **Internal Debate**: For each file, simulate a dialogue:
+   - **@Auditor**: Finds potential bugs, edge cases, and "code smells."
+   - **@Author**: Defends the implementation (e.g., "This is a temporary shim" or "Performance requires this complexity").
+   - **@Resolution**: Agree on the final list of actionable improvements.
+4. **Output**: Use the "Final Review" format specified below.
+
+## Review Checklist
+
+### 🔴 Critical (Bug Risk / Logic)
+- **Edge Cases**: Unhandled `None` values, empty lists, or `0` divisors.
+- **Resource Leaks**: Files or network sockets opened without `with` blocks.
+- **Mutable Defaults**: Using `list` or `dict` as default arguments in functions.
+- **Concurrency**: Thread-safety issues or race conditions in shared state.
+- **Logic Errors**: Off-by-one errors or incorrect boolean logic in complex conditionals.
+
+### 🟡 Warning (Best Practices / Clean Code)
+- **Complexity**: Functions longer than 50 lines or nesting deeper than 3 levels.
+- **DRY (Don't Repeat Yourself)**: Significant logic duplication that should be a helper function.
+- **Error Handling**: Using "bare" `except:` blocks instead of specific exceptions.
+- **Type Hinting**: Public APIs missing type annotations for parameters or return values.
+- **Hardcoding**: URLs, credentials, or magic numbers that should be constants/config.
+
+### 🟢 Suggestion (Style / Optimization)
+- **Vectorization**: Using loops where NumPy or Pandas operations would be $O(1)$ or significantly faster.
+- **Built-ins**: Re-implementing logic that exists in `itertools`, `collections`, or `pathlib`.
+- **Docstrings**: Missing or outdated descriptions of function intent.
+
+## Output Format
+
+```markdown
+## Final Review: `<filename>`
+
+### ⚖️ The Debate Summary
+[1-2 sentences on what was debated between the Auditor and Author.]
+
+### 🚫 Resolved Issues (Blocking)
+- **line N**: [issue]. **Fix**: `<concrete_code_fix>`
+
+### ⚠️ Resolved Warnings
+- **line N**: [issue]. **Consider**: `<suggestion>`
+
+### ✅ Dismissed (False Positives)
+- **line N**: [Original concern] -> [Reason for dismissal]
+
+### Summary
+[Final assessment: e.g., "3 issues found (1 critical, 2 warnings)"]
diff --git a/.claude/rules/testing-conventions.md b/.claude/rules/testing-conventions.md
index 0508ae42..e48b5fa8 100644
--- a/.claude/rules/testing-conventions.md
+++ b/.claude/rules/testing-conventions.md
@@ -88,9 +88,58 @@ with pytest.raises(ValueError, match=msg):
 - **Small data for speed**: `n_obs=200`, `n_folds=3` for return type / exception tests
 - **Larger data for accuracy**: `n_obs=500`, `n_folds=5` for estimation tests
 
+## Tuning Tests (`test_<model>_scalar_tune_ml_models.py`)
+
+Scalar models with `tune_ml_models()` require a dedicated test file. Add it alongside the 5 standard scalar test files.
+
+### Fixtures and Shared Constants
+
+```python
+# Matches resolve_optuna_cv(cv=5) used internally — required for improvement assertions
+_TUNE_CV = KFold(n_splits=5, shuffle=True, random_state=42)
+
+@pytest.fixture(scope="module")
+def <model>_data():
+    np.random.seed(3141)
+    return make_<model>_data(n_obs=500, dim_x=5)
+
+@pytest.fixture(scope="module", params=["score_a", "score_b"])
+def score(request):
+    return request.param
+```
+
+### Required Tests
+
+| Test | Checks |
+|------|--------|
+| `test_<model>_scalar_tune_basic` | Return type `dict[str, DMLOptunaResult]`; correct keys; `tuned=True`; params applied to learners; `model.fit()` succeeds. Parametrize over `score` + `_SAMPLER_CASES`. |
+| `test_<model>_scalar_tune_improves_score` | `tune_res[name].best_score > cross_val_score(default_tree, ..., cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean()` |
+| `test_<model>_scalar_tune_returns_self` | `return_tune_res=False` returns `self` |
+| `test_<model>_scalar_tune_set_as_params_false` | Learner params unchanged; `best_params` still populated |
+| `test_<model>_scalar_tune_invalid_key` | Unknown key raises `ValueError` |
+| `test_<model>_scalar_tune_partial_space` | Tuning only a subset leaves unspecified learners unchanged |
+
+For models with `_LEARNER_PARAM_ALIASES` (e.g., IRM `"ml_g"` → `["ml_g0", "ml_g1"]`), add:
+
+| Test | Checks |
+|------|--------|
+| `test_<model>_scalar_tune_<alias>_alias` | Alias expands to concrete keys in result dict (not the alias key itself) |
+| `test_<model>_scalar_tune_<alias>_alias_explicit_override` | Explicit concrete key overrides alias; verify by constraining the tuned range |
+
+### Scalar vs. Old API
+
+`DoubleMLScalar.tune_ml_models()` returns `dict[str, DMLOptunaResult]` **directly** — no repetition index. The old `DoubleML` API wraps results in a list (`tune_res[0]["ml_l"]`) because tuning runs per repetition. Scalar tuning uses the full dataset once, so the list dimension doesn't exist.
+
+```python
+# Scalar (new):   tune_res["ml_l"].best_params
+# Old DoubleML:   tune_res[0]["ml_l"].best_params
+```
+
+---
+
 ## Naming
 
-- Files: `test_<model>.py`, `test_<model>_scalar.py`, `test_<model>_scalar_exceptions.py`
+- Files: `test_<model>.py`, `test_<model>_scalar.py`, `test_<model>_scalar_exceptions.py`, `test_<model>_scalar_tune_ml_models.py`
 - Functions: `test_<what>` — e.g., `test_coef_within_3_sigma`, `test_exception_invalid_score`
 - Docstrings: Every test function gets a one-line docstring explaining what it verifies
 
@@ -102,3 +151,4 @@ with pytest.raises(ValueError, match=msg):
 - [ ] Seeds set for reproducibility
 - [ ] Test functions have descriptive names and docstrings
 - [ ] New scalar models have all 5 required test files (see `dml-scalar-test-structure.md`)
+- [ ] If model has `tune_ml_models()`, add `test_<model>_scalar_tune_ml_models.py` with all required tuning tests

From 15216f0f5fb43b0c72c83a45c9c8b1e6c825df0a Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 28 Feb 2026 13:38:10 +0100
Subject: [PATCH 19/38] Enhance DoubleMLScalar with improved tuning
 functionality and tests

---
 doubleml/double_ml_scalar.py                  | 264 ++++++++++++++++-
 doubleml/irm/irm_scalar.py                    |  53 +++-
 .../tests/test_irm_scalar_tune_ml_models.py   | 275 ++++++++++++++++++
 doubleml/plm/plr_scalar.py                    |  83 +++++-
 .../tests/test_plr_scalar_tune_ml_models.py   | 259 +++++++++++++++++
 .../test_scalar_tune_optuna_exceptions.py     | 217 ++++++++++++++
 doubleml/tests/test_scalar_tune_pruning.py    | 120 ++++++++
 doubleml/utils/_tune_optuna.py                |  49 ++--
 8 files changed, 1294 insertions(+), 26 deletions(-)
 create mode 100644 doubleml/irm/tests/test_irm_scalar_tune_ml_models.py
 create mode 100644 doubleml/plm/tests/test_plr_scalar_tune_ml_models.py
 create mode 100644 doubleml/tests/test_scalar_tune_optuna_exceptions.py
 create mode 100644 doubleml/tests/test_scalar_tune_pruning.py

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index 958e4b3e..bd39dc9e 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -3,7 +3,10 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import ClassVar, Self
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Self
+
+if TYPE_CHECKING:
+    from .utils._tune_optuna import DMLOptunaResult
 
 import numpy as np
 
@@ -13,6 +16,7 @@
 from .double_ml_framework import DoubleMLFramework
 from .utils._checks import _check_sample_splitting
 from .utils._learner import LearnerInfo, LearnerSpec, validate_learner
+from .utils._tune_optuna import OPTUNA_GLOBAL_SETTING_KEYS, _dml_tune_optuna, resolve_optuna_cv
 from .utils.resampling import DoubleMLClusterResampling, DoubleMLResampling
 
 
@@ -48,6 +52,11 @@ class DoubleMLScalar(DoubleMLBase, ABC):
     # Subclasses define all possible learners for the model
     _LEARNER_SPECS: ClassVar[dict[str, LearnerSpec]]
 
+    # Shorthand aliases for tune_ml_models(): maps user-facing key → list of internal learner keys.
+    # Example: {"ml_g": ["ml_g0", "ml_g1"]} lets users write ml_g once to tune both.
+    # Subclasses override as needed; default is no aliases.
+    _LEARNER_PARAM_ALIASES: ClassVar[dict[str, list[str]]] = {}
+
     def __init__(
         self,
         obj_dml_data: DoubleMLBaseData,
@@ -212,15 +221,20 @@ def smpls_cluster(self) -> list | None:
     @abstractmethod
     def required_learners(self) -> list[str]:
         """
-        Names of the required learners for current configuration.
+        Names of the required learners for the current configuration.
 
         Subclasses implement this as a property that returns the learner names
         needed based on the current score function or model configuration.
 
+        The order of this list determines the tuning order in
+        :meth:`tune_ml_models`. Learners that depend on earlier results (e.g.,
+        PLR ``ml_g`` depends on ``ml_l`` and ``ml_m`` for its 2-stage target)
+        must appear later in the list.
+
         Returns
         -------
         list of str
-            List of required learner names.
+            Ordered list of required learner names.
         """
         pass
 
@@ -835,6 +849,250 @@ def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None:
         """
         pass
 
+    # ==================== Hyperparameter Tuning ====================
+
+    def tune_ml_models(
+        self,
+        ml_param_space: dict[str, Callable | None],
+        scoring_methods: dict[str, str | Callable | None] | None = None,
+        cv: int = 5,
+        optuna_settings: dict | None = None,
+        set_as_params: bool = True,
+        return_tune_res: bool = False,
+    ) -> "Self | dict[str, DMLOptunaResult]":  # quoted because DMLOptunaResult is TYPE_CHECKING-only
+        """
+        Tune hyperparameters for all nuisance learners using Optuna.
+
+        Parameters
+        ----------
+        ml_param_space : dict
+            Parameter space functions keyed by learner name (or alias).
+            Each value must be a callable taking an Optuna trial and returning a dict.
+            Alias keys (e.g. ``'ml_g'`` for IRM, expanding to ``'ml_g0'`` and ``'ml_g1'``)
+            are supported; explicit learner keys always override alias-derived entries.
+        scoring_methods : dict or None, optional
+            Scoring functions keyed by concrete learner name. If ``None``, the
+            estimator's default score method is used. Default is ``None``.
+        cv : int, optional
+            Number of cross-validation folds for Optuna tuning. Default is ``5``.
+        optuna_settings : dict or None, optional
+            Global or per-learner Optuna settings (e.g., ``n_trials``, ``sampler``).
+            Default is ``None``.
+        set_as_params : bool, optional
+            If ``True``, apply the best found parameters to the registered learner
+            objects so they are used in subsequent calls to :meth:`fit`. Default is ``True``.
+        return_tune_res : bool, optional
+            If ``True``, return a dict of :class:`~doubleml.utils._tune_optuna.DMLOptunaResult`
+            objects keyed by learner name. Default is ``False``.
+
+        Notes
+        -----
+        Learners are tuned in the order defined by :attr:`required_learners`.
+        For multi-stage learners (e.g., PLR ``ml_g`` with ``score='IV-type'``),
+        earlier learner results are passed to :meth:`_get_tuning_data` via
+        ``partial_results``. If a preceding learner was not included in
+        ``ml_param_space``, its current (untuned) parameters are used as the
+        fallback when computing the intermediate target.
+
+        Returns
+        -------
+        self : Self
+            Returned when ``return_tune_res=False``.
+        tune_res : dict
+            Dict of :class:`~doubleml.utils._tune_optuna.DMLOptunaResult` objects keyed by
+            learner name. Returned when ``return_tune_res=True``.
+        """
+        if not isinstance(set_as_params, bool):
+            raise TypeError(f"set_as_params must be True or False. Got {str(set_as_params)}.")
+        if not isinstance(return_tune_res, bool):
+            raise TypeError(f"return_tune_res must be True or False. Got {str(return_tune_res)}.")
+        if isinstance(cv, list):
+            raise TypeError(
+                "cv as a list of pre-made (train_idx, test_idx) pairs is not supported in tune_ml_models(). "
+                "Pass an integer (number of folds) or a scikit-learn cross-validation splitter instead."
+            )
+
+        # Expand aliases and validate keys (also checks None, callability)
+        expanded_space = self._expand_tuning_param_space(ml_param_space)
+
+        self._validate_optuna_setting_keys(optuna_settings)
+
+        # Resolve cv once; all learners share the same splitter
+        cv_splitter = resolve_optuna_cv(cv)
+
+        partial_results: dict[str, Any] = {}
+        for learner_name in self.required_learners:
+            # Skip learners not in the expanded param space or set to None
+            if learner_name not in expanded_space or expanded_space[learner_name] is None:
+                continue
+            # Skip learners not yet registered via set_learners()
+            if learner_name not in self._learners:
+                continue
+
+            y_tune, x_tune = self._get_tuning_data(learner_name, partial_results, cv_splitter)
+
+            scoring = None if scoring_methods is None else scoring_methods.get(learner_name)
+
+            result = _dml_tune_optuna(
+                y=y_tune,
+                x=x_tune,
+                learner=self._learners[learner_name].learner,
+                param_grid_func=expanded_space[learner_name],
+                scoring_method=scoring,
+                cv=cv_splitter,
+                optuna_settings=optuna_settings,
+                learner_name=learner_name,
+                params_name=learner_name,
+            )
+            partial_results[learner_name] = result
+
+            if set_as_params and result.tuned:
+                self._learners[learner_name].learner.set_params(**result.best_params)
+
+        if return_tune_res:
+            return partial_results
+        return self
+
+    def _expand_tuning_param_space(self, ml_param_space: dict[str, Callable | None]) -> dict[str, Callable | None]:
+        """
+        Expand alias keys in ml_param_space to concrete learner keys.
+
+        Uses a two-pass strategy so explicit keys always override alias-derived
+        entries, regardless of insertion order:
+
+        - Pass 1: for alias keys, apply with ``setdefault`` (won't override explicit keys)
+        - Pass 2: for explicit learner keys, apply with direct assignment (always overrides)
+
+        Parameters
+        ----------
+        ml_param_space : dict
+            Parameter space dict, may contain alias keys (e.g. ``'ml_g'`` for IRM).
+
+        Returns
+        -------
+        dict
+            Expanded dict with only concrete learner keys.
+
+        Raises
+        ------
+        ValueError
+            If ``ml_param_space`` is not a non-empty dict, or if a key is neither a valid
+            alias nor a defined learner name.
+        TypeError
+            If a parameter space value is not callable.
+        """
+        if not isinstance(ml_param_space, dict):
+            raise TypeError(f"ml_param_space must be a dict. Got {type(ml_param_space).__name__}.")
+        if not ml_param_space:
+            raise ValueError("ml_param_space must be a non-empty dictionary.")
+
+        valid_keys = set(self._LEARNER_SPECS.keys()) | set(self._LEARNER_PARAM_ALIASES.keys())
+        for key in ml_param_space:
+            if key not in valid_keys:
+                raise ValueError(f"Invalid key '{key}' in ml_param_space. " f"Valid keys: {sorted(valid_keys)}.")
+
+        # Validate callability of non-None parameter space functions
+        for key, fn in ml_param_space.items():
+            if fn is not None and not callable(fn):
+                raise TypeError(
+                    f"Parameter space for '{key}' must be a callable function that takes a trial "
+                    f"and returns a dict. Got {type(fn).__name__}. "
+                    f"Example: def ml_params(trial): return {{'max_depth': trial.suggest_int('max_depth', 1, 10)}}"
+                )
+
+        expanded: dict[str, Callable | None] = {}
+        # Pass 1: expand alias keys (setdefault so explicit keys will win in pass 2)
+        for key, fn in ml_param_space.items():
+            if key in self._LEARNER_PARAM_ALIASES:
+                for alias_target in self._LEARNER_PARAM_ALIASES[key]:
+                    expanded.setdefault(alias_target, fn)
+
+        # Pass 2: explicit learner keys always override alias-derived entries
+        for key, fn in ml_param_space.items():
+            if key not in self._LEARNER_PARAM_ALIASES:
+                expanded[key] = fn
+
+        return expanded
+
+    def _validate_optuna_setting_keys(self, optuna_settings: dict | None) -> None:
+        """
+        Validate learner-level keys provided in ``optuna_settings``.
+
+        Parameters
+        ----------
+        optuna_settings : dict or None
+            Optuna settings dict to validate.
+
+        Raises
+        ------
+        TypeError
+            If ``optuna_settings`` is not a dict or None, or if a learner-specific
+            value is not a dict.
+        ValueError
+            If a key is not a global Optuna setting and not a valid learner name or alias.
+        """
+        if optuna_settings is not None and not isinstance(optuna_settings, dict):
+            raise TypeError(f"optuna_settings must be a dict or None. Got {str(type(optuna_settings))}.")
+
+        if not optuna_settings:  # None or empty dict — no settings to validate
+            return
+
+        allowed_learner_keys = set(self._LEARNER_SPECS.keys()) | set(self._LEARNER_PARAM_ALIASES.keys())
+        invalid_keys = [
+            key for key in optuna_settings if key not in OPTUNA_GLOBAL_SETTING_KEYS and key not in allowed_learner_keys
+        ]
+
+        if invalid_keys:
+            valid_keys_msg = ", ".join(sorted(allowed_learner_keys)) if allowed_learner_keys else "<none>"
+            raise ValueError(
+                f"Invalid optuna_settings keys for {self.__class__.__name__}: "
+                f"{', '.join(sorted(invalid_keys))}. "
+                f"Valid learner-specific keys are: {valid_keys_msg}."
+            )
+
+        for key in allowed_learner_keys:
+            if key in optuna_settings and not isinstance(optuna_settings[key], dict):
+                raise TypeError(f"Optuna settings for '{key}' must be a dict.")
+
+    def _get_tuning_data(
+        self,
+        learner_name: str,
+        partial_results: dict[str, Any],
+        cv: Any,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Return ``(y_target, x)`` arrays for tuning the given learner.
+
+        Subclasses must override this method to return the appropriate data for each
+        learner. The ``partial_results`` argument enables multi-stage tuning (e.g., PLR
+        ``ml_g`` which depends on earlier ``ml_l`` and ``ml_m`` results).
+
+        Parameters
+        ----------
+        learner_name : str
+            Name of the learner to tune.
+        partial_results : dict
+            Already-computed :class:`~doubleml.utils._tune_optuna.DMLOptunaResult`
+            objects, keyed by learner name.
+        cv : cross-validator
+            Cross-validation splitter, already resolved by :meth:`tune_ml_models`.
+
+        Returns
+        -------
+        y_target : np.ndarray
+            Target array for the learner.
+        x : np.ndarray
+            Feature matrix.
+
+        Raises
+        ------
+        NotImplementedError
+            Always; subclasses must override this method.
+        """
+        raise NotImplementedError(
+            f"_get_tuning_data not implemented for {self.__class__.__name__}. " "Subclasses must override this method."
+        )
+
     def __str__(self) -> str:
         """
         String representation of the DoubleMLScalar object.
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index 57f4be91..f6983368 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from typing import ClassVar, Self
+from typing import Any, ClassVar, Self
 
 import numpy as np
 from sklearn.base import clone
@@ -83,6 +83,12 @@ class IRM(LinearScoreMixin):
         "ml_m": LearnerSpec("ml_m", allow_regressor=False, allow_classifier=True),
     }
 
+    # ml_g is a shorthand for tuning both ml_g0 and ml_g1 with the same param function.
+    # Explicit ml_g0 or ml_g1 keys always override the alias.
+    _LEARNER_PARAM_ALIASES: ClassVar[dict[str, list[str]]] = {
+        "ml_g": ["ml_g0", "ml_g1"],
+    }
+
     def __init__(
         self,
         obj_dml_data: DoubleMLData,
@@ -362,6 +368,51 @@ def _check_data(obj_dml_data: object) -> None:
                 "needs to be specified as treatment variable."
             )
 
+    def _get_tuning_data(
+        self,
+        learner_name: str,
+        _partial_results: dict[str, Any],
+        _cv: Any,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Return ``(y_target, x)`` for tuning the given IRM learner.
+
+        Parameters
+        ----------
+        learner_name : str
+            Learner to tune: ``'ml_g0'``, ``'ml_g1'``, or ``'ml_m'``.
+        _partial_results : dict
+            Already-tuned DMLOptunaResult objects (unused for IRM).
+        _cv : cross-validator
+            Cross-validation splitter (unused for IRM).
+
+        Returns
+        -------
+        y_target : np.ndarray
+            Target array for the learner.
+        x : np.ndarray
+            Feature matrix.
+
+        Raises
+        ------
+        ValueError
+            If ``learner_name`` is not a valid IRM learner name.
+        """
+        y = self._dml_data.y
+        d = self._dml_data.d
+        x = self._dml_data.x
+
+        if learner_name == "ml_g0":
+            mask = d == 0
+            return y[mask], x[mask]
+        if learner_name == "ml_g1":
+            mask = d == 1
+            return y[mask], x[mask]
+        if learner_name == "ml_m":
+            return d, x
+
+        raise ValueError(f"Unknown learner '{learner_name}' for IRM.")
+
     def _initialize_weights(self, weights: np.ndarray | dict | None) -> None:
         """Initialize weights storage."""
         if weights is None:
diff --git a/doubleml/irm/tests/test_irm_scalar_tune_ml_models.py b/doubleml/irm/tests/test_irm_scalar_tune_ml_models.py
new file mode 100644
index 00000000..a11629b4
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_tune_ml_models.py
@@ -0,0 +1,275 @@
+"""Tests for IRM scalar hyperparameter tuning via tune_ml_models()."""
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.model_selection import KFold, cross_val_score
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+from doubleml.tests._utils_tune_optuna import (
+    _SAMPLER_CASES,
+    _assert_tree_params,
+    _basic_optuna_settings,
+    _small_tree_params,
+)
+from doubleml.utils._tune_optuna import DMLOptunaResult
+
+# CV splitter matching tune_ml_models() default (cv=5)
+_TUNE_CV = KFold(n_splits=5, shuffle=True, random_state=42)
+
+
+@pytest.fixture(scope="module")
+def irm_data():
+    """IRM data fixture shared across all tests in this module."""
+    np.random.seed(3142)
+    return make_irm_data(n_obs=500, dim_x=5)
+
+
+@pytest.fixture(scope="module", params=["ATE", "ATTE"])
+def score(request):
+    """Score function variants for IRM."""
+    return request.param
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("sampler_name,optuna_sampler", _SAMPLER_CASES, ids=[c[0] for c in _SAMPLER_CASES])
+def test_irm_scalar_tune_basic(irm_data, score, sampler_name, optuna_sampler):
+    """tune_ml_models() returns DMLOptunaResult with valid tree params and applies them to learners."""
+    ml_g = DecisionTreeRegressor(random_state=321)
+    ml_m = DecisionTreeClassifier(random_state=654)
+
+    model = IRM(irm_data, score=score)
+    model.set_learners(ml_g=ml_g, ml_m=ml_m)
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_g0": _small_tree_params, "ml_g1": _small_tree_params, "ml_m": _small_tree_params},
+        optuna_settings=_basic_optuna_settings({"sampler": optuna_sampler}),
+        return_tune_res=True,
+    )
+
+    # Return type and keys
+    assert isinstance(tune_res, dict)
+    assert set(tune_res.keys()) == {"ml_g0", "ml_g1", "ml_m"}
+
+    # Each result is a DMLOptunaResult with valid tree params
+    for key in ("ml_g0", "ml_g1"):
+        assert isinstance(tune_res[key], DMLOptunaResult)
+        assert tune_res[key].tuned is True
+        _assert_tree_params(tune_res[key].best_params)
+
+    assert isinstance(tune_res["ml_m"], DMLOptunaResult)
+    assert tune_res["ml_m"].tuned is True
+    _assert_tree_params(tune_res["ml_m"].best_params)
+
+    # Best params are applied to the registered learner objects
+    assert model.get_params("ml_g0")["max_depth"] == tune_res["ml_g0"].best_params["max_depth"]
+    assert model.get_params("ml_g1")["max_depth"] == tune_res["ml_g1"].best_params["max_depth"]
+    assert model.get_params("ml_m")["max_depth"] == tune_res["ml_m"].best_params["max_depth"]
+
+    # Model fits successfully after tuning
+    model.fit(n_folds=3)
+    assert np.isfinite(model.coef).all()
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_improves_score(irm_data, score):
+    """Tuning default (overfitting) trees improves cross-validated neg_rmse for ml_g0 and ml_g1."""
+    x, y, d = irm_data.x, irm_data.y, irm_data.d
+
+    ml_g = DecisionTreeRegressor(random_state=321)
+    ml_m = DecisionTreeClassifier(random_state=654)
+
+    # Baseline: default trees overfit on training folds → very negative neg_rmse
+    mask0, mask1 = d == 0, d == 1
+    baseline_g0 = cross_val_score(clone(ml_g), x[mask0], y[mask0], cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean()
+    baseline_g1 = cross_val_score(clone(ml_g), x[mask1], y[mask1], cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean()
+
+    model = IRM(irm_data, score=score)
+    model.set_learners(ml_g=ml_g, ml_m=ml_m)
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_g0": _small_tree_params, "ml_g1": _small_tree_params, "ml_m": _small_tree_params},
+        optuna_settings=_basic_optuna_settings(),
+        return_tune_res=True,
+    )
+
+    # Optuna best_score should exceed baseline (less overfitting)
+    assert tune_res["ml_g0"].best_score > baseline_g0
+    assert tune_res["ml_g1"].best_score > baseline_g1
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_ml_g_alias(irm_data):
+    """ml_g alias expands to both ml_g0 and ml_g1; result keys are the concrete learner names."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2))
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params},
+        optuna_settings=_basic_optuna_settings(),
+        return_tune_res=True,
+    )
+
+    # Alias expands: result has ml_g0, ml_g1 (not ml_g)
+    assert set(tune_res.keys()) == {"ml_g0", "ml_g1", "ml_m"}
+    _assert_tree_params(tune_res["ml_g0"].best_params)
+    _assert_tree_params(tune_res["ml_g1"].best_params)
+    _assert_tree_params(tune_res["ml_m"].best_params)
+
+    # Model fits after tuning
+    model.fit(n_folds=3)
+    assert np.isfinite(model.coef).all()
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_ml_g_alias_explicit_override(irm_data):
+    """Explicit ml_g0 key overrides the ml_g alias; ml_g1 still gets the alias function."""
+
+    def specific_g0_params(trial):
+        """Restricts max_depth to 1-3 to distinguish from _small_tree_params (1-20)."""
+        return {
+            "max_depth": trial.suggest_int("max_depth", 1, 3),
+            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
+            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
+        }
+
+    model = IRM(irm_data)
+    model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2))
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_g": _small_tree_params, "ml_g0": specific_g0_params, "ml_m": _small_tree_params},
+        optuna_settings=_basic_optuna_settings(),
+        return_tune_res=True,
+    )
+
+    assert set(tune_res.keys()) == {"ml_g0", "ml_g1", "ml_m"}
+    # ml_g0 used specific_g0_params: max_depth constrained to [1, 3]
+    assert tune_res["ml_g0"].best_params["max_depth"] <= 3
+    # ml_g1 used _small_tree_params: all three keys present, max_depth up to 20
+    _assert_tree_params(tune_res["ml_g1"].best_params)
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_returns_self(irm_data):
+    """tune_ml_models() with return_tune_res=False returns self."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2))
+
+    result = model.tune_ml_models(
+        ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params},
+        optuna_settings=_basic_optuna_settings(),
+    )
+
+    assert result is model
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_set_as_params_false(irm_data):
+    """tune_ml_models(set_as_params=False) finds best params but does not apply them to learners."""
+    model = IRM(irm_data)
+    model.set_learners(
+        ml_g=DecisionTreeRegressor(max_depth=1, random_state=1),
+        ml_m=DecisionTreeClassifier(max_depth=1, random_state=2),
+    )
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params},
+        optuna_settings=_basic_optuna_settings(),
+        set_as_params=False,
+        return_tune_res=True,
+    )
+
+    # Learner params are unchanged
+    assert model.get_params("ml_g0")["max_depth"] == 1
+    assert model.get_params("ml_g1")["max_depth"] == 1
+    assert model.get_params("ml_m")["max_depth"] == 1
+    # But tune_res still has valid best params
+    _assert_tree_params(tune_res["ml_g0"].best_params)
+    _assert_tree_params(tune_res["ml_g1"].best_params)
+    _assert_tree_params(tune_res["ml_m"].best_params)
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_invalid_key(irm_data):
+    """_expand_tuning_param_space() raises ValueError for unknown keys."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2))
+
+    with pytest.raises(ValueError, match="Invalid key 'ml_z' in ml_param_space"):
+        model.tune_ml_models(
+            ml_param_space={"ml_z": _small_tree_params},
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_partial_space(irm_data):
+    """Tuning only a subset of learners leaves unspecified learners unchanged."""
+    model = IRM(irm_data)
+    model.set_learners(
+        ml_g=DecisionTreeRegressor(max_depth=5, random_state=1),
+        ml_m=DecisionTreeClassifier(max_depth=5, random_state=2),
+    )
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_g0": _small_tree_params},  # only ml_g0
+        optuna_settings=_basic_optuna_settings(),
+        return_tune_res=True,
+    )
+
+    # Only ml_g0 was tuned
+    assert set(tune_res.keys()) == {"ml_g0"}
+    _assert_tree_params(tune_res["ml_g0"].best_params)
+    # ml_g1 and ml_m max_depth are unchanged
+    assert model.get_params("ml_g1")["max_depth"] == 5
+    assert model.get_params("ml_m")["max_depth"] == 5
+
+
+@pytest.fixture(
+    scope="module",
+    params=["int", "kfold_splitter"],
+    ids=["cv=int", "cv=KFold"],
+)
+def cv_variant(request):
+    """Different cv argument types accepted by tune_ml_models(): int and splitter."""
+    if request.param == "int":
+        return 3
+    return KFold(n_splits=3, shuffle=True, random_state=7)
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_cv_types(irm_data, cv_variant):
+    """tune_ml_models() succeeds for supported cv argument types: int and splitter."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2))
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params},
+        cv=cv_variant,
+        optuna_settings=_basic_optuna_settings(),
+        return_tune_res=True,
+    )
+
+    for name in ("ml_g0", "ml_g1", "ml_m"):
+        assert name in tune_res
+        assert tune_res[name].tuned is True
+        assert isinstance(tune_res[name].best_params, dict)
+        assert np.isfinite(tune_res[name].best_score)
+
+
+@pytest.mark.ci
+def test_irm_scalar_tune_cv_list_raises(irm_data):
+    """tune_ml_models() raises TypeError when cv is a list of pre-made split pairs."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeClassifier(random_state=2))
+    cv_list = list(KFold(n_splits=3).split(np.arange(irm_data.n_obs)))
+
+    msg = r"cv as a list of pre-made \(train_idx, test_idx\) pairs is not supported"
+    with pytest.raises(TypeError, match=msg):
+        model.tune_ml_models(
+            ml_param_space={"ml_g": _small_tree_params, "ml_m": _small_tree_params},
+            cv=cv_list,
+            optuna_settings=_basic_optuna_settings(),
+        )
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 1c38af24..caf290de 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -5,10 +5,11 @@
 from __future__ import annotations
 
 import warnings
-from typing import Dict, List, Optional, Self
+from typing import Any, ClassVar, Dict, List, Optional, Self
 
 import numpy as np
 from sklearn.base import clone
+from sklearn.model_selection import cross_val_predict
 
 from ..data.base_data import DoubleMLData
 from ..double_ml_linear_score import LinearScoreMixin
@@ -37,7 +38,7 @@ class PLR(LinearScoreMixin):
     """
 
     # Define learner specifications for PLR
-    _LEARNER_SPECS: Dict[str, LearnerSpec] = {
+    _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] = {
         "ml_l": LearnerSpec("ml_l", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"),
         "ml_m": LearnerSpec("ml_m", allow_regressor=True, allow_classifier=True, binary_data_check="treatment"),
         "ml_g": LearnerSpec("ml_g", allow_regressor=True, allow_classifier=False),
@@ -277,6 +278,84 @@ def _nuisance_est(
                 ml_g.fit(x[train_j], y[train_j] - theta_initial * d[train_j])
                 self._predictions["ml_g"][test_j, i_rep] = predict_nuisance(ml_g, x[test_j], ml_g_info.is_classifier)
 
+    def _get_tuning_data(
+        self,
+        learner_name: str,
+        partial_results: dict[str, Any],
+        cv: Any,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Return ``(y_target, x)`` for tuning the given PLR learner.
+
+        Parameters
+        ----------
+        learner_name : str
+            Learner to tune: ``'ml_l'``, ``'ml_m'``, or ``'ml_g'``.
+        partial_results : dict
+            Already-tuned DMLOptunaResult objects, keyed by learner name.
+            Used for 2-stage ``ml_g`` tuning: applies the best params from
+            ``ml_l`` and ``ml_m`` when computing the initial theta estimate.
+            If ``ml_l`` or ``ml_m`` were not tuned in this call, their current
+            (untuned) learner params are used as a fallback.
+        cv : cross-validator
+            Cross-validation splitter, already resolved in :meth:`tune_ml_models`.
+
+        Returns
+        -------
+        y_target : np.ndarray
+            Target array for the learner.
+        x : np.ndarray
+            Feature matrix.
+
+        Raises
+        ------
+        ValueError
+            If ``learner_name`` is not a valid PLR learner name.
+        """
+        y = self._dml_data.y
+        d = self._dml_data.d
+        x = self._dml_data.x
+
+        if learner_name == "ml_l":
+            return y, x
+        if learner_name == "ml_m":
+            return d, x
+        if learner_name == "ml_g":
+            # 2-stage: compute initial theta via cross-validated ml_l/ml_m predictions.
+            # Apply tuned params if available, otherwise use the current learner params.
+            if "ml_l" not in self._learners or "ml_m" not in self._learners:
+                raise ValueError(
+                    "Tuning 'ml_g' requires 'ml_l' and 'ml_m' to be registered. "
+                    "Call set_learners(ml_l=..., ml_m=...) before tuning 'ml_g'."
+                )
+            l_info = self._learners["ml_l"]
+            m_info = self._learners["ml_m"]
+
+            l_est = clone(l_info.learner)
+            if "ml_l" in partial_results:
+                l_est.set_params(**partial_results["ml_l"].best_params)
+
+            m_est = clone(m_info.learner)
+            if "ml_m" in partial_results:
+                m_est.set_params(**partial_results["ml_m"].best_params)
+
+            if l_info.is_classifier:
+                l_hat = cross_val_predict(l_est, x, y, cv=cv, method="predict_proba")[:, 1]
+            else:
+                l_hat = cross_val_predict(l_est, x, y, cv=cv)
+
+            if m_info.is_classifier:
+                m_hat = cross_val_predict(m_est, x, d, cv=cv, method="predict_proba")[:, 1]
+            else:
+                m_hat = cross_val_predict(m_est, x, d, cv=cv)
+
+            psi_a = -((d - m_hat) ** 2)
+            psi_b = (d - m_hat) * (y - l_hat)
+            theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a)
+            return y - theta_initial * d, x
+
+        raise ValueError(f"Unknown learner '{learner_name}' for PLR.")
+
     def _get_score_elements(self) -> Dict[str, np.ndarray]:
         y = self._dml_data.y
         d = self._dml_data.d
diff --git a/doubleml/plm/tests/test_plr_scalar_tune_ml_models.py b/doubleml/plm/tests/test_plr_scalar_tune_ml_models.py
new file mode 100644
index 00000000..fb231cea
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar_tune_ml_models.py
@@ -0,0 +1,259 @@
+"""Tests for PLR scalar hyperparameter tuning via tune_ml_models()."""
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.model_selection import KFold, cross_val_predict, cross_val_score
+from sklearn.tree import DecisionTreeRegressor
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+from doubleml.tests._utils_tune_optuna import (
+    _SAMPLER_CASES,
+    _assert_tree_params,
+    _basic_optuna_settings,
+    _small_tree_params,
+)
+from doubleml.utils._tune_optuna import DMLOptunaResult
+
+# CV splitter matching tune_ml_models() default (cv=5)
+_TUNE_CV = KFold(n_splits=5, shuffle=True, random_state=42)
+
+
+@pytest.fixture(scope="module")
+def plr_data():
+    """PLR data fixture shared across all tests in this module."""
+    np.random.seed(3141)
+    return make_plr_CCDDHNR2018(n_obs=500, dim_x=5, alpha=0.5)
+
+
+@pytest.fixture(scope="module", params=["partialling out", "IV-type"])
+def score(request):
+    """Score function variants for PLR."""
+    return request.param
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("sampler_name,optuna_sampler", _SAMPLER_CASES, ids=[c[0] for c in _SAMPLER_CASES])
+def test_plr_scalar_tune_basic(plr_data, score, sampler_name, optuna_sampler):
+    """tune_ml_models() returns DMLOptunaResult with valid tree params and applies them to learners."""
+    ml_l = DecisionTreeRegressor(random_state=123)
+    ml_m = DecisionTreeRegressor(random_state=456)
+
+    model = PLR(plr_data, score=score)
+    model.set_learners(ml_l=ml_l, ml_m=ml_m)
+    if score == "IV-type":
+        model.set_learners(ml_g=DecisionTreeRegressor(random_state=789))
+
+    param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params}
+    if score == "IV-type":
+        param_space["ml_g"] = _small_tree_params
+
+    tune_res = model.tune_ml_models(
+        ml_param_space=param_space,
+        optuna_settings=_basic_optuna_settings({"sampler": optuna_sampler}),
+        return_tune_res=True,
+    )
+
+    # Return type and keys
+    assert isinstance(tune_res, dict)
+    expected_keys = {"ml_l", "ml_m"}
+    if score == "IV-type":
+        expected_keys.add("ml_g")
+    assert set(tune_res.keys()) == expected_keys
+
+    # Each result is a DMLOptunaResult with valid tree params
+    for key in tune_res:
+        assert isinstance(tune_res[key], DMLOptunaResult)
+        assert tune_res[key].tuned is True
+        _assert_tree_params(tune_res[key].best_params)
+
+    # Best params are applied to the registered learner objects
+    assert model.get_params("ml_l")["max_depth"] == tune_res["ml_l"].best_params["max_depth"]
+    assert model.get_params("ml_m")["max_depth"] == tune_res["ml_m"].best_params["max_depth"]
+    if score == "IV-type":
+        assert model.get_params("ml_g")["max_depth"] == tune_res["ml_g"].best_params["max_depth"]
+
+    # Model fits successfully after tuning
+    model.fit(n_folds=3)
+    assert np.isfinite(model.coef).all()
+
+
+@pytest.mark.ci
+def test_plr_scalar_tune_improves_score(plr_data, score):
+    """Tuning a default (overfitting) tree improves cross-validated neg_rmse."""
+    x, y, d = plr_data.x, plr_data.y, plr_data.d
+
+    ml_l = DecisionTreeRegressor(random_state=123)
+    ml_m = DecisionTreeRegressor(random_state=456)
+
+    # Baseline: default trees overfit on training folds → high test RMSE → very negative neg_rmse
+    baseline_l = cross_val_score(clone(ml_l), x, y, cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean()
+    baseline_m = cross_val_score(clone(ml_m), x, d, cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean()
+
+    model = PLR(plr_data, score=score)
+    model.set_learners(ml_l=ml_l, ml_m=ml_m)
+    if score == "IV-type":
+        model.set_learners(ml_g=DecisionTreeRegressor(random_state=789))
+
+    param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params}
+    if score == "IV-type":
+        param_space["ml_g"] = _small_tree_params
+
+    tune_res = model.tune_ml_models(
+        ml_param_space=param_space,
+        optuna_settings=_basic_optuna_settings(),
+        return_tune_res=True,
+    )
+
+    # Optuna best_score (neg_root_mean_squared_error) should exceed baseline (less overfitting)
+    assert tune_res["ml_l"].best_score > baseline_l
+    assert tune_res["ml_m"].best_score > baseline_m
+
+    if score == "IV-type":
+        # Replicate _get_tuning_data's 2-stage target for ml_g: y - theta_initial * d.
+        # Uses _TUNE_CV which matches resolve_optuna_cv(cv=5) used internally.
+        ml_g = DecisionTreeRegressor(random_state=789)
+        l_hat = cross_val_predict(clone(ml_l), x, y, cv=_TUNE_CV)
+        m_hat = cross_val_predict(clone(ml_m), x, d, cv=_TUNE_CV)
+        psi_a = -((d - m_hat) ** 2)
+        psi_b = (d - m_hat) * (y - l_hat)
+        theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a)
+        y_g = y - theta_initial * d
+        baseline_g = cross_val_score(clone(ml_g), x, y_g, cv=_TUNE_CV, scoring="neg_root_mean_squared_error").mean()
+        assert tune_res["ml_g"].best_score > baseline_g
+
+
+@pytest.mark.ci
+def test_plr_scalar_tune_returns_self(plr_data):
+    """tune_ml_models() with return_tune_res=False returns self."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeRegressor(random_state=2))
+
+    result = model.tune_ml_models(
+        ml_param_space={"ml_l": _small_tree_params, "ml_m": _small_tree_params},
+        optuna_settings=_basic_optuna_settings(),
+    )
+
+    assert result is model
+
+
+@pytest.mark.ci
+def test_plr_scalar_tune_set_as_params_false(plr_data):
+    """tune_ml_models(set_as_params=False) finds best params but does not apply them to learners."""
+    model = PLR(plr_data)
+    model.set_learners(
+        ml_l=DecisionTreeRegressor(max_depth=1, random_state=1),
+        ml_m=DecisionTreeRegressor(max_depth=1, random_state=2),
+    )
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_l": _small_tree_params, "ml_m": _small_tree_params},
+        optuna_settings=_basic_optuna_settings(),
+        set_as_params=False,
+        return_tune_res=True,
+    )
+
+    # Learner params are unchanged
+    assert model.get_params("ml_l")["max_depth"] == 1
+    assert model.get_params("ml_m")["max_depth"] == 1
+    # But tune_res still has valid best params
+    _assert_tree_params(tune_res["ml_l"].best_params)
+    _assert_tree_params(tune_res["ml_m"].best_params)
+
+
+@pytest.mark.ci
+def test_plr_scalar_tune_invalid_key(plr_data):
+    """_expand_tuning_param_space() raises ValueError for unknown keys."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=DecisionTreeRegressor(), ml_m=DecisionTreeRegressor())
+
+    with pytest.raises(ValueError, match="Invalid key 'ml_z' in ml_param_space"):
+        model.tune_ml_models(
+            ml_param_space={"ml_z": _small_tree_params},
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+@pytest.fixture(
+    scope="module",
+    params=["int", "kfold_splitter"],
+    ids=["cv=int", "cv=KFold"],
+)
+def cv_variant(request):
+    """Different cv argument types accepted by tune_ml_models(): int and splitter."""
+    if request.param == "int":
+        return 3
+    return KFold(n_splits=3, shuffle=True, random_state=7)
+
+
+@pytest.mark.ci
+def test_plr_scalar_tune_cv_types(plr_data, cv_variant):
+    """tune_ml_models() succeeds for supported cv argument types: int and splitter."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeRegressor(random_state=2))
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_l": _small_tree_params, "ml_m": _small_tree_params},
+        cv=cv_variant,
+        optuna_settings=_basic_optuna_settings(),
+        return_tune_res=True,
+    )
+
+    for name in ("ml_l", "ml_m"):
+        assert name in tune_res
+        assert tune_res[name].tuned is True
+        assert isinstance(tune_res[name].best_params, dict)
+        assert np.isfinite(tune_res[name].best_score)
+
+
+@pytest.mark.ci
+def test_plr_scalar_tune_cv_list_raises(plr_data):
+    """tune_ml_models() raises TypeError when cv is a list of pre-made split pairs."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=DecisionTreeRegressor(random_state=1), ml_m=DecisionTreeRegressor(random_state=2))
+    cv_list = list(KFold(n_splits=3).split(np.arange(plr_data.n_obs)))
+
+    msg = r"cv as a list of pre-made \(train_idx, test_idx\) pairs is not supported"
+    with pytest.raises(TypeError, match=msg):
+        model.tune_ml_models(
+            ml_param_space={"ml_l": _small_tree_params, "ml_m": _small_tree_params},
+            cv=cv_list,
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+@pytest.mark.ci
+def test_plr_scalar_tune_partial_space(plr_data):
+    """Tuning only a subset of learners leaves unspecified learners unchanged."""
+    model = PLR(plr_data)
+    model.set_learners(
+        ml_l=DecisionTreeRegressor(max_depth=5, random_state=1),
+        ml_m=DecisionTreeRegressor(max_depth=5, random_state=2),
+    )
+
+    tune_res = model.tune_ml_models(
+        ml_param_space={"ml_l": _small_tree_params},  # only ml_l
+        optuna_settings=_basic_optuna_settings(),
+        return_tune_res=True,
+    )
+
+    # Only ml_l was tuned
+    assert set(tune_res.keys()) == {"ml_l"}
+    _assert_tree_params(tune_res["ml_l"].best_params)
+    # ml_m max_depth is unchanged
+    assert model.get_params("ml_m")["max_depth"] == 5
+
+
+@pytest.mark.ci
+def test_plr_scalar_tune_ml_g_missing_ml_l_ml_m(plr_data):
+    """Tuning ml_g without ml_l and ml_m registered raises ValueError."""
+    model = PLR(plr_data, score="IV-type")
+    model.set_learners(ml_g=DecisionTreeRegressor(random_state=1))
+
+    msg = r"Tuning 'ml_g' requires 'ml_l' and 'ml_m' to be registered\."
+    with pytest.raises(ValueError, match=msg):
+        model.tune_ml_models(
+            ml_param_space={"ml_g": _small_tree_params},
+            optuna_settings=_basic_optuna_settings(),
+        )
diff --git a/doubleml/tests/test_scalar_tune_optuna_exceptions.py b/doubleml/tests/test_scalar_tune_optuna_exceptions.py
new file mode 100644
index 00000000..78e8c7b0
--- /dev/null
+++ b/doubleml/tests/test_scalar_tune_optuna_exceptions.py
@@ -0,0 +1,217 @@
+"""Tests for DoubleMLScalar.tune_ml_models() input validation and error handling."""
+
+import re
+
+import numpy as np
+import pytest
+from sklearn.model_selection import KFold
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+from doubleml.tests._utils_tune_optuna import _basic_optuna_settings, _small_tree_params
+
+# ── Shared fixtures ────────────────────────────────────────────────────────────
+
+np.random.seed(42)
+_plr_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=5)
+_irm_data = make_irm_data(n_obs=100, dim_x=5)
+
+
+@pytest.fixture(scope="module")
+def plr_model():
+    """Fitted PLR scalar model for reuse across exception tests."""
+    model = PLR(_plr_data)
+    model.set_learners(
+        ml_l=DecisionTreeRegressor(random_state=1),
+        ml_m=DecisionTreeRegressor(random_state=2),
+    )
+    return model
+
+
+@pytest.fixture(scope="module")
+def irm_model():
+    """Fitted IRM scalar model for reuse across exception tests."""
+    model = IRM(_irm_data)
+    model.set_learners(
+        ml_g=DecisionTreeRegressor(random_state=1),
+        ml_m=DecisionTreeClassifier(random_state=2),
+    )
+    return model
+
+
+# ── ml_param_space validation ──────────────────────────────────────────────────
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize(
+    "ml_param_space, exc, msg",
+    [
+        (None, TypeError, "ml_param_space must be a dict. Got NoneType."),
+        ({}, ValueError, "ml_param_space must be a non-empty dictionary."),
+        (
+            {"ml_l": "not-callable"},
+            TypeError,
+            "Parameter space for 'ml_l' must be a callable function that takes a trial and returns a dict. Got str.",
+        ),
+    ],
+)
+def test_scalar_tune_invalid_param_space(plr_model, ml_param_space, exc, msg):
+    """tune_ml_models() raises on None, empty, or non-callable ml_param_space."""
+    with pytest.raises(exc, match=re.escape(msg)):
+        plr_model.tune_ml_models(ml_param_space, optuna_settings=_basic_optuna_settings())
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize(
+    "bad_key, model_name",
+    [
+        ("ml_z", "PLR"),
+        ("ml_g0", "PLR"),
+    ],
+)
+def test_scalar_tune_invalid_param_space_key_plr(plr_model, bad_key, model_name):
+    """_expand_tuning_param_space() raises ValueError for keys not valid for PLR."""
+    with pytest.raises(ValueError, match=re.escape(f"Invalid key '{bad_key}' in ml_param_space")):
+        plr_model.tune_ml_models(
+            {bad_key: _small_tree_params},
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("bad_key", ["ml_l", "ml_z"])
+def test_scalar_tune_invalid_param_space_key_irm(irm_model, bad_key):
+    """_expand_tuning_param_space() raises ValueError for keys not valid for IRM."""
+    with pytest.raises(ValueError, match=re.escape(f"Invalid key '{bad_key}' in ml_param_space")):
+        irm_model.tune_ml_models(
+            {bad_key: _small_tree_params},
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+# ── Boolean flag validation ────────────────────────────────────────────────────
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("set_as_params", ["invalid", None, 1])
+def test_scalar_tune_invalid_set_as_params(plr_model, set_as_params):
+    """tune_ml_models() raises TypeError for non-bool set_as_params."""
+    msg = re.escape(f"set_as_params must be True or False. Got {str(set_as_params)}.")
+    with pytest.raises(TypeError, match=msg):
+        plr_model.tune_ml_models(
+            {"ml_l": _small_tree_params},
+            set_as_params=set_as_params,
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("return_tune_res", ["invalid", None, 1])
+def test_scalar_tune_invalid_return_tune_res(plr_model, return_tune_res):
+    """tune_ml_models() raises TypeError for non-bool return_tune_res."""
+    msg = re.escape(f"return_tune_res must be True or False. Got {str(return_tune_res)}.")
+    with pytest.raises(TypeError, match=msg):
+        plr_model.tune_ml_models(
+            {"ml_l": _small_tree_params},
+            return_tune_res=return_tune_res,
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+# ── optuna_settings validation ─────────────────────────────────────────────────
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize(
+    "optuna_settings, exc, msg",
+    [
+        ("invalid", TypeError, "optuna_settings must be a dict or None. Got <class 'str'>."),
+        (
+            {"ml_g0": {"n_trials": 2}},
+            ValueError,
+            "Invalid optuna_settings keys for PLR: ml_g0. Valid learner-specific keys are:",
+        ),
+        ({"ml_l": "not-a-dict"}, TypeError, "Optuna settings for 'ml_l' must be a dict."),
+    ],
+)
+def test_scalar_tune_invalid_optuna_settings_plr(plr_model, optuna_settings, exc, msg):
+    """tune_ml_models() raises on non-dict, invalid learner key, or non-dict learner settings for PLR."""
+    with pytest.raises(exc, match=re.escape(msg)):
+        plr_model.tune_ml_models({"ml_l": _small_tree_params}, optuna_settings=optuna_settings)
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize(
+    "invalid_key",
+    ["ml_l", "ml_z"],
+)
+def test_scalar_tune_invalid_optuna_settings_key_irm(irm_model, invalid_key):
+    """tune_ml_models() raises ValueError for optuna_settings keys not valid for IRM."""
+    with pytest.raises(ValueError, match=f"Invalid optuna_settings keys for IRM: {invalid_key}"):
+        irm_model.tune_ml_models(
+            {"ml_g": _small_tree_params, "ml_m": _small_tree_params},
+            optuna_settings={invalid_key: {"n_trials": 2}},
+        )
+
+
+# ── cv validation (delegated to resolve_optuna_cv) ────────────────────────────
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize(
+    "cv, exc, msg",
+    [
+        ("invalid", TypeError, "cv must not be provided as a string."),
+        (1, ValueError, "The number of folds used for tuning must be at least two. 1 was passed."),
+    ],
+)
+def test_scalar_tune_invalid_cv(plr_model, cv, exc, msg):
+    """tune_ml_models() raises for string cv or cv < 2."""
+    with pytest.raises(exc, match=re.escape(msg)):
+        plr_model.tune_ml_models(
+            {"ml_l": _small_tree_params},
+            cv=cv,
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+@pytest.mark.ci
+def test_scalar_tune_non_iterable_cv(plr_model):
+    """tune_ml_models() raises TypeError for a non-iterable cv object."""
+
+    class NonIterableCV:
+        pass
+
+    msg = (
+        "cv must be an integer >= 2, a scikit-learn cross-validation splitter, "
+        "or an iterable of (train_indices, test_indices) pairs."
+    )
+    with pytest.raises(TypeError, match=re.escape(msg)):
+        plr_model.tune_ml_models(
+            {"ml_l": _small_tree_params},
+            cv=NonIterableCV(),
+            optuna_settings=_basic_optuna_settings(),
+        )
+
+
+# ── cv variants (positive behavior) ───────────────────────────────────────────
+
+
+@pytest.mark.ci
+def test_scalar_tune_cv_variants(plr_model):
+    """tune_ml_models() accepts integer and KFold splitter as cv."""
+    param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params}
+    settings = _basic_optuna_settings()
+
+    # integer cv
+    result = plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings, return_tune_res=True)
+    assert "ml_l" in result
+
+    # KFold splitter
+    result = plr_model.tune_ml_models(
+        param_space, cv=KFold(n_splits=3, shuffle=True, random_state=0), optuna_settings=settings, return_tune_res=True
+    )
+    assert "ml_l" in result
diff --git a/doubleml/tests/test_scalar_tune_pruning.py b/doubleml/tests/test_scalar_tune_pruning.py
new file mode 100644
index 00000000..75459139
--- /dev/null
+++ b/doubleml/tests/test_scalar_tune_pruning.py
@@ -0,0 +1,120 @@
+"""Tests for per-fold pruning support in DoubleMLScalar.tune_ml_models()."""
+
+import numpy as np
+import optuna
+import pytest
+from sklearn.tree import DecisionTreeRegressor
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+from doubleml.tests._utils_tune_optuna import _small_tree_params
+
+# ── Shared fixtures ────────────────────────────────────────────────────────────
+
+np.random.seed(42)
+_plr_data = make_plr_CCDDHNR2018(n_obs=100, dim_x=5)
+
+
+@pytest.fixture(scope="module")
+def plr_model():
+    """PLR scalar model for reuse across pruning tests."""
+    model = PLR(_plr_data)
+    model.set_learners(
+        ml_l=DecisionTreeRegressor(random_state=1),
+        ml_m=DecisionTreeRegressor(random_state=2),
+    )
+    return model
+
+
+# ── Pruning tests ──────────────────────────────────────────────────────────────
+
+
+@pytest.mark.ci
+def test_scalar_tune_with_median_pruner(plr_model):
+    """tune_ml_models() completes successfully when MedianPruner is passed via study_kwargs."""
+    param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params}
+    settings = {
+        "n_trials": 8,
+        "sampler": optuna.samplers.RandomSampler(seed=3141),
+        "study_kwargs": {"pruner": optuna.pruners.MedianPruner(n_startup_trials=1, n_warmup_steps=0)},
+        "verbosity": optuna.logging.WARNING,
+    }
+
+    result = plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings, return_tune_res=True)
+
+    for name in ("ml_l", "ml_m"):
+        assert name in result
+        assert result[name].tuned is True
+        assert isinstance(result[name].best_params, dict)
+        assert np.isfinite(result[name].best_score)
+        # At least one complete trial must exist (RuntimeError raised otherwise)
+        complete = [t for t in result[name].study.trials if t.state == optuna.trial.TrialState.COMPLETE]
+        assert len(complete) >= 1
+
+
+@pytest.mark.ci
+def test_scalar_tune_pruner_produces_pruned_trials(plr_model):
+    """MedianPruner with n_startup_trials=1 produces at least one pruned trial over enough trials."""
+    param_space = {"ml_l": _small_tree_params}
+    settings = {
+        "n_trials": 20,
+        "sampler": optuna.samplers.RandomSampler(seed=99),
+        "study_kwargs": {"pruner": optuna.pruners.MedianPruner(n_startup_trials=1, n_warmup_steps=0)},
+        "verbosity": optuna.logging.WARNING,
+    }
+
+    result = plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings, return_tune_res=True)
+
+    study = result["ml_l"].study
+    pruned = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
+    assert len(pruned) >= 1, "Expected at least one pruned trial with MedianPruner(n_startup_trials=1) over 20 trials"
+
+
+@pytest.mark.ci
+def test_scalar_tune_all_trials_pruned_raises(plr_model):
+    """tune_ml_models() raises RuntimeError when a pruner eliminates all trials."""
+
+    class _AlwaysPruner(optuna.pruners.BasePruner):
+        """Prune every trial unconditionally (even step 0)."""
+
+        def prune(self, study: optuna.Study, trial: optuna.trial.FrozenTrial) -> bool:
+            return True
+
+    param_space = {"ml_l": _small_tree_params}
+    settings = {
+        "n_trials": 3,
+        "study_kwargs": {"pruner": _AlwaysPruner()},
+        "verbosity": optuna.logging.WARNING,
+    }
+
+    with pytest.raises(RuntimeError, match="Optuna optimization failed to produce any complete trials."):
+        plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings)
+
+
+@pytest.mark.ci
+def test_scalar_tune_pruner_per_learner(plr_model):
+    """Per-learner study_kwargs pruner applies only to that learner; the other learner is unaffected."""
+    param_space = {"ml_l": _small_tree_params, "ml_m": _small_tree_params}
+    settings = {
+        "n_trials": 20,
+        "sampler": optuna.samplers.RandomSampler(seed=3141),
+        "verbosity": optuna.logging.WARNING,
+        # ml_l: aggressive pruner → expect pruned trials
+        "ml_l": {
+            "study_kwargs": {"pruner": optuna.pruners.MedianPruner(n_startup_trials=1, n_warmup_steps=0)},
+        },
+        # ml_m: explicitly disable pruning → zero pruned trials
+        "ml_m": {
+            "study_kwargs": {"pruner": optuna.pruners.NopPruner()},
+        },
+    }
+
+    result = plr_model.tune_ml_models(param_space, cv=3, optuna_settings=settings, return_tune_res=True)
+
+    # ml_l: expect at least one pruned trial due to the per-learner MedianPruner
+    ml_l_pruned = [t for t in result["ml_l"].study.trials if t.state == optuna.trial.TrialState.PRUNED]
+    assert len(ml_l_pruned) >= 1, "Expected ml_l to have pruned trials with a per-learner MedianPruner"
+
+    # ml_m: NoPruner → all 20 trials should be complete
+    ml_m_pruned = [t for t in result["ml_m"].study.trials if t.state == optuna.trial.TrialState.PRUNED]
+    assert len(ml_m_pruned) == 0, "Expected ml_m to have no pruned trials since NoPruner was configured"
diff --git a/doubleml/utils/_tune_optuna.py b/doubleml/utils/_tune_optuna.py
index 36d8f7e7..f3e2a821 100644
--- a/doubleml/utils/_tune_optuna.py
+++ b/doubleml/utils/_tune_optuna.py
@@ -27,7 +27,8 @@
 import numpy as np
 import optuna
 from sklearn.base import clone, is_classifier, is_regressor
-from sklearn.model_selection import BaseCrossValidator, KFold, cross_val_score
+from sklearn.metrics import check_scoring
+from sklearn.model_selection import BaseCrossValidator, KFold
 
 logger = logging.getLogger(__name__)
 
@@ -400,9 +401,9 @@ def _check_tuning_inputs(
 
     Returns
     -------
-    cross-validator or iterable
-        Cross-validation splitter compatible with
-        :func:`sklearn.model_selection.cross_val_score`.
+    cross-validator or list
+        Cross-validation splitter or pre-made list of ``(train, test)`` index
+        pairs as returned by :func:`resolve_optuna_cv`.
     """
 
     if y.shape[0] != x.shape[0]:
@@ -520,6 +521,10 @@ def _create_objective(param_grid_func, learner, x, y, cv, scoring_method):
     """
     Create an Optuna objective function for hyperparameter optimization.
 
+    Uses a manual fold loop with per-fold intermediate reporting so that
+    Optuna pruners (e.g. ``MedianPruner``, ``HyperbandPruner``) can stop
+    unpromising trials early after each cross-validation fold.
+
     Parameters
     ----------
     param_grid_func : callable
@@ -531,8 +536,10 @@ def _create_objective(param_grid_func, learner, x, y, cv, scoring_method):
         Features (full dataset).
     y : np.ndarray
         Target variable (full dataset).
-    cv : cross-validation generator
-        KFold or similar cross-validation splitter.
+    cv : cross-validation splitter or list of (train, test) pairs
+        A scikit-learn cross-validation splitter (has a ``.split()`` method) or
+        a pre-made list of ``(train_indices, test_indices)`` pairs as returned
+        by :func:`resolve_optuna_cv`.
     scoring_method : str, callable or None
         Scoring argument for cross-validation. ``None`` delegates to the
         estimator's default ``score`` implementation.
@@ -542,6 +549,10 @@ def _create_objective(param_grid_func, learner, x, y, cv, scoring_method):
     callable
         Objective function for Optuna optimization.
     """
+    # Build scorer once; scoring_method is already resolved (non-None) by _resolve_optuna_scoring
+    scorer = check_scoring(clone(learner), scoring=scoring_method)
+    # Pre-compute splits: cv may be a splitter (has .split) or a list of (train, test) pairs
+    splits = cv if isinstance(cv, list) else list(cv.split(x, y))
 
     def objective(trial):
         """Objective function for Optuna optimization."""
@@ -554,21 +565,19 @@ def objective(trial):
                 f"Example: def params(trial): return {{'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1)}}"
             )
 
-        # Clone learner and set parameters
-        estimator = clone(learner).set_params(**params)
-
-        # Perform cross-validation on full dataset
-        scores = cross_val_score(
-            estimator,
-            x,
-            y,
-            cv=cv,
-            scoring=scoring_method,
-            error_score="raise",
-        )
+        # Manual fold loop with per-fold intermediate reporting for pruning support
+        fold_scores = []
+        for step, (train_idx, test_idx) in enumerate(splits):
+            est = clone(learner).set_params(**params)
+            est.fit(x[train_idx], y[train_idx])
+            fold_scores.append(scorer(est, x[test_idx], y[test_idx]))
+
+            # Report running mean after each fold so pruners can act between folds
+            trial.report(float(np.nanmean(fold_scores)), step)
+            if trial.should_prune():
+                raise optuna.TrialPruned()
 
-        # Return mean test score
-        return np.nanmean(scores)
+        return float(np.nanmean(fold_scores))
 
     return objective
 

From b0026dabd59f4880e2e6d931ffda52c05b84b220 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sun, 1 Mar 2026 08:58:52 +0100
Subject: [PATCH 20/38] add nuisance evalutaion

---
 doubleml/double_ml_scalar.py                  | 171 +++++++++++
 doubleml/irm/irm_scalar.py                    |  14 +
 .../test_irm_scalar_evaluate_learners.py      | 265 ++++++++++++++++++
 doubleml/plm/plr_scalar.py                    |  39 ++-
 .../test_plr_scalar_evaluate_learners.py      | 251 +++++++++++++++++
 5 files changed, 729 insertions(+), 11 deletions(-)
 create mode 100644 doubleml/irm/tests/test_irm_scalar_evaluate_learners.py
 create mode 100644 doubleml/plm/tests/test_plr_scalar_evaluate_learners.py

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index bd39dc9e..f685cce9 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -9,6 +9,7 @@
     from .utils._tune_optuna import DMLOptunaResult
 
 import numpy as np
+from sklearn.metrics import log_loss, root_mean_squared_error
 
 from .data.base_data import DoubleMLBaseData
 from .double_ml_base import DoubleMLBase
@@ -103,6 +104,8 @@ def __init__(
 
         # Initialize storage for predictions and results
         self._predictions: dict[str, np.ndarray] | None = None
+        self._nuisance_targets: dict[str, np.ndarray] | None = None
+        self._nuisance_loss: dict[str, np.ndarray] | None = None
         self._all_thetas: np.ndarray | None = None
         self._all_ses: np.ndarray | None = None
         self._psi: np.ndarray | None = None
@@ -184,6 +187,51 @@ def predictions(self) -> dict[str, np.ndarray]:
             raise ValueError("Predictions not available. Call fit() first.")
         return self._predictions
 
+    @property
+    def nuisance_targets(self) -> dict[str, np.ndarray]:
+        """
+        Target arrays used for nuisance loss evaluation.
+
+        Returns
+        -------
+        dict[str, np.ndarray]
+            Dictionary with target arrays of shape ``(n_obs, n_rep)`` per learner.
+            Entries are all-NaN for learners whose targets cannot be recovered post-fit
+            (e.g. PLR ``ml_g``).
+
+        Raises
+        ------
+        ValueError
+            If the model has not been fitted yet.
+        """
+        if self._nuisance_targets is None:
+            raise ValueError("Nuisance targets not available. Call fit() or fit_nuisance_models() first.")
+        return self._nuisance_targets
+
+    @property
+    def nuisance_loss(self) -> dict[str, np.ndarray]:
+        """
+        Out-of-sample loss per learner, shape ``(n_rep,)``.
+
+        Uses RMSE for regressors and logloss for classifiers, determined automatically
+        from the registered learner type. Entries are NaN for learners whose targets are
+        unavailable or whose type cannot be determined (external predictions without a
+        registered learner).
+
+        Returns
+        -------
+        dict[str, np.ndarray]
+            Dictionary with loss arrays of shape ``(n_rep,)`` per learner.
+
+        Raises
+        ------
+        ValueError
+            If the model has not been fitted yet.
+        """
+        if self._nuisance_loss is None:
+            raise ValueError("Nuisance loss not available. Call fit() or fit_nuisance_models() first.")
+        return self._nuisance_loss
+
     @property
     def smpls(self) -> list:
         """
@@ -463,6 +511,16 @@ def fit_nuisance_models(
         # Post-nuisance prediction checks (model-specific)
         self._post_nuisance_checks()
 
+        # Build nuisance targets: _get_nuisance_targets() may return None for some learners
+        # (e.g. PLR ml_g whose target y - θ·d varies per rep). Convert None → all-NaN array
+        # so _nuisance_targets is always dict[str, np.ndarray].
+        raw_targets = self._get_nuisance_targets()
+        self._nuisance_targets = {}
+        for name in self.required_learners:
+            t = raw_targets.get(name)
+            self._nuisance_targets[name] = t if isinstance(t, np.ndarray) else np.full((self._n_obs, self.n_rep), np.nan)
+        self._nuisance_loss = self.evaluate_learners()
+
         return self
 
     def estimate_causal_parameters(self) -> Self:
@@ -744,6 +802,8 @@ def _construct_framework(self) -> DoubleMLFramework:
     def _reset_fit_state(self) -> None:
         """Clear fit-dependent state after changing the sample splitting."""
         self._predictions = None
+        self._nuisance_targets = None
+        self._nuisance_loss = None
         self._framework = None
         self._all_thetas = None
         self._all_ses = None
@@ -753,11 +813,122 @@ def _reset_fit_state(self) -> None:
         self._i_rep = None
         self._i_fold = None
 
+    def evaluate_learners(
+        self,
+        learners: list[str] | None = None,
+        metric: Callable | None = None,
+    ) -> dict[str, np.ndarray]:
+        """
+        Evaluate fitted learners on cross-validated predictions with a custom metric.
+
+        Parameters
+        ----------
+        learners : list of str or None, optional
+            Names of learners to evaluate. Default is all :attr:`required_learners`.
+        metric : callable or None, optional
+            Metric function with signature ``(y_true, y_pred) -> float``. Any sklearn
+            metric function (e.g. ``sklearn.metrics.root_mean_squared_error``,
+            ``sklearn.metrics.r2_score``, ``sklearn.metrics.log_loss``) or any custom
+            callable with the same signature can be passed.
+            If ``None``, automatically selects ``root_mean_squared_error`` for regressors
+            and ``log_loss`` for classifiers based on the registered learner type.
+
+        Returns
+        -------
+        dict[str, np.ndarray]
+            Dictionary with loss arrays of shape ``(n_rep,)`` per learner.
+            Entries are NaN for repetitions with no valid (non-NaN) targets or for
+            learners whose type cannot be determined (external predictions without a
+            registered learner).
+
+        Raises
+        ------
+        ValueError
+            If the model has not been fitted yet, or if a requested learner name is not
+            in :attr:`required_learners`.
+        TypeError
+            If ``metric`` is not callable.
+        ValueError
+            If the metric returns a non-finite value.
+
+        Examples
+        --------
+        >>> from sklearn.metrics import root_mean_squared_error, r2_score, log_loss
+        >>> model.evaluate_learners()
+        >>> model.evaluate_learners(metric=r2_score)
+        >>> model.evaluate_learners(learners=["ml_m"], metric=log_loss)
+        """
+        if self._nuisance_targets is None:
+            raise ValueError("Nuisance targets not available. Call fit() or fit_nuisance_models() first.")
+        if metric is not None and not callable(metric):
+            raise TypeError(f"metric must be callable or None. Got {type(metric).__name__}.")
+
+        if learners is None:
+            learners = self.required_learners
+
+        invalid = [name for name in learners if name not in self.required_learners]
+        if invalid:
+            raise ValueError(f"Invalid learner(s) {invalid}. Must be a subset of {self.required_learners}.")
+
+        n_rep = self.n_rep
+        result: dict[str, np.ndarray] = {}
+
+        for name in learners:
+            target = self._nuisance_targets[name]  # (n_obs, n_rep)
+            pred = self._predictions[name]  # (n_obs, n_rep)
+
+            loss_arr = np.full(n_rep, np.nan)
+            for i_rep in range(n_rep):
+                mask = ~np.isnan(target[:, i_rep])
+                if not mask.any():
+                    continue
+
+                t, p = target[mask, i_rep], pred[mask, i_rep]
+
+                if metric is None:
+                    if name not in self._learners:
+                        # No registered learner type (external predictions) — infer from target values
+                        unique_vals = np.unique(t)
+                        is_binary = len(unique_vals) <= 2 and np.all(np.isin(unique_vals, [0, 1]))
+                        fn: Callable = log_loss if is_binary else root_mean_squared_error
+                    else:
+                        fn = log_loss if self._learners[name].is_classifier else root_mean_squared_error
+                else:
+                    fn = metric
+
+                res = fn(t, p)
+                if not np.isfinite(res):
+                    raise ValueError(
+                        f"Evaluation of learner '{name}' for repetition {i_rep} returned " f"a non-finite value: {res}."
+                    )
+                loss_arr[i_rep] = res
+
+            result[name] = loss_arr
+
+        return result
+
     # ==================== Abstract Methods (Must be Implemented by Subclasses) ====================
 
     def _post_nuisance_checks(self) -> None:
         """Post-nuisance prediction validation hook. Override in subclasses for model-specific checks."""
 
+    @abstractmethod
+    def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]:
+        """
+        Return target arrays for nuisance loss evaluation.
+
+        Subclasses must implement this to provide targets for each learner.
+        Return ``None`` for learners whose targets cannot be recovered post-fit
+        (e.g. PLR ``ml_g`` whose target ``y - θ·d`` varies per repetition).
+
+        Returns
+        -------
+        dict[str, np.ndarray or None]
+            Dictionary mapping learner names to target arrays of shape ``(n_obs, n_rep)``,
+            or ``None`` where targets are not available.
+        """
+        pass
+
     @abstractmethod
     def _nuisance_est(
         self,
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index f6983368..4305eb27 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -305,6 +305,20 @@ def _nuisance_est(
 
     # ==================== Score Elements ====================
 
+    def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]:
+        """Return target arrays for nuisance loss evaluation.
+
+        ml_g0 and ml_g1 are fitted only on the d==0 and d==1 subgroups respectively,
+        so targets for the opposite group are NaN. ml_m target is d (binary treatment).
+        """
+        y = self._dml_data.y
+        d = self._dml_data.d
+        return {
+            "ml_g0": np.tile(np.where(d == 0, y, np.nan)[:, np.newaxis], (1, self.n_rep)),
+            "ml_g1": np.tile(np.where(d == 1, y, np.nan)[:, np.newaxis], (1, self.n_rep)),
+            "ml_m": np.tile(d[:, np.newaxis], (1, self.n_rep)),
+        }
+
     def _get_score_elements(self) -> dict[str, np.ndarray]:
         y = self._dml_data.y
         d = self._dml_data.d
diff --git a/doubleml/irm/tests/test_irm_scalar_evaluate_learners.py b/doubleml/irm/tests/test_irm_scalar_evaluate_learners.py
new file mode 100644
index 00000000..40b7a8bd
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_evaluate_learners.py
@@ -0,0 +1,265 @@
+"""Tests for evaluate_learners(), nuisance_loss, and nuisance_targets on IRM scalar models."""
+
+import numpy as np
+import pytest
+from sklearn.linear_model import Lasso, LogisticRegression
+from sklearn.metrics import log_loss, mean_absolute_error, r2_score, root_mean_squared_error
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+
+N_OBS = 500
+N_FOLDS = 5
+N_REP = 2
+
+
+@pytest.fixture(scope="module")
+def irm_data():
+    """Shared IRM dataset."""
+    np.random.seed(3141)
+    return make_irm_data(n_obs=N_OBS, dim_x=5)
+
+
+@pytest.fixture(scope="module", params=["ATE", "ATTE"])
+def score(request):
+    """Parametrize over IRM score functions."""
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def fitted_irm(score, irm_data):
+    """Fit an IRM model for the given score."""
+    model = IRM(irm_data, score=score)
+    model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression())
+    model.fit(n_folds=N_FOLDS, n_rep=N_REP)
+    return model
+
+
+# ==================== nuisance_loss ====================
+
+
+@pytest.mark.ci
+def test_nuisance_loss_type_and_shape(fitted_irm):
+    """nuisance_loss is a dict of (n_rep,) arrays; all entries are finite."""
+    loss = fitted_irm.nuisance_loss
+
+    assert isinstance(loss, dict)
+    for name in ["ml_g0", "ml_g1", "ml_m"]:
+        assert isinstance(loss[name], np.ndarray)
+        assert loss[name].shape == (N_REP,)
+        assert np.all(np.isfinite(loss[name]))
+
+
+@pytest.mark.ci
+def test_nuisance_loss_ml_m_is_logloss(fitted_irm):
+    """ml_m loss uses logloss (classifier path) — positive finite values."""
+    loss = fitted_irm.nuisance_loss
+    assert np.all(loss["ml_m"] > 0)
+
+
+@pytest.mark.ci
+def test_nuisance_loss_ml_g_is_rmse(fitted_irm):
+    """ml_g0 and ml_g1 loss uses RMSE (regressor path) — positive finite values."""
+    loss = fitted_irm.nuisance_loss
+    assert np.all(loss["ml_g0"] > 0)
+    assert np.all(loss["ml_g1"] > 0)
+
+
+# ==================== nuisance_targets ====================
+
+
+@pytest.mark.ci
+def test_nuisance_targets_type_and_shape(fitted_irm):
+    """nuisance_targets is a dict; all entries are (n_obs, n_rep) arrays."""
+    targets = fitted_irm.nuisance_targets
+
+    assert isinstance(targets, dict)
+    for name in ["ml_g0", "ml_g1", "ml_m"]:
+        assert isinstance(targets[name], np.ndarray)
+        assert targets[name].shape == (N_OBS, N_REP)
+
+
+@pytest.mark.ci
+def test_nuisance_targets_ml_g0_partial_nan(fitted_irm, irm_data):
+    """ml_g0 target is y where d==0 and NaN where d==1."""
+    targets = fitted_irm.nuisance_targets
+    d = irm_data.d
+
+    for i_rep in range(N_REP):
+        col = targets["ml_g0"][:, i_rep]
+        assert np.all(np.isnan(col[d == 1]))
+        assert np.all(np.isfinite(col[d == 0]))
+
+
+@pytest.mark.ci
+def test_nuisance_targets_ml_g1_partial_nan(fitted_irm, irm_data):
+    """ml_g1 target is y where d==1 and NaN where d==0."""
+    targets = fitted_irm.nuisance_targets
+    d = irm_data.d
+
+    for i_rep in range(N_REP):
+        col = targets["ml_g1"][:, i_rep]
+        assert np.all(np.isnan(col[d == 0]))
+        assert np.all(np.isfinite(col[d == 1]))
+
+
+@pytest.mark.ci
+def test_nuisance_targets_ml_m_equals_d(fitted_irm, irm_data):
+    """ml_m target is d broadcast across repetitions."""
+    targets = fitted_irm.nuisance_targets
+    d = irm_data.d
+    for i_rep in range(N_REP):
+        np.testing.assert_array_equal(targets["ml_m"][:, i_rep], d)
+
+
+# ==================== evaluate_learners ====================
+
+
+@pytest.mark.ci
+def test_evaluate_learners_default(fitted_irm):
+    """Default evaluate_learners() returns finite values with correct shape."""
+    result = fitted_irm.evaluate_learners()
+
+    assert isinstance(result, dict)
+    for name in ["ml_g0", "ml_g1", "ml_m"]:
+        assert isinstance(result[name], np.ndarray)
+        assert result[name].shape == (N_REP,)
+        assert np.all(np.isfinite(result[name]))
+
+
+@pytest.mark.ci
+def test_evaluate_learners_logloss_ml_m_matches_nuisance_loss(fitted_irm):
+    """evaluate_learners with log_loss on ml_m matches nuisance_loss['ml_m']."""
+    result = fitted_irm.evaluate_learners(learners=["ml_m"], metric=log_loss)
+    loss = fitted_irm.nuisance_loss
+
+    np.testing.assert_allclose(result["ml_m"], loss["ml_m"], rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_rmse_ml_g_matches_nuisance_loss(fitted_irm):
+    """evaluate_learners with RMSE on ml_g0/g1 matches nuisance_loss."""
+    result = fitted_irm.evaluate_learners(learners=["ml_g0", "ml_g1"], metric=root_mean_squared_error)
+    loss = fitted_irm.nuisance_loss
+
+    np.testing.assert_allclose(result["ml_g0"], loss["ml_g0"], rtol=1e-9)
+    np.testing.assert_allclose(result["ml_g1"], loss["ml_g1"], rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_partial_nans_ml_g(fitted_irm):
+    """RMSE for ml_g0/g1 is finite despite NaN targets for the opposite treatment group."""
+    result = fitted_irm.evaluate_learners(learners=["ml_g0", "ml_g1"], metric=root_mean_squared_error)
+
+    assert np.all(np.isfinite(result["ml_g0"]))
+    assert np.all(np.isfinite(result["ml_g1"]))
+
+
+@pytest.mark.ci
+def test_evaluate_learners_r2(fitted_irm):
+    """evaluate_learners with r2_score returns values <= 1 with correct shape."""
+    result = fitted_irm.evaluate_learners(learners=["ml_g0", "ml_g1"], metric=r2_score)
+
+    for name in ["ml_g0", "ml_g1"]:
+        assert result[name].shape == (N_REP,)
+        assert np.all(result[name] <= 1.0)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_mae(fitted_irm):
+    """evaluate_learners with mean_absolute_error returns positive values."""
+    result = fitted_irm.evaluate_learners(learners=["ml_g0", "ml_g1"], metric=mean_absolute_error)
+
+    for name in ["ml_g0", "ml_g1"]:
+        assert result[name].shape == (N_REP,)
+        assert np.all(result[name] > 0)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_subset(fitted_irm):
+    """Requesting only ml_m returns only the ml_m key."""
+    result = fitted_irm.evaluate_learners(learners=["ml_m"])
+
+    assert list(result.keys()) == ["ml_m"]
+    assert result["ml_m"].shape == (N_REP,)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_custom_metric(fitted_irm):
+    """A custom lambda metric produces consistent results."""
+    custom_mae = lambda y_true, y_pred: np.mean(np.abs(y_true - y_pred))  # noqa: E731
+    result_custom = fitted_irm.evaluate_learners(learners=["ml_g0"], metric=custom_mae)
+    result_sklearn = fitted_irm.evaluate_learners(learners=["ml_g0"], metric=mean_absolute_error)
+
+    np.testing.assert_allclose(result_custom["ml_g0"], result_sklearn["ml_g0"], rtol=1e-9)
+
+
+# ==================== Before-fit errors ====================
+
+
+@pytest.mark.ci
+def test_evaluate_learners_before_fit_raises(irm_data):
+    """evaluate_learners() raises ValueError before fit()."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression())
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        model.evaluate_learners()
+
+
+@pytest.mark.ci
+def test_nuisance_loss_before_fit_raises(irm_data):
+    """nuisance_loss raises ValueError before fit()."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression())
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        _ = model.nuisance_loss
+
+
+@pytest.mark.ci
+def test_nuisance_targets_before_fit_raises(irm_data):
+    """nuisance_targets raises ValueError before fit()."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression())
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        _ = model.nuisance_targets
+
+
+# ==================== Input validation ====================
+
+
+@pytest.mark.ci
+def test_evaluate_learners_invalid_learner(fitted_irm):
+    """Requesting an unknown learner name raises ValueError."""
+    with pytest.raises(ValueError, match=r"Invalid learner"):
+        fitted_irm.evaluate_learners(learners=["ml_g0", "ml_unknown"])
+
+
+@pytest.mark.ci
+def test_evaluate_learners_invalid_metric(fitted_irm):
+    """Passing a non-callable metric raises TypeError."""
+    with pytest.raises(TypeError, match=r"metric must be callable"):
+        fitted_irm.evaluate_learners(metric="rmse")
+
+
+# ==================== Reset behaviour ====================
+
+
+@pytest.mark.ci
+def test_reset_clears_nuisance(irm_data):
+    """After draw_sample_splitting(), nuisance_loss raises ValueError."""
+    model = IRM(irm_data)
+    model.set_learners(ml_g=Lasso(), ml_m=LogisticRegression())
+    model.fit(n_folds=N_FOLDS, n_rep=N_REP)
+    assert model.nuisance_loss is not None
+
+    model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        _ = model.nuisance_loss
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index caf290de..0e1c149e 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, ClassVar, Dict, List, Optional, Self
+from typing import Any, ClassVar, Self
 
 import numpy as np
 from sklearn.base import clone
@@ -38,7 +38,7 @@ class PLR(LinearScoreMixin):
     """
 
     # Define learner specifications for PLR
-    _LEARNER_SPECS: ClassVar[Dict[str, LearnerSpec]] = {
+    _LEARNER_SPECS: ClassVar[dict[str, LearnerSpec]] = {
         "ml_l": LearnerSpec("ml_l", allow_regressor=True, allow_classifier=True, binary_data_check="outcome"),
         "ml_m": LearnerSpec("ml_m", allow_regressor=True, allow_classifier=True, binary_data_check="treatment"),
         "ml_g": LearnerSpec("ml_g", allow_regressor=True, allow_classifier=False),
@@ -48,9 +48,9 @@ def __init__(
         self,
         obj_dml_data: DoubleMLData,
         score: str = "partialling out",
-        ml_l: Optional[object] = None,
-        ml_m: Optional[object] = None,
-        ml_g: Optional[object] = None,
+        ml_l: object | None = None,
+        ml_m: object | None = None,
+        ml_g: object | None = None,
     ):
         """
         Initialize PLR model.
@@ -88,7 +88,7 @@ def __init__(
             self.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
 
     @property
-    def required_learners(self) -> List[str]:
+    def required_learners(self) -> list[str]:
         """Required learners for current score."""
         names = ["ml_l", "ml_m"]
         if self.score == "IV-type":
@@ -97,9 +97,9 @@ def required_learners(self) -> List[str]:
 
     def set_learners(
         self,
-        ml_l: Optional[object] = None,
-        ml_m: Optional[object] = None,
-        ml_g: Optional[object] = None,
+        ml_l: object | None = None,
+        ml_m: object | None = None,
+        ml_g: object | None = None,
     ) -> Self:
         """
         Set the learners for nuisance estimation.
@@ -220,7 +220,7 @@ def _nuisance_est(
         test_idx: np.ndarray,
         i_rep: int,
         i_fold: int,
-        external_predictions: Optional[Dict[str, np.ndarray]] = None,
+        external_predictions: dict[str, np.ndarray] | None = None,
     ) -> None:
         x = self._dml_data.x
         y = self._dml_data.y
@@ -356,7 +356,24 @@ def _get_tuning_data(
 
         raise ValueError(f"Unknown learner '{learner_name}' for PLR.")
 
-    def _get_score_elements(self) -> Dict[str, np.ndarray]:
+    def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]:
+        """Return target arrays for nuisance loss evaluation.
+
+        Returns y for ml_l, d for ml_m. For IV-type score, ml_g target is None because
+        the adjusted outcome y - θ·d depends on the estimated parameter and varies per
+        repetition, so it cannot be recovered post-fit.
+        """
+        y = self._dml_data.y
+        d = self._dml_data.d
+        targets: dict[str, np.ndarray | None] = {
+            "ml_l": np.tile(y[:, np.newaxis], (1, self.n_rep)),
+            "ml_m": np.tile(d[:, np.newaxis], (1, self.n_rep)),
+        }
+        if "ml_g" in self.required_learners:
+            targets["ml_g"] = None
+        return targets
+
+    def _get_score_elements(self) -> dict[str, np.ndarray]:
         y = self._dml_data.y
         d = self._dml_data.d
 
diff --git a/doubleml/plm/tests/test_plr_scalar_evaluate_learners.py b/doubleml/plm/tests/test_plr_scalar_evaluate_learners.py
new file mode 100644
index 00000000..bb843274
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar_evaluate_learners.py
@@ -0,0 +1,251 @@
+"""Tests for evaluate_learners(), nuisance_loss, and nuisance_targets on PLR scalar models."""
+
+import numpy as np
+import pytest
+from sklearn.linear_model import Lasso
+from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+N_OBS = 500
+N_FOLDS = 5
+N_REP = 2
+
+
+@pytest.fixture(scope="module")
+def plr_data():
+    """Shared PLR dataset."""
+    np.random.seed(3141)
+    return make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5)
+
+
+@pytest.fixture(scope="module", params=["partialling out", "IV-type"])
+def score(request):
+    """Parametrize over PLR score functions."""
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def fitted_plr(score, plr_data):
+    """Fit a PLR model for the given score."""
+    model = PLR(plr_data, score=score)
+    model.set_learners(ml_l=Lasso(), ml_m=Lasso())
+    model.fit(n_folds=N_FOLDS, n_rep=N_REP)
+    return model
+
+
+# ==================== nuisance_loss ====================
+
+
+@pytest.mark.ci
+def test_nuisance_loss_type_and_shape(fitted_plr):
+    """nuisance_loss is a dict of (n_rep,) arrays; ml_l/ml_m are finite; ml_g is NaN."""
+    loss = fitted_plr.nuisance_loss
+
+    assert isinstance(loss, dict)
+    for name in ["ml_l", "ml_m"]:
+        assert isinstance(loss[name], np.ndarray)
+        assert loss[name].shape == (N_REP,)
+        assert np.all(np.isfinite(loss[name]))
+
+    if fitted_plr.score == "IV-type":
+        assert isinstance(loss["ml_g"], np.ndarray)
+        assert loss["ml_g"].shape == (N_REP,)
+        assert np.all(np.isnan(loss["ml_g"]))
+
+
+@pytest.mark.ci
+def test_nuisance_loss_positive(fitted_plr):
+    """RMSE values for ml_l and ml_m are strictly positive."""
+    loss = fitted_plr.nuisance_loss
+    assert np.all(loss["ml_l"] > 0)
+    assert np.all(loss["ml_m"] > 0)
+
+
+# ==================== nuisance_targets ====================
+
+
+@pytest.mark.ci
+def test_nuisance_targets_type_and_shape(fitted_plr):
+    """nuisance_targets is a dict; ml_l/ml_m have real values; ml_g is all-NaN (IV-type)."""
+    targets = fitted_plr.nuisance_targets
+
+    assert isinstance(targets, dict)
+    for name in ["ml_l", "ml_m"]:
+        assert isinstance(targets[name], np.ndarray)
+        assert targets[name].shape == (N_OBS, N_REP)
+        assert not np.all(np.isnan(targets[name]))
+
+    if fitted_plr.score == "IV-type":
+        assert isinstance(targets["ml_g"], np.ndarray)
+        assert targets["ml_g"].shape == (N_OBS, N_REP)
+        assert np.all(np.isnan(targets["ml_g"]))
+
+
+@pytest.mark.ci
+def test_nuisance_targets_ml_l_equals_y(fitted_plr, plr_data):
+    """ml_l target is y broadcast across repetitions."""
+    targets = fitted_plr.nuisance_targets
+    y = plr_data.y
+    for i_rep in range(N_REP):
+        np.testing.assert_array_equal(targets["ml_l"][:, i_rep], y)
+
+
+@pytest.mark.ci
+def test_nuisance_targets_ml_m_equals_d(fitted_plr, plr_data):
+    """ml_m target is d broadcast across repetitions."""
+    targets = fitted_plr.nuisance_targets
+    d = plr_data.d
+    for i_rep in range(N_REP):
+        np.testing.assert_array_equal(targets["ml_m"][:, i_rep], d)
+
+
+# ==================== evaluate_learners ====================
+
+
+@pytest.mark.ci
+def test_evaluate_learners_default(fitted_plr):
+    """Default evaluate_learners() returns RMSE for ml_l and ml_m."""
+    result = fitted_plr.evaluate_learners()
+
+    assert isinstance(result, dict)
+    for name in ["ml_l", "ml_m"]:
+        assert isinstance(result[name], np.ndarray)
+        assert result[name].shape == (N_REP,)
+        assert np.all(result[name] > 0)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_rmse_matches_nuisance_loss(fitted_plr):
+    """evaluate_learners with root_mean_squared_error matches nuisance_loss for ml_l and ml_m."""
+    result = fitted_plr.evaluate_learners(metric=root_mean_squared_error)
+    loss = fitted_plr.nuisance_loss
+
+    np.testing.assert_allclose(result["ml_l"], loss["ml_l"], rtol=1e-9)
+    np.testing.assert_allclose(result["ml_m"], loss["ml_m"], rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_r2(fitted_plr):
+    """evaluate_learners with r2_score returns values <= 1 with correct shape."""
+    result = fitted_plr.evaluate_learners(learners=["ml_l", "ml_m"], metric=r2_score)
+
+    for name in ["ml_l", "ml_m"]:
+        assert result[name].shape == (N_REP,)
+        assert np.all(result[name] <= 1.0)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_mae(fitted_plr):
+    """evaluate_learners with mean_absolute_error returns positive values with correct shape."""
+    result = fitted_plr.evaluate_learners(learners=["ml_l", "ml_m"], metric=mean_absolute_error)
+
+    for name in ["ml_l", "ml_m"]:
+        assert result[name].shape == (N_REP,)
+        assert np.all(result[name] > 0)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_subset(fitted_plr):
+    """Requesting only ml_l returns only the ml_l key."""
+    result = fitted_plr.evaluate_learners(learners=["ml_l"])
+
+    assert list(result.keys()) == ["ml_l"]
+    assert result["ml_l"].shape == (N_REP,)
+
+
+@pytest.mark.ci
+def test_evaluate_learners_custom_metric(fitted_plr):
+    """A custom lambda metric produces consistent results."""
+    custom_mae = lambda y_true, y_pred: np.mean(np.abs(y_true - y_pred))  # noqa: E731
+    result_custom = fitted_plr.evaluate_learners(learners=["ml_l"], metric=custom_mae)
+    result_sklearn = fitted_plr.evaluate_learners(learners=["ml_l"], metric=mean_absolute_error)
+
+    np.testing.assert_allclose(result_custom["ml_l"], result_sklearn["ml_l"], rtol=1e-9)
+
+
+# ==================== Before-fit errors ====================
+
+
+@pytest.mark.ci
+def test_evaluate_learners_before_fit_raises(plr_data):
+    """evaluate_learners() raises ValueError before fit_nuisance_models()."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=Lasso(), ml_m=Lasso())
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        model.evaluate_learners()
+
+
+@pytest.mark.ci
+def test_evaluate_learners_after_reset_raises(plr_data):
+    """evaluate_learners() raises ValueError after draw_sample_splitting() resets fit state."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=Lasso(), ml_m=Lasso())
+    model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    model.fit_nuisance_models()
+    # Re-drawing splits resets fit state
+    model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        model.evaluate_learners()
+
+
+@pytest.mark.ci
+def test_nuisance_loss_before_fit_raises(plr_data):
+    """nuisance_loss raises ValueError before fit_nuisance_models()."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=Lasso(), ml_m=Lasso())
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        _ = model.nuisance_loss
+
+
+@pytest.mark.ci
+def test_nuisance_targets_before_fit_raises(plr_data):
+    """nuisance_targets raises ValueError before fit_nuisance_models()."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=Lasso(), ml_m=Lasso())
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        _ = model.nuisance_targets
+
+
+# ==================== Input validation ====================
+
+
+@pytest.mark.ci
+def test_evaluate_learners_invalid_learner(fitted_plr):
+    """Requesting an unknown learner name raises ValueError."""
+    with pytest.raises(ValueError, match=r"Invalid learner"):
+        fitted_plr.evaluate_learners(learners=["ml_l", "ml_unknown"])
+
+
+@pytest.mark.ci
+def test_evaluate_learners_invalid_metric(fitted_plr):
+    """Passing a non-callable metric raises TypeError."""
+    with pytest.raises(TypeError, match=r"metric must be callable"):
+        fitted_plr.evaluate_learners(metric="rmse")
+
+
+# ==================== Reset behaviour ====================
+
+
+@pytest.mark.ci
+def test_reset_clears_nuisance(plr_data):
+    """After draw_sample_splitting(), nuisance_loss raises ValueError."""
+    model = PLR(plr_data)
+    model.set_learners(ml_l=Lasso(), ml_m=Lasso())
+    model.fit(n_folds=N_FOLDS, n_rep=N_REP)
+    assert model.nuisance_loss is not None
+
+    model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+
+    msg = r"Call fit\(\) or fit_nuisance_models\(\) first"
+    with pytest.raises(ValueError, match=msg):
+        _ = model.nuisance_loss

From 050fa27035c03bc3eb2816339ec018ba37d9add6 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sun, 1 Mar 2026 12:00:02 +0100
Subject: [PATCH 21/38] Implement sensitivity analysis for scalar models in
 DoubleML

- Added `_sensitivity_element_est` method to `DoubleMLScalar`, `IRM`, and `PLR` classes to compute sensitivity elements including sigma2, nu2, and their influence functions.
- Introduced `sensitivity_elements` property to retrieve computed sensitivity elements after model fitting.
- Implemented validation checks for sensitivity elements in `DoubleMLScalar`.
- Added exception handling for sensitivity analysis methods in `IRM` and `PLR` classes to ensure proper input types and values.
- Created unit tests for sensitivity analysis, including checks for element shapes, bounds, and exception handling in both `IRM` and `PLR` models.
- Ensured compatibility of sensitivity elements between scalar and legacy models in comparison tests.
---
 .claude/rules/testing-conventions.md          | 61 +++++++++++++-
 doubleml/double_ml_scalar.py                  | 81 ++++++++++++++++++
 doubleml/irm/irm_scalar.py                    | 61 ++++++++++++++
 .../irm/tests/test_irm_scalar_exceptions.py   | 65 ++++++++++++++
 .../irm/tests/test_irm_scalar_return_types.py | 78 +++++++++++++++++
 .../irm/tests/test_irm_scalar_sensitivity.py  | 84 +++++++++++++++++++
 doubleml/irm/tests/test_irm_scalar_vs_irm.py  | 31 +++++++
 doubleml/plm/plr_scalar.py                    | 56 +++++++++++++
 .../plm/tests/test_plr_scalar_exceptions.py   | 66 +++++++++++++++
 .../plm/tests/test_plr_scalar_return_types.py | 75 +++++++++++++++++
 .../plm/tests/test_plr_scalar_sensitivity.py  | 81 ++++++++++++++++++
 doubleml/plm/tests/test_plr_scalar_vs_plr.py  | 31 +++++++
 12 files changed, 769 insertions(+), 1 deletion(-)
 create mode 100644 doubleml/irm/tests/test_irm_scalar_sensitivity.py
 create mode 100644 doubleml/plm/tests/test_plr_scalar_sensitivity.py

diff --git a/.claude/rules/testing-conventions.md b/.claude/rules/testing-conventions.md
index e48b5fa8..46a83b42 100644
--- a/.claude/rules/testing-conventions.md
+++ b/.claude/rules/testing-conventions.md
@@ -137,9 +137,66 @@ For models with `_LEARNER_PARAM_ALIASES` (e.g., IRM `"ml_g"` → `["ml_g0", "ml_
 
 ---
 
+## Evaluate Learners Tests (`test_<model>_scalar_evaluate_learners.py`)
+
+Scalar models with `evaluate_learners()` require a dedicated test file. Constants: `N_OBS=500`, `N_FOLDS=5`, `N_REP=2`. Score-parametrized fixture (same pattern as tuning tests).
+
+**Required tests:**
+
+| Test | Checks |
+|------|--------|
+| `test_nuisance_loss_type_and_shape` | `dict`; each value `shape == (N_REP,)`; finite or NaN as expected |
+| `test_nuisance_loss_positive` | RMSE > 0 for learners with real targets |
+| `test_nuisance_targets_type_and_shape` | `shape == (N_OBS, N_REP)`; NaN arrays for unknown targets |
+| `test_nuisance_targets_correct_values` | ml_l target == y; ml_m target == d (model-specific) |
+| `test_evaluate_learners_default` | Default metric returns finite positive values |
+| `test_evaluate_learners_rmse_matches_nuisance_loss` | `evaluate_learners(root_mean_squared_error)` equals `nuisance_loss` |
+| `test_evaluate_learners_r2` | R² ≤ 1; correct shape |
+| `test_evaluate_learners_mae` | MAE > 0; correct shape |
+| `test_evaluate_learners_subset` | `learners=["ml_l"]` returns only `"ml_l"` key |
+| `test_evaluate_learners_custom_metric` | Lambda metric matches sklearn equivalent |
+| `test_evaluate_learners_before_fit_raises` | `ValueError` before `fit_nuisance_models()` |
+| `test_evaluate_learners_after_reset_raises` | `ValueError` after `draw_sample_splitting()` |
+| `test_nuisance_loss_before_fit_raises` | `ValueError` on `.nuisance_loss` before fit |
+| `test_nuisance_targets_before_fit_raises` | `ValueError` on `.nuisance_targets` before fit |
+| `test_evaluate_learners_invalid_learner` | Unknown learner name raises `ValueError` |
+| `test_evaluate_learners_invalid_metric` | Non-callable metric raises `TypeError` |
+| `test_reset_clears_nuisance` | After `draw_sample_splitting()`, `nuisance_loss` raises |
+
+NaN conventions: PLR `ml_g` → all-NaN; IRM `ml_g0` → NaN for `d==1`; `ml_g1` → NaN for `d==0`.
+
+---
+
+## Sensitivity Tests (`test_<model>_scalar_sensitivity.py`)
+
+Scalar models with `_sensitivity_element_est()` require a dedicated test file. Constants: `N_OBS=500`, `N_FOLDS=5`, `N_REP=2`. Score-parametrized `fitted_<model>` fixture.
+
+**Exception tests** go in `test_<model>_scalar_exceptions.py` — not in this file:
+
+| Test | Input | Expected |
+|------|-------|----------|
+| `test_exception_sensitivity_before_fit` | Call before `fit()` | `ValueError` matching `"The framework is not yet initialized"` |
+| `test_exception_sensitivity_cf_y` | `cf_y=1` (int) / `cf_y=1.0` (boundary) | `TypeError` / `ValueError` |
+| `test_exception_sensitivity_cf_d` | `cf_d=1` / `cf_d=1.0` | `TypeError` / `ValueError` |
+| `test_exception_sensitivity_rho` | `rho=1` (int) / `rho=1.1` (out of range) | `TypeError` / `ValueError` |
+| `test_exception_sensitivity_level` | `level=1` (int) / `level=0.0` (boundary) | `TypeError` / `ValueError` |
+| `test_exception_sensitivity_null_hypothesis` | Wrong shape array | `ValueError` |
+
+**Required tests** (parametrize over all scores):
+
+| Test | Checks |
+|------|--------|
+| `test_sensitivity_elements_positive` | `sigma2 >= 0`, `nu2 > 0`, `max_bias >= 0` |
+| `test_sensitivity_params_structure` | After `sensitivity_analysis()`: `theta/se/ci` have `lower`/`upper`; `rv`/`rva` in [0, 1] |
+| `test_sensitivity_params_bounds_ordered` | `theta["lower"] <= coef <= theta["upper"]` |
+| `test_sensitivity_rho0` | `rho=0.0`: `se["lower"] ≈ se["upper"] ≈ model.se` (`rtol=1e-6`) |
+| `test_sensitivity_monotonicity_cf_y` | `cf_y=0.15` → wider theta bounds than `cf_y=0.03` |
+
+---
+
 ## Naming
 
-- Files: `test_<model>.py`, `test_<model>_scalar.py`, `test_<model>_scalar_exceptions.py`, `test_<model>_scalar_tune_ml_models.py`
+- Files: `test_<model>.py`, `test_<model>_scalar.py`, `test_<model>_scalar_exceptions.py`, `test_<model>_scalar_tune_ml_models.py`, `test_<model>_scalar_evaluate_learners.py`, `test_<model>_scalar_sensitivity.py`
 - Functions: `test_<what>` — e.g., `test_coef_within_3_sigma`, `test_exception_invalid_score`
 - Docstrings: Every test function gets a one-line docstring explaining what it verifies
 
@@ -152,3 +209,5 @@ For models with `_LEARNER_PARAM_ALIASES` (e.g., IRM `"ml_g"` → `["ml_g0", "ml_
 - [ ] Test functions have descriptive names and docstrings
 - [ ] New scalar models have all 5 required test files (see `dml-scalar-test-structure.md`)
 - [ ] If model has `tune_ml_models()`, add `test_<model>_scalar_tune_ml_models.py` with all required tuning tests
+- [ ] If model has `evaluate_learners()` / `nuisance_loss`, add `test_<model>_scalar_evaluate_learners.py`
+- [ ] If model has `_sensitivity_element_est()`, add sensitivity exception tests to `test_<model>_scalar_exceptions.py` and add `test_<model>_scalar_sensitivity.py`
diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index f685cce9..d10c1c49 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -17,6 +17,7 @@
 from .double_ml_framework import DoubleMLFramework
 from .utils._checks import _check_sample_splitting
 from .utils._learner import LearnerInfo, LearnerSpec, validate_learner
+from .utils._sensitivity import _compute_sensitivity_bias
 from .utils._tune_optuna import OPTUNA_GLOBAL_SETTING_KEYS, _dml_tune_optuna, resolve_optuna_cv
 from .utils.resampling import DoubleMLClusterResampling, DoubleMLResampling
 
@@ -106,6 +107,7 @@ def __init__(
         self._predictions: dict[str, np.ndarray] | None = None
         self._nuisance_targets: dict[str, np.ndarray] | None = None
         self._nuisance_loss: dict[str, np.ndarray] | None = None
+        self._sensitivity_elements: dict[str, np.ndarray] | None = None
         self._all_thetas: np.ndarray | None = None
         self._all_ses: np.ndarray | None = None
         self._psi: np.ndarray | None = None
@@ -232,6 +234,22 @@ def nuisance_loss(self) -> dict[str, np.ndarray]:
             raise ValueError("Nuisance loss not available. Call fit() or fit_nuisance_models() first.")
         return self._nuisance_loss
 
+    @property
+    def sensitivity_elements(self) -> dict[str, np.ndarray] | None:
+        """
+        Raw sensitivity elements computed after :meth:`fit`.
+
+        Returns ``None`` if sensitivity analysis is not implemented for this model
+        or if the model has not been fitted yet.
+
+        Returns
+        -------
+        dict[str, np.ndarray] or None
+            Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``),
+            ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``).
+        """
+        return self._sensitivity_elements
+
     @property
     def smpls(self) -> list:
         """
@@ -552,6 +570,10 @@ def estimate_causal_parameters(self) -> Self:
         # Estimate causal parameters - from score mixin
         self._est_causal_pars_and_se(psi_elements)
 
+        # Compute sensitivity elements (optional hook — None by default)
+        self._sensitivity_elements = self._sensitivity_element_est()
+        self._validate_sensitivity_elements()
+
         # Construct framework
         self._framework = self._construct_framework()
 
@@ -783,6 +805,22 @@ def _construct_framework(self) -> DoubleMLFramework:
                 "n_folds_per_cluster": self._n_folds_per_cluster,
             }
 
+        # Compute framework-ready sensitivity elements if available
+        sensitivity_elements_for_framework: dict[str, np.ndarray] | None = None
+        if self._sensitivity_elements is not None:
+            max_bias, psi_max_bias = _compute_sensitivity_bias(
+                sigma2=self._sensitivity_elements["sigma2"],
+                nu2=self._sensitivity_elements["nu2"],
+                psi_sigma2=self._sensitivity_elements["psi_sigma2"],
+                psi_nu2=self._sensitivity_elements["psi_nu2"],
+            )
+            sensitivity_elements_for_framework = {
+                "max_bias": max_bias,  # (1, 1, n_rep)
+                "psi_max_bias": psi_max_bias,  # (n_obs, 1, n_rep)
+                "sigma2": self._sensitivity_elements["sigma2"],  # (1, 1, n_rep)
+                "nu2": self._sensitivity_elements["nu2"],  # (1, 1, n_rep)
+            }
+
         # Create data container (no transpose needed - already in framework convention!)
         framework_data = DoubleMLCoreData(
             all_thetas=self._all_thetas,  # (n_thetas, n_rep)
@@ -791,6 +829,7 @@ def _construct_framework(self) -> DoubleMLFramework:
             scaled_psi=scaled_psi,  # (n_obs, n_thetas, n_rep)
             is_cluster_data=self._dml_data.is_cluster_data,
             cluster_dict=cluster_dict,
+            sensitivity_elements=sensitivity_elements_for_framework,
         )
 
         # Create and return framework
@@ -804,6 +843,7 @@ def _reset_fit_state(self) -> None:
         self._predictions = None
         self._nuisance_targets = None
         self._nuisance_loss = None
+        self._sensitivity_elements = None
         self._framework = None
         self._all_thetas = None
         self._all_ses = None
@@ -912,6 +952,47 @@ def evaluate_learners(
     def _post_nuisance_checks(self) -> None:
         """Post-nuisance prediction validation hook. Override in subclasses for model-specific checks."""
 
+    def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
+        """
+        Compute sensitivity analysis elements after causal parameter estimation.
+
+        Optional hook called after :meth:`_est_causal_pars_and_se` in
+        :meth:`estimate_causal_parameters`. Override in subclasses to enable
+        sensitivity analysis via :meth:`sensitivity_analysis`.
+
+        Implementations should access ``self._predictions``, ``self._dml_data``,
+        and ``self._all_thetas`` directly and compute results vectorized over all
+        ``n_rep`` repetitions at once.
+
+        Returns
+        -------
+        dict[str, np.ndarray] or None
+            Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``),
+            ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``).
+            Return ``None`` (default) if sensitivity analysis is not implemented.
+        """
+        return None
+
+    def _validate_sensitivity_elements(self) -> None:
+        """Re-estimate nu2 from riesz representer if nu2 is non-positive (degenerate PS)."""
+        import warnings
+
+        if self._sensitivity_elements is None:
+            return
+        nu2 = self._sensitivity_elements["nu2"]  # (1, 1, n_rep)
+        rr = self._sensitivity_elements["riesz_rep"]  # (n_obs, 1, n_rep)
+        if np.any(nu2 <= 0):
+            treatment_name = self._dml_data.d_cols[0]
+            warnings.warn(
+                f"The estimated nu2 for treatment '{treatment_name}' is not positive. "
+                "Re-estimation based on riesz representer (non-orthogonal).",
+                UserWarning,
+            )
+            psi_nu2_new = rr**2
+            nu2_new = np.mean(psi_nu2_new, axis=0, keepdims=True)
+            self._sensitivity_elements["nu2"] = nu2_new
+            self._sensitivity_elements["psi_nu2"] = psi_nu2_new - nu2_new
+
     @abstractmethod
     def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]:
         """
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index 4305eb27..f6fe85de 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -475,3 +475,64 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
             weights_bar = np.divide(m_hat * w[:, np.newaxis], subgroup_probability)
 
         return weights, weights_bar
+
+    def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
+        """
+        Compute IRM sensitivity elements vectorized over all repetitions.
+
+        Reproduces the propensity score processing and weight computation from
+        :meth:`_get_score_elements` to compute sigma2, nu2, their influence
+        functions, and the Riesz representer.
+
+        Returns
+        -------
+        dict[str, np.ndarray] or None
+            Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``),
+            ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``).
+        """
+        y = self._dml_data.y  # (n_obs,)
+        d = self._dml_data.d  # (n_obs,)
+        g_hat0 = self._predictions["ml_g0"]  # (n_obs, n_rep)
+        g_hat1 = self._predictions["ml_g1"]  # (n_obs, n_rep)
+        m_hat_raw = self._predictions["ml_m"]  # (n_obs, n_rep)
+
+        # Reproduce PS processing (same per-rep loop as _get_score_elements)
+        m_hat = np.zeros_like(m_hat_raw)
+        for i_rep in range(self.n_rep):
+            m_hat[:, i_rep] = self._ps_processor.adjust_ps(m_hat_raw[:, i_rep], d, cv=self._smpls[i_rep], learner_name="ml_m")
+        m_hat_adj = np.zeros_like(m_hat)
+        for i_rep in range(self.n_rep):
+            m_hat_adj[:, i_rep] = _propensity_score_adjustment(
+                propensity_score=m_hat[:, i_rep],
+                treatment_indicator=d,
+                normalize_ipw=self.normalize_ipw,
+            )
+
+        d2d = d[:, np.newaxis]  # (n_obs, 1) for broadcasting
+
+        # sigma2: squared residual of the outcome regression
+        sigma2_score = (y[:, np.newaxis] - d2d * g_hat1 - (1.0 - d2d) * g_hat0) ** 2  # (n_obs, n_rep)
+        sigma2_mean = np.mean(sigma2_score, axis=0)  # (n_rep,)
+        psi_sigma2 = sigma2_score - sigma2_mean[np.newaxis, :]  # (n_obs, n_rep)
+        sigma2 = sigma2_mean[np.newaxis, np.newaxis, :]  # (1, 1, n_rep)
+        psi_sigma2 = psi_sigma2[:, np.newaxis, :]  # (n_obs, 1, n_rep)
+
+        # Riesz representer and nu2 — uses _get_weights which vectorizes over n_rep
+        weights, weights_bar = self._get_weights(m_hat_adj)  # each (n_obs, n_rep)
+        rr_2d = weights_bar * (np.divide(d2d, m_hat_adj) - np.divide(1.0 - d2d, 1.0 - m_hat_adj))  # (n_obs, n_rep)
+        m_alpha = weights * weights_bar * (np.divide(1.0, m_hat_adj) + np.divide(1.0, 1.0 - m_hat_adj))  # (n_obs, n_rep)
+
+        nu2_score = 2.0 * m_alpha - rr_2d**2  # (n_obs, n_rep)
+        nu2_mean = np.mean(nu2_score, axis=0)  # (n_rep,)
+        psi_nu2 = nu2_score - nu2_mean[np.newaxis, :]  # (n_obs, n_rep)
+        nu2 = nu2_mean[np.newaxis, np.newaxis, :]  # (1, 1, n_rep)
+        psi_nu2 = psi_nu2[:, np.newaxis, :]  # (n_obs, 1, n_rep)
+        rr = rr_2d[:, np.newaxis, :]  # (n_obs, 1, n_rep)
+
+        return {
+            "sigma2": sigma2,
+            "nu2": nu2,
+            "psi_sigma2": psi_sigma2,
+            "psi_nu2": psi_nu2,
+            "riesz_rep": rr,
+        }
diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py
index 59dc91a1..2fe5fd35 100644
--- a/doubleml/irm/tests/test_irm_scalar_exceptions.py
+++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py
@@ -163,3 +163,68 @@ def test_irm_scalar_exception_binary_predictions_g():
     msg = r"For the binary variable .+, predictions .+ are also observed to be binary"
     with pytest.raises(ValueError, match=msg):
         dml_obj.fit_nuisance_models()
+
+
+# ==================== sensitivity_analysis exceptions ====================
+
+
+@pytest.fixture(scope="module")
+def fitted_irm_for_sensitivity():
+    """Fitted IRM model for sensitivity exception tests."""
+    dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m)
+    dml_obj.fit(n_folds=3, n_rep=1)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_before_fit():
+    """sensitivity_analysis() raises ValueError before fit()."""
+    dml_obj = IRM(obj_dml_data)
+    msg = r"The framework is not yet initialized"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.sensitivity_analysis()
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_cf_y(fitted_irm_for_sensitivity):
+    """cf_y must be a float in [0,1)."""
+    with pytest.raises(TypeError, match=r"cf_y must be of float type"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(cf_y=1)
+    with pytest.raises(ValueError, match=r"cf_y must be in \[0,1\)"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(cf_y=1.0)
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_cf_d(fitted_irm_for_sensitivity):
+    """cf_d must be a float in [0,1)."""
+    with pytest.raises(TypeError, match=r"cf_d must be of float type"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(cf_d=1)
+    with pytest.raises(ValueError, match=r"cf_d must be in \[0,1\)"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(cf_d=1.0)
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_rho(fitted_irm_for_sensitivity):
+    """rho must be a float with |rho| <= 1."""
+    with pytest.raises(TypeError, match=r"rho must be of float type"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(rho=1)
+    with pytest.raises(ValueError, match=r"The absolute value of rho must be in \[0,1\]"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(rho=1.1)
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_level(fitted_irm_for_sensitivity):
+    """level must be a float in (0,1)."""
+    with pytest.raises(TypeError, match=r"The confidence level must be of float type"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(level=1)
+    with pytest.raises(ValueError, match=r"The confidence level must be in \(0,1\)"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(level=0.0)
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_null_hypothesis(fitted_irm_for_sensitivity):
+    """null_hypothesis with wrong shape raises ValueError."""
+    import numpy as np
+
+    with pytest.raises(ValueError, match=r"null_hypothesis"):
+        fitted_irm_for_sensitivity.sensitivity_analysis(null_hypothesis=np.array([0.0, 0.0]))
diff --git a/doubleml/irm/tests/test_irm_scalar_return_types.py b/doubleml/irm/tests/test_irm_scalar_return_types.py
index a437f49d..4b755899 100644
--- a/doubleml/irm/tests/test_irm_scalar_return_types.py
+++ b/doubleml/irm/tests/test_irm_scalar_return_types.py
@@ -214,3 +214,81 @@ def test_reset_after_draw_sample_splitting():
         _ = dml_obj.coef
     with pytest.raises(ValueError, match="Predictions not available. Call fit"):
         _ = dml_obj.predictions
+
+
+@pytest.mark.ci
+def test_sensitivity_elements_type_and_shape(fitted_dml_obj):
+    """sensitivity_elements has correct keys, types, and shapes after fit."""
+    elems = fitted_dml_obj.sensitivity_elements
+    assert isinstance(elems, dict)
+    for key in ["sigma2", "nu2"]:
+        assert key in elems
+        assert isinstance(elems[key], np.ndarray)
+        assert elems[key].shape == (1, 1, N_REP)
+    for key in ["psi_sigma2", "psi_nu2", "riesz_rep"]:
+        assert key in elems
+        assert isinstance(elems[key], np.ndarray)
+        assert elems[key].shape == (N_OBS, 1, N_REP)
+
+
+@pytest.mark.ci
+def test_sensitivity_analysis_runs(fitted_dml_obj):
+    """sensitivity_analysis() completes without error and returns self."""
+    result = fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    assert result is fitted_dml_obj.framework
+
+
+@pytest.mark.ci
+def test_sensitivity_before_fit_is_none():
+    """sensitivity_elements returns None before fit()."""
+    dml_obj = IRM(obj_dml_data)
+    assert dml_obj.sensitivity_elements is None
+
+
+@pytest.mark.ci
+def test_sensitivity_reset_after_draw_sample_splitting():
+    """sensitivity_elements resets to None after draw_sample_splitting()."""
+    np.random.seed(3141)
+    dml_obj = IRM(obj_dml_data)
+    dml_obj.set_learners(
+        ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42),
+        ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42),
+    )
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+    assert dml_obj.sensitivity_elements is not None
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    assert dml_obj.sensitivity_elements is None
+
+
+@pytest.mark.ci
+def test_sensitivity_params_structure(fitted_dml_obj):
+    """sensitivity_params has expected keys and finite rv/rva after sensitivity_analysis()."""
+    fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03)
+    params = fitted_dml_obj.framework.sensitivity_params
+    for key in ["theta", "se", "ci"]:
+        assert "lower" in params[key] and "upper" in params[key]
+    for key in ["rv", "rva"]:
+        assert np.all(np.isfinite(params[key]))
+        assert np.all(params[key] >= 0) and np.all(params[key] <= 1)
+
+
+@pytest.mark.ci
+def test_sensitivity_rho0_se_bounds(fitted_dml_obj):
+    """With rho=0, se lower and upper bounds equal the unadjusted se."""
+    fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=0.0)
+    params = fitted_dml_obj.framework.sensitivity_params
+    np.testing.assert_allclose(params["se"]["lower"], fitted_dml_obj.se, rtol=1e-6)
+    np.testing.assert_allclose(params["se"]["upper"], fitted_dml_obj.se, rtol=1e-6)
+
+
+@pytest.mark.ci
+def test_sensitivity_monotonicity_cf_y(fitted_dml_obj):
+    """Increasing cf_y widens the theta sensitivity bounds."""
+    fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    params_low = fitted_dml_obj.framework.sensitivity_params
+    width_low = params_low["theta"]["upper"] - params_low["theta"]["lower"]
+    fitted_dml_obj.sensitivity_analysis(cf_y=0.15, cf_d=0.03, rho=1.0)
+    params_high = fitted_dml_obj.framework.sensitivity_params
+    width_high = params_high["theta"]["upper"] - params_high["theta"]["lower"]
+    assert np.all(width_high >= width_low)
diff --git a/doubleml/irm/tests/test_irm_scalar_sensitivity.py b/doubleml/irm/tests/test_irm_scalar_sensitivity.py
new file mode 100644
index 00000000..d3682e9e
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_sensitivity.py
@@ -0,0 +1,84 @@
+"""Score-parametrized sensitivity analysis tests for IRM scalar models."""
+
+import numpy as np
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+
+N_OBS = 500
+N_FOLDS = 5
+N_REP = 2
+
+
+@pytest.fixture(scope="module")
+def irm_data():
+    """Shared IRM dataset."""
+    np.random.seed(3141)
+    return make_irm_data(theta=0.5, n_obs=N_OBS, dim_x=5, return_type="DoubleMLData")
+
+
+@pytest.fixture(scope="module", params=["ATE", "ATTE"])
+def fitted_irm(request, irm_data):
+    """Fitted IRM model parametrized over both score variants."""
+    dml_obj = IRM(irm_data, score=request.param)
+    dml_obj.set_learners(
+        ml_g=RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42),
+        ml_m=RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42),
+    )
+    dml_obj.fit(n_folds=N_FOLDS, n_rep=N_REP)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_sensitivity_elements_positive(fitted_irm):
+    """sigma2 >= 0, nu2 > 0, and max_bias >= 0 for each score variant."""
+    elems = fitted_irm.sensitivity_elements
+    assert np.all(elems["sigma2"] >= 0)
+    assert np.all(elems["nu2"] > 0)
+    assert np.all(fitted_irm.framework.sensitivity_elements["max_bias"] >= 0)
+
+
+@pytest.mark.ci
+def test_sensitivity_params_structure(fitted_irm):
+    """After sensitivity_analysis(), theta/se/ci have lower/upper; rv/rva in [0,1]."""
+    fitted_irm.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    params = fitted_irm.framework.sensitivity_params
+    for key in ["theta", "se", "ci"]:
+        assert "lower" in params[key] and "upper" in params[key]
+    for key in ["rv", "rva"]:
+        assert np.all(np.isfinite(params[key]))
+        assert np.all(params[key] >= 0) and np.all(params[key] <= 1)
+
+
+@pytest.mark.ci
+def test_sensitivity_params_bounds_ordered(fitted_irm):
+    """theta lower bound <= estimated coef <= theta upper bound."""
+    fitted_irm.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    params = fitted_irm.framework.sensitivity_params
+    assert np.all(params["theta"]["lower"] <= fitted_irm.coef)
+    assert np.all(fitted_irm.coef <= params["theta"]["upper"])
+
+
+@pytest.mark.ci
+def test_sensitivity_rho0(fitted_irm):
+    """With rho=0, se lower and upper bounds equal the unadjusted se."""
+    fitted_irm.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=0.0)
+    params = fitted_irm.framework.sensitivity_params
+    np.testing.assert_allclose(params["se"]["lower"], fitted_irm.se, rtol=1e-6)
+    np.testing.assert_allclose(params["se"]["upper"], fitted_irm.se, rtol=1e-6)
+
+
+@pytest.mark.ci
+def test_sensitivity_monotonicity_cf_y(fitted_irm):
+    """Increasing cf_y produces wider theta sensitivity bounds."""
+    fitted_irm.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    width_low = (
+        fitted_irm.framework.sensitivity_params["theta"]["upper"] - fitted_irm.framework.sensitivity_params["theta"]["lower"]
+    )
+    fitted_irm.sensitivity_analysis(cf_y=0.15, cf_d=0.03, rho=1.0)
+    width_high = (
+        fitted_irm.framework.sensitivity_params["theta"]["upper"] - fitted_irm.framework.sensitivity_params["theta"]["lower"]
+    )
+    assert np.all(width_high >= width_low)
diff --git a/doubleml/irm/tests/test_irm_scalar_vs_irm.py b/doubleml/irm/tests/test_irm_scalar_vs_irm.py
index 196385e8..adf578cc 100644
--- a/doubleml/irm/tests/test_irm_scalar_vs_irm.py
+++ b/doubleml/irm/tests/test_irm_scalar_vs_irm.py
@@ -80,3 +80,34 @@ def test_all_se_equal(comparison_fixture):
     old = comparison_fixture["old"]
     new = comparison_fixture["new"]
     np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_sigma2_equal(comparison_fixture):
+    """IRM scalar sigma2 matches DoubleMLIRM sensitivity_elements['sigma2']."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    # Legacy shape: (1, n_rep, 1); scalar shape: (1, 1, n_rep). Transpose to align.
+    old_sigma2 = np.transpose(old.sensitivity_elements["sigma2"], (0, 2, 1))
+    np.testing.assert_allclose(new.sensitivity_elements["sigma2"], old_sigma2, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_nu2_equal(comparison_fixture):
+    """IRM scalar nu2 matches DoubleMLIRM sensitivity_elements['nu2']."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    old_nu2 = np.transpose(old.sensitivity_elements["nu2"], (0, 2, 1))
+    np.testing.assert_allclose(new.sensitivity_elements["nu2"], old_nu2, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_max_bias_equal(comparison_fixture):
+    """IRM scalar framework max_bias matches DoubleMLIRM framework max_bias."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(
+        new.framework.sensitivity_elements["max_bias"],
+        old.framework.sensitivity_elements["max_bias"],
+        rtol=1e-9,
+    )
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 0e1c149e..9d2da5eb 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -392,3 +392,59 @@ def _get_score_elements(self) -> dict[str, np.ndarray]:
             psi_b = v_hat * (y[:, np.newaxis] - g_hat)
 
         return {"psi_a": psi_a, "psi_b": psi_b}
+
+    def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
+        """
+        Compute PLR sensitivity elements vectorized over all repetitions.
+
+        Computes sigma2 (outcome residual variance), nu2 (inverse of treatment
+        residual variance), their influence functions, and the Riesz representer.
+        Handles both ``'partialling out'`` and ``'IV-type'`` scores.
+
+        Returns
+        -------
+        dict[str, np.ndarray] or None
+            Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``),
+            ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``).
+            Returns ``None`` for callable scores (no standard Riesz representer).
+        """
+        if callable(self.score):
+            return None
+
+        y = self._dml_data.y  # (n_obs,)
+        d = self._dml_data.d  # (n_obs,)
+        m_hat = self._predictions["ml_m"]  # (n_obs, n_rep)
+        theta = self._all_thetas  # (1, n_rep) — broadcasts with (n_obs, n_rep)
+
+        treatment_residual = d[:, np.newaxis] - m_hat  # (n_obs, n_rep)
+
+        if self.score == "partialling out":
+            l_hat = self._predictions["ml_l"]  # (n_obs, n_rep)
+            sigma2_score = (y[:, np.newaxis] - l_hat - theta * treatment_residual) ** 2
+        else:  # "IV-type"
+            g_hat = self._predictions["ml_g"]  # (n_obs, n_rep)
+            sigma2_score = (y[:, np.newaxis] - g_hat - theta * d[:, np.newaxis]) ** 2
+
+        # sigma2: mean across observations, reshaped to (1, 1, n_rep)
+        sigma2_mean = np.mean(sigma2_score, axis=0)  # (n_rep,)
+        psi_sigma2 = sigma2_score - sigma2_mean[np.newaxis, :]  # (n_obs, n_rep)
+        sigma2 = sigma2_mean[np.newaxis, np.newaxis, :]  # (1, 1, n_rep)
+        psi_sigma2 = psi_sigma2[:, np.newaxis, :]  # (n_obs, 1, n_rep)
+
+        # nu2 = 1 / E[(d - m_hat)^2], reshaped to (1, 1, n_rep)
+        tr_sq_mean = np.mean(treatment_residual**2, axis=0)  # (n_rep,)
+        nu2_val = 1.0 / tr_sq_mean  # (n_rep,)
+        psi_nu2 = nu2_val[np.newaxis, :] - treatment_residual**2 * nu2_val[np.newaxis, :] ** 2  # (n_obs, n_rep)
+        nu2 = nu2_val[np.newaxis, np.newaxis, :]  # (1, 1, n_rep)
+        psi_nu2 = psi_nu2[:, np.newaxis, :]  # (n_obs, 1, n_rep)
+
+        # Riesz representer: (d - m_hat) * nu2
+        rr = (treatment_residual * nu2_val[np.newaxis, :])[:, np.newaxis, :]  # (n_obs, 1, n_rep)
+
+        return {
+            "sigma2": sigma2,
+            "nu2": nu2,
+            "psi_sigma2": psi_sigma2,
+            "psi_nu2": psi_nu2,
+            "riesz_rep": rr,
+        }
diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py
index d49d1902..7d2a57b9 100644
--- a/doubleml/plm/tests/test_plr_scalar_exceptions.py
+++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py
@@ -136,3 +136,69 @@ def test_plr_scalar_warning_binary_outcome_classifier():
     msg = r"The ml_l learner .+ was identified as classifier\. Fitting an additive probability model\."
     with pytest.warns(UserWarning, match=msg):
         dml_obj.set_learners(ml_l=LogisticRegression(), ml_m=Lasso())
+
+
+# ==================== sensitivity_analysis exceptions ====================
+
+
+@pytest.fixture(scope="module")
+def fitted_plr_for_sensitivity():
+    """Fitted PLR model for sensitivity exception tests."""
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=ml_l, ml_m=ml_m)
+    dml_obj.fit(n_folds=3, n_rep=1)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_before_fit():
+    """sensitivity_analysis() raises ValueError before fit()."""
+    dml_obj = PLR(obj_dml_data)
+    msg = r"The framework is not yet initialized"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.sensitivity_analysis()
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_cf_y(fitted_plr_for_sensitivity):
+    """cf_y must be a float in [0,1)."""
+    with pytest.raises(TypeError, match=r"cf_y must be of float type"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(cf_y=1)
+    with pytest.raises(ValueError, match=r"cf_y must be in \[0,1\)"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(cf_y=1.0)
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_cf_d(fitted_plr_for_sensitivity):
+    """cf_d must be a float in [0,1)."""
+    with pytest.raises(TypeError, match=r"cf_d must be of float type"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(cf_d=1)
+    with pytest.raises(ValueError, match=r"cf_d must be in \[0,1\)"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(cf_d=1.0)
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_rho(fitted_plr_for_sensitivity):
+    """rho must be a float with |rho| <= 1."""
+    with pytest.raises(TypeError, match=r"rho must be of float type"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(rho=1)
+    with pytest.raises(ValueError, match=r"The absolute value of rho must be in \[0,1\]"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(rho=1.1)
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_level(fitted_plr_for_sensitivity):
+    """level must be a float in (0,1)."""
+    with pytest.raises(TypeError, match=r"The confidence level must be of float type"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(level=1)
+    with pytest.raises(ValueError, match=r"The confidence level must be in \(0,1\)"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(level=0.0)
+
+
+@pytest.mark.ci
+def test_exception_sensitivity_null_hypothesis(fitted_plr_for_sensitivity):
+    """null_hypothesis with wrong shape raises ValueError."""
+    import numpy as np
+
+    with pytest.raises(ValueError, match=r"null_hypothesis"):
+        fitted_plr_for_sensitivity.sensitivity_analysis(null_hypothesis=np.array([0.0, 0.0]))
diff --git a/doubleml/plm/tests/test_plr_scalar_return_types.py b/doubleml/plm/tests/test_plr_scalar_return_types.py
index 39fe77e6..3771eeca 100644
--- a/doubleml/plm/tests/test_plr_scalar_return_types.py
+++ b/doubleml/plm/tests/test_plr_scalar_return_types.py
@@ -191,3 +191,78 @@ def test_reset_after_draw_sample_splitting():
         _ = dml_obj.coef
     with pytest.raises(ValueError, match="Predictions not available. Call fit"):
         _ = dml_obj.predictions
+
+
+@pytest.mark.ci
+def test_sensitivity_elements_type_and_shape(fitted_dml_obj):
+    """sensitivity_elements has correct keys, types, and shapes after fit."""
+    elems = fitted_dml_obj.sensitivity_elements
+    assert isinstance(elems, dict)
+    for key in ["sigma2", "nu2"]:
+        assert key in elems
+        assert isinstance(elems[key], np.ndarray)
+        assert elems[key].shape == (1, 1, N_REP)
+    for key in ["psi_sigma2", "psi_nu2", "riesz_rep"]:
+        assert key in elems
+        assert isinstance(elems[key], np.ndarray)
+        assert elems[key].shape == (N_OBS, 1, N_REP)
+
+
+@pytest.mark.ci
+def test_sensitivity_analysis_runs(fitted_dml_obj):
+    """sensitivity_analysis() completes without error and returns self."""
+    result = fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    assert result is fitted_dml_obj.framework
+
+
+@pytest.mark.ci
+def test_sensitivity_before_fit_is_none():
+    """sensitivity_elements returns None before fit()."""
+    dml_obj = PLR(obj_dml_data)
+    assert dml_obj.sensitivity_elements is None
+
+
+@pytest.mark.ci
+def test_sensitivity_reset_after_draw_sample_splitting():
+    """sensitivity_elements resets to None after draw_sample_splitting()."""
+    np.random.seed(3141)
+    dml_obj = PLR(obj_dml_data)
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+    assert dml_obj.sensitivity_elements is not None
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    assert dml_obj.sensitivity_elements is None
+
+
+@pytest.mark.ci
+def test_sensitivity_params_structure(fitted_dml_obj):
+    """sensitivity_params has expected keys and finite rv/rva after sensitivity_analysis()."""
+    fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03)
+    params = fitted_dml_obj.framework.sensitivity_params
+    for key in ["theta", "se", "ci"]:
+        assert "lower" in params[key] and "upper" in params[key]
+    for key in ["rv", "rva"]:
+        assert np.all(np.isfinite(params[key]))
+        assert np.all(params[key] >= 0) and np.all(params[key] <= 1)
+
+
+@pytest.mark.ci
+def test_sensitivity_rho0_se_bounds(fitted_dml_obj):
+    """With rho=0, se lower and upper bounds equal the unadjusted se."""
+    fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=0.0)
+    params = fitted_dml_obj.framework.sensitivity_params
+    np.testing.assert_allclose(params["se"]["lower"], fitted_dml_obj.se, rtol=1e-6)
+    np.testing.assert_allclose(params["se"]["upper"], fitted_dml_obj.se, rtol=1e-6)
+
+
+@pytest.mark.ci
+def test_sensitivity_monotonicity_cf_y(fitted_dml_obj):
+    """Increasing cf_y widens the theta sensitivity bounds."""
+    fitted_dml_obj.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    params_low = fitted_dml_obj.framework.sensitivity_params
+    width_low = params_low["theta"]["upper"] - params_low["theta"]["lower"]
+    fitted_dml_obj.sensitivity_analysis(cf_y=0.15, cf_d=0.03, rho=1.0)
+    params_high = fitted_dml_obj.framework.sensitivity_params
+    width_high = params_high["theta"]["upper"] - params_high["theta"]["lower"]
+    assert np.all(width_high >= width_low)
diff --git a/doubleml/plm/tests/test_plr_scalar_sensitivity.py b/doubleml/plm/tests/test_plr_scalar_sensitivity.py
new file mode 100644
index 00000000..2b358757
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar_sensitivity.py
@@ -0,0 +1,81 @@
+"""Score-parametrized sensitivity analysis tests for PLR scalar models."""
+
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+N_OBS = 500
+N_FOLDS = 5
+N_REP = 2
+
+
+@pytest.fixture(scope="module")
+def plr_data():
+    """Shared PLR dataset."""
+    np.random.seed(3141)
+    return make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5)
+
+
+@pytest.fixture(scope="module", params=["partialling out", "IV-type"])
+def fitted_plr(request, plr_data):
+    """Fitted PLR model parametrized over both score variants."""
+    dml_obj = PLR(plr_data, score=request.param)
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    dml_obj.fit(n_folds=N_FOLDS, n_rep=N_REP)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_sensitivity_elements_positive(fitted_plr):
+    """sigma2 >= 0, nu2 > 0, and max_bias >= 0 for each score variant."""
+    elems = fitted_plr.sensitivity_elements
+    assert np.all(elems["sigma2"] >= 0)
+    assert np.all(elems["nu2"] > 0)
+    assert np.all(fitted_plr.framework.sensitivity_elements["max_bias"] >= 0)
+
+
+@pytest.mark.ci
+def test_sensitivity_params_structure(fitted_plr):
+    """After sensitivity_analysis(), theta/se/ci have lower/upper; rv/rva in [0,1]."""
+    fitted_plr.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    params = fitted_plr.framework.sensitivity_params
+    for key in ["theta", "se", "ci"]:
+        assert "lower" in params[key] and "upper" in params[key]
+    for key in ["rv", "rva"]:
+        assert np.all(np.isfinite(params[key]))
+        assert np.all(params[key] >= 0) and np.all(params[key] <= 1)
+
+
+@pytest.mark.ci
+def test_sensitivity_params_bounds_ordered(fitted_plr):
+    """theta lower bound <= estimated coef <= theta upper bound."""
+    fitted_plr.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    params = fitted_plr.framework.sensitivity_params
+    assert np.all(params["theta"]["lower"] <= fitted_plr.coef)
+    assert np.all(fitted_plr.coef <= params["theta"]["upper"])
+
+
+@pytest.mark.ci
+def test_sensitivity_rho0(fitted_plr):
+    """With rho=0, se lower and upper bounds equal the unadjusted se."""
+    fitted_plr.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=0.0)
+    params = fitted_plr.framework.sensitivity_params
+    np.testing.assert_allclose(params["se"]["lower"], fitted_plr.se, rtol=1e-6)
+    np.testing.assert_allclose(params["se"]["upper"], fitted_plr.se, rtol=1e-6)
+
+
+@pytest.mark.ci
+def test_sensitivity_monotonicity_cf_y(fitted_plr):
+    """Increasing cf_y produces wider theta sensitivity bounds."""
+    fitted_plr.sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0)
+    width_low = (
+        fitted_plr.framework.sensitivity_params["theta"]["upper"] - fitted_plr.framework.sensitivity_params["theta"]["lower"]
+    )
+    fitted_plr.sensitivity_analysis(cf_y=0.15, cf_d=0.03, rho=1.0)
+    width_high = (
+        fitted_plr.framework.sensitivity_params["theta"]["upper"] - fitted_plr.framework.sensitivity_params["theta"]["lower"]
+    )
+    assert np.all(width_high >= width_low)
diff --git a/doubleml/plm/tests/test_plr_scalar_vs_plr.py b/doubleml/plm/tests/test_plr_scalar_vs_plr.py
index 15453c12..713acb6a 100644
--- a/doubleml/plm/tests/test_plr_scalar_vs_plr.py
+++ b/doubleml/plm/tests/test_plr_scalar_vs_plr.py
@@ -81,3 +81,34 @@ def test_all_se_equal(comparison_fixture):
     old = comparison_fixture["old"]
     new = comparison_fixture["new"]
     np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_sigma2_equal(comparison_fixture):
+    """PLR scalar sigma2 matches DoubleMLPLR sensitivity_elements['sigma2']."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    # Legacy shape: (1, n_rep, 1); scalar shape: (1, 1, n_rep). Transpose to align.
+    old_sigma2 = np.transpose(old.sensitivity_elements["sigma2"], (0, 2, 1))
+    np.testing.assert_allclose(new.sensitivity_elements["sigma2"], old_sigma2, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_nu2_equal(comparison_fixture):
+    """PLR scalar nu2 matches DoubleMLPLR sensitivity_elements['nu2']."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    old_nu2 = np.transpose(old.sensitivity_elements["nu2"], (0, 2, 1))
+    np.testing.assert_allclose(new.sensitivity_elements["nu2"], old_nu2, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_max_bias_equal(comparison_fixture):
+    """PLR scalar framework max_bias matches DoubleMLPLR framework max_bias."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(
+        new.framework.sensitivity_elements["max_bias"],
+        old.framework.sensitivity_elements["max_bias"],
+        rtol=1e-9,
+    )

From e980ccae37a03296665a913e53754a22581bd044 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sun, 1 Mar 2026 21:23:31 +0100
Subject: [PATCH 22/38] add first dml vector class

---
 doubleml/double_ml_vector.py | 751 +++++++++++++++++++++++++++++++++++
 1 file changed, 751 insertions(+)
 create mode 100644 doubleml/double_ml_vector.py

diff --git a/doubleml/double_ml_vector.py b/doubleml/double_ml_vector.py
new file mode 100644
index 00000000..f6f1e376
--- /dev/null
+++ b/doubleml/double_ml_vector.py
@@ -0,0 +1,751 @@
+"""Abstract base class for multi-treatment DoubleML models (parameter vector estimation)."""
+
+from __future__ import annotations
+
+import copy
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Self
+
+if TYPE_CHECKING:
+    from .utils._tune_optuna import DMLOptunaResult
+
+import numpy as np
+import pandas as pd
+from joblib import Parallel, delayed
+
+from .data.base_data import DoubleMLData
+from .double_ml_base import DoubleMLBase
+from .double_ml_framework import concat
+from .double_ml_scalar import DoubleMLScalar
+from .utils._checks import _check_sample_splitting
+from .utils._tune_optuna import TUNE_ML_MODELS_DOC
+from .utils.gain_statistics import gain_statistics
+from .utils.resampling import DoubleMLResampling
+
+
+class DoubleMLVector(DoubleMLBase, ABC):
+    """
+    Abstract base class for multi-treatment DoubleML models.
+
+    Orchestrates multiple :class:`~doubleml.DoubleMLScalar` instances — one per
+    treatment column in ``d_cols`` — sharing a single set of sample splits and
+    concatenating their :class:`~doubleml.DoubleMLFramework` objects into one
+    unified result.
+
+    This class is intentionally general: by overriding :meth:`_initialize_models`
+    (and optionally :meth:`_get_data_for_model`), concrete subclasses can cover
+    any scenario where multiple scalar models must be fitted and combined:
+
+    * **Multiple treatment columns** (e.g., ``DoubleMLPLRVector``): each sub-model
+      receives a single-column data view created by :meth:`_get_data_for_model`.
+    * **Multiple treatment levels** (e.g., a future ``DoubleMLAPOSVector``): all
+      sub-models share the same data; each scalar carries its own ``treatment_level``
+      parameter.  Override :meth:`_get_data_for_model` to return ``self._dml_data``
+      unchanged, or bypass it entirely inside :meth:`_initialize_models`.
+
+    Parameters
+    ----------
+    obj_dml_data : DoubleMLBaseData
+        The data object for the double machine learning model.
+    score : str, optional
+        The score function to use. Default is ``'default'``.
+
+    Attributes
+    ----------
+    n_folds : int
+        Number of cross-fitting folds.
+    n_rep : int
+        Number of sample-splitting repetitions.
+    score : str
+        The score function being used.
+    modellist : list of DoubleMLScalar
+        The scalar sub-models, one per treatment column (or model key).
+    """
+
+    def __init__(
+        self,
+        obj_dml_data: DoubleMLData,
+        score: str = "default",
+    ) -> None:
+        super().__init__(obj_dml_data)
+        self._dml_data: DoubleMLData = obj_dml_data  # narrow for attribute access
+        self._score = score
+
+        # Sample-splitting state
+        self._n_folds: int | None = None
+        self._n_folds_per_cluster: int | None = None
+        self._n_rep: int | None = None
+        self._smpls: list | None = None
+        self._smpls_cluster: list | None = None
+
+        # Sub-model list — populated by subclass via _initialize_models()
+        self._modellist: list[DoubleMLScalar] | None = None
+
+    # ==================== Properties ====================
+
+    @property
+    def n_rep(self) -> int:
+        """
+        Number of repetitions for sample splitting.
+
+        Returns
+        -------
+        int
+            Number of repetitions.
+
+        Raises
+        ------
+        ValueError
+            If sample splitting has not been drawn yet.
+        """
+        if self._n_rep is None:
+            raise ValueError("n_rep not set. Call draw_sample_splitting() first.")
+        return self._n_rep
+
+    @property
+    def n_folds(self) -> int:
+        """
+        Number of folds for cross-fitting.
+
+        Returns
+        -------
+        int
+            Number of folds.
+
+        Raises
+        ------
+        ValueError
+            If sample splitting has not been drawn yet.
+        """
+        if self._n_folds is None:
+            raise ValueError("n_folds not set. Call draw_sample_splitting() first.")
+        return self._n_folds
+
+    @property
+    def score(self) -> str:
+        """
+        The score function being used.
+
+        Returns
+        -------
+        str
+            Score function name.
+        """
+        return self._score
+
+    @property
+    def smpls(self) -> list:
+        """
+        Sample-splitting indices used for cross-fitting.
+
+        Returns
+        -------
+        list
+            List of sample-splitting indices for each repetition.
+
+        Raises
+        ------
+        ValueError
+            If sample splitting has not been drawn yet.
+        """
+        if self._smpls is None:
+            raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.")
+        return self._smpls
+
+    @property
+    def modellist(self) -> list[DoubleMLScalar] | None:
+        """
+        The scalar sub-models in the same order as ``d_cols``.
+
+        Returns
+        -------
+        list of DoubleMLScalar or None
+            ``None`` before :meth:`_initialize_models` has been called by the subclass.
+        """
+        return self._modellist
+
+    @property
+    def n_rep_boot(self) -> int | None:
+        """
+        The number of bootstrap replications, or ``None`` if not bootstrapped.
+
+        Returns
+        -------
+        int or None
+        """
+        return None if self._framework is None else self._framework.n_rep_boot
+
+    @property
+    def boot_method(self) -> str | None:
+        """
+        The bootstrap method used, or ``None`` if not bootstrapped.
+
+        Returns
+        -------
+        str or None
+        """
+        return None if self._framework is None else self._framework.boot_method
+
+    @property
+    def boot_t_stat(self) -> np.ndarray | None:
+        """
+        Bootstrapped t-statistics, or ``None`` if not bootstrapped.
+
+        Returns
+        -------
+        np.ndarray or None
+        """
+        return None if self._framework is None else self._framework.boot_t_stat
+
+    @property
+    def sensitivity_elements(self) -> dict[str, np.ndarray] | None:
+        """
+        Raw sensitivity elements after :meth:`fit`, or ``None`` if unavailable.
+
+        Returns
+        -------
+        dict or None
+        """
+        return None if self._framework is None else self._framework.sensitivity_elements
+
+    @property
+    def sensitivity_params(self) -> dict | None:
+        """
+        Sensitivity analysis parameters after :meth:`sensitivity_analysis`,
+        or ``None`` if not yet computed.
+
+        Returns
+        -------
+        dict or None
+        """
+        return None if self._framework is None else self._framework.sensitivity_params
+
+    @property
+    def sensitivity_summary(self) -> str:
+        """
+        Summary for the sensitivity analysis after :meth:`sensitivity_analysis`.
+
+        Returns
+        -------
+        str
+
+        Raises
+        ------
+        ValueError
+            If :meth:`fit` has not been called yet.
+        """
+        if self._framework is None:
+            raise ValueError("Apply fit() before accessing sensitivity_summary.")
+        return self._framework.sensitivity_summary
+
+    # ==================== Abstract Methods ====================
+
+    @property
+    @abstractmethod
+    def required_learners(self) -> list[str]:
+        """
+        Names of the required learners for the current configuration.
+
+        Returns
+        -------
+        list of str
+            Ordered list of required learner names.
+        """
+
+    @abstractmethod
+    def set_learners(self, **kwargs: object) -> Self:
+        """
+        Set the learners for nuisance estimation on all sub-models.
+
+        Subclasses must implement this method with explicit keyword arguments
+        matching their model's learners (e.g., ``ml_l``, ``ml_m`` for PLR).
+        The same learners (cloned per sub-model) are applied to every treatment.
+
+        Parameters
+        ----------
+        **kwargs
+            Learner keyword arguments specific to the subclass.
+
+        Returns
+        -------
+        self : Self
+        """
+
+    @abstractmethod
+    def _initialize_models(self) -> list[DoubleMLScalar]:
+        """
+        Create and return one scalar sub-model per treatment column.
+
+        Called once during ``__init__`` of concrete subclasses.  Use
+        :meth:`_get_data_for_model` to obtain a single-treatment data view for
+        each ``d_col``, or bypass it for scenarios where all sub-models share the
+        same data (e.g., APOS-like treatment-level orchestration).
+
+        Returns
+        -------
+        list of DoubleMLScalar
+            One configured scalar model per element of ``self._dml_data.d_cols``.
+        """
+
+    # ==================== Protected Helpers ====================
+
+    def _get_data_for_model(self, d_col: str) -> DoubleMLData:
+        """
+        Return a single-treatment :class:`~doubleml.data.DoubleMLData` for ``d_col``.
+
+        Creates a new :class:`~doubleml.data.DoubleMLData` that **shares the
+        underlying DataFrame** (zero additional memory for array data). Other
+        treatment columns are appended to ``x_cols`` so that the
+        :class:`DoubleMLScalar` single-treatment check passes.
+
+        Override in subclasses for non-d_col scenarios. For example, an APOS-like
+        class would override this to return ``self._dml_data`` unchanged (each APO
+        scalar stores its treatment level internally).
+
+        Parameters
+        ----------
+        d_col : str
+            The treatment column to make active.
+
+        Returns
+        -------
+        DoubleMLData
+            A :class:`~doubleml.data.DoubleMLData` with ``d_cols=[d_col]``
+            and all other treatment columns added to ``x_cols``.
+        """
+        other_d_cols = [c for c in self._dml_data.d_cols if c != d_col]
+        x_cols = list(self._dml_data.x_cols) + other_d_cols
+
+        return DoubleMLData(
+            data=self._dml_data.data,  # Shared DataFrame — zero copy overhead
+            y_col=self._dml_data.y_col,
+            d_cols=d_col,
+            x_cols=x_cols,
+            z_cols=self._dml_data.z_cols,
+            cluster_cols=self._dml_data.cluster_cols,
+            use_other_treat_as_covariate=False,  # Already handled above
+            force_all_x_finite=self._dml_data.force_all_x_finite,
+            force_all_d_finite=self._dml_data.force_all_d_finite,
+        )
+
+    def _reset_fit_state(self) -> None:
+        """Clear fit-dependent state when sample splitting changes."""
+        self._framework = None
+        if self._modellist is not None:
+            for model in self._modellist:
+                model._reset_fit_state()
+
+    def _propagate_splits_to_models(self) -> None:
+        """Push the vector's sample splits into each sub-model."""
+        if self._modellist is None:
+            raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.")
+        for model in self._modellist:
+            model._smpls = self._smpls
+            model._smpls_cluster = self._smpls_cluster
+            model._n_folds = self._n_folds
+            model._n_folds_per_cluster = self._n_folds_per_cluster
+            model._n_rep = self._n_rep
+
+    def _fit_single_model(
+        self,
+        i_d: int,
+        n_jobs_cv: int | None,
+        ext_preds: dict[str, np.ndarray] | None,
+    ) -> DoubleMLScalar:
+        """Fit nuisance models and estimate causal parameters for one sub-model."""
+        if self._modellist is None:
+            raise ValueError("Sub-models are not initialized.")
+        model = self._modellist[i_d]
+        model.fit(n_jobs_cv=n_jobs_cv, external_predictions=ext_preds)
+        return model
+
+    # ==================== Sample Splitting ====================
+
+    def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
+        """
+        Draw sample splitting for cross-fitting.
+
+        Splits are drawn once for the vector and shared across all sub-models via
+        :meth:`_propagate_splits_to_models`.
+
+        Parameters
+        ----------
+        n_folds : int, optional
+            Number of folds. Default is ``5``.
+        n_rep : int, optional
+            Number of repetitions. Default is ``1``.
+
+        Returns
+        -------
+        self : Self
+
+        Raises
+        ------
+        ValueError
+            If ``n_folds < 2`` or ``n_rep < 1``.
+        """
+        if not isinstance(n_folds, int) or n_folds < 2:
+            raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.")
+        if not isinstance(n_rep, int) or n_rep < 1:
+            raise ValueError(f"n_rep must be an integer >= 1. Got {n_rep}.")
+
+        resampler = DoubleMLResampling(
+            n_folds=n_folds,
+            n_rep=n_rep,
+            n_obs=self._n_obs,
+        )
+        self._smpls = resampler.split_samples()
+        self._smpls_cluster = None
+        self._n_folds = n_folds
+        self._n_folds_per_cluster = None
+        self._n_rep = n_rep
+
+        self._reset_fit_state()
+        return self
+
+    def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None = None) -> Self:
+        """
+        Set pre-computed sample splitting for all sub-models.
+
+        Parameters
+        ----------
+        all_smpls : list
+            List of ``(train_ind, test_ind)`` tuples per fold, or a list of such
+            lists for repeated sample splitting.
+        all_smpls_cluster : list or None, optional
+            Nested list for cluster sample splitting. Default is ``None``.
+
+        Returns
+        -------
+        self : Self
+
+        Raises
+        ------
+        TypeError
+            If ``all_smpls`` is not a list.
+        ValueError
+            If the partition is invalid.
+        """
+        if isinstance(all_smpls, tuple):
+            raise TypeError("all_smpls must be a list of folds; tuple shorthand is not supported for DoubleMLVector.")
+        if not isinstance(all_smpls, list):
+            raise TypeError(f"all_smpls must be of list type. " f"{str(all_smpls)} of type {str(type(all_smpls))} was passed.")
+
+        smpls, smpls_cluster, n_rep, n_folds = _check_sample_splitting(
+            all_smpls,
+            all_smpls_cluster,
+            self._dml_data,
+            self._dml_data.is_cluster_data,
+            n_obs=self._n_obs,
+        )
+
+        self._smpls = smpls
+        self._smpls_cluster = smpls_cluster
+        self._n_rep = n_rep
+        self._n_folds = n_folds
+        self._n_folds_per_cluster = None
+
+        self._reset_fit_state()
+        return self
+
+    # ==================== Fit ====================
+
+    def fit(
+        self,
+        n_folds: int = 5,
+        n_rep: int = 1,
+        n_jobs_models: int | None = None,
+        n_jobs_cv: int | None = None,
+        external_predictions: dict[str, dict[str, np.ndarray]] | None = None,
+        **kwargs: Any,
+    ) -> Self:
+        """
+        Estimate all sub-models and combine their results.
+
+        Calls :meth:`draw_sample_splitting` (if not yet done), fits each scalar
+        sub-model (optionally in parallel via joblib), and concatenates their
+        :class:`~doubleml.DoubleMLFramework` objects into one unified result.
+
+        Parameters
+        ----------
+        n_folds : int, optional
+            Number of cross-fitting folds. Default is ``5``.
+            Only used if sample splitting has not been drawn yet.
+        n_rep : int, optional
+            Number of repetitions. Default is ``1``.
+            Only used if sample splitting has not been drawn yet.
+        n_jobs_models : int or None, optional
+            Number of jobs for parallel sub-model fitting. ``None`` means
+            sequential. Default is ``None``.
+        n_jobs_cv : int or None, optional
+            Number of jobs for cross-validation inside each sub-model.
+            Default is ``None``.
+        external_predictions : dict or None, optional
+            Nested dictionary keyed by treatment column name. Each value is a dict
+            of external predictions passed to the corresponding sub-model's
+            :meth:`~doubleml.DoubleMLScalar.fit_nuisance_models`.
+            Default is ``None``.
+
+        Returns
+        -------
+        self : Self
+        """
+        if self._smpls is None:
+            self.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+
+        self._propagate_splits_to_models()
+
+        fitted_models = Parallel(n_jobs=n_jobs_models, verbose=0, pre_dispatch="2*n_jobs")(
+            delayed(self._fit_single_model)(
+                i_d,
+                n_jobs_cv,
+                external_predictions.get(d_col) if external_predictions is not None else None,
+            )
+            for i_d, d_col in enumerate(self._dml_data.d_cols)
+        )
+
+        self._modellist = list(fitted_models)
+
+        # Concatenate scalar frameworks into one unified multi-treatment framework
+        self._framework = concat([m.framework for m in self._modellist])
+        self._framework.treatment_names = list(self._dml_data.d_cols)
+
+        return self
+
+    # ==================== Learner Access ====================
+
+    def get_params(self, learner_name: str) -> list[dict]:
+        """
+        Get parameters of a learner across all sub-models.
+
+        Parameters
+        ----------
+        learner_name : str
+            Name of the learner.
+
+        Returns
+        -------
+        list of dict
+            One parameter dict per sub-model, in ``d_cols`` order.
+        """
+        if self._modellist is None:
+            raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.")
+        return [model.get_params(learner_name) for model in self._modellist]
+
+    def set_params(self, learner_name: str, **params: object) -> Self:
+        """
+        Set parameters of a learner on all sub-models.
+
+        Parameters
+        ----------
+        learner_name : str
+            Name of the learner.
+        **params
+            Parameters to set on each sub-model's learner.
+
+        Returns
+        -------
+        self : Self
+        """
+        if self._modellist is None:
+            raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.")
+        for model in self._modellist:
+            model.set_params(learner_name, **params)
+        return self
+
+    # ==================== Hyperparameter Tuning ====================
+
+    def tune_ml_models(
+        self,
+        ml_param_space: dict,
+        scoring_methods: dict | None = None,
+        cv: int = 5,
+        set_as_params: bool = True,
+        return_tune_res: bool = False,
+        optuna_settings: dict | None = None,
+    ) -> "Self | list[dict[str, DMLOptunaResult]]":
+        """Hyperparameter-tuning for DoubleML models using Optuna."""
+        if self._modellist is None:
+            raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.")
+        tuning_kwargs: dict[str, Any] = {
+            "ml_param_space": ml_param_space,
+            "scoring_methods": scoring_methods,
+            "cv": cv,
+            "set_as_params": set_as_params,
+            "return_tune_res": return_tune_res,
+            "optuna_settings": optuna_settings,
+        }
+
+        tune_res: list = []
+        for model in self._modellist:
+            res = model.tune_ml_models(**tuning_kwargs)
+            if return_tune_res:
+                tune_res.append(res)
+
+        return tune_res if return_tune_res else self
+
+    tune_ml_models.__doc__ = TUNE_ML_MODELS_DOC
+
+    # ==================== Sensitivity ====================
+
+    def sensitivity_plot(
+        self,
+        idx_treatment: int = 0,
+        value: str = "theta",
+        rho: float = 1.0,
+        level: float = 0.95,
+        null_hypothesis: float = 0.0,
+        include_scenario: bool = True,
+        benchmarks: dict | None = None,
+        fill: bool = True,
+        grid_bounds: tuple[float, float] = (0.15, 0.15),
+        grid_size: int = 100,
+    ) -> object:
+        """
+        Contour plot of the sensitivity with respect to latent/confounding variables.
+
+        Parameters
+        ----------
+        idx_treatment : int, optional
+            Index of the treatment parameter to plot. Default is ``0``.
+        value : str, optional
+            Contour value: ``'theta'`` for bounds, ``'ci'`` for bounds including
+            statistical uncertainty. Default is ``'theta'``.
+        rho : float, optional
+            Correlation between confounders in the main regression and Riesz
+            representer. Default is ``1.0``.
+        level : float, optional
+            The confidence level. Default is ``0.95``.
+        null_hypothesis : float, optional
+            Null hypothesis for the direction of contour lines. Default is ``0.0``.
+        include_scenario : bool, optional
+            Whether to highlight the last :meth:`sensitivity_analysis` scenario.
+            Default is ``True``.
+        benchmarks : dict or None, optional
+            Benchmark dictionary with keys ``'cf_y'``, ``'cf_d'``, ``'name'``.
+            Default is ``None``.
+        fill : bool, optional
+            Heatmap style (``True``) vs. contour lines only (``False``).
+            Default is ``True``.
+        grid_bounds : tuple of float, optional
+            Evaluation bounds ``(cf_d_max, cf_y_max)`` in ``[0, 1)``.
+            Default is ``(0.15, 0.15)``.
+        grid_size : int, optional
+            Number of grid evaluation points. Default is ``100``.
+
+        Returns
+        -------
+        fig : plotly figure
+            Plotly figure of the sensitivity contours.
+
+        Raises
+        ------
+        ValueError
+            If :meth:`fit` has not been called yet.
+        """
+        if self._framework is None:
+            raise ValueError("Apply fit() before sensitivity_plot().")
+        return self._framework.sensitivity_plot(
+            idx_treatment=idx_treatment,
+            value=value,
+            rho=rho,
+            level=level,
+            null_hypothesis=null_hypothesis,
+            include_scenario=include_scenario,
+            benchmarks=benchmarks,
+            fill=fill,
+            grid_bounds=grid_bounds,
+            grid_size=grid_size,
+        )
+
+    def sensitivity_benchmark(self, benchmarking_set: list[str], fit_args: dict | None = None) -> pd.DataFrame:
+        """
+        Compute a benchmark for a given set of features.
+
+        Refits a short-form model excluding ``benchmarking_set`` from ``x_cols``
+        and computes gain statistics comparing long and short forms.
+
+        Parameters
+        ----------
+        benchmarking_set : list of str
+            Feature names to benchmark. Must be a non-empty subset of ``x_cols``.
+        fit_args : dict or None, optional
+            Additional keyword arguments passed to :meth:`fit` when refitting the
+            short-form model. Default is ``None``.
+
+        Returns
+        -------
+        pd.DataFrame
+            Benchmark results indexed by treatment column names with columns
+            ``'cf_y'``, ``'cf_d'``, ``'rho'``, and ``'delta_theta'``.
+
+        Raises
+        ------
+        NotImplementedError
+            If sensitivity analysis is not available for this model.
+        TypeError
+            If ``benchmarking_set`` or ``fit_args`` have the wrong type.
+        ValueError
+            If ``benchmarking_set`` is empty or not a subset of ``x_cols``.
+        """
+        if self._framework is None:
+            raise ValueError("Apply fit() before sensitivity_benchmark().")
+
+        x_list_long = self._dml_data.x_cols
+
+        if self.sensitivity_elements is None:
+            raise NotImplementedError(f"Sensitivity analysis not yet implemented for {self.__class__.__name__}.")
+        if not isinstance(benchmarking_set, list):
+            raise TypeError(
+                f"benchmarking_set must be a list. " f"{str(benchmarking_set)} of type {type(benchmarking_set)} was passed."
+            )
+        if len(benchmarking_set) == 0:
+            raise ValueError("benchmarking_set must not be empty.")
+        if not set(benchmarking_set) <= set(x_list_long):
+            raise ValueError(
+                f"benchmarking_set must be a subset of features {str(self._dml_data.x_cols)}. "
+                f"{str(benchmarking_set)} was passed."
+            )
+        if fit_args is not None and not isinstance(fit_args, dict):
+            raise TypeError(f"fit_args must be a dict. {str(fit_args)} of type {type(fit_args)} was passed.")
+
+        x_list_short = [x for x in x_list_long if x not in benchmarking_set]
+        dml_short = copy.deepcopy(self)
+        dml_short._dml_data.x_cols = x_list_short
+        # Sub-models each hold their own DoubleMLData — rebuild them from the updated _dml_data
+        # so that the short-form model actually uses the reduced feature set.
+        dml_short._modellist = dml_short._initialize_models()
+        dml_short._framework = None
+
+        if fit_args is not None:
+            dml_short.fit(**fit_args)
+        else:
+            dml_short.fit()
+
+        benchmark_dict = gain_statistics(dml_long=self, dml_short=dml_short)
+        df_benchmark = pd.DataFrame(benchmark_dict, index=self._dml_data.d_cols)
+        return df_benchmark
+
+    # ==================== String Representation ====================
+
+    def __str__(self) -> str:
+        """
+        String representation of the DoubleMLVector object.
+
+        Returns
+        -------
+        str
+            A formatted string summary of the model.
+        """
+        class_name = self.__class__.__name__
+        header = f"{'=' * 20} {class_name} Object {'=' * 20}"
+
+        info = f"Score function: {self.score}\n"
+        if self._n_folds is not None:
+            info += f"Resampling: {self._n_folds}-fold CV, {self._n_rep} repetitions\n"
+        info += f"Treatments: {list(self._dml_data.d_cols)}\n"
+
+        if self._framework is not None:
+            return f"{header}\n\n{info}\n{str(self.summary)}"
+        else:
+            return f"{header}\n\n{info}\nModel not yet fitted. Call fit() first."

From 17cf8f30b93907a9af90d5110a3c3730b6f503e5 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Wed, 25 Mar 2026 08:03:24 +0100
Subject: [PATCH 23/38] Add branch status and TODOs documentation for
 sk-refactoring

---
 .claude/CLAUDE.md |  2 ++
 .claude/STATUS.md | 74 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 .claude/STATUS.md

diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
index 0dc51dca..69f4d98b 100644
--- a/.claude/CLAUDE.md
+++ b/.claude/CLAUDE.md
@@ -8,6 +8,8 @@ DoubleML is a Python package implementing Double/Debiased Machine Learning (DML)
 
 **Docs**: https://docs.doubleml.org | **Source**: https://github.com/DoubleML/doubleml-for-py
 
+**Branch status & TODOs**: `.claude/STATUS.md`
+
 ## Architecture
 
 ### Class Hierarchy
diff --git a/.claude/STATUS.md b/.claude/STATUS.md
new file mode 100644
index 00000000..ac0d5881
--- /dev/null
+++ b/.claude/STATUS.md
@@ -0,0 +1,74 @@
+# Branch Status & TODOs
+
+> Tracked in git so it syncs across machines. Update this file as work progresses.
+> Reference: `CLAUDE.md` loads this automatically via the line below.
+
+---
+
+## Branch: `sk-refactoring`
+
+**Goal**: Introduce a new `DoubleMLScalar` / `DoubleMLVector` hierarchy alongside
+the existing `DoubleML` API — cleaner design, better testability, explicit tuning,
+nuisance evaluation, and sensitivity analysis.
+
+### Completed
+
+- [x] **Claude tooling** — `.claude/` dir, `CLAUDE.md`, `rules/`, `agents/`, `skills/`
+- [x] **Architecture docs** — `doc/diagrams/architecture.md`, `doc/diagrams/testing_structure.md`
+- [x] **`DoubleMLBase`** — abstract base with shared properties (`coef`, `se`, `summary`) and inference delegation (`doubleml/double_ml_base.py`)
+- [x] **`LinearScoreMixin`** — closed-form θ = −E[ψ_b]/E[ψ_a] solver (`doubleml/double_ml_linear_score.py`)
+- [x] **`DoubleMLScalar`** — single-parameter orchestrator (`doubleml/double_ml_scalar.py`) with:
+  - `fit()` → `draw_sample_splitting()` + `fit_nuisance_models()` + `estimate_causal_parameters()`
+  - `tune_ml_models()` via Optuna (`_LEARNER_PARAM_ALIASES`, `_get_tuning_data()` hook)
+  - `nuisance_targets`, `nuisance_loss`, `evaluate_learners()`
+  - `_sensitivity_element_est()` hook + full sensitivity analysis pipeline
+- [x] **`DoubleMLPLRScalar`** — PLR scalar (`doubleml/plm/plr_scalar.py`) with all 7 test files:
+  - `test_plr_scalar.py`, `_return_types`, `_exceptions`, `_vs_plr`, `_external_predictions`, `_tune_ml_models`, `_evaluate_learners`, `_sensitivity`
+- [x] **`DoubleMLIRMScalar`** — IRM scalar (`doubleml/irm/irm_scalar.py`) with all 7 test files (same structure)
+- [x] **`DoubleMLVector`** — multi-treatment base class first iteration (`doubleml/double_ml_vector.py`)
+- [x] **BLP multi-rep support** — `doubleml/utils/blp.py`
+
+### In Progress
+
+- [ ] **`DoubleMLVector`** — base class exists; no concrete subclass yet
+
+### Feature Gaps vs Legacy Classes
+
+Missing from `PLR` / `IRM` scalar compared to `DoubleMLPLR` / `DoubleMLIRM`:
+
+| Feature | Legacy location | Applies to | Notes |
+|---------|----------------|-----------|-------|
+| `cate()` | `plr.py:447`, `irm.py:564` | both | Depends on BLP (multi-rep already done) |
+| `gate()` | `plr.py:485`, `irm.py:598` | both | Delegates to `cate()` |
+| `_partial_out()` | `plr.py:522` | PLR only | Helper needed by PLR `cate()`/`gate()` |
+| `policy_tree()` | `irm.py:635` | IRM only | Not planned yet |
+
+Weighted effects in IRM (`weights` dict form):
+- Array weights: ✅ supported
+- Dict weights with `weights_bar`: ⚠️ **gap** — `_check_weights()` called at init with `n_rep=1` (`utils/_checks.py:276`) but `n_rep` is only determined at `draw_sample_splitting()`. Dict weights with `weights_bar.shape == (n_obs, n_rep > 1)` fail validation incorrectly.
+
+Intentionally **not ported**:
+- Callable score — design decision
+- `trimming_rule` / `trimming_threshold` deprecated props — use `ps_processor_config`
+
+### Planned
+
+| Item | Files | Notes |
+|------|-------|-------|
+| `cate()` + `gate()` for PLR scalar | `doubleml/plm/plr_scalar.py` | Needs `_partial_out()` first |
+| `cate()` + `gate()` for IRM scalar | `doubleml/irm/irm_scalar.py` | |
+| Fix dict `weights_bar` validation for multi-rep | `doubleml/irm/irm_scalar.py` | Defer n_rep shape check to `fit()` |
+| `DoubleMLPLRVector` | `doubleml/plm/plr_vector.py` + tests | First concrete Vector subclass |
+| `DoubleMLPLIVScalar` | `doubleml/plm/pliv_scalar.py` + 7 test files | Next scalar model |
+| `DoubleMLPLPRScalar` | `doubleml/plm/plpr_scalar.py` + 7 test files | |
+| DID scalar variants | `doubleml/did/*_scalar.py` | DID, DIDCSBinary, DIDMulti |
+| `DoubleMLVector` tests | `doubleml/tests/test_vector_*.py` | Base class tests |
+
+---
+
+## How to Update This File
+
+- Mark items `[x]` when complete
+- Move items between sections as work progresses
+- Add new planned items as they are identified
+- Commit this file with the relevant code changes so the status stays in sync

From 3818c2b86703dcae861e99497e143f9dc84c4931 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Wed, 25 Mar 2026 10:00:54 +0100
Subject: [PATCH 24/38] Refactor weight handling in IRM and add comprehensive
 exception tests for weights

---
 doubleml/irm/irm_scalar.py                    |  10 +-
 .../irm/tests/test_irm_scalar_exceptions.py   |  86 +++++++++++++
 .../tests/test_irm_scalar_weighted_scores.py  | 113 ++++++++++++++++++
 doubleml/utils/_checks.py                     |  17 ++-
 4 files changed, 217 insertions(+), 9 deletions(-)
 create mode 100644 doubleml/irm/tests/test_irm_scalar_weighted_scores.py

diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index f6fe85de..44e62e96 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -147,8 +147,8 @@ def __init__(
             self._ps_processor_config = PSProcessorConfig()
             self._ps_processor = PSProcessor.from_config(self._ps_processor_config)
 
-        # Weights
-        _check_weights(weights, score, obj_dml_data.n_obs, n_rep=1)
+        # Weights — n_rep shape deferred to _get_weights() when n_rep is known
+        _check_weights(weights, score, obj_dml_data.n_obs)
         self._initialize_weights(weights)
 
         # Set learners if provided
@@ -460,8 +460,12 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
             w = self._weights["weights"]
             weights = w[:, np.newaxis] * np.ones((1, self.n_rep))  # (n_obs, n_rep)
             if "weights_bar" in self._weights:
-                # weights_bar has shape (n_obs, n_rep) already
                 weights_bar = self._weights["weights_bar"]
+                if weights_bar.shape != (self.n_obs, self.n_rep):
+                    raise ValueError(
+                        f"weights_bar must have shape ({self.n_obs}, {self.n_rep}). "
+                        f"weights_bar of shape {weights_bar.shape} was passed."
+                    )
             else:
                 weights_bar = weights.copy()
         else:
diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py
index 2fe5fd35..3f0ac0bb 100644
--- a/doubleml/irm/tests/test_irm_scalar_exceptions.py
+++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py
@@ -165,6 +165,92 @@ def test_irm_scalar_exception_binary_predictions_g():
         dml_obj.fit_nuisance_models()
 
 
+# ==================== weights exceptions ====================
+
+_N_OBS = obj_dml_data.n_obs
+
+
+@pytest.mark.ci
+def test_exception_weights_wrong_type():
+    """weights of non-array, non-dict type raises TypeError."""
+    msg = r"weights must be a numpy array or dictionary\."
+    with pytest.raises(TypeError, match=msg):
+        IRM(obj_dml_data, weights="not_an_array")
+
+
+@pytest.mark.ci
+def test_exception_weights_wrong_shape():
+    """1D weights array with wrong length raises ValueError."""
+    msg = r"weights must have shape"
+    with pytest.raises(ValueError, match=msg):
+        IRM(obj_dml_data, weights=np.ones(_N_OBS + 1))
+
+
+@pytest.mark.ci
+def test_exception_weights_negative():
+    """weights array with a negative value raises ValueError."""
+    w = np.ones(_N_OBS)
+    w[0] = -1.0
+    msg = r"All weights values must be greater or equal 0\."
+    with pytest.raises(ValueError, match=msg):
+        IRM(obj_dml_data, weights=w)
+
+
+@pytest.mark.ci
+def test_exception_weights_atte_not_array():
+    """dict weights with score='ATTE' raises TypeError."""
+    dict_weights = {"weights": np.ones(_N_OBS), "weights_bar": np.ones((_N_OBS, 1))}
+    msg = r"weights must be a numpy array for ATTE score\."
+    with pytest.raises(TypeError, match=msg):
+        IRM(obj_dml_data, score="ATTE", weights=dict_weights)
+
+
+@pytest.mark.ci
+def test_exception_weights_atte_not_binary():
+    """Non-binary array weights with score='ATTE' raises ValueError."""
+    w = np.full(_N_OBS, 0.5)
+    msg = r"weights must be binary for ATTE score\."
+    with pytest.raises(ValueError, match=msg):
+        IRM(obj_dml_data, score="ATTE", weights=w)
+
+
+@pytest.mark.ci
+def test_exception_dict_weights_wrong_keys():
+    """Dict weights with unexpected keys raises ValueError."""
+    bad_dict = {"weights": np.ones(_N_OBS), "wrong_key": np.ones((_N_OBS, 1))}
+    msg = r"weights must have keys"
+    with pytest.raises(ValueError, match=msg):
+        IRM(obj_dml_data, weights=bad_dict)
+
+
+@pytest.mark.ci
+def test_exception_dict_weights_bar_wrong_n_obs():
+    """Dict weights_bar with wrong number of rows raises ValueError at init."""
+    dict_weights = {
+        "weights": np.ones(_N_OBS),
+        "weights_bar": np.ones((_N_OBS + 1, 1)),
+    }
+    msg = r"weights_bar must be a 2-dimensional array with"
+    with pytest.raises(ValueError, match=msg):
+        IRM(obj_dml_data, weights=dict_weights)
+
+
+@pytest.mark.ci
+def test_exception_dict_weights_bar_wrong_n_rep():
+    """Dict weights_bar with wrong n_rep column raises ValueError at estimate time."""
+    # weights_bar has 2 columns but n_rep=3 is used; mismatch detected in estimate_causal_parameters()
+    dict_weights = {
+        "weights": np.ones(_N_OBS),
+        "weights_bar": np.ones((_N_OBS, 2)),
+    }
+    dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m, weights=dict_weights)
+    dml_obj.draw_sample_splitting(n_folds=2, n_rep=3)
+    dml_obj.fit_nuisance_models()
+    msg = r"weights_bar must have shape"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.estimate_causal_parameters()
+
+
 # ==================== sensitivity_analysis exceptions ====================
 
 
diff --git a/doubleml/irm/tests/test_irm_scalar_weighted_scores.py b/doubleml/irm/tests/test_irm_scalar_weighted_scores.py
new file mode 100644
index 00000000..ee2586ae
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_weighted_scores.py
@@ -0,0 +1,113 @@
+"""Test weighted score computation for IRM scalar, including dict weights with n_rep > 1.
+
+With constant dict weights c * ones (weights = c, weights_bar = c):
+  psi_a = -weights / mean(weights) = -c/c = -1  (same as unweighted)
+  psi_b = c * psi_b_unweighted
+  theta = -mean(psi_b) / mean(psi_a) = c * theta_unweighted
+  se    = c * se_unweighted  (psi scales by c, psi_a unchanged)
+"""
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+
+_N_FOLDS = 5
+_N_OBS = 500
+_DIM_X = 10
+_WEIGHT_CONST = 0.5
+
+ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
+ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    """Number of repetitions — covers single- and multi-rep cases."""
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def irm_data():
+    """Shared IRM dataset."""
+    np.random.seed(42)
+    return make_irm_data(theta=0.5, n_obs=_N_OBS, dim_x=_DIM_X, return_type="DoubleMLData")
+
+
+@pytest.fixture(scope="module")
+def constant_weights_fixture(irm_data, n_rep):
+    """Pair of IRM scalar models sharing sample splits: unweighted and constant-0.5-weighted.
+
+    With weights = weights_bar = 0.5 * ones:
+      theta_weighted = 0.5 * theta_unweighted
+      se_weighted    = 0.5 * se_unweighted
+    """
+    n_obs = irm_data.n_obs
+    const_weights = {
+        "weights": np.full(n_obs, _WEIGHT_CONST),
+        "weights_bar": np.full((n_obs, n_rep), _WEIGHT_CONST),
+    }
+
+    # Unweighted reference
+    dml_ref = IRM(irm_data, score="ATE")
+    dml_ref.set_learners(ml_g=clone(ml_g), ml_m=clone(ml_m))
+    dml_ref.draw_sample_splitting(n_folds=_N_FOLDS, n_rep=n_rep)
+    dml_ref.fit_nuisance_models()
+    dml_ref.estimate_causal_parameters()
+
+    # Constant-weighted — share exact sample splits for identical nuisance predictions
+    dml_weighted = IRM(irm_data, score="ATE", weights=const_weights)
+    dml_weighted.set_learners(ml_g=clone(ml_g), ml_m=clone(ml_m))
+    dml_weighted._n_folds = _N_FOLDS
+    dml_weighted._n_rep = n_rep
+    dml_weighted._smpls = dml_ref.smpls
+    dml_weighted.fit_nuisance_models()
+    dml_weighted.estimate_causal_parameters()
+
+    return {"ref": dml_ref, "weighted": dml_weighted}
+
+
+@pytest.mark.ci
+def test_dict_weights_n_rep_gt1_succeeds():
+    """IRM scalar with weights_bar shape (n_obs, 3) and n_rep=3 fits without error."""
+    np.random.seed(42)
+    obj_dml_data = make_irm_data(theta=0.5, n_obs=200, dim_x=5, return_type="DoubleMLData")
+    n_obs = obj_dml_data.n_obs
+    n_rep = 3
+
+    dict_weights = {
+        "weights": np.full(n_obs, _WEIGHT_CONST),
+        "weights_bar": np.full((n_obs, n_rep), _WEIGHT_CONST),
+    }
+    dml_obj = IRM(obj_dml_data, score="ATE", weights=dict_weights)
+    dml_obj.set_learners(ml_g=clone(ml_g), ml_m=clone(ml_m))
+    dml_obj.draw_sample_splitting(n_folds=3, n_rep=n_rep)
+    dml_obj.fit_nuisance_models()
+    dml_obj.estimate_causal_parameters()
+
+
+@pytest.mark.ci
+def test_constant_weights_coef(constant_weights_fixture):
+    """theta (coef) with constant weights c equals c * theta_unweighted."""
+    np.testing.assert_allclose(
+        constant_weights_fixture["weighted"].coef,
+        _WEIGHT_CONST * constant_weights_fixture["ref"].coef,
+        rtol=1e-9,
+    )
+
+
+@pytest.mark.ci
+def test_constant_weights_se(constant_weights_fixture):
+    """se with constant weights c equals c * se_unweighted.
+
+    psi_weighted = c * psi_unweighted, psi_a unchanged (-1), so
+    se_weighted = sqrt(mean(c² * psi²)) / sqrt(n) = c * se_unweighted.
+    """
+    np.testing.assert_allclose(
+        constant_weights_fixture["weighted"].se,
+        _WEIGHT_CONST * constant_weights_fixture["ref"].se,
+        rtol=1e-9,
+    )
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index ad493e28..f6470d13 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -240,7 +240,7 @@ def _check_benchmarks(benchmarks):
     return
 
 
-def _check_weights(weights, score, n_obs, n_rep):
+def _check_weights(weights, score, n_obs, n_rep: int | None = None):
     if weights is not None:
         # check general type
         if (not isinstance(weights, np.ndarray)) and (not isinstance(weights, dict)):
@@ -273,14 +273,19 @@ def _check_weights(weights, score, n_obs, n_rep):
             if not set(weights.keys()) == set(expected_keys):
                 raise ValueError(f"weights must have keys {expected_keys}. keys {str(weights.keys())} were passed.")
 
-            expected_shapes = [(n_obs,), (n_obs, n_rep)]
-            if weights["weights"].shape != expected_shapes[0]:
+            if weights["weights"].shape != (n_obs,):
                 raise ValueError(
-                    f"weights must have shape {expected_shapes[0]}. weights of shape {weights['weights'].shape} was passed."
+                    f"weights must have shape ({n_obs},). weights of shape {weights['weights'].shape} was passed."
                 )
-            if weights["weights_bar"].shape != expected_shapes[1]:
+            # weights_bar must be 2D with n_obs rows; the n_rep column is validated later when n_rep is known
+            if weights["weights_bar"].ndim != 2 or weights["weights_bar"].shape[0] != n_obs:
                 raise ValueError(
-                    f"weights_bar must have shape {expected_shapes[1]}. "
+                    f"weights_bar must be a 2-dimensional array with {n_obs} rows. "
+                    f"weights_bar of shape {weights['weights_bar'].shape} was passed."
+                )
+            if n_rep is not None and weights["weights_bar"].shape[1] != n_rep:
+                raise ValueError(
+                    f"weights_bar must have shape ({n_obs}, {n_rep}). "
                     f"weights_bar of shape {weights['weights_bar'].shape} was passed."
                 )
             if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)):

From 82d95a5cb3d8c2793458b1e74c2b2e797e4a0297 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 08:55:55 +0200
Subject: [PATCH 25/38] refactor: enhance validation for weights_bar in IRM and
 update fit handling in DoubleMLScalar

---
 .claude/STATUS.md                             |  3 +-
 doubleml/double_ml_scalar.py                  | 60 +++++++++---
 doubleml/irm/irm_scalar.py                    | 15 ++-
 .../irm/tests/test_irm_scalar_exceptions.py   | 49 +++++++++-
 doubleml/tests/test_scalar_fit.py             | 93 +++++++++++++++++++
 5 files changed, 197 insertions(+), 23 deletions(-)
 create mode 100644 doubleml/tests/test_scalar_fit.py

diff --git a/.claude/STATUS.md b/.claude/STATUS.md
index ac0d5881..c3a73ecb 100644
--- a/.claude/STATUS.md
+++ b/.claude/STATUS.md
@@ -45,7 +45,7 @@ Missing from `PLR` / `IRM` scalar compared to `DoubleMLPLR` / `DoubleMLIRM`:
 
 Weighted effects in IRM (`weights` dict form):
 - Array weights: ✅ supported
-- Dict weights with `weights_bar`: ⚠️ **gap** — `_check_weights()` called at init with `n_rep=1` (`utils/_checks.py:276`) but `n_rep` is only determined at `draw_sample_splitting()`. Dict weights with `weights_bar.shape == (n_obs, n_rep > 1)` fail validation incorrectly.
+- Dict weights with `weights_bar`: ✅ supported — init defers the `n_rep` column check; `DoubleMLScalar._check_smpls_dependent_inputs()` hook validates `weights_bar.shape == (n_obs, n_rep)` from inside both `draw_sample_splitting()` and `set_sample_splitting()`. `fit(n_folds=..., n_rep=...)` re-draws splits with a `UserWarning` when args conflict with existing splits.
 
 Intentionally **not ported**:
 - Callable score — design decision
@@ -57,7 +57,6 @@ Intentionally **not ported**:
 |------|-------|-------|
 | `cate()` + `gate()` for PLR scalar | `doubleml/plm/plr_scalar.py` | Needs `_partial_out()` first |
 | `cate()` + `gate()` for IRM scalar | `doubleml/irm/irm_scalar.py` | |
-| Fix dict `weights_bar` validation for multi-rep | `doubleml/irm/irm_scalar.py` | Defer n_rep shape check to `fit()` |
 | `DoubleMLPLRVector` | `doubleml/plm/plr_vector.py` + tests | First concrete Vector subclass |
 | `DoubleMLPLIVScalar` | `doubleml/plm/pliv_scalar.py` + 7 test files | Next scalar model |
 | `DoubleMLPLPRScalar` | `doubleml/plm/plpr_scalar.py` + 7 test files | |
diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index d10c1c49..d460b824 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -2,6 +2,7 @@
 Abstract base class for scalar DoubleML models (single parameter estimation).
 """
 
+import warnings
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Callable, ClassVar, Self
 
@@ -417,8 +418,8 @@ def set_learners(self, **kwargs: object) -> Self:
 
     def fit(
         self,
-        n_folds: int = 5,
-        n_rep: int = 1,
+        n_folds: int | None = None,
+        n_rep: int | None = None,
         n_jobs_cv: int | None = None,
         external_predictions: dict[str, np.ndarray] | None = None,
         **kwargs,
@@ -431,12 +432,18 @@ def fit(
 
         Parameters
         ----------
-        n_folds : int, optional
-            Number of folds for cross-fitting. Default is 5.
-            Only used if sample splitting has not been drawn yet.
-        n_rep : int, optional
-            Number of repetitions for sample splitting. Default is 1.
-            Only used if sample splitting has not been drawn yet.
+        n_folds : int or None, optional
+            Number of folds for cross-fitting. If sample splitting has not been
+            drawn yet, defaults to 5. If sample splitting already exists and
+            ``n_folds`` differs from ``self.n_folds``, the splits are re-drawn
+            (discarding existing splits and fit state) and a :class:`UserWarning`
+            is emitted. Default is ``None``.
+        n_rep : int or None, optional
+            Number of repetitions for sample splitting. If sample splitting has
+            not been drawn yet, defaults to 1. If sample splitting already exists
+            and ``n_rep`` differs from ``self.n_rep``, the splits are re-drawn
+            (discarding existing splits and fit state) and a :class:`UserWarning`
+            is emitted. Default is ``None``.
         n_jobs_cv : int, optional
             Number of jobs for parallel processing during cross-validation.
             Currently not used (reserved for future parallelization).
@@ -454,7 +461,26 @@ def fit(
             The fitted estimator.
         """
         if self._smpls is None:
-            self.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+            self.draw_sample_splitting(
+                n_folds=5 if n_folds is None else n_folds,
+                n_rep=1 if n_rep is None else n_rep,
+            )
+        else:
+            current_n_folds = self.n_folds
+            current_n_rep = self.n_rep
+            n_folds_conflict = n_folds is not None and n_folds != current_n_folds
+            n_rep_conflict = n_rep is not None and n_rep != current_n_rep
+            if n_folds_conflict or n_rep_conflict:
+                new_n_folds = n_folds if (n_folds is not None and n_folds_conflict) else current_n_folds
+                new_n_rep = n_rep if (n_rep is not None and n_rep_conflict) else current_n_rep
+                warnings.warn(
+                    f"Re-drawing sample splitting (was n_folds={current_n_folds}, n_rep={current_n_rep}; "
+                    f"now n_folds={new_n_folds}, n_rep={new_n_rep}). "
+                    "Existing splits and fit state are discarded.",
+                    UserWarning,
+                    stacklevel=2,
+                )
+                self.draw_sample_splitting(n_folds=new_n_folds, n_rep=new_n_rep)
         self.fit_nuisance_models(n_jobs_cv=n_jobs_cv, external_predictions=external_predictions)
         self.estimate_causal_parameters()
         return self
@@ -639,6 +665,7 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
             self._smpls_cluster = None
 
         self._reset_fit_state()
+        self._check_smpls_dependent_inputs()
 
         return self
 
@@ -696,6 +723,7 @@ def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None =
             self._n_folds_per_cluster = None
 
         self._reset_fit_state()
+        self._check_smpls_dependent_inputs()
 
         return self
 
@@ -853,6 +881,18 @@ def _reset_fit_state(self) -> None:
         self._i_rep = None
         self._i_fold = None
 
+    def _check_smpls_dependent_inputs(self) -> None:
+        """
+        Validate inputs whose shape depends on ``n_rep``.
+
+        Called by :meth:`draw_sample_splitting` and :meth:`set_sample_splitting`
+        after ``self._n_rep`` and ``self._smpls`` have been set and fit state has
+        been reset. Subclasses override this hook to validate user-supplied
+        objects whose shape only becomes meaningful once ``n_rep`` is known
+        (e.g. ``weights_bar`` in IRM). The default implementation is a no-op.
+        """
+        return
+
     def evaluate_learners(
         self,
         learners: list[str] | None = None,
@@ -975,8 +1015,6 @@ def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
 
     def _validate_sensitivity_elements(self) -> None:
         """Re-estimate nu2 from riesz representer if nu2 is non-positive (degenerate PS)."""
-        import warnings
-
         if self._sensitivity_elements is None:
             return
         nu2 = self._sensitivity_elements["nu2"]  # (1, 1, n_rep)
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index 44e62e96..58f94c95 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -461,11 +461,6 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
             weights = w[:, np.newaxis] * np.ones((1, self.n_rep))  # (n_obs, n_rep)
             if "weights_bar" in self._weights:
                 weights_bar = self._weights["weights_bar"]
-                if weights_bar.shape != (self.n_obs, self.n_rep):
-                    raise ValueError(
-                        f"weights_bar must have shape ({self.n_obs}, {self.n_rep}). "
-                        f"weights_bar of shape {weights_bar.shape} was passed."
-                    )
             else:
                 weights_bar = weights.copy()
         else:
@@ -480,6 +475,16 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
 
         return weights, weights_bar
 
+    def _check_smpls_dependent_inputs(self) -> None:
+        """Validate ``weights_bar`` shape now that ``n_rep`` is known."""
+        if "weights_bar" in self._weights:
+            weights_bar = self._weights["weights_bar"]
+            expected = (self.n_obs, self.n_rep)
+            if weights_bar.shape != expected:
+                raise ValueError(
+                    f"weights_bar must have shape {expected}. " f"weights_bar of shape {weights_bar.shape} was passed."
+                )
+
     def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
         """
         Compute IRM sensitivity elements vectorized over all repetitions.
diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py
index 3f0ac0bb..a1db7598 100644
--- a/doubleml/irm/tests/test_irm_scalar_exceptions.py
+++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py
@@ -237,18 +237,57 @@ def test_exception_dict_weights_bar_wrong_n_obs():
 
 @pytest.mark.ci
 def test_exception_dict_weights_bar_wrong_n_rep():
-    """Dict weights_bar with wrong n_rep column raises ValueError at estimate time."""
-    # weights_bar has 2 columns but n_rep=3 is used; mismatch detected in estimate_causal_parameters()
+    """Dict weights_bar with wrong n_rep column raises ValueError at draw_sample_splitting()."""
+    # weights_bar has 2 columns but n_rep=3 is requested; the mismatch is detected as
+    # soon as n_rep becomes known (i.e. inside draw_sample_splitting), before any nuisance fitting.
     dict_weights = {
         "weights": np.ones(_N_OBS),
         "weights_bar": np.ones((_N_OBS, 2)),
     }
     dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m, weights=dict_weights)
-    dml_obj.draw_sample_splitting(n_folds=2, n_rep=3)
-    dml_obj.fit_nuisance_models()
     msg = r"weights_bar must have shape"
     with pytest.raises(ValueError, match=msg):
-        dml_obj.estimate_causal_parameters()
+        dml_obj.draw_sample_splitting(n_folds=2, n_rep=3)
+
+
+@pytest.mark.ci
+def test_exception_dict_weights_bar_wrong_n_rep_via_set_sample_splitting():
+    """Dict weights_bar mismatch is also caught when splits arrive via set_sample_splitting()."""
+    dict_weights = {
+        "weights": np.ones(_N_OBS),
+        "weights_bar": np.ones((_N_OBS, 1)),
+    }
+    dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m, weights=dict_weights)
+
+    # Build a manually constructed sample splitting list with n_rep=2.
+    rng = np.random.default_rng(0)
+    indices = np.arange(_N_OBS)
+    all_smpls = []
+    for _ in range(2):
+        perm = rng.permutation(indices)
+        fold_size = _N_OBS // 2
+        test1, test2 = perm[:fold_size], perm[fold_size:]
+        train1 = np.setdiff1d(indices, test1)
+        train2 = np.setdiff1d(indices, test2)
+        all_smpls.append([(train1, test1), (train2, test2)])
+
+    msg = r"weights_bar must have shape"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.set_sample_splitting(all_smpls)
+
+
+@pytest.mark.ci
+def test_exception_dict_weights_bar_after_redraw():
+    """Re-drawing splits with a different n_rep re-runs the weights_bar check."""
+    dict_weights = {
+        "weights": np.ones(_N_OBS),
+        "weights_bar": np.ones((_N_OBS, 1)),
+    }
+    dml_obj = IRM(obj_dml_data, ml_g=ml_g, ml_m=ml_m, weights=dict_weights)
+    dml_obj.draw_sample_splitting(n_folds=2, n_rep=1)  # OK
+    msg = r"weights_bar must have shape"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_folds=2, n_rep=2)
 
 
 # ==================== sensitivity_analysis exceptions ====================
diff --git a/doubleml/tests/test_scalar_fit.py b/doubleml/tests/test_scalar_fit.py
new file mode 100644
index 00000000..ceb0e76e
--- /dev/null
+++ b/doubleml/tests/test_scalar_fit.py
@@ -0,0 +1,93 @@
+"""Test fit() argument handling on DoubleMLScalar (vehicle: PLR scalar)."""
+
+import warnings
+
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression
+
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+
+N_OBS = 200
+N_FOLDS = 3
+
+
+def _build_unfitted_plr() -> PLR:
+    np.random.seed(3141)
+    dml_data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=10, alpha=0.5)
+    dml_obj = PLR(dml_data)
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_fit_redraws_on_n_rep_mismatch():
+    """fit(n_rep=...) re-draws splits and warns when n_rep differs from existing splits."""
+    dml_obj = _build_unfitted_plr()
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=1)
+    msg = r"Re-drawing sample splitting"
+    with pytest.warns(UserWarning, match=msg):
+        dml_obj.fit(n_rep=3)
+    assert dml_obj.n_rep == 3
+    assert dml_obj.n_folds == N_FOLDS  # n_folds preserved
+    assert dml_obj.all_thetas.shape == (1, 3)
+
+
+@pytest.mark.ci
+def test_fit_redraws_on_n_folds_mismatch():
+    """fit(n_folds=...) re-draws splits and warns when n_folds differs from existing splits."""
+    dml_obj = _build_unfitted_plr()
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=2)
+    msg = r"Re-drawing sample splitting"
+    with pytest.warns(UserWarning, match=msg):
+        dml_obj.fit(n_folds=N_FOLDS + 2)
+    assert dml_obj.n_folds == N_FOLDS + 2
+    assert dml_obj.n_rep == 2  # n_rep preserved
+
+
+@pytest.mark.ci
+def test_fit_no_warning_when_consistent():
+    """fit(n_rep, n_folds) matching existing splits emits no UserWarning and keeps splits."""
+    dml_obj = _build_unfitted_plr()
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=2)
+    original_smpls = dml_obj.smpls
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        dml_obj.fit(n_folds=N_FOLDS, n_rep=2)
+    # smpls were not redrawn
+    assert dml_obj.smpls is original_smpls
+
+
+@pytest.mark.ci
+def test_fit_no_warning_when_args_omitted():
+    """fit() with no args emits no UserWarning even when splits differ from defaults."""
+    dml_obj = _build_unfitted_plr()
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=2)
+    original_smpls = dml_obj.smpls
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        dml_obj.fit()
+    assert dml_obj.n_rep == 2
+    assert dml_obj.n_folds == N_FOLDS
+    assert dml_obj.smpls is original_smpls
+
+
+@pytest.mark.ci
+def test_fit_draws_default_splits_when_none_set():
+    """fit() without prior draw_sample_splitting() falls back to default n_folds=5, n_rep=1."""
+    dml_obj = _build_unfitted_plr()
+    dml_obj.fit()
+    assert dml_obj.n_folds == 5
+    assert dml_obj.n_rep == 1
+
+
+@pytest.mark.ci
+def test_fit_draws_explicit_splits_when_none_set():
+    """fit(n_folds, n_rep) without prior draw_sample_splitting() honors the args without warning."""
+    dml_obj = _build_unfitted_plr()
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        dml_obj.fit(n_folds=4, n_rep=2)
+    assert dml_obj.n_folds == 4
+    assert dml_obj.n_rep == 2

From 10305aa7b68b8dee2a33845093c94e976d7fd8ae Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 10:25:19 +0200
Subject: [PATCH 26/38] feat: Add CATE and GATE methods to IRM and PLR scalar
 models

- Implemented `cate()` and `gate()` methods in `IRM` and `PLR` classes for estimating conditional average treatment effects.
- Enhanced `DoubleMLBLP` to support per-rep basis for multi-rep scenarios.
- Updated tests for `IRM` and `PLR` to validate new functionality, including checks for correct handling of multi-rep bases and group effects.
- Improved validation of basis inputs in `DoubleMLBLP` to accept both single DataFrame and list of DataFrames.
- Added new test cases to ensure robustness of the new features and backward compatibility with legacy models.
---
 .claude/STATUS.md                             |  11 +-
 doubleml/irm/irm_scalar.py                    |  73 +++++++
 .../irm/tests/test_irm_scalar_cate_gate.py    | 205 ++++++++++++++++++
 doubleml/plm/plr.py                           |   4 +-
 doubleml/plm/plr_scalar.py                    | 106 +++++++++
 doubleml/plm/tests/test_plr.py                |  28 +++
 .../plm/tests/test_plr_scalar_cate_gate.py    | 205 ++++++++++++++++++
 doubleml/utils/blp.py                         |  63 ++++--
 doubleml/utils/tests/test_blp.py              |  76 ++++++-
 9 files changed, 749 insertions(+), 22 deletions(-)
 create mode 100644 doubleml/irm/tests/test_irm_scalar_cate_gate.py
 create mode 100644 doubleml/plm/tests/test_plr_scalar_cate_gate.py

diff --git a/.claude/STATUS.md b/.claude/STATUS.md
index c3a73ecb..c358e91f 100644
--- a/.claude/STATUS.md
+++ b/.claude/STATUS.md
@@ -25,6 +25,9 @@ nuisance evaluation, and sensitivity analysis.
 - [x] **`DoubleMLPLRScalar`** — PLR scalar (`doubleml/plm/plr_scalar.py`) with all 7 test files:
   - `test_plr_scalar.py`, `_return_types`, `_exceptions`, `_vs_plr`, `_external_predictions`, `_tune_ml_models`, `_evaluate_learners`, `_sensitivity`
 - [x] **`DoubleMLIRMScalar`** — IRM scalar (`doubleml/irm/irm_scalar.py`) with all 7 test files (same structure)
+- [x] **`cate()` + `gate()` for IRM scalar** — `doubleml/irm/irm_scalar.py` + `test_irm_scalar_cate_gate.py`
+- [x] **`cate()` + `gate()` + `_partial_out()` for PLR scalar** — `doubleml/plm/plr_scalar.py` + `test_plr_scalar_cate_gate.py`. Multi-rep × multi-column basis fully supported.
+- [x] **`DoubleMLBLP` per-rep basis API** — `basis` may be a single `pd.DataFrame` (shared) or a `list[pd.DataFrame]` of length `n_rep`. Also fixes the legacy `DoubleMLPLR.cate()` multi-rep bug (`basis * D_tilde` mis-broadcast for `n_rep>1` and `d_basis>1`).
 - [x] **`DoubleMLVector`** — multi-treatment base class first iteration (`doubleml/double_ml_vector.py`)
 - [x] **BLP multi-rep support** — `doubleml/utils/blp.py`
 
@@ -38,9 +41,9 @@ Missing from `PLR` / `IRM` scalar compared to `DoubleMLPLR` / `DoubleMLIRM`:
 
 | Feature | Legacy location | Applies to | Notes |
 |---------|----------------|-----------|-------|
-| `cate()` | `plr.py:447`, `irm.py:564` | both | Depends on BLP (multi-rep already done) |
-| `gate()` | `plr.py:485`, `irm.py:598` | both | Delegates to `cate()` |
-| `_partial_out()` | `plr.py:522` | PLR only | Helper needed by PLR `cate()`/`gate()` |
+| `cate()` | `plr.py:447`, `irm.py:564` | — | ✅ ported for both IRM and PLR |
+| `gate()` | `plr.py:485`, `irm.py:598` | — | ✅ ported for both IRM and PLR |
+| `_partial_out()` | `plr.py:522` | — | ✅ ported for PLR scalar |
 | `policy_tree()` | `irm.py:635` | IRM only | Not planned yet |
 
 Weighted effects in IRM (`weights` dict form):
@@ -55,8 +58,6 @@ Intentionally **not ported**:
 
 | Item | Files | Notes |
 |------|-------|-------|
-| `cate()` + `gate()` for PLR scalar | `doubleml/plm/plr_scalar.py` | Needs `_partial_out()` first |
-| `cate()` + `gate()` for IRM scalar | `doubleml/irm/irm_scalar.py` | |
 | `DoubleMLPLRVector` | `doubleml/plm/plr_vector.py` + tests | First concrete Vector subclass |
 | `DoubleMLPLIVScalar` | `doubleml/plm/pliv_scalar.py` + 7 test files | Next scalar model |
 | `DoubleMLPLPRScalar` | `doubleml/plm/plpr_scalar.py` + 7 test files | |
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index 58f94c95..ac44a346 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -4,9 +4,11 @@
 
 from __future__ import annotations
 
+import warnings
 from typing import Any, ClassVar, Self
 
 import numpy as np
+import pandas as pd
 from sklearn.base import clone
 from sklearn.utils.multiclass import type_of_target
 
@@ -15,6 +17,7 @@
 from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_score, _check_weights
 from ..utils._learner import LearnerSpec, predict_nuisance
 from ..utils._propensity_score import _propensity_score_adjustment
+from ..utils.blp import DoubleMLBLP
 from ..utils.propensity_score_processing import PSProcessor, PSProcessorConfig
 
 
@@ -357,6 +360,76 @@ def _get_score_elements(self) -> dict[str, np.ndarray]:
 
         return {"psi_a": psi_a, "psi_b": psi_b}
 
+    # ==================== Heterogeneous Effects ====================
+
+    def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> DoubleMLBLP:
+        """
+        Calculate conditional average treatment effects (CATE) for a given basis.
+
+        Parameters
+        ----------
+        basis : :class:`pandas.DataFrame`
+            The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``,
+            where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
+        is_gate : bool
+            Indicates whether the basis is constructed for GATEs (dummy-basis).
+            Default is ``False``.
+        **kwargs : dict
+            Additional keyword arguments passed to :meth:`statsmodels.regression.linear_model.OLS.fit`,
+            e.g. ``cov_type``.
+
+        Returns
+        -------
+        model : :class:`doubleml.DoubleMLBLP`
+            Best linear predictor model.
+        """
+        if self.score != "ATE":
+            raise ValueError(f"Invalid score '{self.score}'. CATE is only implemented for score='ATE'.")
+        if self._predictions is None:
+            raise ValueError("CATE requires a fitted model. Call fit() first.")
+
+        orth_signal = self._get_score_elements()["psi_b"]
+
+        model = DoubleMLBLP(orth_signal, basis=basis, is_gate=is_gate)
+        model.fit(**kwargs)
+        return model
+
+    def gate(self, groups: pd.DataFrame, **kwargs: Any) -> DoubleMLBLP:
+        """
+        Calculate group average treatment effects (GATE) for mutually exclusive groups.
+
+        Parameters
+        ----------
+        groups : :class:`pandas.DataFrame`
+            The group indicator for estimating the best linear predictor. Groups should be mutually exclusive.
+            Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations
+            and ``d`` is the number of groups, or ``(n_obs, 1)`` containing the corresponding groups (as str).
+        **kwargs : dict
+            Additional keyword arguments passed to :meth:`statsmodels.regression.linear_model.OLS.fit`,
+            e.g. ``cov_type``.
+
+        Returns
+        -------
+        model : :class:`doubleml.DoubleMLBLP`
+            Best linear predictor model for group effects.
+        """
+        if not isinstance(groups, pd.DataFrame):
+            raise TypeError(f"Groups must be of DataFrame type. Groups of type {str(type(groups))} was passed.")
+
+        if not all(groups.dtypes == bool) or all(groups.dtypes == int):
+            if groups.shape[1] == 1:
+                groups = pd.get_dummies(groups, prefix="Group", prefix_sep="_")
+            else:
+                raise TypeError(
+                    "Columns of groups must be of bool type or int type (dummy coded). "
+                    "Alternatively, groups should only contain one column."
+                )
+
+        if any(groups.sum(0) <= 5):
+            warnings.warn("At least one group effect is estimated with less than 6 observations.")
+
+        return self.cate(groups, is_gate=True, **kwargs)
+
     # ==================== Private Helpers ====================
 
     @staticmethod
diff --git a/doubleml/irm/tests/test_irm_scalar_cate_gate.py b/doubleml/irm/tests/test_irm_scalar_cate_gate.py
new file mode 100644
index 00000000..dfa73924
--- /dev/null
+++ b/doubleml/irm/tests/test_irm_scalar_cate_gate.py
@@ -0,0 +1,205 @@
+"""Test cate() and gate() for the IRM scalar model."""
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+import doubleml as dml
+from doubleml.irm.datasets import make_irm_data
+from doubleml.irm.irm_scalar import IRM
+from doubleml.utils.blp import DoubleMLBLP
+
+N_OBS = 120
+N_FOLDS = 3
+BASIS_DIM = 5
+
+
+def _build_irm(n_rep: int, score: str = "ATE", random_state: int = 42) -> tuple[IRM, pd.DataFrame]:
+    """Build and fit an IRM scalar model with a random basis."""
+    np.random.seed(random_state)
+    data = make_irm_data(n_obs=N_OBS, dim_x=2, return_type="DoubleMLData")
+
+    ml_g = RandomForestRegressor(n_estimators=10, random_state=random_state)
+    ml_m = RandomForestClassifier(n_estimators=10, random_state=random_state)
+
+    model = IRM(data, score=score)
+    model.set_learners(ml_g=ml_g, ml_m=ml_m)
+    model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=n_rep)
+    model.fit()
+
+    basis = pd.DataFrame(
+        np.random.normal(0, 1, size=(N_OBS, BASIS_DIM)),
+        columns=[f"b{i}" for i in range(BASIS_DIM)],
+    )
+    return model, basis
+
+
+@pytest.fixture(scope="module")
+def fitted_irm_single_rep() -> tuple[IRM, pd.DataFrame]:
+    return _build_irm(n_rep=1)
+
+
+@pytest.fixture(scope="module")
+def fitted_irm_multi_rep() -> tuple[IRM, pd.DataFrame]:
+    return _build_irm(n_rep=2)
+
+
+@pytest.mark.ci
+def test_cate_returns_blp(fitted_irm_single_rep):
+    """cate() returns a fitted DoubleMLBLP instance."""
+    model, basis = fitted_irm_single_rep
+    cate = model.cate(basis)
+    assert isinstance(cate, DoubleMLBLP)
+
+
+@pytest.mark.ci
+def test_cate_confint_shape(fitted_irm_single_rep):
+    """cate().confint() returns a DataFrame with one row per basis column."""
+    model, basis = fitted_irm_single_rep
+    cate = model.cate(basis)
+    ci = cate.confint()
+    assert isinstance(ci, pd.DataFrame)
+    assert ci.shape[0] == BASIS_DIM
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("cov_type", ["nonrobust", "HC1", "HC3"])
+def test_cate_cov_type_passthrough(fitted_irm_single_rep, cov_type):
+    """The cov_type kwarg propagates through to the underlying OLS fit."""
+    model, basis = fitted_irm_single_rep
+    cate = model.cate(basis, cov_type=cov_type)
+    assert cate.blp_model[0].cov_type == cov_type
+
+
+@pytest.mark.ci
+def test_cate_multi_rep_n_rep(fitted_irm_multi_rep):
+    """cate.n_rep matches the model's n_rep."""
+    model, basis = fitted_irm_multi_rep
+    cate = model.cate(basis)
+    assert cate.n_rep == 2
+    assert isinstance(cate.blp_model, list)
+    assert len(cate.blp_model) == 2
+
+
+@pytest.mark.ci
+def test_cate_multi_rep_shapes(fitted_irm_multi_rep):
+    """all_coef and all_se have shape (BASIS_DIM, n_rep) under multi-rep."""
+    model, basis = fitted_irm_multi_rep
+    cate = model.cate(basis)
+    assert cate.all_coef.shape == (BASIS_DIM, 2)
+    assert cate.all_se.shape == (BASIS_DIM, 2)
+    assert isinstance(cate.confint(), pd.DataFrame)
+    assert isinstance(cate.summary, pd.DataFrame)
+
+
+@pytest.mark.ci
+def test_gate_dummy_coded(fitted_irm_single_rep):
+    """gate() accepts a pre-dummy-coded boolean DataFrame."""
+    model, _ = fitted_irm_single_rep
+    x1 = model._dml_data.data["X1"]
+    groups = pd.DataFrame({"Group 1": x1 <= x1.median(), "Group 2": x1 > x1.median()})
+    gate = model.gate(groups)
+    assert isinstance(gate, DoubleMLBLP)
+    assert all(gate.confint().index == groups.columns.to_list())
+
+
+@pytest.mark.ci
+def test_gate_single_column_string(fitted_irm_single_rep):
+    """A single-column string DataFrame is auto-converted to dummies."""
+    model, _ = fitted_irm_single_rep
+    np.random.seed(0)
+    groups = pd.DataFrame(np.random.choice(["A", "B"], N_OBS))
+    gate = model.gate(groups)
+    assert isinstance(gate, DoubleMLBLP)
+    assert all(gate.confint().index == ["Group_A", "Group_B"])
+
+
+@pytest.mark.ci
+def test_gate_warns_small_group(fitted_irm_single_rep):
+    """A group with <= 5 observations triggers a UserWarning."""
+    model, _ = fitted_irm_single_rep
+    groups = pd.DataFrame(
+        {
+            "small": np.array([True] * 3 + [False] * (N_OBS - 3)),
+            "large": np.array([False] * 3 + [True] * (N_OBS - 3)),
+        }
+    )
+    with pytest.warns(UserWarning, match=r"At least one group effect is estimated with less than 6 observations"):
+        model.gate(groups)
+
+
+@pytest.mark.ci
+def test_cate_exception_atte():
+    """CATE on an ATTE model raises ValueError."""
+    model, basis = _build_irm(n_rep=1, score="ATTE")
+    with pytest.raises(ValueError, match=r"only implemented for score='ATE'"):
+        model.cate(basis)
+
+
+@pytest.mark.ci
+def test_cate_exception_before_fit():
+    """Calling cate() before fit() raises ValueError."""
+    np.random.seed(42)
+    data = make_irm_data(n_obs=N_OBS, dim_x=2, return_type="DoubleMLData")
+    model = IRM(data, score="ATE")
+    model.set_learners(ml_g=RandomForestRegressor(n_estimators=10), ml_m=RandomForestClassifier(n_estimators=10))
+    basis = pd.DataFrame(np.random.normal(0, 1, size=(N_OBS, BASIS_DIM)))
+    with pytest.raises(ValueError, match=r"requires a fitted model"):
+        model.cate(basis)
+
+
+@pytest.mark.ci
+def test_gate_exception_not_dataframe(fitted_irm_single_rep):
+    """gate() with a non-DataFrame raises TypeError."""
+    model, _ = fitted_irm_single_rep
+    with pytest.raises(TypeError, match=r"DataFrame type"):
+        model.gate(np.zeros((N_OBS, 2)))
+
+
+@pytest.mark.ci
+def test_gate_exception_bad_dtype(fitted_irm_single_rep):
+    """gate() with multi-column non-bool/int data raises TypeError."""
+    model, _ = fitted_irm_single_rep
+    groups = pd.DataFrame(
+        {
+            "g1": np.random.normal(0, 1, N_OBS),
+            "g2": np.random.normal(0, 1, N_OBS),
+        }
+    )
+    with pytest.raises(TypeError, match=r"bool type or int type"):
+        model.gate(groups)
+
+
+@pytest.mark.ci
+def test_cate_vs_legacy():
+    """CATE coefficients from the new IRM match the legacy DoubleMLIRM."""
+    n_obs = 200
+    np.random.seed(42)
+    data = make_irm_data(n_obs=n_obs, dim_x=5, return_type="DoubleMLData")
+
+    ml_g = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
+    ml_m = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
+
+    np.random.seed(3141)
+    dml_old = dml.DoubleMLIRM(data, ml_g, ml_m, n_folds=N_FOLDS, n_rep=1, score="ATE")
+    dml_old.fit()
+
+    dml_new = IRM(data, score="ATE")
+    dml_new.set_learners(ml_g=ml_g, ml_m=ml_m)
+    dml_new._n_folds = N_FOLDS
+    dml_new._n_rep = 1
+    dml_new._smpls = dml_old.smpls
+    dml_new.fit()
+
+    np.random.seed(0)
+    basis = pd.DataFrame(
+        np.random.normal(0, 1, size=(n_obs, BASIS_DIM)),
+        columns=[f"b{i}" for i in range(BASIS_DIM)],
+    )
+
+    cate_old = dml_old.cate(basis)
+    cate_new = dml_new.cate(basis)
+
+    np.testing.assert_allclose(cate_new.coef, cate_old.coef, rtol=1e-9)
+    np.testing.assert_allclose(cate_new.se, cate_old.se, rtol=1e-9)
diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
index ed96bf84..825ec845 100644
--- a/doubleml/plm/plr.py
+++ b/doubleml/plm/plr.py
@@ -473,10 +473,10 @@ def cate(self, basis, is_gate=False, **kwargs):
 
         Y_tilde, D_tilde = self._partial_out()
 
-        D_basis = basis * D_tilde
+        basis_per_rep = [basis.multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)]
         model = DoubleMLBLP(
             orth_signal=Y_tilde,
-            basis=D_basis,
+            basis=basis_per_rep,
             is_gate=is_gate,
         )
         model.fit(**kwargs)
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 9d2da5eb..4451e267 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -8,6 +8,7 @@
 from typing import Any, ClassVar, Self
 
 import numpy as np
+import pandas as pd
 from sklearn.base import clone
 from sklearn.model_selection import cross_val_predict
 
@@ -15,6 +16,7 @@
 from ..double_ml_linear_score import LinearScoreMixin
 from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_is_propensity
 from ..utils._learner import LearnerSpec, predict_nuisance
+from ..utils.blp import DoubleMLBLP
 
 
 class PLR(LinearScoreMixin):
@@ -393,6 +395,110 @@ def _get_score_elements(self) -> dict[str, np.ndarray]:
 
         return {"psi_a": psi_a, "psi_b": psi_b}
 
+    # ==================== Heterogeneous Effects ====================
+
+    def _partial_out(self) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Return partialled-out residuals (Y_tilde, D_tilde), each of shape (n_obs, n_rep).
+
+        For score ``'partialling out'``: ``Y_tilde = y - ml_l``, ``D_tilde = d - ml_m``.
+        For score ``'IV-type'``: ``Y_tilde = y - theta * ml_m - ml_g`` and
+        ``D_tilde = d - ml_m`` where ``theta = self.coef[0]`` (aggregated across reps,
+        matching the legacy DoubleMLPLR behavior).
+
+        Returns
+        -------
+        Y_tilde, D_tilde : tuple[np.ndarray, np.ndarray]
+            Outcome and treatment residuals, each of shape ``(n_obs, n_rep)``.
+        """
+        if self._predictions is None:
+            raise ValueError("predictions are None. Call fit() first.")
+
+        y = self._dml_data.y[:, np.newaxis]
+        d = self._dml_data.d[:, np.newaxis]
+        m_hat = self._predictions["ml_m"]
+
+        if self.score == "partialling out":
+            Y_tilde = y - self._predictions["ml_l"]
+            D_tilde = d - m_hat
+        else:  # "IV-type"
+            Y_tilde = y - self.coef[0] * m_hat - self._predictions["ml_g"]
+            D_tilde = d - m_hat
+        return Y_tilde, D_tilde
+
+    def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> DoubleMLBLP:
+        """
+        Calculate conditional average treatment effects (CATE) for a given basis.
+
+        Builds one ``basis * D_tilde[:, i_rep]`` DataFrame per repetition, fits per-rep
+        OLS via :class:`DoubleMLBLP`, and aggregates coefficients across repetitions.
+
+        Parameters
+        ----------
+        basis : :class:`pandas.DataFrame`
+            The basis for estimating the best linear predictor. Has to have shape
+            ``(n_obs, d)``.
+        is_gate : bool
+            Indicates whether the basis is constructed for GATEs (dummy basis).
+            Default is ``False``.
+        **kwargs : dict
+            Additional keyword arguments passed to
+            :meth:`statsmodels.regression.linear_model.OLS.fit`, e.g. ``cov_type``.
+
+        Returns
+        -------
+        model : :class:`doubleml.DoubleMLBLP`
+            Best linear predictor model.
+        """
+        if self._dml_data.n_treat > 1:
+            raise NotImplementedError(
+                f"Only implemented for single treatment. Number of treatments is {self._dml_data.n_treat}."
+            )
+        if self._predictions is None:
+            raise ValueError("CATE requires a fitted model. Call fit() first.")
+
+        Y_tilde, D_tilde = self._partial_out()
+        basis_per_rep = [basis.multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)]
+
+        model = DoubleMLBLP(orth_signal=Y_tilde, basis=basis_per_rep, is_gate=is_gate)
+        model.fit(**kwargs)
+        return model
+
+    def gate(self, groups: pd.DataFrame, **kwargs: Any) -> DoubleMLBLP:
+        """
+        Calculate group average treatment effects (GATE) for mutually exclusive groups.
+
+        Parameters
+        ----------
+        groups : :class:`pandas.DataFrame`
+            The group indicator. Either dummy-coded with shape ``(n_obs, d)`` (one column
+            per group) or ``(n_obs, 1)`` containing the group labels (as str).
+        **kwargs : dict
+            Additional keyword arguments passed to
+            :meth:`statsmodels.regression.linear_model.OLS.fit`, e.g. ``cov_type``.
+
+        Returns
+        -------
+        model : :class:`doubleml.DoubleMLBLP`
+            Best linear predictor model for group effects.
+        """
+        if not isinstance(groups, pd.DataFrame):
+            raise TypeError(f"Groups must be of DataFrame type. Groups of type {str(type(groups))} was passed.")
+
+        if not all(groups.dtypes == bool) or all(groups.dtypes == int):
+            if groups.shape[1] == 1:
+                groups = pd.get_dummies(groups, prefix="Group", prefix_sep="_")
+            else:
+                raise TypeError(
+                    "Columns of groups must be of bool type or int type (dummy coded). "
+                    "Alternatively, groups should only contain one column."
+                )
+
+        if any(groups.sum(0) <= 5):
+            warnings.warn("At least one group effect is estimated with less than 6 observations.")
+
+        return self.cate(groups, is_gate=True, **kwargs)
+
     def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
         """
         Compute PLR sensitivity elements vectorized over all repetitions.
diff --git a/doubleml/plm/tests/test_plr.py b/doubleml/plm/tests/test_plr.py
index 67e396c5..62461657 100644
--- a/doubleml/plm/tests/test_plr.py
+++ b/doubleml/plm/tests/test_plr.py
@@ -379,3 +379,31 @@ def test_dml_plr_cate_gate_multiple_rep(score, cov_type):
     assert gate.all_se.shape == (groups.shape[1], 2)
     assert isinstance(gate.confint(), pd.DataFrame)
     assert all(gate.confint().index == groups.columns.tolist())
+
+
+@pytest.mark.ci
+def test_dml_plr_cate_multi_rep_per_rep_correctness():
+    """For n_rep>1 with a multi-column basis, the per-rep BLP fit must use that rep's
+    own D_tilde residuals (not the global broadcasting that the previous expression
+    produced). Verify by comparing against a manual sm.OLS fit on rep 0."""
+    import statsmodels.api as sm
+
+    n = 150
+    np.random.seed(42)
+    obj_dml_data = dml.plm.datasets.make_plr_CCDDHNR2018(n_obs=n)
+    ml_l = LinearRegression()
+    ml_m = LinearRegression()
+    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_l=ml_l, ml_m=ml_m, n_folds=3, n_rep=3, score="partialling out")
+    dml_plr_obj.fit()
+
+    np.random.seed(7)
+    basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 4)), columns=[f"b{i}" for i in range(4)])
+    cate = dml_plr_obj.cate(basis)
+
+    # Manually replicate the per-rep BLP for rep 0
+    Y_tilde, D_tilde = dml_plr_obj._partial_out()
+    manual_basis_0 = basis.multiply(D_tilde[:, 0], axis=0)
+    manual_blp_0 = sm.OLS(Y_tilde[:, 0], manual_basis_0).fit(cov_type="HC0")
+
+    np.testing.assert_allclose(cate.all_coef[:, 0], manual_blp_0.params, rtol=1e-12)
+    np.testing.assert_allclose(cate.all_se[:, 0], manual_blp_0.bse, rtol=1e-12)
diff --git a/doubleml/plm/tests/test_plr_scalar_cate_gate.py b/doubleml/plm/tests/test_plr_scalar_cate_gate.py
new file mode 100644
index 00000000..7be0f84b
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_scalar_cate_gate.py
@@ -0,0 +1,205 @@
+"""Test cate() and gate() for the PLR scalar model."""
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import Lasso
+
+import doubleml as dml
+from doubleml.plm.datasets import make_plr_CCDDHNR2018
+from doubleml.plm.plr_scalar import PLR
+from doubleml.utils.blp import DoubleMLBLP
+
+N_OBS = 200
+N_FOLDS = 3
+BASIS_DIM = 4
+
+
+def _build_plr(n_rep: int, score: str = "partialling out", random_state: int = 42) -> tuple[PLR, pd.DataFrame]:
+    """Build and fit a PLR scalar model with a random basis."""
+    np.random.seed(random_state)
+    data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5, alpha=0.5, return_type="DoubleMLData")
+
+    ml_l = Lasso(alpha=0.1)
+    ml_m = Lasso(alpha=0.1)
+    ml_g = Lasso(alpha=0.1)
+
+    model = PLR(data, score=score)
+    model.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
+    model.draw_sample_splitting(n_folds=N_FOLDS, n_rep=n_rep)
+    model.fit()
+
+    basis = pd.DataFrame(
+        np.random.normal(0, 1, size=(N_OBS, BASIS_DIM)),
+        columns=[f"b{i}" for i in range(BASIS_DIM)],
+    )
+    return model, basis
+
+
+@pytest.fixture(scope="module", params=["partialling out", "IV-type"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def fitted_plr_single_rep(score) -> tuple[PLR, pd.DataFrame]:
+    return _build_plr(n_rep=1, score=score)
+
+
+@pytest.fixture(scope="module")
+def fitted_plr_multi_rep(score) -> tuple[PLR, pd.DataFrame]:
+    return _build_plr(n_rep=2, score=score)
+
+
+@pytest.mark.ci
+def test_cate_returns_blp(fitted_plr_single_rep):
+    """cate() returns a fitted DoubleMLBLP instance."""
+    model, basis = fitted_plr_single_rep
+    cate = model.cate(basis)
+    assert isinstance(cate, DoubleMLBLP)
+
+
+@pytest.mark.ci
+def test_cate_confint_shape(fitted_plr_single_rep):
+    """cate().confint() returns a DataFrame with one row per basis column."""
+    model, basis = fitted_plr_single_rep
+    cate = model.cate(basis)
+    ci = cate.confint()
+    assert isinstance(ci, pd.DataFrame)
+    assert ci.shape[0] == BASIS_DIM
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("cov_type", ["nonrobust", "HC1", "HC3"])
+def test_cate_cov_type_passthrough(fitted_plr_single_rep, cov_type):
+    """The cov_type kwarg propagates through to the underlying OLS fit."""
+    model, basis = fitted_plr_single_rep
+    cate = model.cate(basis, cov_type=cov_type)
+    assert cate.blp_model[0].cov_type == cov_type
+
+
+@pytest.mark.ci
+def test_cate_multi_rep_n_rep(fitted_plr_multi_rep):
+    """cate.n_rep matches the model's n_rep under multi-rep."""
+    model, basis = fitted_plr_multi_rep
+    cate = model.cate(basis)
+    assert cate.n_rep == 2
+    assert isinstance(cate.blp_model, list)
+    assert len(cate.blp_model) == 2
+
+
+@pytest.mark.ci
+def test_cate_multi_rep_shapes(fitted_plr_multi_rep):
+    """all_coef and all_se have shape (BASIS_DIM, n_rep) under multi-rep."""
+    model, basis = fitted_plr_multi_rep
+    cate = model.cate(basis)
+    assert cate.all_coef.shape == (BASIS_DIM, 2)
+    assert cate.all_se.shape == (BASIS_DIM, 2)
+    assert isinstance(cate.confint(), pd.DataFrame)
+    assert isinstance(cate.summary, pd.DataFrame)
+
+
+@pytest.mark.ci
+def test_gate_dummy_coded(fitted_plr_single_rep):
+    """gate() accepts a pre-dummy-coded boolean DataFrame."""
+    model, _ = fitted_plr_single_rep
+    x1 = model._dml_data.x[:, 0]
+    groups = pd.DataFrame({"low": x1 <= np.median(x1), "high": x1 > np.median(x1)})
+    gate = model.gate(groups)
+    assert isinstance(gate, DoubleMLBLP)
+    assert all(gate.confint().index == groups.columns.to_list())
+
+
+@pytest.mark.ci
+def test_gate_single_column_string(fitted_plr_single_rep):
+    """A single-column string DataFrame is auto-converted to dummies."""
+    model, _ = fitted_plr_single_rep
+    np.random.seed(0)
+    groups = pd.DataFrame(np.random.choice(["A", "B"], N_OBS))
+    gate = model.gate(groups)
+    assert isinstance(gate, DoubleMLBLP)
+    assert all(gate.confint().index == ["Group_A", "Group_B"])
+
+
+@pytest.mark.ci
+def test_gate_warns_small_group(fitted_plr_single_rep):
+    """A group with <= 5 observations triggers a UserWarning."""
+    model, _ = fitted_plr_single_rep
+    groups = pd.DataFrame(
+        {
+            "small": np.array([True] * 3 + [False] * (N_OBS - 3)),
+            "large": np.array([False] * 3 + [True] * (N_OBS - 3)),
+        }
+    )
+    with pytest.warns(UserWarning, match=r"At least one group effect is estimated with less than 6 observations"):
+        model.gate(groups)
+
+
+@pytest.mark.ci
+def test_cate_exception_before_fit():
+    """Calling cate() before fit() raises ValueError."""
+    np.random.seed(42)
+    data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5, alpha=0.5, return_type="DoubleMLData")
+    model = PLR(data, score="partialling out")
+    model.set_learners(ml_l=Lasso(alpha=0.1), ml_m=Lasso(alpha=0.1))
+    basis = pd.DataFrame(np.random.normal(0, 1, size=(N_OBS, BASIS_DIM)))
+    with pytest.raises(ValueError, match=r"requires a fitted model"):
+        model.cate(basis)
+
+
+@pytest.mark.ci
+def test_gate_exception_not_dataframe(fitted_plr_single_rep):
+    """gate() with a non-DataFrame raises TypeError."""
+    model, _ = fitted_plr_single_rep
+    with pytest.raises(TypeError, match=r"DataFrame type"):
+        model.gate(np.zeros((N_OBS, 2)))
+
+
+@pytest.mark.ci
+def test_gate_exception_bad_dtype(fitted_plr_single_rep):
+    """gate() with multi-column non-bool/int data raises TypeError."""
+    model, _ = fitted_plr_single_rep
+    groups = pd.DataFrame(
+        {
+            "g1": np.random.normal(0, 1, N_OBS),
+            "g2": np.random.normal(0, 1, N_OBS),
+        }
+    )
+    with pytest.raises(TypeError, match=r"bool type or int type"):
+        model.gate(groups)
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("score", ["partialling out", "IV-type"])
+@pytest.mark.parametrize("n_rep", [1, 2])
+def test_cate_vs_legacy(score, n_rep):
+    """CATE coefficients from the new PLR match the legacy DoubleMLPLR."""
+    np.random.seed(42)
+    data = make_plr_CCDDHNR2018(n_obs=N_OBS, dim_x=5, alpha=0.5, return_type="DoubleMLData")
+
+    ml_l = Lasso(alpha=0.1)
+    ml_m = Lasso(alpha=0.1)
+    ml_g = Lasso(alpha=0.1)
+
+    np.random.seed(3141)
+    dml_old = dml.DoubleMLPLR(data, ml_l, ml_m, ml_g, n_folds=N_FOLDS, n_rep=n_rep, score=score)
+    dml_old.fit()
+
+    dml_new = PLR(data, score=score)
+    dml_new.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
+    dml_new._n_folds = N_FOLDS
+    dml_new._n_rep = n_rep
+    dml_new._smpls = dml_old.smpls
+    dml_new.fit()
+
+    np.random.seed(0)
+    basis = pd.DataFrame(
+        np.random.normal(0, 1, size=(N_OBS, BASIS_DIM)),
+        columns=[f"b{i}" for i in range(BASIS_DIM)],
+    )
+
+    cate_old = dml_old.cate(basis)
+    cate_new = dml_new.cate(basis)
+
+    np.testing.assert_allclose(cate_new.coef, cate_old.coef, rtol=1e-9)
+    np.testing.assert_allclose(cate_new.se, cate_old.se, rtol=1e-9)
diff --git a/doubleml/utils/blp.py b/doubleml/utils/blp.py
index c0b11e18..c5e59d7e 100644
--- a/doubleml/utils/blp.py
+++ b/doubleml/utils/blp.py
@@ -19,9 +19,12 @@ class DoubleMLBLP:
         The orthogonal signal to be predicted. Has to be of shape ``(n_obs,)`` or ``(n_obs, n_rep)``,
         where ``n_obs`` is the number of observations and ``n_rep`` is the number of repetitions.
 
-    basis : :class:`pandas.DataFrame`
-        The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``,
-        where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
+    basis : :class:`pandas.DataFrame` or list of :class:`pandas.DataFrame`
+        The basis for estimating the best linear predictor. Either a single DataFrame of shape
+        ``(n_obs, d)`` (shared across all repetitions) or a list of DataFrames of length ``n_rep``
+        (one basis per repetition, e.g. for PLR CATE where the basis is multiplied by per-rep
+        residuals). When a list is passed, every entry must have the same column names so per-rep
+        coefficients can be aggregated.
 
     is_gate : bool
         Indicates whether the basis is constructed for GATEs (dummy-basis).
@@ -44,16 +47,8 @@ def __init__(self, orth_signal, basis, is_gate=False):
         self._n_rep = self._orth_signal.shape[1]
         self._is_gate = is_gate
 
-        if not isinstance(basis, pd.DataFrame):
-            raise TypeError(f"The basis must be of DataFrame type. Basis of type {str(type(basis))} was passed.")
-        if not basis.columns.is_unique:
-            raise ValueError("Invalid pd.DataFrame: Contains duplicate column names.")
-        if self._orth_signal.shape[0] != basis.shape[0]:
-            raise ValueError(
-                "The number of observations in signal and basis does not match. "
-                f"Got {str(self._orth_signal.shape[0])} and {str(basis.shape[0])}."
-            )
-        self._basis = basis
+        self._basis_list = self._validate_basis(basis, self._orth_signal.shape[0], self._n_rep)
+        self._basis = self._basis_list[0]
 
         # initialize the score and the covariance
         self._blp_model = None
@@ -63,6 +58,46 @@ def __init__(self, orth_signal, basis, is_gate=False):
         self._coef = None
         self._se = None
 
+    @staticmethod
+    def _validate_basis(basis, n_obs, n_rep):
+        """Validate ``basis`` and return a list of length ``n_rep``.
+
+        ``basis`` may be a single ``pd.DataFrame`` (shared across reps) or a list of
+        ``pd.DataFrame`` of length ``n_rep``. Per-rep DataFrames must share column names
+        so coefficients are comparable for aggregation.
+        """
+        if isinstance(basis, pd.DataFrame):
+            basis_list = [basis] * n_rep
+        elif isinstance(basis, list):
+            if len(basis) != n_rep:
+                raise ValueError(f"When basis is a list it must have length n_rep={n_rep}. Got length {len(basis)}.")
+            if not all(isinstance(b, pd.DataFrame) for b in basis):
+                raise TypeError("All entries of basis list must be of DataFrame type.")
+            ref_cols = list(basis[0].columns)
+            for i, b in enumerate(basis[1:], start=1):
+                if list(b.columns) != ref_cols:
+                    raise ValueError(
+                        f"All per-rep bases must have the same column names. "
+                        f"Entry 0 columns: {ref_cols}, entry {i} columns: {list(b.columns)}."
+                    )
+            basis_list = basis
+        else:
+            raise TypeError(
+                f"The basis must be of DataFrame type or a list of DataFrames. "
+                f"Basis of type {str(type(basis))} was passed."
+            )
+
+        if not basis_list[0].columns.is_unique:
+            raise ValueError("Invalid pd.DataFrame: Contains duplicate column names.")
+
+        for i, b in enumerate(basis_list):
+            if b.shape[0] != n_obs:
+                raise ValueError(
+                    "The number of observations in signal and basis does not match. "
+                    f"Got {n_obs} and {b.shape[0]}" + (f" (basis entry {i})." if len(basis_list) > 1 else ".")
+                )
+        return basis_list
+
     def __str__(self):
         class_name = self.__class__.__name__
         header = f"================== {class_name} Object ==================\n"
@@ -188,7 +223,7 @@ def fit(self, cov_type="HC0", **kwargs):
         self._blp_model = []
 
         for i_rep in range(self.n_rep):
-            blp_model = sm.OLS(self._orth_signal[:, i_rep], self._basis).fit(cov_type=cov_type, **kwargs)
+            blp_model = sm.OLS(self._orth_signal[:, i_rep], self._basis_list[i_rep]).fit(cov_type=cov_type, **kwargs)
             self._blp_model.append(blp_model)
             self._all_coef[:, i_rep] = np.asarray(blp_model.params)
             self._all_se[:, i_rep] = np.asarray(blp_model.bse)
diff --git a/doubleml/utils/tests/test_blp.py b/doubleml/utils/tests/test_blp.py
index e05f850e..a89e2b0c 100644
--- a/doubleml/utils/tests/test_blp.py
+++ b/doubleml/utils/tests/test_blp.py
@@ -160,7 +160,7 @@ def test_doubleml_exception_blp():
     msg = "The signal must be one- or two-dimensional. Signal of dimensions 3 was passed."
     with pytest.raises(ValueError, match=msg):
         dml.DoubleMLBLP(orth_signal=np.array([[[1]], [[2]]]), basis=random_basis)
-    msg = "The basis must be of DataFrame type. Basis of type <class 'int'> was passed."
+    msg = r"The basis must be of DataFrame type or a list of DataFrames. Basis of type <class 'int'> was passed."
     with pytest.raises(TypeError, match=msg):
         dml.DoubleMLBLP(orth_signal=signal, basis=1)
     msg = "The number of observations in signal and basis does not match. Got 3 and 2."
@@ -200,3 +200,77 @@ def test_doubleml_exception_blp():
     msg = "Invalid basis: DataFrame has to have the exact same number and ordering of columns."
     with pytest.raises(ValueError, match=msg):
         dml_blp_confint.confint(basis=pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["x_1", "x_2", "x_3"]))
+
+
+@pytest.mark.ci
+def test_blp_per_rep_basis_fits():
+    """A list-of-DataFrames basis fits and exposes per-rep coefficient shapes."""
+    n, d, n_rep = 50, 3, 3
+    np.random.seed(0)
+    signal = np.random.normal(0, 1, size=(n, n_rep))
+    cols = [f"b{i}" for i in range(d)]
+    basis_list = [pd.DataFrame(np.random.normal(0, 1, size=(n, d)), columns=cols) for _ in range(n_rep)]
+
+    blp = dml.DoubleMLBLP(signal, basis_list).fit()
+    assert blp.all_coef.shape == (d, n_rep)
+    assert blp.all_se.shape == (d, n_rep)
+    assert blp.coef.shape == (d,)
+    assert blp.se.shape == (d,)
+
+
+@pytest.mark.ci
+def test_blp_per_rep_basis_matches_shared():
+    """Per-rep list of identical bases yields the same fit as the shared-basis call."""
+    n, d, n_rep = 50, 3, 3
+    np.random.seed(1)
+    signal = np.random.normal(0, 1, size=(n, n_rep))
+    basis = pd.DataFrame(np.random.normal(0, 1, size=(n, d)), columns=[f"b{i}" for i in range(d)])
+
+    blp_shared = dml.DoubleMLBLP(signal, basis).fit()
+    blp_list = dml.DoubleMLBLP(signal, [basis] * n_rep).fit()
+
+    np.testing.assert_allclose(blp_list.all_coef, blp_shared.all_coef, rtol=1e-12)
+    np.testing.assert_allclose(blp_list.all_se, blp_shared.all_se, rtol=1e-12)
+    np.testing.assert_allclose(blp_list.coef, blp_shared.coef, rtol=1e-12)
+
+
+@pytest.mark.ci
+def test_blp_per_rep_basis_wrong_length():
+    """Wrong list length raises ValueError."""
+    n, n_rep = 30, 3
+    signal = np.zeros((n, n_rep))
+    basis = pd.DataFrame(np.zeros((n, 2)), columns=["a", "b"])
+    with pytest.raises(ValueError, match=r"length n_rep=3"):
+        dml.DoubleMLBLP(signal, [basis, basis])
+
+
+@pytest.mark.ci
+def test_blp_per_rep_basis_mismatched_columns():
+    """Per-rep bases with different column names raise ValueError."""
+    n, n_rep = 30, 2
+    signal = np.zeros((n, n_rep))
+    basis_a = pd.DataFrame(np.zeros((n, 2)), columns=["a", "b"])
+    basis_b = pd.DataFrame(np.zeros((n, 2)), columns=["a", "c"])
+    with pytest.raises(ValueError, match=r"same column names"):
+        dml.DoubleMLBLP(signal, [basis_a, basis_b])
+
+
+@pytest.mark.ci
+def test_blp_per_rep_basis_mismatched_n_obs():
+    """Per-rep basis with wrong row count raises ValueError."""
+    n, n_rep = 30, 2
+    signal = np.zeros((n, n_rep))
+    basis_ok = pd.DataFrame(np.zeros((n, 2)), columns=["a", "b"])
+    basis_bad = pd.DataFrame(np.zeros((n - 1, 2)), columns=["a", "b"])
+    with pytest.raises(ValueError, match=r"basis entry 1"):
+        dml.DoubleMLBLP(signal, [basis_ok, basis_bad])
+
+
+@pytest.mark.ci
+def test_blp_per_rep_basis_non_dataframe_entry():
+    """A non-DataFrame entry in the list raises TypeError."""
+    n, n_rep = 30, 2
+    signal = np.zeros((n, n_rep))
+    basis = pd.DataFrame(np.zeros((n, 2)), columns=["a", "b"])
+    with pytest.raises(TypeError, match=r"All entries of basis list must be of DataFrame type"):
+        dml.DoubleMLBLP(signal, [basis, np.zeros((n, 2))])

From 71ef4838d1dab85266bfc33ac869f588dce8bdef Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 14:46:58 +0200
Subject: [PATCH 27/38] feat: Implement PLRVector for multi-treatment partially
 linear regression and add comprehensive tests

---
 .claude/STATUS.md                             |   5 +-
 doubleml/plm/plr_vector.py                    | 139 +++++++++++
 doubleml/plm/tests/test_plr_vector.py         |  52 ++++
 .../plm/tests/test_plr_vector_exceptions.py   | 160 +++++++++++++
 .../test_plr_vector_external_predictions.py   |  83 +++++++
 .../plm/tests/test_plr_vector_return_types.py | 222 ++++++++++++++++++
 doubleml/plm/tests/test_plr_vector_vs_plr.py  | 122 ++++++++++
 7 files changed, 781 insertions(+), 2 deletions(-)
 create mode 100644 doubleml/plm/plr_vector.py
 create mode 100644 doubleml/plm/tests/test_plr_vector.py
 create mode 100644 doubleml/plm/tests/test_plr_vector_exceptions.py
 create mode 100644 doubleml/plm/tests/test_plr_vector_external_predictions.py
 create mode 100644 doubleml/plm/tests/test_plr_vector_return_types.py
 create mode 100644 doubleml/plm/tests/test_plr_vector_vs_plr.py

diff --git a/.claude/STATUS.md b/.claude/STATUS.md
index c358e91f..953b2543 100644
--- a/.claude/STATUS.md
+++ b/.claude/STATUS.md
@@ -30,10 +30,11 @@ nuisance evaluation, and sensitivity analysis.
 - [x] **`DoubleMLBLP` per-rep basis API** — `basis` may be a single `pd.DataFrame` (shared) or a `list[pd.DataFrame]` of length `n_rep`. Also fixes the legacy `DoubleMLPLR.cate()` multi-rep bug (`basis * D_tilde` mis-broadcast for `n_rep>1` and `d_basis>1`).
 - [x] **`DoubleMLVector`** — multi-treatment base class first iteration (`doubleml/double_ml_vector.py`)
 - [x] **BLP multi-rep support** — `doubleml/utils/blp.py`
+- [x] **`PLRVector`** — first concrete `DoubleMLVector` subclass (`doubleml/plm/plr_vector.py`) with 5 test files: `test_plr_vector.py`, `_return_types`, `_exceptions`, `_vs_plr`, `_external_predictions`. Validates exact equivalence with legacy `DoubleMLPLR` for multi-treatment.
 
 ### In Progress
 
-- [ ] **`DoubleMLVector`** — base class exists; no concrete subclass yet
+_(none)_
 
 ### Feature Gaps vs Legacy Classes
 
@@ -58,7 +59,7 @@ Intentionally **not ported**:
 
 | Item | Files | Notes |
 |------|-------|-------|
-| `DoubleMLPLRVector` | `doubleml/plm/plr_vector.py` + tests | First concrete Vector subclass |
+| `DoubleMLIRMVector` | `doubleml/irm/irm_vector.py` + tests | Next concrete Vector subclass |
 | `DoubleMLPLIVScalar` | `doubleml/plm/pliv_scalar.py` + 7 test files | Next scalar model |
 | `DoubleMLPLPRScalar` | `doubleml/plm/plpr_scalar.py` + 7 test files | |
 | DID scalar variants | `doubleml/did/*_scalar.py` | DID, DIDCSBinary, DIDMulti |
diff --git a/doubleml/plm/plr_vector.py b/doubleml/plm/plr_vector.py
new file mode 100644
index 00000000..6a3621bb
--- /dev/null
+++ b/doubleml/plm/plr_vector.py
@@ -0,0 +1,139 @@
+"""Partially Linear Regression (PLR) multi-treatment model based on the DoubleMLVector hierarchy."""
+
+from __future__ import annotations
+
+from typing import Any, Self
+
+from ..data.base_data import DoubleMLData
+from ..double_ml_scalar import DoubleMLScalar
+from ..double_ml_vector import DoubleMLVector
+from .plr_scalar import PLR
+
+
+class PLRVector(DoubleMLVector):
+    """Multi-treatment double machine learning for partially linear regression models.
+
+    Orchestrates one :class:`~doubleml.plm.plr_scalar.PLR` instance per treatment column
+    in ``d_cols``. Sample splits are drawn once and shared across all sub-models;
+    learners are propagated (and cloned per sub-model) via :meth:`set_learners`.
+    The scalar :class:`~doubleml.DoubleMLFramework` objects are concatenated into a
+    single multi-treatment framework after fit.
+
+    Parameters
+    ----------
+    obj_dml_data : DoubleMLData
+        The data object providing the data and specifying the variables for the causal
+        model. May contain multiple treatment columns in ``d_cols``.
+    score : str
+        The score function (``'partialling out'`` or ``'IV-type'``).
+        Default is ``'partialling out'``.
+    ml_l : estimator, optional
+        Learner for E[Y|X]. Can be regressor or classifier.
+    ml_m : estimator, optional
+        Learner for E[D|X]. Can be regressor or classifier.
+    ml_g : estimator, optional
+        Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor.
+    """
+
+    def __init__(
+        self,
+        obj_dml_data: DoubleMLData,
+        score: str = "partialling out",
+        ml_l: object | None = None,
+        ml_m: object | None = None,
+        ml_g: object | None = None,
+    ) -> None:
+        # Validate at the vector level so the error fires before sub-model construction.
+        self._check_data(obj_dml_data)
+        valid_scores = ["partialling out", "IV-type"]
+        if score not in valid_scores:
+            raise ValueError(f"Invalid score '{score}'. Valid scores: {valid_scores}.")
+        if score == "IV-type" and obj_dml_data.binary_outcome:
+            raise ValueError("For score = 'IV-type', additive probability models (binary outcomes) are not supported.")
+
+        super().__init__(obj_dml_data=obj_dml_data, score=score)
+        self._modellist = self._initialize_models()
+
+        if any(learner is not None for learner in (ml_l, ml_m, ml_g)):
+            self.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
+
+    @staticmethod
+    def _check_data(obj_dml_data: Any) -> None:
+        """Validate the data object for PLR vector estimation.
+
+        Parameters
+        ----------
+        obj_dml_data : Any
+            Data candidate. Must be a :class:`~doubleml.data.DoubleMLData` without
+            instrumental variables.
+
+        Raises
+        ------
+        TypeError
+            If ``obj_dml_data`` is not a :class:`~doubleml.data.DoubleMLData`.
+        ValueError
+            If ``obj_dml_data`` defines instrumental variables (``z_cols``).
+        """
+        if not isinstance(obj_dml_data, DoubleMLData):
+            raise TypeError(
+                f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed."
+            )
+        if obj_dml_data.z_cols is not None:
+            raise ValueError(
+                "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). "
+                "PLRVector does not support instrumental variables."
+            )
+
+    @property
+    def required_learners(self) -> list[str]:
+        """Required learners for the current score."""
+        names = ["ml_l", "ml_m"]
+        if self.score == "IV-type":
+            names.append("ml_g")
+        return names
+
+    def set_learners(
+        self,
+        ml_l: object | None = None,
+        ml_m: object | None = None,
+        ml_g: object | None = None,
+    ) -> Self:
+        """Set the learners for nuisance estimation on every sub-model.
+
+        Parameters
+        ----------
+        ml_l : estimator or None, optional
+            Learner for :math:`\\ell_0(X) = E[Y|X]`.
+        ml_m : estimator or None, optional
+            Learner for :math:`m_0(X) = E[D|X]`.
+        ml_g : estimator or None, optional
+            Learner for :math:`g_0(X) = E[Y - D\\theta_0|X]`. Required for ``score='IV-type'``.
+
+        Returns
+        -------
+        self : PLRVector
+        """
+        if self._modellist is None:
+            raise RuntimeError("Sub-models are not initialized. _initialize_models() must run in __init__.")
+        for model in self._modellist:
+            model.set_learners(ml_l=ml_l, ml_m=ml_m, ml_g=ml_g)
+        self._reset_fit_state()
+        return self
+
+    def _initialize_models(self) -> list[DoubleMLScalar]:
+        """Create one PLR sub-model per treatment column."""
+        return [PLR(obj_dml_data=self._get_data_for_model(d_col), score=self.score) for d_col in self._dml_data.d_cols]
+
+    def cate(self, *args: Any, **kwargs: Any) -> Any:
+        """Not implemented for multi-treatment PLR."""
+        raise NotImplementedError(
+            "cate() is not defined for multi-treatment PLR. "
+            "Use the single-treatment PLR (doubleml.plm.plr_scalar.PLR) instead."
+        )
+
+    def gate(self, *args: Any, **kwargs: Any) -> Any:
+        """Not implemented for multi-treatment PLR."""
+        raise NotImplementedError(
+            "gate() is not defined for multi-treatment PLR. "
+            "Use the single-treatment PLR (doubleml.plm.plr_scalar.PLR) instead."
+        )
diff --git a/doubleml/plm/tests/test_plr_vector.py b/doubleml/plm/tests/test_plr_vector.py
new file mode 100644
index 00000000..8798b391
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_vector.py
@@ -0,0 +1,52 @@
+"""Core multi-treatment estimation accuracy for PLRVector."""
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.linear_model import Lasso
+
+import doubleml as dml
+from doubleml.plm.plr_vector import PLRVector
+
+
+@pytest.fixture(scope="module", params=["partialling out", "IV-type"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def fitted_plr_vector_bivariate(generate_data_bivariate, score):
+    """PLRVector fitted on bivariate data with theta = [0.5, 0.9]."""
+    data = generate_data_bivariate
+    x_cols = data.columns[data.columns.str.startswith("X")].tolist()
+    d_cols = data.columns[data.columns.str.startswith("d")].tolist()
+    obj_dml_data = dml.DoubleMLData(data, y_col="y", d_cols=d_cols, x_cols=x_cols)
+
+    learner = Lasso(alpha=0.1)
+    np.random.seed(3141)
+    dml_obj = PLRVector(obj_dml_data, score=score)
+    dml_obj.set_learners(ml_l=clone(learner), ml_m=clone(learner), ml_g=clone(learner) if score == "IV-type" else None)
+    dml_obj.draw_sample_splitting(n_folds=5, n_rep=1)
+    dml_obj.fit()
+    return dml_obj, np.array([0.5, 0.9])
+
+
+@pytest.mark.ci
+def test_coef_within_3_sigma(fitted_plr_vector_bivariate):
+    """All treatment coefficients fall within 3 SE of the true thetas."""
+    dml_obj, true_theta = fitted_plr_vector_bivariate
+    assert np.all(np.abs(dml_obj.coef - true_theta) <= 3.0 * dml_obj.se)
+
+
+@pytest.mark.ci
+def test_se_positive(fitted_plr_vector_bivariate):
+    """Standard errors are strictly positive for every treatment."""
+    dml_obj, _ = fitted_plr_vector_bivariate
+    assert np.all(dml_obj.se > 0)
+
+
+@pytest.mark.ci
+def test_coef_shape_matches_d_cols(fitted_plr_vector_bivariate):
+    """Coefficient vector has one entry per treatment column."""
+    dml_obj, _ = fitted_plr_vector_bivariate
+    assert dml_obj.coef.shape == (len(dml_obj._dml_data.d_cols),)
diff --git a/doubleml/plm/tests/test_plr_vector_exceptions.py b/doubleml/plm/tests/test_plr_vector_exceptions.py
new file mode 100644
index 00000000..0f018405
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_vector_exceptions.py
@@ -0,0 +1,160 @@
+"""Validate PLRVector input validation and error handling."""
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import Lasso
+
+import doubleml as dml
+from doubleml.plm.plr_vector import PLRVector
+
+
+def _make_bivariate_data(n_obs: int = 200, dim_x: int = 5) -> dml.DoubleMLData:
+    np.random.seed(42)
+    x = np.random.normal(size=(n_obs, dim_x))
+    d0 = np.random.normal(size=n_obs)
+    d1 = np.random.normal(size=n_obs)
+    y = 0.5 * d0 + 0.9 * d1 + x[:, 0] + np.random.normal(size=n_obs)
+    df = pd.DataFrame(
+        np.column_stack([x, y, d0, d1]),
+        columns=[f"X{i + 1}" for i in range(dim_x)] + ["y", "d1", "d2"],
+    )
+    return dml.DoubleMLData(df, y_col="y", d_cols=["d1", "d2"], x_cols=[f"X{i + 1}" for i in range(dim_x)])
+
+
+def _make_binary_outcome_bivariate_data(n_obs: int = 100) -> dml.DoubleMLData:
+    np.random.seed(11)
+    x = np.random.normal(size=(n_obs, 3))
+    d0 = (np.random.normal(size=n_obs) > 0).astype(float)
+    d1 = (np.random.normal(size=n_obs) > 0).astype(float)
+    y = (np.random.normal(size=n_obs) > 0).astype(float)
+    df = pd.DataFrame({"y": y, "d1": d0, "d2": d1, "X1": x[:, 0], "X2": x[:, 1], "X3": x[:, 2]})
+    return dml.DoubleMLData(df, y_col="y", d_cols=["d1", "d2"], x_cols=["X1", "X2", "X3"])
+
+
+def _make_iv_data(n_obs: int = 200, dim_x: int = 5) -> dml.DoubleMLData:
+    np.random.seed(42)
+    x = np.random.normal(size=(n_obs, dim_x))
+    d0 = np.random.normal(size=n_obs)
+    d1 = np.random.normal(size=n_obs)
+    z = np.random.normal(size=n_obs)
+    y = 0.5 * d0 + 0.9 * d1 + x[:, 0] + np.random.normal(size=n_obs)
+    df = pd.DataFrame(
+        np.column_stack([x, y, d0, d1, z]),
+        columns=[f"X{i + 1}" for i in range(dim_x)] + ["y", "d1", "d2", "Z1"],
+    )
+    return dml.DoubleMLData(
+        df,
+        y_col="y",
+        d_cols=["d1", "d2"],
+        x_cols=[f"X{i + 1}" for i in range(dim_x)],
+        z_cols="Z1",
+    )
+
+
+@pytest.mark.ci
+def test_exception_data_type():
+    """Non-DoubleMLData input is rejected with a TypeError."""
+    msg = r"The data must be of DoubleMLData type\."
+    with pytest.raises(TypeError, match=msg):
+        PLRVector(pd.DataFrame())
+
+
+@pytest.mark.ci
+def test_exception_instrument():
+    """Data carrying instrumental variables (z_cols) is rejected."""
+    msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\."
+    with pytest.raises(ValueError, match=msg):
+        PLRVector(_make_iv_data())
+
+
+@pytest.mark.ci
+def test_exception_invalid_score():
+    """Unknown score string is rejected at construction."""
+    msg = r"Invalid score 'invalid'\."
+    with pytest.raises(ValueError, match=msg):
+        PLRVector(_make_bivariate_data(), score="invalid")
+
+
+@pytest.mark.ci
+def test_exception_iv_type_binary_outcome():
+    """IV-type score with binary outcome is rejected."""
+    msg = r"For score = 'IV-type', additive probability models \(binary outcomes\) are not supported\."
+    with pytest.raises(ValueError, match=msg):
+        PLRVector(_make_binary_outcome_bivariate_data(), score="IV-type")
+
+
+@pytest.mark.ci
+def test_exception_n_folds():
+    """draw_sample_splitting rejects n_folds < 2."""
+    dml_obj = PLRVector(_make_bivariate_data())
+    msg = r"n_folds must be an integer >= 2\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_folds=1)
+
+
+@pytest.mark.ci
+def test_exception_n_rep():
+    """draw_sample_splitting rejects n_rep < 1."""
+    dml_obj = PLRVector(_make_bivariate_data())
+    msg = r"n_rep must be an integer >= 1\."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.draw_sample_splitting(n_rep=0)
+
+
+@pytest.mark.ci
+def test_exception_missing_learner():
+    """fit() fails when no learners are registered."""
+    dml_obj = PLRVector(_make_bivariate_data())
+    dml_obj.draw_sample_splitting()
+    msg = r"Learner 'ml_l' is required but not set"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit()
+
+
+@pytest.mark.ci
+def test_exception_missing_partial_learner():
+    """fit() fails when ml_m is missing."""
+    dml_obj = PLRVector(_make_bivariate_data())
+    dml_obj.set_learners(ml_l=Lasso(alpha=0.1))
+    dml_obj.draw_sample_splitting()
+    msg = r"Learner 'ml_m' is required but not set"
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.fit()
+
+
+@pytest.mark.ci
+def test_exception_invalid_learner_class():
+    """Passing a class instead of an instance raises TypeError."""
+    dml_obj = PLRVector(_make_bivariate_data())
+    msg = r"Invalid learner provided for ml_l: provide an instance"
+    with pytest.raises(TypeError, match=msg):
+        dml_obj.set_learners(ml_l=Lasso)
+
+
+@pytest.mark.ci
+def test_warning_ml_g_partialling_out():
+    """Passing ml_g with score='partialling out' triggers a UserWarning."""
+    dml_obj = PLRVector(_make_bivariate_data(), score="partialling out")
+    with pytest.warns(UserWarning, match=r"not required for score.*ignored"):
+        dml_obj.set_learners(ml_l=Lasso(alpha=0.1), ml_m=Lasso(alpha=0.1), ml_g=Lasso(alpha=0.1))
+
+
+@pytest.mark.ci
+def test_cate_not_implemented():
+    """cate() raises NotImplementedError on multi-treatment PLR."""
+    dml_obj = PLRVector(_make_bivariate_data())
+    dml_obj.set_learners(ml_l=Lasso(alpha=0.1), ml_m=Lasso(alpha=0.1))
+    dml_obj.fit(n_folds=3)
+    with pytest.raises(NotImplementedError, match=r"cate\(\) is not defined for multi-treatment PLR"):
+        dml_obj.cate(pd.DataFrame({"const": np.ones(200)}))
+
+
+@pytest.mark.ci
+def test_gate_not_implemented():
+    """gate() raises NotImplementedError on multi-treatment PLR."""
+    dml_obj = PLRVector(_make_bivariate_data())
+    dml_obj.set_learners(ml_l=Lasso(alpha=0.1), ml_m=Lasso(alpha=0.1))
+    dml_obj.fit(n_folds=3)
+    with pytest.raises(NotImplementedError, match=r"gate\(\) is not defined for multi-treatment PLR"):
+        dml_obj.gate(pd.DataFrame({"g": np.ones(200, dtype=bool)}))
diff --git a/doubleml/plm/tests/test_plr_vector_external_predictions.py b/doubleml/plm/tests/test_plr_vector_external_predictions.py
new file mode 100644
index 00000000..f4316c6f
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_vector_external_predictions.py
@@ -0,0 +1,83 @@
+"""External predictions equivalence for PLRVector across multiple treatments."""
+
+import math
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import LinearRegression
+
+import doubleml as dml
+from doubleml.plm.plr_vector import PLRVector
+
+
+def _make_bivariate_data(n_obs: int = 300, dim_x: int = 5) -> dml.DoubleMLData:
+    np.random.seed(42)
+    x = np.random.normal(size=(n_obs, dim_x))
+    d0 = np.random.normal(size=n_obs)
+    d1 = np.random.normal(size=n_obs)
+    y = 0.5 * d0 + 0.9 * d1 + x[:, 0] + np.random.normal(size=n_obs)
+    df = pd.DataFrame(
+        np.column_stack([x, y, d0, d1]),
+        columns=[f"X{i + 1}" for i in range(dim_x)] + ["y", "d1", "d2"],
+    )
+    return dml.DoubleMLData(df, y_col="y", d_cols=["d1", "d2"], x_cols=[f"X{i + 1}" for i in range(dim_x)])
+
+
+@pytest.fixture(scope="module", params=["partialling out", "IV-type"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def external_predictions_fixture(score, n_rep):
+    """Fit a reference PLRVector and a second one consuming its predictions externally."""
+    n_folds = 3
+    obj_dml_data = _make_bivariate_data()
+    learner_kwargs: dict[str, object] = {"ml_l": LinearRegression(), "ml_m": LinearRegression()}
+    if score == "IV-type":
+        learner_kwargs["ml_g"] = LinearRegression()
+
+    np.random.seed(3141)
+    dml_ref = PLRVector(obj_dml_data, score=score)
+    dml_ref.set_learners(**learner_kwargs)
+    dml_ref.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_ref.fit()
+
+    # Build external predictions per treatment, replicating every required learner.
+    learner_names = ["ml_l", "ml_m"] + (["ml_g"] if score == "IV-type" else [])
+    external_predictions = {
+        d_col: {name: dml_ref.modellist[i]._predictions[name] for name in learner_names}
+        for i, d_col in enumerate(obj_dml_data.d_cols)
+    }
+
+    # Fit a fresh PLRVector consuming the external predictions on identical splits.
+    dml_ext = PLRVector(obj_dml_data, score=score)
+    dml_ext.set_learners(**learner_kwargs)
+    dml_ext.set_sample_splitting(dml_ref.smpls)
+    dml_ext.fit(external_predictions=external_predictions)
+
+    return {"ref": dml_ref, "ext": dml_ext}
+
+
+@pytest.mark.ci
+def test_coef_matches_external(external_predictions_fixture):
+    """Per-treatment coefficients match the reference fit when fed via external_predictions."""
+    ref = external_predictions_fixture["ref"]
+    ext = external_predictions_fixture["ext"]
+    for i in range(ref.coef.shape[0]):
+        assert math.isclose(ref.coef[i], ext.coef[i], rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_se_matches_external(external_predictions_fixture):
+    """Per-treatment standard errors match the reference fit when fed via external_predictions."""
+    ref = external_predictions_fixture["ref"]
+    ext = external_predictions_fixture["ext"]
+    for i in range(ref.se.shape[0]):
+        assert math.isclose(ref.se[i], ext.se[i], rel_tol=1e-9, abs_tol=1e-4)
diff --git a/doubleml/plm/tests/test_plr_vector_return_types.py b/doubleml/plm/tests/test_plr_vector_return_types.py
new file mode 100644
index 00000000..65346e17
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_vector_return_types.py
@@ -0,0 +1,222 @@
+"""Validate PLRVector return types and reset behavior."""
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import LinearRegression
+
+import doubleml as dml
+from doubleml.plm.plr_vector import PLRVector
+
+N_OBS = 200
+N_FOLDS = 3
+N_REP = 2
+N_REP_BOOT = 251
+
+
+def _make_data(n_obs: int = N_OBS, dim_x: int = 5) -> dml.DoubleMLData:
+    """Build a small bivariate-treatment DoubleMLData for return-type tests."""
+    np.random.seed(7)
+    x = np.random.normal(size=(n_obs, dim_x))
+    d0 = np.random.normal(size=n_obs)
+    d1 = np.random.normal(size=n_obs)
+    y = 0.5 * d0 + 0.9 * d1 + x[:, 0] + np.random.normal(size=n_obs)
+    df = pd.DataFrame(
+        np.column_stack([x, y, d0, d1]),
+        columns=[f"X{i + 1}" for i in range(dim_x)] + ["y", "d1", "d2"],
+    )
+    return dml.DoubleMLData(df, y_col="y", d_cols=["d1", "d2"], x_cols=[f"X{i + 1}" for i in range(dim_x)])
+
+
+N_TREAT = 2  # tied to _make_data
+
+
+@pytest.fixture(scope="module")
+def fitted_plr_vector():
+    """Fit a PLRVector once and share across tests."""
+    np.random.seed(3141)
+    obj_dml_data = _make_data()
+    dml_obj = PLRVector(obj_dml_data)
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+    dml_obj.bootstrap(n_rep_boot=N_REP_BOOT)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_coef_type_and_shape(fitted_plr_vector):
+    """coef is a 1D array with one entry per treatment."""
+    assert isinstance(fitted_plr_vector.coef, np.ndarray)
+    assert fitted_plr_vector.coef.shape == (N_TREAT,)
+
+
+@pytest.mark.ci
+def test_se_type_and_shape(fitted_plr_vector):
+    """se is a 1D array with one entry per treatment."""
+    assert isinstance(fitted_plr_vector.se, np.ndarray)
+    assert fitted_plr_vector.se.shape == (N_TREAT,)
+
+
+@pytest.mark.ci
+def test_all_thetas_shape(fitted_plr_vector):
+    """all_thetas is (n_treat, n_rep)."""
+    assert fitted_plr_vector.all_thetas.shape == (N_TREAT, N_REP)
+
+
+@pytest.mark.ci
+def test_all_ses_shape(fitted_plr_vector):
+    """all_ses is (n_treat, n_rep)."""
+    assert fitted_plr_vector.all_ses.shape == (N_TREAT, N_REP)
+
+
+@pytest.mark.ci
+def test_summary_index_matches_d_cols(fitted_plr_vector):
+    """summary is a DataFrame indexed by d_cols in declaration order."""
+    summary = fitted_plr_vector.summary
+    assert isinstance(summary, pd.DataFrame)
+    assert summary.shape[0] == N_TREAT
+    assert summary.index.tolist() == ["d1", "d2"]
+
+
+@pytest.mark.ci
+def test_confint_shape(fitted_plr_vector):
+    """confint returns (n_treat, 2) DataFrame."""
+    ci = fitted_plr_vector.confint()
+    assert isinstance(ci, pd.DataFrame)
+    assert ci.shape == (N_TREAT, 2)
+
+
+@pytest.mark.ci
+def test_confint_joint_shape(fitted_plr_vector):
+    """confint(joint=True) returns (n_treat, 2) DataFrame after bootstrap."""
+    ci = fitted_plr_vector.confint(joint=True)
+    assert isinstance(ci, pd.DataFrame)
+    assert ci.shape == (N_TREAT, 2)
+
+
+@pytest.mark.ci
+def test_psi_shape(fitted_plr_vector):
+    """psi has shape (n_obs, n_treat, n_rep)."""
+    assert fitted_plr_vector.psi.shape == (N_OBS, N_TREAT, N_REP)
+
+
+@pytest.mark.ci
+def test_modellist_length_and_type(fitted_plr_vector):
+    """modellist exposes one PLR scalar sub-model per treatment."""
+    from doubleml.plm.plr_scalar import PLR
+
+    models = fitted_plr_vector.modellist
+    assert isinstance(models, list)
+    assert len(models) == N_TREAT
+    assert all(isinstance(m, PLR) for m in models)
+
+
+@pytest.mark.ci
+def test_smpls_shared_across_submodels(fitted_plr_vector):
+    """Sample splits are propagated identically into each sub-model."""
+    parent_smpls = fitted_plr_vector.smpls
+    for model in fitted_plr_vector.modellist:
+        for i_rep in range(N_REP):
+            for j_fold in range(N_FOLDS):
+                np.testing.assert_array_equal(model.smpls[i_rep][j_fold][0], parent_smpls[i_rep][j_fold][0])
+                np.testing.assert_array_equal(model.smpls[i_rep][j_fold][1], parent_smpls[i_rep][j_fold][1])
+
+
+@pytest.mark.ci
+def test_n_properties(fitted_plr_vector):
+    """n_obs, n_folds, n_rep, score reflect configuration."""
+    assert fitted_plr_vector.n_obs == N_OBS
+    assert fitted_plr_vector.n_folds == N_FOLDS
+    assert fitted_plr_vector.n_rep == N_REP
+    assert fitted_plr_vector.score == "partialling out"
+
+
+@pytest.mark.ci
+def test_required_learners(fitted_plr_vector):
+    """required_learners is score-dependent and matches scalar PLR."""
+    assert fitted_plr_vector.required_learners == ["ml_l", "ml_m"]
+
+
+@pytest.mark.ci
+def test_get_params_returns_per_submodel_list(fitted_plr_vector):
+    """get_params returns one parameter dict per sub-model, in d_cols order."""
+    params = fitted_plr_vector.get_params("ml_l")
+    assert isinstance(params, list)
+    assert len(params) == N_TREAT
+    for p in params:
+        assert isinstance(p, dict)
+        assert "fit_intercept" in p
+
+
+@pytest.mark.ci
+def test_set_params_updates_all_submodels(fitted_plr_vector):
+    """set_params propagates to every sub-model and returns self."""
+    result = fitted_plr_vector.set_params("ml_l", fit_intercept=False)
+    assert result is fitted_plr_vector
+    params = fitted_plr_vector.get_params("ml_l")
+    assert all(p["fit_intercept"] is False for p in params)
+    fitted_plr_vector.set_params("ml_l", fit_intercept=True)
+
+
+@pytest.mark.ci
+def test_sensitivity_elements_shape(fitted_plr_vector):
+    """sensitivity_elements exposes framework-level keys with multi-treatment shapes."""
+    elems = fitted_plr_vector.sensitivity_elements
+    assert isinstance(elems, dict)
+    for key in ["sigma2", "nu2", "max_bias"]:
+        assert elems[key].shape == (1, N_TREAT, N_REP)
+    assert elems["psi_max_bias"].shape == (N_OBS, N_TREAT, N_REP)
+
+
+@pytest.mark.ci
+def test_treatment_names_set_on_framework(fitted_plr_vector):
+    """treatment_names on the framework match d_cols."""
+    assert fitted_plr_vector.framework.treatment_names == ["d1", "d2"]
+
+
+@pytest.mark.ci
+def test_before_fit_raises():
+    """Properties relying on framework raise before fit()."""
+    np.random.seed(3141)
+    dml_obj = PLRVector(_make_data())
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.coef
+
+
+@pytest.mark.ci
+def test_reset_after_draw_sample_splitting():
+    """draw_sample_splitting clears framework and fitted properties on vector and sub-models."""
+    np.random.seed(3141)
+    dml_obj = PLRVector(_make_data())
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    dml_obj.fit()
+    _ = dml_obj.framework
+    _ = dml_obj.coef
+
+    dml_obj.draw_sample_splitting(n_folds=N_FOLDS, n_rep=N_REP)
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.framework
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.coef
+    for model in dml_obj.modellist:
+        with pytest.raises(ValueError, match="framework is not yet initialized"):
+            _ = model.framework
+
+
+@pytest.mark.ci
+def test_reset_after_set_learners():
+    """set_learners after fit clears the vector framework so stale results aren't returned."""
+    np.random.seed(3141)
+    dml_obj = PLRVector(_make_data())
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    dml_obj.fit(n_folds=N_FOLDS, n_rep=N_REP)
+    _ = dml_obj.framework
+
+    dml_obj.set_learners(ml_l=LinearRegression(), ml_m=LinearRegression())
+    with pytest.raises(ValueError, match="framework is not yet initialized"):
+        _ = dml_obj.framework
+    for model in dml_obj.modellist:
+        with pytest.raises(ValueError, match="framework is not yet initialized"):
+            _ = model.framework
diff --git a/doubleml/plm/tests/test_plr_vector_vs_plr.py b/doubleml/plm/tests/test_plr_vector_vs_plr.py
new file mode 100644
index 00000000..58e28d4a
--- /dev/null
+++ b/doubleml/plm/tests/test_plr_vector_vs_plr.py
@@ -0,0 +1,122 @@
+"""Compare PLRVector against the legacy DoubleMLPLR implementation for multi-treatment data."""
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.linear_model import Lasso, LinearRegression
+
+import doubleml as dml
+from doubleml.plm.plr_vector import PLRVector
+
+
+@pytest.fixture(scope="module", params=[LinearRegression(), Lasso(alpha=0.1)])
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["partialling out", "IV-type"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def comparison_fixture(generate_data_bivariate, learner, score, n_rep):
+    n_folds = 3
+    seed = 3141
+    data = generate_data_bivariate
+    x_cols = data.columns[data.columns.str.startswith("X")].tolist()
+    d_cols = data.columns[data.columns.str.startswith("d")].tolist()
+
+    obj_dml_data = dml.DoubleMLData(data, y_col="y", d_cols=d_cols, x_cols=x_cols)
+
+    ml_g_arg = clone(learner) if score == "IV-type" else None
+
+    # Legacy DoubleMLPLR draws splits in __init__
+    np.random.seed(seed)
+    dml_old = dml.DoubleMLPLR(
+        obj_dml_data,
+        clone(learner),
+        clone(learner),
+        ml_g_arg,
+        n_folds=n_folds,
+        n_rep=n_rep,
+        score=score,
+    )
+    dml_old.fit()
+
+    # New PLRVector draws splits explicitly via draw_sample_splitting
+    np.random.seed(seed)
+    dml_new = PLRVector(obj_dml_data, score=score)
+    dml_new.set_learners(ml_l=clone(learner), ml_m=clone(learner), ml_g=ml_g_arg)
+    dml_new.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
+    dml_new.fit()
+
+    return {"old": dml_old, "new": dml_new}
+
+
+@pytest.mark.ci
+def test_coef_equal(comparison_fixture):
+    """PLRVector.coef matches legacy DoubleMLPLR.coef per treatment."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.coef, old.coef, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_se_equal(comparison_fixture):
+    """PLRVector.se matches legacy DoubleMLPLR.se per treatment."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.se, old.se, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_all_coef_equal(comparison_fixture):
+    """PLRVector.all_thetas matches legacy DoubleMLPLR.all_coef."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.all_thetas, old.all_coef, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_all_se_equal(comparison_fixture):
+    """PLRVector.all_ses matches legacy DoubleMLPLR.all_se."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(new.all_ses, old.all_se, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_sigma2_equal(comparison_fixture):
+    """PLRVector sigma2 matches legacy DoubleMLPLR sensitivity_elements['sigma2'] after axis swap."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    # Legacy shape: (1, n_rep, n_treat); vector shape: (1, n_treat, n_rep). Transpose to align.
+    old_sigma2 = np.transpose(old.sensitivity_elements["sigma2"], (0, 2, 1))
+    np.testing.assert_allclose(new.sensitivity_elements["sigma2"], old_sigma2, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_nu2_equal(comparison_fixture):
+    """PLRVector nu2 matches legacy DoubleMLPLR sensitivity_elements['nu2'] after axis swap."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    old_nu2 = np.transpose(old.sensitivity_elements["nu2"], (0, 2, 1))
+    np.testing.assert_allclose(new.sensitivity_elements["nu2"], old_nu2, rtol=1e-9)
+
+
+@pytest.mark.ci
+def test_sensitivity_max_bias_equal(comparison_fixture):
+    """PLRVector framework max_bias matches legacy DoubleMLPLR framework max_bias."""
+    old = comparison_fixture["old"]
+    new = comparison_fixture["new"]
+    np.testing.assert_allclose(
+        new.framework.sensitivity_elements["max_bias"],
+        old.framework.sensitivity_elements["max_bias"],
+        rtol=1e-9,
+    )

From 1ae721c9307b24082e99faf489305f86aff72c06 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 16:44:17 +0200
Subject: [PATCH 28/38] refactor: move Self type hint import to
 typing_extensions for 3.10

---
 doubleml/double_ml_base.py   | 3 ++-
 doubleml/double_ml_scalar.py | 4 +++-
 doubleml/double_ml_vector.py | 4 +++-
 doubleml/irm/irm_scalar.py   | 3 ++-
 doubleml/plm/plr_scalar.py   | 3 ++-
 doubleml/plm/plr_vector.py   | 4 +++-
 6 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py
index 645e3ed6..df0affa8 100644
--- a/doubleml/double_ml_base.py
+++ b/doubleml/double_ml_base.py
@@ -3,10 +3,11 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Self
+from typing import Dict, Optional
 
 import numpy as np
 import pandas as pd
+from typing_extensions import Self
 
 from .data.base_data import DoubleMLBaseData
 from .double_ml_framework import DoubleMLFramework
diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index d460b824..39bd495e 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -4,7 +4,9 @@
 
 import warnings
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Self
+from typing import TYPE_CHECKING, Any, Callable, ClassVar
+
+from typing_extensions import Self
 
 if TYPE_CHECKING:
     from .utils._tune_optuna import DMLOptunaResult
diff --git a/doubleml/double_ml_vector.py b/doubleml/double_ml_vector.py
index f6f1e376..e2a7c7d7 100644
--- a/doubleml/double_ml_vector.py
+++ b/doubleml/double_ml_vector.py
@@ -4,7 +4,9 @@
 
 import copy
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Self
+from typing import TYPE_CHECKING, Any
+
+from typing_extensions import Self
 
 if TYPE_CHECKING:
     from .utils._tune_optuna import DMLOptunaResult
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index ac44a346..1ceac4af 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -5,12 +5,13 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, ClassVar, Self
+from typing import Any, ClassVar
 
 import numpy as np
 import pandas as pd
 from sklearn.base import clone
 from sklearn.utils.multiclass import type_of_target
+from typing_extensions import Self
 
 from ..data.base_data import DoubleMLData
 from ..double_ml_linear_score import LinearScoreMixin
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 4451e267..2831453e 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -5,12 +5,13 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, ClassVar, Self
+from typing import Any, ClassVar
 
 import numpy as np
 import pandas as pd
 from sklearn.base import clone
 from sklearn.model_selection import cross_val_predict
+from typing_extensions import Self
 
 from ..data.base_data import DoubleMLData
 from ..double_ml_linear_score import LinearScoreMixin
diff --git a/doubleml/plm/plr_vector.py b/doubleml/plm/plr_vector.py
index 6a3621bb..87b55764 100644
--- a/doubleml/plm/plr_vector.py
+++ b/doubleml/plm/plr_vector.py
@@ -2,7 +2,9 @@
 
 from __future__ import annotations
 
-from typing import Any, Self
+from typing import Any
+
+from typing_extensions import Self
 
 from ..data.base_data import DoubleMLData
 from ..double_ml_scalar import DoubleMLScalar

From f1c0bcdaa74baede9c80dd2459031a906ffa7f6d Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 16:54:37 +0200
Subject: [PATCH 29/38] Fix high priority codacy issues: update set_learners
 method signature and enhance error handling in PLR and LearnerSpec validation

---
 doubleml/double_ml_scalar.py         |  7 +------
 doubleml/plm/plr_scalar.py           |  5 +++--
 doubleml/utils/_checks.py            |  4 +++-
 doubleml/utils/_learner.py           |  5 ++++-
 doubleml/utils/tests/test_learner.py | 13 +++++++++++++
 5 files changed, 24 insertions(+), 10 deletions(-)
 create mode 100644 doubleml/utils/tests/test_learner.py

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index 39bd495e..10f74365 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -397,18 +397,13 @@ def _register_learner(self, name: str, learner: object) -> None:
         self._learners[name] = info
 
     @abstractmethod
-    def set_learners(self, **kwargs: object) -> Self:
+    def set_learners(self) -> Self:
         """
         Set the learners for nuisance estimation.
 
         Subclasses must implement this method with explicit keyword arguments
         for each learner (e.g., ``ml_l``, ``ml_m``, ``ml_g`` for PLR).
 
-        Parameters
-        ----------
-        **kwargs
-            Learner keyword arguments specific to the subclass.
-
         Returns
         -------
         self : Self
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 2831453e..77980881 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -388,11 +388,12 @@ def _get_score_elements(self) -> dict[str, np.ndarray]:
             u_hat = y[:, np.newaxis] - l_hat
             psi_a = -v_hat * v_hat
             psi_b = v_hat * u_hat
-        else:
-            assert self.score == "IV-type"
+        elif self.score == "IV-type":
             g_hat = self._predictions["ml_g"]
             psi_a = -v_hat * d[:, np.newaxis]
             psi_b = v_hat * (y[:, np.newaxis] - g_hat)
+        else:
+            raise ValueError(f"Invalid score '{self.score}'.")
 
         return {"psi_a": psi_a, "psi_b": psi_b}
 
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index 8adfafc1..3b065031 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -556,6 +556,9 @@ def _check_learner(learner, learner_name, regressor=True, classifier=True):
     err_msg_prefix = f"Invalid learner provided for {learner_name}: "
     warn_msg_prefix = f"Learner provided for {learner_name} is probably invalid: "
 
+    if not (regressor or classifier):
+        raise ValueError("At least one of regressor or classifier must be True.")
+
     if isinstance(learner, type):
         raise TypeError(err_msg_prefix + "provide an instance of a learner instead of a class.")
 
@@ -583,7 +586,6 @@ def _check_learner(learner, learner_name, regressor=True, classifier=True):
             warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.")
         learner_is_classifier = True
     else:
-        assert regressor  # classifier, regressor or both must be True
         if not is_regressor(learner):
             warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.")
         learner_is_classifier = False
diff --git a/doubleml/utils/_learner.py b/doubleml/utils/_learner.py
index 04659c98..5c31a16a 100644
--- a/doubleml/utils/_learner.py
+++ b/doubleml/utils/_learner.py
@@ -36,6 +36,10 @@ class LearnerSpec:
     allow_classifier: bool = True
     binary_data_check: Optional[Literal["outcome", "treatment"]] = None
 
+    def __post_init__(self) -> None:
+        if not (self.allow_regressor or self.allow_classifier):
+            raise ValueError(f"LearnerSpec '{self.name}': at least one of allow_regressor or allow_classifier must be True.")
+
 
 @dataclass
 class LearnerInfo:
@@ -127,7 +131,6 @@ def validate_learner(
             warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.")
         learner_is_classifier = True
     else:
-        assert spec.allow_regressor  # At least one must be True
         if not is_regressor(learner):
             warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.")
         learner_is_classifier = False
diff --git a/doubleml/utils/tests/test_learner.py b/doubleml/utils/tests/test_learner.py
new file mode 100644
index 00000000..07d67c74
--- /dev/null
+++ b/doubleml/utils/tests/test_learner.py
@@ -0,0 +1,13 @@
+"""Tests for LearnerSpec validation in doubleml.utils._learner."""
+
+import pytest
+
+from doubleml.utils._learner import LearnerSpec
+
+
+@pytest.mark.ci
+def test_learner_spec_requires_regressor_or_classifier():
+    """LearnerSpec must have at least one of allow_regressor / allow_classifier set to True."""
+    msg = r"LearnerSpec 'ml_x': at least one of allow_regressor or allow_classifier must be True\."
+    with pytest.raises(ValueError, match=msg):
+        LearnerSpec("ml_x", allow_regressor=False, allow_classifier=False)

From d74e9f935d21f4fa822072832a79919925dbf89c Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 17:23:51 +0200
Subject: [PATCH 30/38] fix medium codacy issues: streamline learner validation
 by extracting checks into dedicated functions

---
 doubleml/utils/_checks.py  | 104 +++++++++++++++-------------
 doubleml/utils/_learner.py | 136 ++++++++++++++++++-------------------
 2 files changed, 123 insertions(+), 117 deletions(-)

diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index 3b065031..2857823a 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -241,59 +241,65 @@ def _check_benchmarks(benchmarks):
     return
 
 
+def _check_weights_array(weights, n_obs):
+    if (weights.ndim != 1) or weights.shape[0] != n_obs:
+        raise ValueError(f"weights must have shape ({n_obs},). weights of shape {weights.shape} was passed.")
+    if not np.all(0 <= weights):
+        raise ValueError("All weights values must be greater or equal 0.")
+    if weights.sum() == 0:
+        raise ValueError("At least one weight must be non-zero.")
+
+
+def _check_weights_atte(weights):
+    if not isinstance(weights, np.ndarray):
+        raise TypeError(f"weights must be a numpy array for ATTE score. weights of type {str(type(weights))} was passed.")
+
+    is_binary = np.all((np.power(weights, 2) - weights) == 0)
+    if not is_binary:
+        raise ValueError("weights must be binary for ATTE score.")
+
+
+def _check_weights_dict(weights, score, n_obs, n_rep):
+    if score != "ATE":
+        raise ValueError(f"weights as a dictionary is only supported for ATE score, got '{score}'.")
+    expected_keys = ["weights", "weights_bar"]
+    if not set(weights.keys()) == set(expected_keys):
+        raise ValueError(f"weights must have keys {expected_keys}. keys {str(weights.keys())} were passed.")
+
+    if weights["weights"].shape != (n_obs,):
+        raise ValueError(f"weights must have shape ({n_obs},). weights of shape {weights['weights'].shape} was passed.")
+    # weights_bar must be 2D with n_obs rows; the n_rep column is validated later when n_rep is known
+    if weights["weights_bar"].ndim != 2 or weights["weights_bar"].shape[0] != n_obs:
+        raise ValueError(
+            f"weights_bar must be a 2-dimensional array with {n_obs} rows. "
+            f"weights_bar of shape {weights['weights_bar'].shape} was passed."
+        )
+    if n_rep is not None and weights["weights_bar"].shape[1] != n_rep:
+        raise ValueError(
+            f"weights_bar must have shape ({n_obs}, {n_rep}). "
+            f"weights_bar of shape {weights['weights_bar'].shape} was passed."
+        )
+    if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)):
+        raise ValueError("All weights values must be greater or equal 0.")
+    if (weights["weights"].sum() == 0) or (weights["weights_bar"].sum() == 0):
+        raise ValueError("At least one weight must be non-zero.")
+
+
 def _check_weights(weights, score, n_obs, n_rep: int | None = None):
-    if weights is not None:
-        # check general type
-        if (not isinstance(weights, np.ndarray)) and (not isinstance(weights, dict)):
-            raise TypeError(f"weights must be a numpy array or dictionary. weights of type {str(type(weights))} was passed.")
-
-        # check shape
-        if isinstance(weights, np.ndarray):
-            if (weights.ndim != 1) or weights.shape[0] != n_obs:
-                raise ValueError(f"weights must have shape ({n_obs},). weights of shape {weights.shape} was passed.")
-            if not np.all(0 <= weights):
-                raise ValueError("All weights values must be greater or equal 0.")
-            if weights.sum() == 0:
-                raise ValueError("At least one weight must be non-zero.")
-
-        # check special form for ATTE score
-        if score == "ATTE":
-            if not isinstance(weights, np.ndarray):
-                raise TypeError(
-                    f"weights must be a numpy array for ATTE score. weights of type {str(type(weights))} was passed."
-                )
+    if weights is None:
+        return
 
-            is_binary = np.all((np.power(weights, 2) - weights) == 0)
-            if not is_binary:
-                raise ValueError("weights must be binary for ATTE score.")
+    if not isinstance(weights, (np.ndarray, dict)):
+        raise TypeError(f"weights must be a numpy array or dictionary. weights of type {str(type(weights))} was passed.")
 
-        # check general form for ATE score
-        if isinstance(weights, dict):
-            assert score == "ATE"
-            expected_keys = ["weights", "weights_bar"]
-            if not set(weights.keys()) == set(expected_keys):
-                raise ValueError(f"weights must have keys {expected_keys}. keys {str(weights.keys())} were passed.")
+    if isinstance(weights, np.ndarray):
+        _check_weights_array(weights, n_obs)
 
-            if weights["weights"].shape != (n_obs,):
-                raise ValueError(
-                    f"weights must have shape ({n_obs},). weights of shape {weights['weights'].shape} was passed."
-                )
-            # weights_bar must be 2D with n_obs rows; the n_rep column is validated later when n_rep is known
-            if weights["weights_bar"].ndim != 2 or weights["weights_bar"].shape[0] != n_obs:
-                raise ValueError(
-                    f"weights_bar must be a 2-dimensional array with {n_obs} rows. "
-                    f"weights_bar of shape {weights['weights_bar'].shape} was passed."
-                )
-            if n_rep is not None and weights["weights_bar"].shape[1] != n_rep:
-                raise ValueError(
-                    f"weights_bar must have shape ({n_obs}, {n_rep}). "
-                    f"weights_bar of shape {weights['weights_bar'].shape} was passed."
-                )
-            if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)):
-                raise ValueError("All weights values must be greater or equal 0.")
-            if (weights["weights"].sum() == 0) or (weights["weights_bar"].sum() == 0):
-                raise ValueError("At least one weight must be non-zero.")
-    return
+    if score == "ATTE":
+        _check_weights_atte(weights)
+
+    if isinstance(weights, dict):
+        _check_weights_dict(weights, score, n_obs, n_rep)
 
 
 def _check_external_predictions(external_predictions, valid_treatments, valid_learners, n_obs, n_rep):
diff --git a/doubleml/utils/_learner.py b/doubleml/utils/_learner.py
index 5c31a16a..83c1537f 100644
--- a/doubleml/utils/_learner.py
+++ b/doubleml/utils/_learner.py
@@ -63,6 +63,65 @@ def predict_method(self) -> str:
         return "predict_proba" if self.is_classifier else "predict"
 
 
+def _check_learner_interface(learner: Any, err_prefix: str) -> None:
+    """Raise TypeError if learner is a class or lacks fit/set_params/get_params."""
+    if isinstance(learner, type):
+        raise TypeError(err_prefix + "provide an instance of a learner instead of a class.")
+    for method in ("fit", "set_params", "get_params"):
+        if not hasattr(learner, method):
+            raise TypeError(err_prefix + f"{str(learner)} has no method .{method}().")
+
+
+def _determine_learner_type(learner: Any, spec: LearnerSpec, warn_prefix: str) -> bool:
+    """Return True if learner should be treated as classifier; warn if type is ambiguous."""
+    if spec.allow_regressor and spec.allow_classifier:
+        if is_classifier(learner):
+            return True
+        if is_regressor(learner):
+            return False
+        warnings.warn(
+            warn_prefix
+            + f"{str(learner)} is (probably) neither a regressor nor a classifier. "
+            + "Method predict is used for prediction."
+        )
+        return False
+    if spec.allow_classifier:
+        if not is_classifier(learner):
+            warnings.warn(warn_prefix + f"{str(learner)} is (probably) no classifier.")
+        return True
+    if not is_regressor(learner):
+        warnings.warn(warn_prefix + f"{str(learner)} is (probably) no regressor.")
+    return False
+
+
+def _check_binary_data_compatibility(
+    learner: Any,
+    spec: LearnerSpec,
+    learner_is_classifier: bool,
+    binary_outcome: bool,
+    binary_treatment: bool,
+) -> None:
+    """Raise on classifier with non-binary data; warn on regressor with binary data."""
+    if not spec.binary_data_check:
+        return
+
+    is_outcome_check = spec.binary_data_check == "outcome"
+    data_is_binary = binary_outcome if is_outcome_check else binary_treatment
+    var_label = "outcome" if is_outcome_check else "treatment"
+
+    if learner_is_classifier and not data_is_binary:
+        raise ValueError(
+            f"The {spec.name} learner {str(learner)} was identified as classifier "
+            f"but the {var_label} variable is not binary with values 0 and 1."
+        )
+
+    if not learner_is_classifier and data_is_binary:
+        action = "fit an additive probability model" if is_outcome_check else "estimate propensity scores"
+        warnings.warn(
+            f"Binary {var_label} detected. Consider using a classifier for {spec.name} " f"with predict_proba() to {action}."
+        )
+
+
 def validate_learner(
     learner: Any,
     spec: LearnerSpec,
@@ -100,80 +159,21 @@ def validate_learner(
     err_msg_prefix = f"Invalid learner provided for {spec.name}: "
     warn_msg_prefix = f"Learner provided for {spec.name} is probably invalid: "
 
-    # Check it's an instance, not a class
-    if isinstance(learner, type):
-        raise TypeError(err_msg_prefix + "provide an instance of a learner instead of a class.")
-
-    # Check required methods
-    if not hasattr(learner, "fit"):
-        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .fit().")
-    if not hasattr(learner, "set_params"):
-        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .set_params().")
-    if not hasattr(learner, "get_params"):
-        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .get_params().")
-
-    # Determine learner type
-    learner_is_classifier: bool
-    if spec.allow_regressor and spec.allow_classifier:
-        if is_classifier(learner):
-            learner_is_classifier = True
-        elif is_regressor(learner):
-            learner_is_classifier = False
-        else:
-            warnings.warn(
-                warn_msg_prefix
-                + f"{str(learner)} is (probably) neither a regressor nor a classifier. "
-                + "Method predict is used for prediction."
-            )
-            learner_is_classifier = False
-    elif spec.allow_classifier:
-        if not is_classifier(learner):
-            warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no classifier.")
-        learner_is_classifier = True
-    else:
-        if not is_regressor(learner):
-            warnings.warn(warn_msg_prefix + f"{str(learner)} is (probably) no regressor.")
-        learner_is_classifier = False
-
-    # Check type is allowed
+    _check_learner_interface(learner, err_msg_prefix)
+    learner_is_classifier = _determine_learner_type(learner, spec, warn_msg_prefix)
+
+    # Check type is allowed by spec
     if learner_is_classifier and not spec.allow_classifier:
         raise ValueError(f"Classifier not allowed for {spec.name}. Use a regressor instead.")
     if not learner_is_classifier and not spec.allow_regressor:
         raise ValueError(f"Regressor not allowed for {spec.name}. Use a classifier instead.")
 
     # Check prediction method exists
-    if learner_is_classifier:
-        if not hasattr(learner, "predict_proba"):
-            raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict_proba().")
-    else:
-        if not hasattr(learner, "predict"):
-            raise TypeError(err_msg_prefix + f"{str(learner)} has no method .predict().")
-
-    # Check binary data compatibility for classifiers
-    if learner_is_classifier and spec.binary_data_check:
-        if spec.binary_data_check == "outcome" and not binary_outcome:
-            raise ValueError(
-                f"The {spec.name} learner {str(learner)} was identified as classifier "
-                "but the outcome variable is not binary with values 0 and 1."
-            )
-        if spec.binary_data_check == "treatment" and not binary_treatment:
-            raise ValueError(
-                f"The {spec.name} learner {str(learner)} was identified as classifier "
-                "but the treatment variable is not binary with values 0 and 1."
-            )
-
-    # Warn if regressor used with binary data
-    if not learner_is_classifier and spec.binary_data_check:
-        if spec.binary_data_check == "outcome" and binary_outcome:
-            warnings.warn(
-                f"Binary outcome detected. Consider using a classifier for {spec.name} "
-                "with predict_proba() to fit an additive probability model."
-            )
-        elif spec.binary_data_check == "treatment" and binary_treatment:
-            warnings.warn(
-                f"Binary treatment detected. Consider using a classifier for {spec.name} "
-                "with predict_proba() to estimate propensity scores."
-            )
+    predict_method = "predict_proba" if learner_is_classifier else "predict"
+    if not hasattr(learner, predict_method):
+        raise TypeError(err_msg_prefix + f"{str(learner)} has no method .{predict_method}().")
+
+    _check_binary_data_compatibility(learner, spec, learner_is_classifier, binary_outcome, binary_treatment)
 
     return LearnerInfo(
         learner=clone(learner),

From 4c0fe2ac274b5a8e1d29d6b587d397c9288dae0a Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 17:35:24 +0200
Subject: [PATCH 31/38] docs: fix docstring lint on new scalar/vector
 implementations

Apply ruff D200/D213/D413 auto-fixes and add __init__ docstrings to
DoubleMLVector and PLRVector.
---
 doubleml/double_ml_base.py         | 24 ++++++++++++++---
 doubleml/double_ml_framework.py    |  7 +++++
 doubleml/double_ml_linear_score.py |  8 +++---
 doubleml/double_ml_scalar.py       | 41 +++++++++++++++++++++++++++---
 doubleml/double_ml_vector.py       | 25 ++++++++++++++++++
 doubleml/irm/irm_scalar.py         | 18 +++++++++----
 doubleml/plm/plr.py                |  7 ++++-
 doubleml/plm/plr_scalar.py         | 18 +++++++++----
 doubleml/plm/plr_vector.py         | 13 +++++++---
 doubleml/utils/_checks.py          |  1 +
 doubleml/utils/_learner.py         |  8 +++---
 doubleml/utils/_tune_optuna.py     | 14 ++++++++--
 doubleml/utils/blp.py              |  9 +++++--
 13 files changed, 163 insertions(+), 30 deletions(-)

diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py
index df0affa8..05e80061 100644
--- a/doubleml/double_ml_base.py
+++ b/doubleml/double_ml_base.py
@@ -1,6 +1,4 @@
-"""
-Abstract base class for Double Machine Learning estimators.
-"""
+"""Abstract base class for Double Machine Learning estimators."""
 
 from abc import ABC, abstractmethod
 from typing import Dict, Optional
@@ -46,6 +44,7 @@ class DoubleMLBase(ABC):
         Influence function values (shape: (n_obs, n_thetas, n_rep)).
     n_rep : int
         Number of repetitions for sample splitting.
+
     """
 
     def __init__(
@@ -59,6 +58,7 @@ def __init__(
         ----------
         obj_dml_data : DoubleMLBaseData
             The data object for the double machine learning model.
+
         """
         # Validate and store data
         if not isinstance(obj_dml_data, DoubleMLBaseData):
@@ -89,6 +89,7 @@ def framework(self) -> DoubleMLFramework:
         ------
         ValueError
             If framework is not yet initialized (fit() has not been called).
+
         """
         if self._framework is None:
             raise ValueError("The framework is not yet initialized. " "Call fit() before accessing estimation results.")
@@ -103,6 +104,7 @@ def thetas(self) -> np.ndarray:
         -------
         np.ndarray
             Parameter estimates (shape: (n_thetas,)).
+
         """
         return self.framework.thetas
 
@@ -115,6 +117,7 @@ def coef(self) -> np.ndarray:
         -------
         np.ndarray
             Parameter estimates (shape: (n_thetas,)).
+
         """
         return self.thetas
 
@@ -127,6 +130,7 @@ def all_thetas(self) -> np.ndarray:
         -------
         np.ndarray
             Parameter estimates for all repetitions (shape: (n_thetas, n_rep)).
+
         """
         return self.framework.all_thetas
 
@@ -139,6 +143,7 @@ def all_coef(self) -> np.ndarray:
         -------
         np.ndarray
             Parameter estimates for all repetitions (shape: (n_thetas, n_rep)).
+
         """
         return self.all_thetas
 
@@ -151,6 +156,7 @@ def se(self) -> np.ndarray:
         -------
         np.ndarray
             Standard errors (shape: (n_thetas,)).
+
         """
         return self.framework.ses
 
@@ -163,6 +169,7 @@ def all_ses(self) -> np.ndarray:
         -------
         np.ndarray
             Standard errors for all repetitions (shape: (n_thetas, n_rep)).
+
         """
         return self.framework.all_ses
 
@@ -175,6 +182,7 @@ def summary(self) -> pd.DataFrame:
         -------
         pd.DataFrame
             Summary statistics for all parameters.
+
         """
         return self.framework.summary
 
@@ -187,6 +195,7 @@ def psi(self) -> np.ndarray:
         -------
         np.ndarray
             Influence function values (shape: (n_obs, n_thetas, n_rep)).
+
         """
         return self.framework.scaled_psi
 
@@ -200,6 +209,7 @@ def n_rep(self) -> int:
         -------
         int
             Number of repetitions.
+
         """
         pass
 
@@ -212,6 +222,7 @@ def n_obs(self) -> int:
         -------
         int
             Number of observations in the dataset.
+
         """
         return self._n_obs
 
@@ -234,6 +245,7 @@ def confint(self, joint: bool = False, level: float = 0.95) -> pd.DataFrame:
         -------
         pd.DataFrame
             A DataFrame with confidence intervals.
+
         """
         return self.framework.confint(joint=joint, level=level)
 
@@ -254,6 +266,7 @@ def bootstrap(self, method: str = "normal", n_rep_boot: int = 500) -> Self:
         -------
         self : DoubleMLBase
             The DoubleML estimator with bootstrap results.
+
         """
         self.framework.bootstrap(method=method, n_rep_boot=n_rep_boot)
         return self
@@ -271,6 +284,7 @@ def p_adjust(self, method: str = "romano-wolf") -> pd.DataFrame:
         -------
         pd.DataFrame
             A DataFrame with adjusted p-values.
+
         """
         return self.framework.p_adjust(method=method)
 
@@ -307,6 +321,7 @@ def sensitivity_analysis(
         -------
         dict
             A dictionary with sensitivity analysis results.
+
         """
         return self.framework.sensitivity_analysis(
             cf_y=cf_y,
@@ -334,6 +349,7 @@ def fit(self, **kwargs) -> Self:
         -------
         self : DoubleMLBase
             The fitted DoubleML estimator.
+
         """
         pass
 
@@ -345,6 +361,7 @@ def __str__(self) -> str:
         -------
         str
             A formatted string summary of the model.
+
         """
         class_name = self.__class__.__name__
         header = f"{'=' * 20} {class_name} Object {'=' * 20}"
@@ -363,5 +380,6 @@ def __repr__(self) -> str:
         -------
         str
             A string representation of the object.
+
         """
         return self.__str__()
diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py
index c82ad206..86927470 100644
--- a/doubleml/double_ml_framework.py
+++ b/doubleml/double_ml_framework.py
@@ -167,6 +167,7 @@ class DoubleMLFramework:
     ----------
     dml_core : DoubleMLCore
         A DoubleMLCore object providing the estimated parameters and scores.
+
     """
 
     def __init__(
@@ -383,6 +384,7 @@ def sensitivity_summary(self):
         -------
         res : str
             Summary for the sensitivity analysis.
+
         """
         header = "================== Sensitivity Analysis ==================\n"
         if self.sensitivity_params is None:
@@ -713,6 +715,7 @@ def sensitivity_analysis(self, cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95, null_h
         Returns
         -------
         self : object
+
         """
         # check null_hypothesis
         if isinstance(null_hypothesis, float):
@@ -772,6 +775,7 @@ def confint(self, joint=False, level=0.95):
         -------
         df_ci : pd.DataFrame
             A data frame with the confidence interval(s).
+
         """
 
         if not isinstance(joint, bool):
@@ -822,6 +826,7 @@ def bootstrap(self, method="normal", n_rep_boot=500):
         Returns
         -------
         self : object
+
         """
 
         _check_bootstrap(method, n_rep_boot)
@@ -858,6 +863,7 @@ def p_adjust(self, method="romano-wolf"):
             A data frame with adjusted p-values.
         all_p_vals_corrected : np.ndarray
             A numpy array with all corrected p-values for each repetition.
+
         """
         if not isinstance(method, str):
             raise TypeError(f"The p_adjust method must be of str type. {str(method)} of type {str(type(method))} was passed.")
@@ -970,6 +976,7 @@ def sensitivity_plot(
         -------
         fig : object
             Plotly figure of the sensitivity contours.
+
         """
         _check_integer(idx_treatment, "idx_treatment", lower_bound=0, upper_bound=self.n_thetas - 1)
         if not isinstance(value, str):
diff --git a/doubleml/double_ml_linear_score.py b/doubleml/double_ml_linear_score.py
index 4bada4d8..df35f8ec 100644
--- a/doubleml/double_ml_linear_score.py
+++ b/doubleml/double_ml_linear_score.py
@@ -1,6 +1,4 @@
-"""
-Mixin for DoubleML models with linear score functions.
-"""
+"""Mixin for DoubleML models with linear score functions."""
 
 from typing import Dict
 
@@ -34,6 +32,7 @@ class LinearScoreMixin(DoubleMLScalar):
     Subclasses must implement:
     - _nuisance_est(): Estimate nuisance parameters for one fold
     - _get_score_elements(): Return dict with 'psi_a' and 'psi_b' arrays of shape (n_obs, n_rep)
+
     """
 
     def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None:
@@ -60,6 +59,7 @@ def _est_causal_pars_and_se(self, psi_elements: Dict[str, np.ndarray]) -> None:
         - self._psi: Influence function values (n_obs, n_thetas=1, n_rep)
         - self._psi_deriv: Score derivative w.r.t. θ (n_obs, n_thetas=1, n_rep)
         - self._var_scaling_factors: Variance scaling factors (n_thetas=1,)
+
         """
         # Extract score elements
         if "psi_a" not in psi_elements or "psi_b" not in psi_elements:
@@ -137,6 +137,7 @@ def _compute_score(self, psi_elements: Dict[str, np.ndarray], coef: float) -> np
         -------
         np.ndarray
             Score function values, shape (n_obs, n_rep).
+
         """
         psi_a = psi_elements["psi_a"]
         psi_b = psi_elements["psi_b"]
@@ -151,5 +152,6 @@ def _score_element_names(self) -> list:
         -------
         list
             List of score element names: ['psi_a', 'psi_b']
+
         """
         return ["psi_a", "psi_b"]
diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index 10f74365..a00d415e 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -1,6 +1,4 @@
-"""
-Abstract base class for scalar DoubleML models (single parameter estimation).
-"""
+"""Abstract base class for scalar DoubleML models (single parameter estimation)."""
 
 import warnings
 from abc import ABC, abstractmethod
@@ -52,6 +50,7 @@ class DoubleMLScalar(DoubleMLBase, ABC):
         Number of repetitions for sample splitting (set via draw_sample_splitting).
     score : str
         The score function being used.
+
     """
 
     # Subclasses define all possible learners for the model
@@ -81,6 +80,7 @@ def __init__(
         ------
         ValueError
             If obj_dml_data contains more than one treatment column.
+
         """
         # Validate single treatment column
         if len(obj_dml_data.d_cols) != 1:
@@ -137,6 +137,7 @@ def n_folds(self) -> int:
         ------
         ValueError
             If sample splitting has not been performed yet.
+
         """
         if self._n_folds is None:
             raise ValueError("n_folds not set. Call draw_sample_splitting() first.")
@@ -156,6 +157,7 @@ def n_rep(self) -> int:
         ------
         ValueError
             If sample splitting has not been performed yet.
+
         """
         if self._n_rep is None:
             raise ValueError("n_rep not set. Call draw_sample_splitting() first.")
@@ -170,6 +172,7 @@ def score(self) -> str:
         -------
         str
             Score function name.
+
         """
         return self._score
 
@@ -187,6 +190,7 @@ def predictions(self) -> dict[str, np.ndarray]:
         ------
         ValueError
             If the model has not been fitted yet.
+
         """
         if self._predictions is None:
             raise ValueError("Predictions not available. Call fit() first.")
@@ -208,6 +212,7 @@ def nuisance_targets(self) -> dict[str, np.ndarray]:
         ------
         ValueError
             If the model has not been fitted yet.
+
         """
         if self._nuisance_targets is None:
             raise ValueError("Nuisance targets not available. Call fit() or fit_nuisance_models() first.")
@@ -232,6 +237,7 @@ def nuisance_loss(self) -> dict[str, np.ndarray]:
         ------
         ValueError
             If the model has not been fitted yet.
+
         """
         if self._nuisance_loss is None:
             raise ValueError("Nuisance loss not available. Call fit() or fit_nuisance_models() first.")
@@ -250,6 +256,7 @@ def sensitivity_elements(self) -> dict[str, np.ndarray] | None:
         dict[str, np.ndarray] or None
             Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``),
             ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``).
+
         """
         return self._sensitivity_elements
 
@@ -262,6 +269,7 @@ def smpls(self) -> list:
         -------
         list
             List of sample splitting indices for each repetition.
+
         """
         if self._smpls is None:
             raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.")
@@ -281,6 +289,7 @@ def smpls_cluster(self) -> list | None:
         ------
         ValueError
             If cluster data is used but cluster splitting is not available.
+
         """
         if self._dml_data.is_cluster_data and self._smpls_cluster is None:
             raise ValueError("Cluster sample splitting has not been provided. Call set_sample_splitting() first.")
@@ -304,6 +313,7 @@ def required_learners(self) -> list[str]:
         -------
         list of str
             Ordered list of required learner names.
+
         """
         pass
 
@@ -316,6 +326,7 @@ def learners(self) -> dict[str, object]:
         -------
         dict
             Dictionary mapping learner names to estimator instances.
+
         """
         return {name: info.learner for name, info in self._learners.items()}
 
@@ -337,6 +348,7 @@ def get_params(self, learner_name: str) -> dict:
         ------
         ValueError
             If the learner is not registered.
+
         """
         if learner_name not in self._learners:
             raise ValueError(f"Learner '{learner_name}' not registered.")
@@ -362,6 +374,7 @@ def set_params(self, learner_name: str, **params: object) -> Self:
         ------
         ValueError
             If the learner is not registered.
+
         """
         if learner_name not in self._learners:
             raise ValueError(f"Learner '{learner_name}' not registered.")
@@ -383,6 +396,7 @@ def _register_learner(self, name: str, learner: object) -> None:
         ------
         ValueError
             If the learner name is not defined in _LEARNER_SPECS.
+
         """
         if name not in self._LEARNER_SPECS:
             raise ValueError(f"Learner '{name}' not defined for this model.")
@@ -408,6 +422,7 @@ def set_learners(self) -> Self:
         -------
         self : Self
             The estimator with learners set.
+
         """
         pass
 
@@ -456,6 +471,7 @@ def fit(
         -------
         self : Self
             The fitted estimator.
+
         """
         if self._smpls is None:
             self.draw_sample_splitting(
@@ -511,6 +527,7 @@ def fit_nuisance_models(
         ------
         ValueError
             If sample splitting has not been initialized.
+
         """
         if self._smpls is None:
             raise ValueError("Sample splitting has not been initialized. Call draw_sample_splitting() first.")
@@ -580,6 +597,7 @@ def estimate_causal_parameters(self) -> Self:
         ------
         ValueError
             If nuisance models have not been fitted yet.
+
         """
         if self._predictions is None:
             raise ValueError("Predictions not available. Call fit_nuisance_models() first.")
@@ -625,6 +643,7 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
         ------
         ValueError
             If n_folds or n_rep have invalid values.
+
         """
         if not isinstance(n_folds, int) or n_folds < 2:
             raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.")
@@ -689,6 +708,7 @@ def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None =
             If ``all_smpls`` is not a list or if tuple shorthand is used.
         ValueError
             If the partition is invalid or cluster splitting is missing.
+
         """
         if isinstance(all_smpls, tuple):
             raise TypeError("all_smpls must be a list of folds; tuple shorthand is not supported for DoubleMLScalar.")
@@ -750,6 +770,7 @@ def _initialize_predictions_dict(self) -> dict[str, np.ndarray]:
         -------
         dict
             Dictionary mapping learner names to NaN-filled arrays.
+
         """
         n_obs = self._n_obs
         n_rep = self.n_rep
@@ -770,6 +791,7 @@ def _check_external_predictions(self, external_predictions: dict[str, np.ndarray
             If a value is not a numpy array.
         ValueError
             If a value does not match shape (n_obs, n_rep).
+
         """
         n_obs = self._n_obs
         n_rep = self.n_rep
@@ -798,6 +820,7 @@ def _check_learners_available(self, external_predictions: dict[str, np.ndarray]
         ------
         ValueError
             If a required learner is missing and not covered by external predictions.
+
         """
         ext_keys = set(external_predictions.keys()) if external_predictions is not None else set()
 
@@ -816,6 +839,7 @@ def _construct_framework(self) -> DoubleMLFramework:
         -------
         DoubleMLFramework
             The framework object with estimation results.
+
         """
         # Standardize the score function: psi / E[psi_deriv]
         # Both already in framework shape: (n_obs, n_thetas, n_rep)
@@ -934,6 +958,7 @@ def evaluate_learners(
         >>> model.evaluate_learners()
         >>> model.evaluate_learners(metric=r2_score)
         >>> model.evaluate_learners(learners=["ml_m"], metric=log_loss)
+
         """
         if self._nuisance_targets is None:
             raise ValueError("Nuisance targets not available. Call fit() or fit_nuisance_models() first.")
@@ -1007,6 +1032,7 @@ def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
             Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``),
             ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``).
             Return ``None`` (default) if sensitivity analysis is not implemented.
+
         """
         return None
 
@@ -1042,6 +1068,7 @@ def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]:
         dict[str, np.ndarray or None]
             Dictionary mapping learner names to target arrays of shape ``(n_obs, n_rep)``,
             or ``None`` where targets are not available.
+
         """
         pass
 
@@ -1078,6 +1105,7 @@ def _nuisance_est(
             If provided, a dictionary of external predictions. Learners whose names
             appear as keys should not be fitted; their predictions are already
             pre-filled in self._predictions.
+
         """
         pass
 
@@ -1104,6 +1132,7 @@ def _get_score_elements(self) -> dict[str, np.ndarray]:
             psi_a = (D - m_hat) ** 2  # shape: (n_obs, n_rep)
             psi_b = (D - m_hat) * (Y - l_hat)  # shape: (n_obs, n_rep)
             return {'psi_a': psi_a, 'psi_b': psi_b}
+
         """
         pass
 
@@ -1133,6 +1162,7 @@ def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None:
         - self._psi should have shape (n_obs, n_thetas, n_rep)
         - self._psi_deriv should have shape (n_obs, n_thetas, n_rep)
         - self._var_scaling_factors should have shape (n_thetas,)
+
         """
         pass
 
@@ -1188,6 +1218,7 @@ def tune_ml_models(
         tune_res : dict
             Dict of :class:`~doubleml.utils._tune_optuna.DMLOptunaResult` objects keyed by
             learner name. Returned when ``return_tune_res=True``.
+
         """
         if not isinstance(set_as_params, bool):
             raise TypeError(f"set_as_params must be True or False. Got {str(set_as_params)}.")
@@ -1267,6 +1298,7 @@ def _expand_tuning_param_space(self, ml_param_space: dict[str, Callable | None])
             alias nor a defined learner name.
         TypeError
             If a parameter space value is not callable.
+
         """
         if not isinstance(ml_param_space, dict):
             raise TypeError(f"ml_param_space must be a dict. Got {type(ml_param_space).__name__}.")
@@ -1317,6 +1349,7 @@ def _validate_optuna_setting_keys(self, optuna_settings: dict | None) -> None:
             value is not a dict.
         ValueError
             If a key is not a global Optuna setting and not a valid learner name or alias.
+
         """
         if optuna_settings is not None and not isinstance(optuna_settings, dict):
             raise TypeError(f"optuna_settings must be a dict or None. Got {str(type(optuna_settings))}.")
@@ -1375,6 +1408,7 @@ def _get_tuning_data(
         ------
         NotImplementedError
             Always; subclasses must override this method.
+
         """
         raise NotImplementedError(
             f"_get_tuning_data not implemented for {self.__class__.__name__}. " "Subclasses must override this method."
@@ -1388,6 +1422,7 @@ def __str__(self) -> str:
         -------
         str
             A formatted string summary of the model.
+
         """
         class_name = self.__class__.__name__
         header = f"{'=' * 20} {class_name} Object {'=' * 20}"
diff --git a/doubleml/double_ml_vector.py b/doubleml/double_ml_vector.py
index e2a7c7d7..d4cbe531 100644
--- a/doubleml/double_ml_vector.py
+++ b/doubleml/double_ml_vector.py
@@ -62,6 +62,7 @@ class DoubleMLVector(DoubleMLBase, ABC):
         The score function being used.
     modellist : list of DoubleMLScalar
         The scalar sub-models, one per treatment column (or model key).
+
     """
 
     def __init__(
@@ -69,6 +70,7 @@ def __init__(
         obj_dml_data: DoubleMLData,
         score: str = "default",
     ) -> None:
+        """Initialize DoubleMLVector. See class docstring for parameter details."""
         super().__init__(obj_dml_data)
         self._dml_data: DoubleMLData = obj_dml_data  # narrow for attribute access
         self._score = score
@@ -99,6 +101,7 @@ def n_rep(self) -> int:
         ------
         ValueError
             If sample splitting has not been drawn yet.
+
         """
         if self._n_rep is None:
             raise ValueError("n_rep not set. Call draw_sample_splitting() first.")
@@ -118,6 +121,7 @@ def n_folds(self) -> int:
         ------
         ValueError
             If sample splitting has not been drawn yet.
+
         """
         if self._n_folds is None:
             raise ValueError("n_folds not set. Call draw_sample_splitting() first.")
@@ -132,6 +136,7 @@ def score(self) -> str:
         -------
         str
             Score function name.
+
         """
         return self._score
 
@@ -149,6 +154,7 @@ def smpls(self) -> list:
         ------
         ValueError
             If sample splitting has not been drawn yet.
+
         """
         if self._smpls is None:
             raise ValueError("Sample splitting has not been performed. Call draw_sample_splitting() first.")
@@ -163,6 +169,7 @@ def modellist(self) -> list[DoubleMLScalar] | None:
         -------
         list of DoubleMLScalar or None
             ``None`` before :meth:`_initialize_models` has been called by the subclass.
+
         """
         return self._modellist
 
@@ -174,6 +181,7 @@ def n_rep_boot(self) -> int | None:
         Returns
         -------
         int or None
+
         """
         return None if self._framework is None else self._framework.n_rep_boot
 
@@ -185,6 +193,7 @@ def boot_method(self) -> str | None:
         Returns
         -------
         str or None
+
         """
         return None if self._framework is None else self._framework.boot_method
 
@@ -196,6 +205,7 @@ def boot_t_stat(self) -> np.ndarray | None:
         Returns
         -------
         np.ndarray or None
+
         """
         return None if self._framework is None else self._framework.boot_t_stat
 
@@ -207,6 +217,7 @@ def sensitivity_elements(self) -> dict[str, np.ndarray] | None:
         Returns
         -------
         dict or None
+
         """
         return None if self._framework is None else self._framework.sensitivity_elements
 
@@ -219,6 +230,7 @@ def sensitivity_params(self) -> dict | None:
         Returns
         -------
         dict or None
+
         """
         return None if self._framework is None else self._framework.sensitivity_params
 
@@ -235,6 +247,7 @@ def sensitivity_summary(self) -> str:
         ------
         ValueError
             If :meth:`fit` has not been called yet.
+
         """
         if self._framework is None:
             raise ValueError("Apply fit() before accessing sensitivity_summary.")
@@ -252,6 +265,7 @@ def required_learners(self) -> list[str]:
         -------
         list of str
             Ordered list of required learner names.
+
         """
 
     @abstractmethod
@@ -271,6 +285,7 @@ def set_learners(self, **kwargs: object) -> Self:
         Returns
         -------
         self : Self
+
         """
 
     @abstractmethod
@@ -287,6 +302,7 @@ def _initialize_models(self) -> list[DoubleMLScalar]:
         -------
         list of DoubleMLScalar
             One configured scalar model per element of ``self._dml_data.d_cols``.
+
         """
 
     # ==================== Protected Helpers ====================
@@ -314,6 +330,7 @@ class would override this to return ``self._dml_data`` unchanged (each APO
         DoubleMLData
             A :class:`~doubleml.data.DoubleMLData` with ``d_cols=[d_col]``
             and all other treatment columns added to ``x_cols``.
+
         """
         other_d_cols = [c for c in self._dml_data.d_cols if c != d_col]
         x_cols = list(self._dml_data.x_cols) + other_d_cols
@@ -385,6 +402,7 @@ def draw_sample_splitting(self, n_folds: int = 5, n_rep: int = 1) -> Self:
         ------
         ValueError
             If ``n_folds < 2`` or ``n_rep < 1``.
+
         """
         if not isinstance(n_folds, int) or n_folds < 2:
             raise ValueError(f"n_folds must be an integer >= 2. Got {n_folds}.")
@@ -427,6 +445,7 @@ def set_sample_splitting(self, all_smpls: list, all_smpls_cluster: list | None =
             If ``all_smpls`` is not a list.
         ValueError
             If the partition is invalid.
+
         """
         if isinstance(all_smpls, tuple):
             raise TypeError("all_smpls must be a list of folds; tuple shorthand is not supported for DoubleMLVector.")
@@ -491,6 +510,7 @@ def fit(
         Returns
         -------
         self : Self
+
         """
         if self._smpls is None:
             self.draw_sample_splitting(n_folds=n_folds, n_rep=n_rep)
@@ -529,6 +549,7 @@ def get_params(self, learner_name: str) -> list[dict]:
         -------
         list of dict
             One parameter dict per sub-model, in ``d_cols`` order.
+
         """
         if self._modellist is None:
             raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.")
@@ -548,6 +569,7 @@ def set_params(self, learner_name: str, **params: object) -> Self:
         Returns
         -------
         self : Self
+
         """
         if self._modellist is None:
             raise ValueError("Sub-models are not initialized. Call _initialize_models() in the subclass __init__.")
@@ -644,6 +666,7 @@ def sensitivity_plot(
         ------
         ValueError
             If :meth:`fit` has not been called yet.
+
         """
         if self._framework is None:
             raise ValueError("Apply fit() before sensitivity_plot().")
@@ -689,6 +712,7 @@ def sensitivity_benchmark(self, benchmarking_set: list[str], fit_args: dict | No
             If ``benchmarking_set`` or ``fit_args`` have the wrong type.
         ValueError
             If ``benchmarking_set`` is empty or not a subset of ``x_cols``.
+
         """
         if self._framework is None:
             raise ValueError("Apply fit() before sensitivity_benchmark().")
@@ -738,6 +762,7 @@ def __str__(self) -> str:
         -------
         str
             A formatted string summary of the model.
+
         """
         class_name = self.__class__.__name__
         header = f"{'=' * 20} {class_name} Object {'=' * 20}"
diff --git a/doubleml/irm/irm_scalar.py b/doubleml/irm/irm_scalar.py
index 1ceac4af..3cac41b7 100644
--- a/doubleml/irm/irm_scalar.py
+++ b/doubleml/irm/irm_scalar.py
@@ -1,6 +1,4 @@
-"""
-Interactive Regression Model (IRM) based on the new DoubleMLScalar hierarchy.
-"""
+"""Interactive Regression Model (IRM) based on the new DoubleMLScalar hierarchy."""
 
 from __future__ import annotations
 
@@ -23,7 +21,8 @@
 
 
 class IRM(LinearScoreMixin):
-    """Double machine learning for interactive regression models.
+    """
+    Double machine learning for interactive regression models.
 
     Based on the DoubleMLScalar + LinearScoreMixin hierarchy.
 
@@ -78,6 +77,7 @@ class IRM(LinearScoreMixin):
     .. math::
 
         \\theta_0 = \\mathbb{E}[g_0(1, X) - g_0(0, X) | D=1].
+
     """
 
     # Define learner specifications for IRM
@@ -122,6 +122,7 @@ def __init__(
             Weights for weighted ATE.
         ps_processor_config : PSProcessorConfig, optional
             Configuration for propensity score processing.
+
         """
         # Validate data
         self._check_data(obj_dml_data)
@@ -215,6 +216,7 @@ def set_learners(
         -------
         self : IRM
             The estimator with learners set.
+
         """
         # ml_g convenience: clone to ml_g0/ml_g1 if not explicitly set
         if ml_g is not None:
@@ -310,7 +312,8 @@ def _nuisance_est(
     # ==================== Score Elements ====================
 
     def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]:
-        """Return target arrays for nuisance loss evaluation.
+        """
+        Return target arrays for nuisance loss evaluation.
 
         ml_g0 and ml_g1 are fitted only on the d==0 and d==1 subgroups respectively,
         so targets for the opposite group are NaN. ml_m target is d (binary treatment).
@@ -383,6 +386,7 @@ def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> Dou
         -------
         model : :class:`doubleml.DoubleMLBLP`
             Best linear predictor model.
+
         """
         if self.score != "ATE":
             raise ValueError(f"Invalid score '{self.score}'. CATE is only implemented for score='ATE'.")
@@ -413,6 +417,7 @@ def gate(self, groups: pd.DataFrame, **kwargs: Any) -> DoubleMLBLP:
         -------
         model : :class:`doubleml.DoubleMLBLP`
             Best linear predictor model for group effects.
+
         """
         if not isinstance(groups, pd.DataFrame):
             raise TypeError(f"Groups must be of DataFrame type. Groups of type {str(type(groups))} was passed.")
@@ -485,6 +490,7 @@ def _get_tuning_data(
         ------
         ValueError
             If ``learner_name`` is not a valid IRM learner name.
+
         """
         y = self._dml_data.y
         d = self._dml_data.d
@@ -527,6 +533,7 @@ def _get_weights(self, m_hat: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
             Shape (n_obs, n_rep) or broadcastable.
         weights_bar : np.ndarray
             Shape (n_obs, n_rep) or broadcastable.
+
         """
         d = self._dml_data.d
 
@@ -572,6 +579,7 @@ def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
         dict[str, np.ndarray] or None
             Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``),
             ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``).
+
         """
         y = self._dml_data.y  # (n_obs,)
         d = self._dml_data.d  # (n_obs,)
diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
index 825ec845..13182745 100644
--- a/doubleml/plm/plr.py
+++ b/doubleml/plm/plr.py
@@ -16,7 +16,8 @@
 
 
 class DoubleMLPLR(LinearScoreMixin, DoubleML):
-    """Double machine learning for partially linear regression models
+    """
+    Double machine learning for partially linear regression models
 
     Parameters
     ----------
@@ -87,6 +88,7 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML):
     where :math:`Y` is the outcome variable and :math:`D` is the policy variable of interest.
     The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates,
     and :math:`\\zeta` and :math:`V` are stochastic errors.
+
     """
 
     def __init__(
@@ -465,6 +467,7 @@ def cate(self, basis, is_gate=False, **kwargs):
         -------
         model : :class:`doubleML.DoubleMLBLP`
             Best linear Predictor model.
+
         """
         if self._dml_data.n_treat > 1:
             raise NotImplementedError(
@@ -500,6 +503,7 @@ def gate(self, groups, **kwargs):
         -------
         model : :class:`doubleML.DoubleMLBLP`
             Best linear Predictor model for Group Effects.
+
         """
 
         if not isinstance(groups, pd.DataFrame):
@@ -530,6 +534,7 @@ def _partial_out(self):
             The residual of the regression of Y on X.
         D_tilde : :class:`numpy.ndarray`
             The residual of the regression of D on X.
+
         """
         if self.predictions is None:
             raise ValueError("predictions are None. Call .fit(store_predictions=True) to store the predictions.")
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 77980881..7fbce66e 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -1,6 +1,4 @@
-"""
-Partially Linear Regression (PLR) model based on the new DoubleMLScalar hierarchy.
-"""
+"""Partially Linear Regression (PLR) model based on the new DoubleMLScalar hierarchy."""
 
 from __future__ import annotations
 
@@ -21,7 +19,8 @@
 
 
 class PLR(LinearScoreMixin):
-    """Double machine learning for partially linear regression models.
+    """
+    Double machine learning for partially linear regression models.
 
     Based on the DoubleMLScalar + LinearScoreMixin hierarchy.
 
@@ -38,6 +37,7 @@ class PLR(LinearScoreMixin):
         Learner for E[D|X]. Can be regressor or classifier.
     ml_g : estimator, optional
         Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor.
+
     """
 
     # Define learner specifications for PLR
@@ -70,6 +70,7 @@ def __init__(
             Learner for E[D|X]. Can be regressor or classifier.
         ml_g : estimator, optional
             Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor.
+
         """
         # Validate data
         self._check_data(obj_dml_data)
@@ -124,6 +125,7 @@ def set_learners(
         -------
         self : PLR
             The estimator with learners set.
+
         """
         for name, learner in [("ml_l", ml_l), ("ml_m", ml_m), ("ml_g", ml_g)]:
             if learner is None:
@@ -314,6 +316,7 @@ def _get_tuning_data(
         ------
         ValueError
             If ``learner_name`` is not a valid PLR learner name.
+
         """
         y = self._dml_data.y
         d = self._dml_data.d
@@ -360,7 +363,8 @@ def _get_tuning_data(
         raise ValueError(f"Unknown learner '{learner_name}' for PLR.")
 
     def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]:
-        """Return target arrays for nuisance loss evaluation.
+        """
+        Return target arrays for nuisance loss evaluation.
 
         Returns y for ml_l, d for ml_m. For IV-type score, ml_g target is None because
         the adjusted outcome y - θ·d depends on the estimated parameter and varies per
@@ -412,6 +416,7 @@ def _partial_out(self) -> tuple[np.ndarray, np.ndarray]:
         -------
         Y_tilde, D_tilde : tuple[np.ndarray, np.ndarray]
             Outcome and treatment residuals, each of shape ``(n_obs, n_rep)``.
+
         """
         if self._predictions is None:
             raise ValueError("predictions are None. Call fit() first.")
@@ -451,6 +456,7 @@ def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> Dou
         -------
         model : :class:`doubleml.DoubleMLBLP`
             Best linear predictor model.
+
         """
         if self._dml_data.n_treat > 1:
             raise NotImplementedError(
@@ -483,6 +489,7 @@ def gate(self, groups: pd.DataFrame, **kwargs: Any) -> DoubleMLBLP:
         -------
         model : :class:`doubleml.DoubleMLBLP`
             Best linear predictor model for group effects.
+
         """
         if not isinstance(groups, pd.DataFrame):
             raise TypeError(f"Groups must be of DataFrame type. Groups of type {str(type(groups))} was passed.")
@@ -515,6 +522,7 @@ def _sensitivity_element_est(self) -> dict[str, np.ndarray] | None:
             Dictionary with keys ``'sigma2'``, ``'nu2'`` (shape ``(1, 1, n_rep)``),
             ``'psi_sigma2'``, ``'psi_nu2'``, ``'riesz_rep'`` (shape ``(n_obs, 1, n_rep)``).
             Returns ``None`` for callable scores (no standard Riesz representer).
+
         """
         if callable(self.score):
             return None
diff --git a/doubleml/plm/plr_vector.py b/doubleml/plm/plr_vector.py
index 87b55764..c32ee4f0 100644
--- a/doubleml/plm/plr_vector.py
+++ b/doubleml/plm/plr_vector.py
@@ -13,7 +13,8 @@
 
 
 class PLRVector(DoubleMLVector):
-    """Multi-treatment double machine learning for partially linear regression models.
+    """
+    Multi-treatment double machine learning for partially linear regression models.
 
     Orchestrates one :class:`~doubleml.plm.plr_scalar.PLR` instance per treatment column
     in ``d_cols``. Sample splits are drawn once and shared across all sub-models;
@@ -35,6 +36,7 @@ class PLRVector(DoubleMLVector):
         Learner for E[D|X]. Can be regressor or classifier.
     ml_g : estimator, optional
         Learner for E[Y - D*theta|X]. Only for IV-type. Must be regressor.
+
     """
 
     def __init__(
@@ -45,6 +47,7 @@ def __init__(
         ml_m: object | None = None,
         ml_g: object | None = None,
     ) -> None:
+        """Initialize PLRVector. See class docstring for parameter details."""
         # Validate at the vector level so the error fires before sub-model construction.
         self._check_data(obj_dml_data)
         valid_scores = ["partialling out", "IV-type"]
@@ -61,7 +64,8 @@ def __init__(
 
     @staticmethod
     def _check_data(obj_dml_data: Any) -> None:
-        """Validate the data object for PLR vector estimation.
+        """
+        Validate the data object for PLR vector estimation.
 
         Parameters
         ----------
@@ -75,6 +79,7 @@ def _check_data(obj_dml_data: Any) -> None:
             If ``obj_dml_data`` is not a :class:`~doubleml.data.DoubleMLData`.
         ValueError
             If ``obj_dml_data`` defines instrumental variables (``z_cols``).
+
         """
         if not isinstance(obj_dml_data, DoubleMLData):
             raise TypeError(
@@ -100,7 +105,8 @@ def set_learners(
         ml_m: object | None = None,
         ml_g: object | None = None,
     ) -> Self:
-        """Set the learners for nuisance estimation on every sub-model.
+        """
+        Set the learners for nuisance estimation on every sub-model.
 
         Parameters
         ----------
@@ -114,6 +120,7 @@ def set_learners(
         Returns
         -------
         self : PLRVector
+
         """
         if self._modellist is None:
             raise RuntimeError("Sub-models are not initialized. _initialize_models() must run in __init__.")
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index 2857823a..eaddbc45 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -558,6 +558,7 @@ def _check_learner(learner, learner_name, regressor=True, classifier=True):
     TypeError
         If the learner is a class instead of an instance, or lacks
         required methods (fit, set_params, get_params, predict/predict_proba).
+
     """
     err_msg_prefix = f"Invalid learner provided for {learner_name}: "
     warn_msg_prefix = f"Learner provided for {learner_name} is probably invalid: "
diff --git a/doubleml/utils/_learner.py b/doubleml/utils/_learner.py
index 83c1537f..642e02f1 100644
--- a/doubleml/utils/_learner.py
+++ b/doubleml/utils/_learner.py
@@ -1,6 +1,4 @@
-"""
-Learner specification and validation utilities for DoubleML.
-"""
+"""Learner specification and validation utilities for DoubleML."""
 
 from __future__ import annotations
 
@@ -29,6 +27,7 @@ class LearnerSpec:
         If specified, warns when using regressor with binary data.
         "outcome" checks binary_outcome, "treatment" checks binary_treatment.
         Default is ``None``.
+
     """
 
     name: str
@@ -52,6 +51,7 @@ class LearnerInfo:
         The learner object (already cloned).
     is_classifier : bool
         Whether the learner is a classifier.
+
     """
 
     learner: Any
@@ -155,6 +155,7 @@ def validate_learner(
     ValueError
         If the learner type is not allowed by the specification.
         If a classifier is used with non-binary data when required.
+
     """
     err_msg_prefix = f"Invalid learner provided for {spec.name}: "
     warn_msg_prefix = f"Learner provided for {spec.name} is probably invalid: "
@@ -198,6 +199,7 @@ def predict_nuisance(learner: Any, X: np.ndarray, is_classifier: bool) -> np.nda
     -------
     np.ndarray
         Predictions. For classifiers, returns probability of class 1.
+
     """
     if is_classifier:
         return learner.predict_proba(X)[:, 1]
diff --git a/doubleml/utils/_tune_optuna.py b/doubleml/utils/_tune_optuna.py
index f3e2a821..25eff3ba 100644
--- a/doubleml/utils/_tune_optuna.py
+++ b/doubleml/utils/_tune_optuna.py
@@ -15,6 +15,7 @@
     >>> import logging
     >>> logging.basicConfig(level=logging.INFO)
     >>> # Now you'll see tuning progress and information
+
 """
 
 import logging
@@ -85,6 +86,7 @@ class DMLOptunaResult:
 
     tuned : bool
         Indicates whether tuning was performed (True) or skipped (False).
+
     """
 
     learner_name: str
@@ -296,7 +298,8 @@ def _default_optuna_settings():
 
 
 def _resolve_optuna_scoring(scoring_method, learner, params_name):
-    """Resolve the scoring argument for an Optuna-tuned learner.
+    """
+    Resolve the scoring argument for an Optuna-tuned learner.
 
     Parameters
     ----------
@@ -315,6 +318,7 @@ def _resolve_optuna_scoring(scoring_method, learner, params_name):
     :func:`sklearn.model_selection.cross_val_score` (``None`` means use the
     estimator's default ``score``) and a human-readable message describing
     the decision for logging purposes.
+
     """
 
     if scoring_method is not None:
@@ -380,7 +384,8 @@ def _check_tuning_inputs(
     cv,
     params_name,
 ):
-    """Validate Optuna tuning inputs and normalize the cross-validation splitter.
+    """
+    Validate Optuna tuning inputs and normalize the cross-validation splitter.
 
     Parameters
     ----------
@@ -404,6 +409,7 @@ def _check_tuning_inputs(
     cross-validator or list
         Cross-validation splitter or pre-made list of ``(train, test)`` index
         pairs as returned by :func:`resolve_optuna_cv`.
+
     """
 
     if y.shape[0] != x.shape[0]:
@@ -444,6 +450,7 @@ def _get_optuna_settings(optuna_settings, params_name):
     -------
     dict
         Resolved settings dictionary.
+
     """
     default_settings = _default_optuna_settings()
 
@@ -497,6 +504,7 @@ def _create_study(settings, learner_name):
     -------
     optuna.study.Study
         The Optuna study object ready for optimization.
+
     """
 
     # Check if a study instance is provided directly
@@ -548,6 +556,7 @@ def _create_objective(param_grid_func, learner, x, y, cv, scoring_method):
     -------
     callable
         Objective function for Optuna optimization.
+
     """
     # Build scorer once; scoring_method is already resolved (non-None) by _resolve_optuna_scoring
     scorer = check_scoring(clone(learner), scoring=scoring_method)
@@ -627,6 +636,7 @@ def _dml_tune_optuna(
     -------
     DMLOptunaResult
         A tuning result containing the optuna.Study object and further information.
+
     """
 
     scoring_method, scoring_message = _resolve_optuna_scoring(scoring_method, learner, params_name)
diff --git a/doubleml/utils/blp.py b/doubleml/utils/blp.py
index c5e59d7e..406e2a7f 100644
--- a/doubleml/utils/blp.py
+++ b/doubleml/utils/blp.py
@@ -10,7 +10,8 @@
 
 
 class DoubleMLBLP:
-    """Best linear predictor (BLP) for DoubleML with orthogonal signals.
+    """
+    Best linear predictor (BLP) for DoubleML with orthogonal signals.
     Manily used for CATE and GATE estimation for IRM models.
 
     Parameters
@@ -29,6 +30,7 @@ class DoubleMLBLP:
     is_gate : bool
         Indicates whether the basis is constructed for GATEs (dummy-basis).
         Default is ``False``.
+
     """
 
     def __init__(self, orth_signal, basis, is_gate=False):
@@ -60,7 +62,8 @@ def __init__(self, orth_signal, basis, is_gate=False):
 
     @staticmethod
     def _validate_basis(basis, n_obs, n_rep):
-        """Validate ``basis`` and return a list of length ``n_rep``.
+        """
+        Validate ``basis`` and return a list of length ``n_rep``.
 
         ``basis`` may be a single ``pd.DataFrame`` (shared across reps) or a list of
         ``pd.DataFrame`` of length ``n_rep``. Per-rep DataFrames must share column names
@@ -213,6 +216,7 @@ def fit(self, cov_type="HC0", **kwargs):
         Returns
         -------
         self : object
+
         """
 
         # fit the best-linear-predictor of the orthogonal signal with respect to the grid
@@ -261,6 +265,7 @@ def confint(self, basis=None, joint=False, level=0.95, n_rep_boot=500):
         -------
         df_ci : pd.DataFrame
             A data frame with the confidence interval(s).
+
         """
         if not isinstance(joint, bool):
             raise TypeError(f"joint must be True or False. Got {str(joint)}.")

From b996235f8dc495909a14edc720cc74af7d11d6ba Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 17:39:02 +0200
Subject: [PATCH 32/38] refactor: simplify set_learners method signature by
 removing kwargs

---
 doubleml/double_ml_vector.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/doubleml/double_ml_vector.py b/doubleml/double_ml_vector.py
index d4cbe531..b6535d7a 100644
--- a/doubleml/double_ml_vector.py
+++ b/doubleml/double_ml_vector.py
@@ -269,7 +269,7 @@ def required_learners(self) -> list[str]:
         """
 
     @abstractmethod
-    def set_learners(self, **kwargs: object) -> Self:
+    def set_learners(self) -> Self:
         """
         Set the learners for nuisance estimation on all sub-models.
 
@@ -277,11 +277,6 @@ def set_learners(self, **kwargs: object) -> Self:
         matching their model's learners (e.g., ``ml_l``, ``ml_m`` for PLR).
         The same learners (cloned per sub-model) are applied to every treatment.
 
-        Parameters
-        ----------
-        **kwargs
-            Learner keyword arguments specific to the subclass.
-
         Returns
         -------
         self : Self

From 39a010131a29d4a665b9c559315380d732ba9ba5 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 17:45:52 +0200
Subject: [PATCH 33/38] refactor: remove redundant pass statements in abstract
 methods and streamline sample comparison logic in tests

---
 doubleml/double_ml_base.py                         |  2 --
 doubleml/double_ml_scalar.py                       |  1 -
 doubleml/irm/tests/test_irm_scalar_exceptions.py   |  4 ----
 doubleml/plm/tests/test_plr_scalar_exceptions.py   |  2 --
 doubleml/tests/test_scalar_set_sample_splitting.py | 10 +++++-----
 5 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/doubleml/double_ml_base.py b/doubleml/double_ml_base.py
index 05e80061..7892c8d4 100644
--- a/doubleml/double_ml_base.py
+++ b/doubleml/double_ml_base.py
@@ -211,7 +211,6 @@ def n_rep(self) -> int:
             Number of repetitions.
 
         """
-        pass
 
     @property
     def n_obs(self) -> int:
@@ -351,7 +350,6 @@ def fit(self, **kwargs) -> Self:
             The fitted DoubleML estimator.
 
         """
-        pass
 
     def __str__(self) -> str:
         """
diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index a00d415e..22bcb694 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -315,7 +315,6 @@ def required_learners(self) -> list[str]:
             Ordered list of required learner names.
 
         """
-        pass
 
     @property
     def learners(self) -> dict[str, object]:
diff --git a/doubleml/irm/tests/test_irm_scalar_exceptions.py b/doubleml/irm/tests/test_irm_scalar_exceptions.py
index a1db7598..4cca6d2d 100644
--- a/doubleml/irm/tests/test_irm_scalar_exceptions.py
+++ b/doubleml/irm/tests/test_irm_scalar_exceptions.py
@@ -51,8 +51,6 @@ def test_irm_scalar_exception_instrument():
     df = plr_data.data.copy()
     x_cols = [c for c in df.columns if c.startswith("X")]
 
-    import doubleml as dml
-
     dml_data_iv = dml.DoubleMLData(df, y_col="y", d_cols="d", x_cols=x_cols[:-1], z_cols=x_cols[-1])
 
     msg = r"Incompatible data\. .* have been set as instrumental variable\(s\)\."
@@ -349,7 +347,5 @@ def test_exception_sensitivity_level(fitted_irm_for_sensitivity):
 @pytest.mark.ci
 def test_exception_sensitivity_null_hypothesis(fitted_irm_for_sensitivity):
     """null_hypothesis with wrong shape raises ValueError."""
-    import numpy as np
-
     with pytest.raises(ValueError, match=r"null_hypothesis"):
         fitted_irm_for_sensitivity.sensitivity_analysis(null_hypothesis=np.array([0.0, 0.0]))
diff --git a/doubleml/plm/tests/test_plr_scalar_exceptions.py b/doubleml/plm/tests/test_plr_scalar_exceptions.py
index 7d2a57b9..4bc205f1 100644
--- a/doubleml/plm/tests/test_plr_scalar_exceptions.py
+++ b/doubleml/plm/tests/test_plr_scalar_exceptions.py
@@ -198,7 +198,5 @@ def test_exception_sensitivity_level(fitted_plr_for_sensitivity):
 @pytest.mark.ci
 def test_exception_sensitivity_null_hypothesis(fitted_plr_for_sensitivity):
     """null_hypothesis with wrong shape raises ValueError."""
-    import numpy as np
-
     with pytest.raises(ValueError, match=r"null_hypothesis"):
         fitted_plr_for_sensitivity.sensitivity_analysis(null_hypothesis=np.array([0.0, 0.0]))
diff --git a/doubleml/tests/test_scalar_set_sample_splitting.py b/doubleml/tests/test_scalar_set_sample_splitting.py
index bc9abd84..a049a984 100644
--- a/doubleml/tests/test_scalar_set_sample_splitting.py
+++ b/doubleml/tests/test_scalar_set_sample_splitting.py
@@ -9,11 +9,11 @@
 
 def _assert_smpls_equal(smpls0, smpls1):
     assert len(smpls0) == len(smpls1)
-    for i_rep in range(len(smpls0)):
-        assert len(smpls0[i_rep]) == len(smpls1[i_rep])
-        for i_fold in range(len(smpls0[i_rep])):
-            assert np.array_equal(smpls0[i_rep][i_fold][0], smpls1[i_rep][i_fold][0])
-            assert np.array_equal(smpls0[i_rep][i_fold][1], smpls1[i_rep][i_fold][1])
+    for rep0, rep1 in zip(smpls0, smpls1):
+        assert len(rep0) == len(rep1)
+        for fold0, fold1 in zip(rep0, rep1):
+            assert np.array_equal(fold0[0], fold1[0])
+            assert np.array_equal(fold0[1], fold1[1])
 
 
 @pytest.mark.ci

From 9913dbf917c7504b0607e2fa9763b4f087dac2ea Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 20:09:04 +0200
Subject: [PATCH 34/38] refactor: remove redundant pass statement in
 DoubleMLScalar class

---
 doubleml/double_ml_scalar.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index 22bcb694..5ab55bdd 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -423,7 +423,6 @@ def set_learners(self) -> Self:
             The estimator with learners set.
 
         """
-        pass
 
     # ==================== Concrete fit() Method (Template) ====================
 

From bd75efb181018282dba4edc6996ee6b0e20b8058 Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 20:21:40 +0200
Subject: [PATCH 35/38] refactor: remove redundant pass statements in abstract
 methods of DoubleMLScalar class

---
 doubleml/double_ml_scalar.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index 5ab55bdd..d0b5e323 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -1068,7 +1068,6 @@ def _get_nuisance_targets(self) -> dict[str, np.ndarray | None]:
             or ``None`` where targets are not available.
 
         """
-        pass
 
     @abstractmethod
     def _nuisance_est(
@@ -1105,7 +1104,6 @@ def _nuisance_est(
             pre-filled in self._predictions.
 
         """
-        pass
 
     @abstractmethod
     def _get_score_elements(self) -> dict[str, np.ndarray]:
@@ -1132,7 +1130,6 @@ def _get_score_elements(self) -> dict[str, np.ndarray]:
             return {'psi_a': psi_a, 'psi_b': psi_b}
 
         """
-        pass
 
     @abstractmethod
     def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None:
@@ -1162,7 +1159,6 @@ def _est_causal_pars_and_se(self, psi_elements: dict[str, np.ndarray]) -> None:
         - self._var_scaling_factors should have shape (n_thetas,)
 
         """
-        pass
 
     # ==================== Hyperparameter Tuning ====================
 

From 93247fda1ceceacfbc6150da4d54aa796c916bdb Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 20:57:49 +0200
Subject: [PATCH 36/38] refactor: simplify docstring for set_learners method in
 PLRVector class

---
 doubleml/plm/plr_vector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doubleml/plm/plr_vector.py b/doubleml/plm/plr_vector.py
index c32ee4f0..dcc534bb 100644
--- a/doubleml/plm/plr_vector.py
+++ b/doubleml/plm/plr_vector.py
@@ -106,7 +106,7 @@ def set_learners(
         ml_g: object | None = None,
     ) -> Self:
         """
-        Set the learners for nuisance estimation on every sub-model.
+        Set learners for nuisance estimation on every sub-model.
 
         Parameters
         ----------

From d56d105bb16b4a79247bc9d0a89438b056cd4ece Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 21:00:35 +0200
Subject: [PATCH 37/38] refactor: add doctest skip directive to
 evaluate_learners examples in DoubleMLScalar class

---
 doubleml/double_ml_scalar.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doubleml/double_ml_scalar.py b/doubleml/double_ml_scalar.py
index d0b5e323..265006b9 100644
--- a/doubleml/double_ml_scalar.py
+++ b/doubleml/double_ml_scalar.py
@@ -953,9 +953,9 @@ def evaluate_learners(
         Examples
         --------
         >>> from sklearn.metrics import root_mean_squared_error, r2_score, log_loss
-        >>> model.evaluate_learners()
-        >>> model.evaluate_learners(metric=r2_score)
-        >>> model.evaluate_learners(learners=["ml_m"], metric=log_loss)
+        >>> model.evaluate_learners()  # doctest: +SKIP
+        >>> model.evaluate_learners(metric=r2_score)  # doctest: +SKIP
+        >>> model.evaluate_learners(learners=["ml_m"], metric=log_loss)  # doctest: +SKIP
 
         """
         if self._nuisance_targets is None:

From 3ffa823754671b3375af0098e75a7a8d5b857d1a Mon Sep 17 00:00:00 2001
From: SvenKlaassen <sven.klaassen@uni-hamburg.de>
Date: Sat, 9 May 2026 21:58:08 +0200
Subject: [PATCH 38/38] refactor: enhance basis validation in DoubleMLPLR and
 PLR classes

---
 doubleml/plm/plr.py               | 16 +++++++++++++++-
 doubleml/plm/plr_scalar.py        | 16 +++++++++++++++-
 doubleml/tests/test_exceptions.py |  4 ++--
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
index 13182745..cba17b7d 100644
--- a/doubleml/plm/plr.py
+++ b/doubleml/plm/plr.py
@@ -476,7 +476,21 @@ def cate(self, basis, is_gate=False, **kwargs):
 
         Y_tilde, D_tilde = self._partial_out()
 
-        basis_per_rep = [basis.multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)]
+        if isinstance(basis, pd.DataFrame):
+            basis_list = [basis] * self.n_rep
+        elif isinstance(basis, list):
+            if len(basis) != self.n_rep:
+                raise ValueError(f"When basis is a list it must have length n_rep={self.n_rep}. Got length {len(basis)}.")
+            if not all(isinstance(b, pd.DataFrame) for b in basis):
+                raise TypeError("All entries of basis list must be of DataFrame type.")
+            basis_list = basis
+        else:
+            raise TypeError(
+                f"The basis must be of DataFrame type or a list of DataFrames. "
+                f"Basis of type {str(type(basis))} was passed."
+            )
+
+        basis_per_rep = [basis_list[i_rep].multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)]
         model = DoubleMLBLP(
             orth_signal=Y_tilde,
             basis=basis_per_rep,
diff --git a/doubleml/plm/plr_scalar.py b/doubleml/plm/plr_scalar.py
index 7fbce66e..b11993fd 100644
--- a/doubleml/plm/plr_scalar.py
+++ b/doubleml/plm/plr_scalar.py
@@ -465,8 +465,22 @@ def cate(self, basis: pd.DataFrame, is_gate: bool = False, **kwargs: Any) -> Dou
         if self._predictions is None:
             raise ValueError("CATE requires a fitted model. Call fit() first.")
 
+        if isinstance(basis, pd.DataFrame):
+            basis_list = [basis] * self.n_rep
+        elif isinstance(basis, list):
+            if len(basis) != self.n_rep:
+                raise ValueError(f"When basis is a list it must have length n_rep={self.n_rep}. Got length {len(basis)}.")
+            if not all(isinstance(b, pd.DataFrame) for b in basis):
+                raise TypeError("All entries of basis list must be of DataFrame type.")
+            basis_list = basis
+        else:
+            raise TypeError(
+                f"The basis must be of DataFrame type or a list of DataFrames. "
+                f"Basis of type {str(type(basis))} was passed."
+            )
+
         Y_tilde, D_tilde = self._partial_out()
-        basis_per_rep = [basis.multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)]
+        basis_per_rep = [basis_list[i_rep].multiply(D_tilde[:, i_rep], axis=0) for i_rep in range(self.n_rep)]
 
         model = DoubleMLBLP(orth_signal=Y_tilde, basis=basis_per_rep, is_gate=is_gate)
         model.fit(**kwargs)
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index 7d164054..4fa99240 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -1418,7 +1418,7 @@ def test_doubleml_exception_cate():
         n_rep=2,
     )
     dml_irm_obj.fit()
-    msg = "The basis must be of DataFrame type. Basis of type <class 'int'> was passed."
+    msg = r"The basis must be of DataFrame type or a list of DataFrames\. Basis of type <class 'int'> was passed\."
     with pytest.raises(TypeError, match=msg):
         dml_irm_obj.cate(basis=2)
 
@@ -1427,7 +1427,7 @@ def test_doubleml_exception_cate():
 def test_doubleml_exception_plr_cate():
     dml_plr_obj = DoubleMLPLR(dml_data, ml_l=Lasso(), ml_m=Lasso(), n_folds=2, n_rep=2)
     dml_plr_obj.fit()
-    msg = "The basis must be of DataFrame type. Basis of type <class 'numpy.ndarray'> was passed."
+    msg = r"The basis must be of DataFrame type or a list of DataFrames\. Basis of type <class 'int'> was passed\."
     with pytest.raises(TypeError, match=msg):
         dml_plr_obj.cate(basis=2)