diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 83242a1..6756cdc 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: os: [windows-latest, ubuntu-latest, macos-latest] - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.10', '3.11', '3.12', '3.13', '3.14'] runs-on: ${{ matrix.os }} env: PIP_NO_INPUT: "1" @@ -25,6 +25,9 @@ jobs: PIP_RETRIES: "5" steps: - uses: actions/checkout@v4 + - name: Install OpenMP + if: runner.os == 'macOS' + run: brew install libomp - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: diff --git a/README.md b/README.md index df4c465..fe29c57 100644 --- a/README.md +++ b/README.md @@ -5,27 +5,26 @@ # Probmetrics: Classification metrics and post-hoc calibration This package (PyTorch-based) currently contains -- classification metrics, especially also -metrics for assessing the quality of probabilistic predictions, and -- post-hoc calibration methods, especially - - a fast and accurate implementation of temperature scaling. - - an implementation of structured matrix scaling (SMS), - a regularized version of matrix scaling that outperforms other - logistic-based calibration functions. - -It accompanies our papers -[Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195) and [Structured Matrix Scaling for Multi-Class Calibration](https://arxiv.org/abs/2511.03685) -and [A Variational Estimator for Lp Calibration Errors](https://arxiv.org/abs/2602.24230). +- post-hoc calibration methods, in particular: + - a fast and accurate temperature scaling implementation described in [1] + - an implementation of structured matrix scaling (SMS), a regularized version of matrix scaling introduced in [2] + - implementations for all the post-hoc calibration methods in the CalArena benchmark [4] +- classification metrics, especially metrics for assessing the quality of probabilistic + predictions, in particular: + - our classifier based $L_p$ calibration error estimators from [3] + +It accompanies our papers +[1] [Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195) +(see also: [vision experiments](https://github.com/eugeneberta/RefineThenCalibrate-Vision), [tabular experiments](https://github.com/dholzmueller/pytabkit), [theory](https://github.com/eugeneberta/RefineThenCalibrate-Theory)) +[2] [Structured Matrix Scaling for Multi-Class Calibration](https://arxiv.org/abs/2511.03685) +(see also: [experiments](https://github.com/eugeneberta/LogisticCalibrationBenchmark)) +[3] [A Variational Estimator for Lp Calibration Errors](https://arxiv.org/abs/2602.24230) +(see also [all experiments](https://github.com/ElSacho/Evaluating_Lp_Calibration_Errors)) +[4] [CalArena: A Large-Scale Post-Hoc Calibration Benchmark](link_to_come) +(see also [leaderboards and experiments](https://github.com/probkit/CalArena)) + Please cite us if you use this repository for research purposes. -The experiments from the papers can be found here: -- Rethinking Early Stopping: - - [vision experiments](https://github.com/eugeneberta/RefineThenCalibrate-Vision). - - [tabular experiments](https://github.com/dholzmueller/pytabkit). - - [theory](https://github.com/eugeneberta/RefineThenCalibrate-Theory). -- Structured Matrix Scaling: - [all experiments](https://github.com/eugeneberta/LogisticCalibrationBenchmark). -- A Variational Estimator for Lp Calibration Errors: - [all experiments](https://github.com/ElSacho/Evaluating_Lp_Calibration_Errors). + ## Installation @@ -34,31 +33,40 @@ Probmetrics is available via pip install probmetrics ``` To obtain all functionality, install `probmetrics[extra,dev,dirichletcal]`. -- extra installs more packages for our CatBoost/LightGBM-based $L_p$ calibration - error metrics, smooth ECE (only works with scikit-learn versions <= 1.6), - Venn-Abers calibration, - centered isotonic regression, - and the temperature scaling implementation in NetCal. -- dev installs more packages for development (esp. testing) -- dirichletcal installs Dirichlet calibration, - which however only works for Python 3.12 upwards. +- extra installs: + - numba for SAGA based optimizers in logistic calibrators SVS, SMS and others. + - relplot for smooth ECE (only works with scikit-learn versions <= 1.6). + - catboost, lightgbm for our classifier-based $L_p$ calibration error metrics. + - other packages supporting more post-hoc calibration methods: netcal (BBQ, ENIR, Netcal TS), + uncertainty-calibration (Scaling-binning), betacal (Beta calibration), splinecalib + (Spline calibration), venn-abers (Venn-Abers calibration), cir-model (CIR), scipy + (ETS). +- dev installs more packages for development (especially testing) +- dirichletcal installs Dirichlet calibration, which however only works for Python 3.12 +upwards. ## Using post-hoc calibration methods -You can create a calibrator as follows: +You can create a calibrator by importing it from `probmetrics.calibrators` +```python +from probmetrics.calibrators import TemperatureScalingCalibrator + +calib = TemperatureScalingCalibrator() +``` + +or using the `get_calibrator` factory function: ```python from probmetrics.calibrators import get_calibrator -calib = get_calibrator('logistic') +calib = get_calibrator("logistic") ``` -These are the main supported methods: + + +You can find all post-hoc calibration functions implemented in `probmetrics/calibrators`. +A subset of those methods is supported by the `get_calibrator` function, find details in `probmetrics/calibrators/factory.py`. -More details on parameters and other methods can be found in the get_calibrator function -[here](https://github.com/dholzmueller/probmetrics/probmetrics/calibrators.py). -### Usage with `numpy` +### Usage with numpy ```python import numpy as np probas = np.asarray([[0.1, 0.9]]) # shape = (n_samples, n_classes) labels = np.asarray([1]) # shape = (n_samples,) + calib.fit(probas, labels) -calibrated_probas = calib.predict_proba(probas) +calibrated_probas = calib.predict_proba(probas) # shape = (n_samples, n_classes) ``` ### Usage with PyTorch -The PyTorch version can be used directly with GPU tensors, -which is leveraged by our temperature scaling implementation -but not by most other methods. -For temperature scaling, this could accelerate things, -but the CPU version can be faster +The PyTorch version can be used directly with GPU tensors, which is leveraged by our +temperature scaling implementation but not by most other methods. +For temperature scaling, this could accelerate things, but the CPU version can be faster for smaller validation sets (around 1K-10K samples). ```python @@ -116,74 +123,83 @@ result = calib.predict_proba_torch(CategoricalProbs(probas)) calibrated_probas = result.get_probs() ``` +## Using metrics -## Using our refinement and calibration metrics - -We provide estimators for refinement error -(loss after post-hoc calibration) -and calibration error -(loss improvement through post-hoc calibration). -They can be used as follows: +We provide classification and calibration metrics, which can be used as follows: ```python import torch from probmetrics.metrics import Metrics -# compute multiple metrics at once -# this is more efficient than computing them individually -metrics = Metrics.from_names(['logloss', - 'refinement_logloss_ts-mix_all', - 'calib-err_logloss_ts-mix_all']) +# computing multiple metrics at once is more efficient than computing them individually +metrics = Metrics.from_names([ + "logloss", + "brier", # for binary, this is 2x the brier from sklearn + "accuracy", + "class-error", + "auroc-ovr", # one-vs-rest + "auroc-ovo-sklearn", # one-vs-one (can be slow!) + "ece-15", # Expected calibration error with 15 bins + "rmsce-15", # Root mean squared calibration error with 15 bins + "mce-15", # Maximum calibration error with 15 bins + "smece", # Smooth ECE, requires the relplot package. +]) + y_true = torch.tensor(...) -y_logits = torch.tensor(...) -results = metrics.compute_all_from_labels_logits(y_true, y_logits) -print(results['refinement_logloss_ts-mix_all'].item()) +y_pred = torch.tensor(...) + +results = metrics.compute_all_from_labels_probs(y_true, y_pred) +print(results["brier"].item()) ``` -## Using more metrics +While there are some classes for regression metrics, they are not implemented yet. + +The following function returns a list of all metric names: +```python +from probmetrics.metrics import Metrics, MetricType +Metrics.get_available_names(metric_type=MetricType.CLASS) +``` + +### Using our refinement and calibration metrics + +We provide estimators for refinement error (loss after post-hoc calibration) and +calibration error (loss improvement through post-hoc calibration). +They can be used as follows: -In general, while some metrics can be -flexibly configured using the corresponding classes, -many metrics are available through their name. -Here are some relevant classification metrics: ```python from probmetrics.metrics import Metrics metrics = Metrics.from_names([ - 'logloss', - 'brier', # for binary, this is 2x the brier from sklearn - 'accuracy', 'class-error', - 'auroc-ovr', # one-vs-rest - 'auroc-ovo-sklearn', # one-vs-one (can be slow!) - # calibration metrics - 'ece-15', 'rmsce-15', 'mce-15', 'smece' - 'refinement_logloss_ts-mix_all', - 'calib-err_logloss_ts-mix_all', - 'refinement_brier_ts-mix_all', - 'calib-err_brier_ts-mix_all', - 'calib-err_proper-L1-binary-as-1d_WS_CatboostClassifier_all', - 'calib-err_proper-L2-binary-as-1d_WS_CatboostClassifier_all', - 'calib-err_proper-Linf-binary-as-1d_WS_CatboostClassifier_all', + # Mean test logloss after post-hoc calibration with ts-mix: + "refinement_logloss_ts-mix_all", + + # Difference in mean test logloss before/after calibration with ts-mix: + "calib-err_logloss_ts-mix_all", + + # Same thing for Brier score: + "refinement_brier_ts-mix_all", + "calib-err_brier_ts-mix_all", + + # Using the L1, L2 and Linf calibration error estimators (CatBoost based) described + # in [3]: + "calib-err_proper-L1-binary-as-1d_WS_CatboostClassifier_all", + "calib-err_proper-L2-binary-as-1d_WS_CatboostClassifier_all", + "calib-err_proper-Linf-binary-as-1d_WS_CatboostClassifier_all", ]) -``` -The following function returns a list of all metric names: -```python -from probmetrics.metrics import Metrics, MetricType -Metrics.get_available_names(metric_type=MetricType.CLASS) -``` +y_true = torch.tensor(...) +y_logits = torch.tensor(...) -While there are some classes for regression metrics, they are not implemented. +results = metrics.compute_all_from_labels_logits(y_true, y_logits) +print(results['refinement_logloss_ts-mix_all'].item()) +``` -## Advanced calibration, confidence, and top-class metrics +### Advanced calibration, confidence, and top-class metrics Beyond standard metrics, you can evaluate proper Lp calibration errors for any p, as well as isolate specific types of errors like over-confidence, under-confidence, and top-class errors. -**Note:** Over- and under-confidence metrics are designed for binary classification. -To use those for multi-class, please use `TopClassLoss(OverConfidenceLoss(your_metric))`. - ```python from probmetrics.metrics import ( ProperLpLoss, @@ -212,17 +228,23 @@ over_topclass_brier = TopClassLoss(OverConfidenceLoss(BrierLoss())) # Some metrics are listed by default, here are some of them metrics = metrics = Metrics.from_names([ - 'proper-L1-binary-as-1d', # use to estimate E[ \| Y - E[Y|f(X)] \|_1 ] and treat binary - # predictions as scalars with shapes (n,1) ) - 'proper-L2', # use to estimate E[ \| Y - E[Y|f(X)] \|_2 ] (and treat binary predictions - # as vector with shapes (n,2) ) + # Estimate E[ \| Y - E[Y|f(X)] \|_1 ] and treat binary predictions as scalars with + # shapes (n,1) + "proper-L1-binary-as-1d" + + # Estimate E[ \| Y - E[Y|f(X)] \|_2 ] and treat binary predictions as vectors with + # shapes (n,2) + "proper-L2", + "topclass-proper-L1-binary-as-1d", # Estimate L1 calibration error of top class "topclass-under-proper-L1-binary-as-1d", # Estimate L1-overconfidence of top class "topclass-over-proper-L1-binary-as-1d", # Estimate L1-underconfidence of top class ]) - ``` +> **Note:** Over- and under-confidence metrics are designed for binary classification. +For multi-class in a top-class fashion, please use `TopClassLoss(OverConfidenceLoss(your_metric))`. + Once those losses are defined, you can evaluate the calibration error by doing: ```python @@ -232,53 +254,51 @@ from probmetrics.splitters import CVSplitter loss = ProperLpLoss(p=2) -metrics = MetricsWithCalibration(loss, - calibrator=WS_CatboostClassifier(), # The classifier used to recalibrate the predictions - val_splitter=CVSplitter(n_cv=5) # cross-validation splitter - ) - -# or use combined metrics to evaluate multiple metrics -# while fitting the post-hoc calibrator only once -combined_losses = CombinedMetrics( - [ - ProperLpLoss(p=1), - OverConfidenceLoss.from_name("brier"), - OverConfidenceLoss.from_name("proper-L1") , - UnderConfidenceLoss.from_name("proper-L1" ), - UnderConfidenceLoss( BrierLoss() ), - BrierLoss() - ] - ) - -metrics = MetricsWithCalibration(combined_losses, - calibrator=WS_LGBMClassifier(), - val_splitter=CVSplitter(n_cv=5) - ) +metrics = MetricsWithCalibration( + loss, + calibrator=WS_CatboostClassifier(), # classifier used to recalibrate the predictions + val_splitter=CVSplitter(n_cv=5) # cross-validation splitter +) + +# Or use combined metrics to evaluate multiple metrics while fitting the post-hoc +# calibrator only once +combined_losses = CombinedMetrics([ + ProperLpLoss(p=1), + OverConfidenceLoss.from_name("brier"), + OverConfidenceLoss.from_name("proper-L1") , + UnderConfidenceLoss.from_name("proper-L1" ), + UnderConfidenceLoss(BrierLoss()), + BrierLoss() +]) + +metrics = MetricsWithCalibration( + combined_losses, + calibrator=WS_LGBMClassifier(), + val_splitter=CVSplitter(n_cv=5) +) y_true = torch.tensor(...) y_prob = torch.tensor(...) + results = metrics.compute_all_from_labels_probs(y_true, y_prob) ``` -The `calibrator` argument is a class used to recalibrate the original predictions. -Any estimator that inherits from sklearn.base.ClassifierMixin (i.e., follows the -scikit-learn classifier API) and implements `predict_proba()` can be used. +The `calibrator` argument is used to recalibrate the original predictions. +Any class inheriting from sklearn.base.ClassifierMixin (i.e., follows the scikit-learn +classifier API) and implementing `predict_proba()` can be used. We recommend using `WS_CatboostClassifier` with default parameters. -The "WS" stands for "Warm Start", as predictions are initialized at the -original predicted $f(x)$ values (see the paper [A Variational Estimator for Lp -Calibration Errors](https://arxiv.org/abs/2602.24230) for additional information). - +"WS" stands for "Warm Started", as predictions are initialized at the original predicted +$f(x)$ values (see [3]). -### Binary vs. multiclass formatting +#### Binary vs. multiclass formatting -The library internally stores predictions in a multiclass format -with shape `(n_samples, n_classes)`. -For binary classification, for some metrics -you can control whether to treat the output as a two-column distribution -or a single-column probability using the `binary_as_multiclass` parameter. -For example, for `BrierLoss()`, using `binary_as_multiclass=False` -will yield the scikit-learn formula, while `binary_as_multiclass=True` -will yield twice the value. +The library internally stores predictions in a multiclass format with shape +`(n_samples, n_classes)`. +For binary classification, for some metrics you can control whether to treat the output +as a two-column distribution or a single-column probability using the +`binary_as_multiclass` parameter. +For example, for `BrierLoss()`, using `binary_as_multiclass=False` will yield the +scikit-learn formula, while `binary_as_multiclass=True` will yield twice the value. Setting `binary_as_multiclass=False` tells the loss function to treat `(n_samples, 2)` predictions as a single-column `(n_samples, 1)` probability. @@ -306,9 +326,22 @@ based on $f(X)$ instead of $g(f(X))$ so the loss difference uses the same choice ## Releases +- v1.3.0 by [@eugeneberta](https://github.com/eugeneberta): + - Splitted calibrators.py into several subfiles and implemented many new calibrators, + most of which are benchmarked and ranked in the CalArena leaderboard. + Among others, added Binary histogram binning, Scaling-binning (from + uncertainty-calibration), BBQ (from netcal), Beta calibration (from betacal), ENIR + (from netcal), binary and multiclass Kernel calibration (using beta and dirichlet + kernels from ece_kde), Spline calibration (from splinecalib), CDF-Spline calibration, + Ensemble Temperature Scaling, tree based calibration with CatBoost, LightGBM and + XGBoost. + - Re-structured the base `Calibrator` class to differentiate + `_predict_proba_torch_impl` from `_predict_proba_impl`. + - Added Kuiper and Kolmogorov-Smirnov binary calibration metrics. + - Deprecated python 3.9, added python 3.13 and 3.14 support. - v1.2.0 by [@elsacho](https://github.com/elsacho): Added new proper loss functions: - - ProperLpLoss(p=p): Metrics to evaluate $E[ \Vert f(X) - E[Y|f(X)] \Vert_p ]$ where $f(X)$ are the - predictions of the classifier, $p >= 1$, including `p=float("inf")` + - ProperLpLoss(p=p): Metrics to evaluate $E[ \Vert f(X) - E[Y|f(X)] \Vert_p ]$ where + $f(X)$ are the predictions of the classifier, $p >= 1$, including `p=float("inf")` - TopClassLoss: A wrapper to variationally evaluate top-class errors. - OverConfidenceLoss & UnderConfidenceLoss: Wrappers to variationally evaluate over/under-confidence in binary predictors. @@ -316,7 +349,8 @@ based on $f(X)$ instead of $g(f(X))$ so the loss difference uses the same choice - New classifiers: Added `WS_CatboostClassifier` and `WS_LGBMClassifier` for evaluating calibration errors. - removed sklearn < 1.7 constraint. -- v1.1.0 by [@eugeneberta](https://github.com/eugeneberta): Improvements to the SVS and SMS calibrators: +- v1.1.0 by [@eugeneberta](https://github.com/eugeneberta): Improvements to the SVS and + SMS calibrators: - logit pre-processing with `'ts-mix'` is now automatic, and the global scaling parameter $\alpha$ is fixed to 1. This yields: - improved performance on our tabular and computer vision benchmarks @@ -329,10 +363,9 @@ based on $f(X)$ instead of $g(f(X))$ so the loss difference uses the same choice - the default binary calibrator in `LogisticCalibrator` is now quadratic scaling instead of affine scaling, this can be changed back by using `LogisticCalibrator(binary_type='affine')`. -- v1.0.0 by [@eugeneberta](https://github.com/eugeneberta): New post-hoc calibrators like `'logistic'` - including structured matrix scaling (SMS), - structured vector scaling (SVS), - affine scaling, and quadratic scaling. +- v1.0.0 by [@eugeneberta](https://github.com/eugeneberta): New post-hoc calibrators + like `'logistic'` including structured matrix scaling (SMS), structured vector scaling + (SVS), affine scaling, and quadratic scaling. - v0.0.2 by [@dholzmueller](https://github.com/dholzmueller): - Removed numpy<2.0 constraint - allow 1D vectors in CategoricalLogits / CategoricalProbs diff --git a/probmetrics/__about__.py b/probmetrics/__about__.py index 881244a..d63e145 100644 --- a/probmetrics/__about__.py +++ b/probmetrics/__about__.py @@ -1,2 +1,2 @@ # SPDX-License-Identifier: Apache-2.0 -__version__ = "1.2.0" +__version__ = "1.3.0" diff --git a/probmetrics/calibrators.py b/probmetrics/calibrators.py deleted file mode 100644 index ef01281..0000000 --- a/probmetrics/calibrators.py +++ /dev/null @@ -1,1858 +0,0 @@ -import warnings -from typing import Callable, Optional, List - -import torch -import sklearn -import logging -import torchmin -import numpy as np - -from probmetrics.saga import ( - fit_affine_scaling, - warm_up_affine_scaling, - fit_quadratic_scaling, - warm_up_quadratic_scaling, - fit_vector_scaling, - warm_up_vector_scaling, - fit_matrix_scaling, - warm_up_matrix_scaling, -) -from probmetrics.utils import ( - process_binary_probs, - binary_probs_to_logits, - multiclass_probs_to_logits, - validate_probabilities, -) -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.calibration import CalibratedClassifierCV -from sklearn.model_selection import StratifiedKFold, GridSearchCV -from sklearn.preprocessing import LabelEncoder - -from torch import nn - -from probmetrics.distributions import ( - CategoricalDistribution, - CategoricalProbs, - CategoricalLogits, -) - -logger = logging.getLogger(__name__) - - -class Calibrator(BaseEstimator, ClassifierMixin): - """ - Calibrator base class. To implement, - - override at least one of _fit_impl and _fit_torch_impl - - override at least one of predict_proba and predict_proba_torch - """ - - def __init__(self): - assert self.__class__.fit == Calibrator.fit - assert self.__class__.fit_torch == Calibrator.fit_torch - - def fit(self, X, y) -> "Calibrator": - assert isinstance(X, np.ndarray) - assert isinstance(y, np.ndarray) - - self.classes_ = list(range(X.shape[-1])) - - if self.__class__._fit_impl != Calibrator._fit_impl: - self._fit_impl(X, y) - return self - - if self.__class__._fit_torch_impl != Calibrator._fit_torch_impl: - self._fit_torch_impl( - y_pred=CategoricalProbs(torch.as_tensor(X)), - y_true_labels=torch.as_tensor(y, dtype=torch.long), - ) - return self - - raise NotImplementedError() - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - raise NotImplementedError() - - def fit_torch( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ) -> "Calibrator": - assert isinstance(y_true_labels, torch.Tensor) - assert isinstance(y_pred, CategoricalDistribution) - # default implementation, using sklearn - self.classes_ = list(range(y_pred.get_n_classes())) - - if self.__class__._fit_torch_impl != Calibrator._fit_torch_impl: - self._fit_torch_impl(y_pred, y_true_labels) - return self - - if self.__class__._fit_impl != Calibrator._fit_impl: - self._fit_impl( - y_pred.get_probs().detach().cpu().numpy(), - y_true_labels.detach().cpu().numpy(), - ) - return self - - raise NotImplementedError() - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - raise NotImplementedError() - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - if self.__class__.predict_proba_torch != Calibrator.predict_proba_torch: - return ( - self.predict_proba_torch(CategoricalProbs(torch.as_tensor(X))) - .get_probs() - .numpy() - ) - - raise NotImplementedError() - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - if self.__class__.predict_proba != Calibrator.predict_proba: - y_pred_probs = y_pred.get_probs() - probs = self.predict_proba(y_pred_probs.detach().cpu().numpy()) - return CategoricalProbs( - torch.as_tensor( - probs, device=y_pred_probs.device, dtype=y_pred_probs.dtype - ) - ) - - raise NotImplementedError() - - def predict(self, X): - y_probs = self.predict_proba(X) - class_idxs = np.argmax(y_probs, axis=-1) - return np.asarray(self.classes_)[class_idxs] - - -class ApplyToLogitsCalibrator(Calibrator): - def __init__(self, calib: BaseEstimator): - super().__init__() - self.calib = calib - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - self.calib_ = sklearn.base.clone(self.calib) - self.calib_.fit(np.log(X + 1e-10), y) - - def predict_proba(self, X): - return self.calib_.predict_proba(np.log(X + 1e-10)) - - -def bisection_search(f: Callable[[float], float], a: float, b: float, n_steps: int): - for _ in range(n_steps): - c = a + 0.5 * (b - a) - f_c = f(c) - if f_c > 0: - b = c - else: - a = c - - return 0.5 * (a + b) - - -class TemperatureScalingCalibrator(Calibrator): - def __init__( - self, - opt: str = "bisection", - max_bisection_steps: int = 30, - lr: float = 0.1, - max_iter: int = 200, - use_inv_temp: bool = True, - inv_temp_init: float = 1 / 1.5, - ): - super().__init__() - self.lr = lr - self.max_bisection_steps = max_bisection_steps - self.max_iter = max_iter - self.use_inv_temp = use_inv_temp - self.inv_temp_init = inv_temp_init - self.opt = opt - - def _get_loss_grad(self, invtemp: float, logits: torch.Tensor, y: torch.Tensor): - part_1 = torch.mean( - torch.sum(logits * torch.softmax(invtemp * logits, dim=-1), dim=-1) - ) - part_2 = torch.mean(logits[torch.arange(logits.shape[0]), y]) - return (part_1 - part_2).item() - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - logits = y_pred.get_logits() - labels = y_true_labels - - if self.opt in ["lbfgs", "lbfgs_line_search"]: - self._fit_lbfgs(logits, labels) - elif self.opt == "bisection": - self._fit_bisection(logits, labels) - else: - raise ValueError(f'Unknown optimizer "{self.opt}"') - - # print(f'{self.invtemp_=}') - - def _fit_lbfgs(self, logits: torch.Tensor, labels: torch.Tensor): - # following https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py - param = nn.Parameter( - torch.ones(1, device=logits.device) - * (self.inv_temp_init if self.use_inv_temp else 1 / self.inv_temp_init) - ) - criterion = nn.CrossEntropyLoss() - optimizer = torch.optim.LBFGS( - [param], - lr=self.lr, - max_iter=self.max_iter, - line_search_fn="strong_wolfe" if self.opt == "lbfgs_line_search" else None, - ) - - def eval(): - optimizer.zero_grad() - y_pred = ( - logits * param[:, None] - if self.use_inv_temp - else logits / param[:, None] - ) - loss = criterion(y_pred, labels) - loss.backward() - return loss - - optimizer.step(eval) - - self.invtemp_ = param.item() if self.use_inv_temp else 1 / param.item() - - def _fit_bisection(self, logits: torch.Tensor, labels: torch.Tensor): - objective_grad = lambda u, l=logits, tar=labels: self._get_loss_grad( - np.exp(u), l, tar - ) - - # should reach about float32 accuracy - # need log_2(32) = 5 steps to get to length 1 and then 24 more steps to get to float32 epsilon (2^{-24}) - self.invtemp_ = np.exp( - bisection_search( - objective_grad, a=-16, b=16, n_steps=self.max_bisection_steps - ) - ) - # print(f'{self.invtemp_=}') - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - return CategoricalLogits(self.invtemp_ * y_pred.get_logits()) - - -### New binary calibrators ### - - -class BinaryLogisticCalibrator(Calibrator): - """ - Binary post-hoc calibration with linear, affine or quadratic scaling - on the binary logits using sklearn's LogisticRegression. - - This class fits a model of the form: - P(y=1) = sigma(calibrated_logit) - - Where `calibrated_logit` depends on the `type`: - - 'linear': calibrated_logit = w * logit (Temperature scaling with inverse temperature) - - 'affine': calibrated_logit = b + w * logit (Platt scaling) - - 'quadratic': calibrated_logit = b + w1 * logit + w2 * logit^2 - """ - - VALID_TYPES = ["linear", "affine", "quadratic", "beta"] - - def __init__(self, type: str = "affine"): - """ - Args: - type (str, optional): The type of scaling. - One of ['linear', 'affine', 'quadratic']. - Defaults to 'affine'. - """ - super().__init__() - - if type not in self.VALID_TYPES: - raise ValueError( - f"Invalid type '{type}'. Must be one of {self.VALID_TYPES}" - ) - self.type = type - self.cal_ = None - - def _get_logits(self, X: np.ndarray) -> np.ndarray: - log_p, log_1_minus_p = binary_probs_to_logits(X) - logits = log_p - log_1_minus_p - - if self.type == "linear": - return logits - - ones = np.ones_like(logits) - - if self.type == "affine": - return np.hstack([ones, logits]) - - elif self.type == "quadratic": - return np.hstack([ones, logits, np.square(logits)]) - - if self.type == "beta": # TODO Check that this is indeed beta - return np.hstack([ones, log_p, log_1_minus_p]) - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - """Fits the logistic regression calibrator.""" - from sklearn.linear_model import LogisticRegression - - self.cal_ = LogisticRegression( - penalty=None, - fit_intercept=False, - solver="lbfgs" - ) - - features = self._get_logits(X) - self.cal_.fit(features, y) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - """Generates calibrated probability estimates.""" - if self.cal_ is None: - raise RuntimeError( - "Calibrator has not been fitted yet. Call fit() before predict_proba()." - ) - - features = self._get_logits(X) - return self.cal_.predict_proba(features) - - -class RegularizedAffineScalingCalibrator(Calibrator): - """ - Binary post-hoc calibration with Platt scaling (~binary logistic regression) $sigmoid(ax+b)$ on the logits x. - Uses the SAGA algorithm to fit the scaling and intercept parameters a & b. - Numba functions are warmed-up the first time the class is initialized (~1s). - A penalty (one of ["mcp", "lasso", "ridge"]) can be applied to the intercept parameter b, with regularization strength lambda_intercept. - To apply no regularization, leave penalty to None. - """ - - _warmed_up = False - - def __init__( - self, - penalty: str = None, - lambda_intercept: float = 0.0, - max_iter: int = 200, - tol: float = 1e-4, - print_init_info: bool = True, - ): - super().__init__() - self.penalty = penalty - self.lambda_intercept = lambda_intercept - self.max_iter = max_iter - self.tol = tol - self.print_init_info = print_init_info - - self.w = None - - if not RegularizedAffineScalingCalibrator._warmed_up: - if self.print_init_info: - print( - "First RegularizedAffineScalingCalibrator instantiation - warming up Numba functions..." - ) - warm_up_affine_scaling() - RegularizedAffineScalingCalibrator._warmed_up = True - if self.print_init_info: - print("Warmed up!") - - def _get_logits(self, X: np.ndarray) -> np.ndarray: - log_p, log_1_minus_p = binary_probs_to_logits(X) - logits = log_p - log_1_minus_p - ones = np.ones_like(logits) - return np.hstack([ones, logits]) - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - logits = self._get_logits(X) - self.w = fit_affine_scaling( - logits, - y, - penalty=self.penalty, - reg_intercept=self.lambda_intercept, - max_iter=self.max_iter, - tol=self.tol, - ) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - if self.w is None: - raise RuntimeError( - "Calibrator has not been fitted yet. Call fit() before predict_proba()." - ) - - logits = self._get_logits(X) - logits = logits.dot(self.w).reshape(-1, 1) - probs = 1.0 / (1.0 + np.exp(-logits)) - return np.hstack([1 - probs, probs]) - - -class RegularizedQuadraticScalingCalibrator(Calibrator): - """ - Binary post-hoc calibration with quadratic scaling $sigmoid(a + bx + cx^2)$ on the logits x. - Uses the SAGA algorithm to fit the intercept, scaling and quadratic parameters a, b & c. - Numba functions are warmed-up the first time the class is initialized (~1s). - A penalty (one of ["mcp", "lasso", "ridge"]) can be applied to the intercept and quadratic parameters a & c, - with respective regularization strength lambda_intercept and lambda_quadratic. - To apply no regularization, leave penalty to None. - """ - - _warmed_up = False - - def __init__( - self, - penalty: str = None, - lambda_intercept: float = 0.0, - lambda_quadratic: float = 0.0, - max_iter: int = 200, - tol: float = 1e-4, - print_init_info: bool = True, - ): - super().__init__() - self.penalty = penalty - self.lambda_intercept = lambda_intercept - self.lambda_quadratic = lambda_quadratic - self.max_iter = max_iter - self.tol = tol - self.print_init_info = print_init_info - - self.w = None - - if not RegularizedQuadraticScalingCalibrator._warmed_up: - if self.print_init_info: - print( - "First RegularizedQuadraticScalingCalibrator instantiation - warming up Numba functions..." - ) - warm_up_quadratic_scaling() - RegularizedQuadraticScalingCalibrator._warmed_up = True - if self.print_init_info: - print("Warmed up!") - - def _get_logits(self, X: np.ndarray) -> np.ndarray: - log_p, log_1_minus_p = binary_probs_to_logits(X) - logits = log_p - log_1_minus_p - ones = np.ones_like(logits) - return np.hstack([ones, logits, np.square(logits)]) - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - logits = self._get_logits(X) - self.w = fit_quadratic_scaling( - logits, - y, - penalty=self.penalty, - reg_intercept=self.lambda_intercept, - reg_quadratic=self.lambda_quadratic, - max_iter=self.max_iter, - tol=self.tol, - ) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - if self.w is None: - raise RuntimeError( - "Calibrator has not been fitted yet. Call fit() before predict_proba()." - ) - - logits = self._get_logits(X) - logits = logits.dot(self.w).reshape(-1, 1) - probs = 1.0 / (1.0 + np.exp(-logits)) - return np.hstack([1 - probs, probs]) - - -################################## - - -### New multiclass calibrators ### - - -class SVSCalibrator(Calibrator): - """ - Multiclass post-hoc calibration with a structured scaling scheme $softmax((aI + diag(v))x + b)$ on the logits x. - - Numba functions are warmed-up the first time the class is initialized with the 'saga' optimizer. - - A penalty (one of [None, "mcp", "lasso", "ridge"]) is applied to the intercept (b) and vector scaling (v) parameters, - with respective regularization strength lambda_intercept*(k**rho)/(n**tau) and lambda_diagonal*(k**rho)/(n**tau). - - Instead of fitting the global scaling parameter jointly with the other parameters, the logits are pre-processed with - temperature scaling (with no regularization) and a is then fixed to a=1. - - [!] solver = 'bfgs' only supports 'ridge' penalty. - [!] For solver = 'bfgs', max_iter and tol are ignored and default torchmin parameters are used. - """ - - _warmed_up_saga = False - _ALLOWED_PENALTIES = [None, "mcp", "lasso", "ridge"] - _ALLOWED_OPTIMIZERS = ["saga", "bfgs"] - - def __init__( - self, - penalty: str = "ridge", - rho: float = 1.0, - tau: float = 1.0, - lambda_intercept: float = 1.0, - lambda_diagonal: float = 1.0, - opt: str = "bfgs", - max_iter: int = 500, - tol: float = 1e-5, - print_init_info: bool = True, - ): - super().__init__() - - if penalty not in self._ALLOWED_PENALTIES: - raise ValueError( - f"Penalty '{penalty}' not recognized. Choose from {self._ALLOWED_PENALTIES}." - ) - - if opt not in self._ALLOWED_OPTIMIZERS: - raise ValueError( - f"Optimizer '{opt}' not recognized. Choose from {self._ALLOWED_OPTIMIZERS}." - ) - - if opt == "bfgs" and penalty != "ridge": - raise ValueError( - "The 'bfgs' optimizer only supports the 'ridge' penalty, use 'saga' instead." - ) - - if opt == "saga" and not SVSCalibrator._warmed_up_saga: - if print_init_info: - logger.info( - "First SVSCalibrator instantiation with 'saga' - warming up Numba functions..." - ) - warm_up_vector_scaling() - SVSCalibrator._warmed_up_saga = True - if print_init_info: - logger.info("Warmed up!") - - self.penalty = penalty - self.rho = rho - self.tau = tau - self.lambda_intercept = lambda_intercept - self.lambda_diagonal = lambda_diagonal - self.max_iter = max_iter - self.tol = tol - self.opt = opt - self.print_init_info = print_init_info - - self.v = None - self.b = None - self.ts = None - - def _get_logits(self, X: np.ndarray) -> tuple[np.ndarray, int, int]: - logits = multiclass_probs_to_logits(X) - return logits - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - validate_probabilities(X) - - n, k = X.shape - - self.ts = get_calibrator("ts-mix") - self.ts.fit(X, y) - scaled_probs = self.ts.predict_proba(X) - - reg_intercept = self.lambda_intercept * (k**self.rho) / (n**self.tau) - reg_diagonal = self.lambda_diagonal * (k**self.rho) / (n**self.tau) - - logits = self._get_logits(scaled_probs) - - if self.opt == "saga": - self.v, self.b = fit_vector_scaling( - logits, - y, - penalty=self.penalty, - reg_intercept=reg_intercept, - reg_diagonal=reg_diagonal, - max_iter=self.max_iter, - tol=self.tol, - ) - - elif self.opt == "bfgs": - self.v, self.b = self._fit_bfgs( - torch.tensor(logits), - torch.tensor(y), - num_classes=k, - reg_intercept=reg_intercept, - reg_diagonal=reg_diagonal, - ) - - def _fit_bfgs( - self, - logits: torch.Tensor, - y: torch.Tensor, - num_classes, - reg_intercept, - reg_diagonal, - ): - K = num_classes - - initial_params = torch.zeros(2 * K, device=logits.device, dtype=logits.dtype) - - def loss(params): - v_delta = params[:K] - b = params[K:] - - scaled_logits = logits * (1.0 + v_delta) + b - ce = torch.nn.functional.cross_entropy(scaled_logits, y) - - r_b = reg_intercept * b.pow(2).sum() - r_v = reg_diagonal * v_delta.pow(2).sum() - - return ce + r_b + r_v - - method = "l-bfgs" if initial_params.numel() > 1000 else "bfgs" - res = torchmin.minimize(loss, initial_params, method=method) - - v_delta_final = res.x[:K] - b_final = res.x[K:] - - return ( - 1.0 + v_delta_final - ).detach().cpu().numpy(), b_final.detach().cpu().numpy() - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - validate_probabilities(X) - - if self.v is None: - raise RuntimeError( - "Calibrator has not been fitted yet. Call fit() before predict_proba()." - ) - - scaled_probs = self.ts.predict_proba(X) - logits = self._get_logits(scaled_probs) - calibrated_logits = self.v.reshape(1, -1) * logits + self.b.reshape(1, -1) - calibrated_logits -= np.max(calibrated_logits, axis=1, keepdims=True) - exp_logits = np.exp(calibrated_logits) - return exp_logits / np.sum(exp_logits, axis=1, keepdims=True) - - -class SMSCalibrator(Calibrator): - """ - Multiclass post-hoc calibration with a structured scaling scheme $softmax( (aI + diag(v) + off_diag(M) )x + b)$ on the logits x. - - Numba functions are warmed-up the first time the class is initialized with the 'saga' optimizer. - - A penalty (one of ["mcp", "lasso", "group_lasso", "ridge"]) is applied to the intercept (b), vector scaling (v) and off diagonal (M) parameters, - with respective regularization strength lambda_intercept*(k**rho)/(n**tau), lambda_diagonal*(k**rho)/(n**tau) and lambda_off_diagonal*((k*(k-1))**rho)/(n**tau). - - Instead of fitting the global scaling parameter jointly with the other parameters, the logits are pre-processed with - temperature scaling (with no regularization) and a is then fixed to a=1. - - [!] The 'bfgs' solver only supports 'ridge' penalty. - [!] If 'bfgs' solver is used, max_iter and tol are ignored and default torchmin parameters are used. - """ - - _warmed_up_saga = False - _ALLOWED_PENALTIES = [None, "mcp", "lasso", "group_lasso", "ridge"] - _ALLOWED_OPTIMIZERS = ["saga", "bfgs"] - - def __init__( - self, - penalty: str = "ridge", - rho: float = 1.0, - tau: float = 1.0, - lambda_intercept: float = 1.0, - lambda_diagonal: float = 1.0, - lambda_off_diagonal: float = 1.0, - opt: str = "bfgs", - max_iter: int = 500, - tol: float = 1e-5, - print_init_info: bool = True, - ): - super().__init__() - - if penalty not in self._ALLOWED_PENALTIES: - raise ValueError( - f"Penalty '{penalty}' not recognized. Choose from {self._ALLOWED_PENALTIES}." - ) - - if opt not in self._ALLOWED_OPTIMIZERS: - raise ValueError( - f"Optimizer '{opt}' not recognized. Choose from {self._ALLOWED_OPTIMIZERS}." - ) - - if opt == "bfgs" and penalty != "ridge": - raise ValueError( - "The 'bfgs' optimizer only supports the 'ridge' penalty, use 'saga' instead." - ) - - if opt == "saga" and not SMSCalibrator._warmed_up_saga: - if print_init_info: - logger.info( - "First SMSCalibrator instantiation with 'saga' - warming up Numba functions..." - ) - warm_up_matrix_scaling() - SMSCalibrator._warmed_up_saga = True - if print_init_info: - logger.info("Warmed up!") - - self.penalty = penalty - self.rho = rho - self.tau = tau - self.lambda_intercept = lambda_intercept - self.lambda_diagonal = lambda_diagonal - self.lambda_off_diagonal = lambda_off_diagonal - self.max_iter = max_iter - self.tol = tol - self.opt = opt - self.print_init_info = print_init_info - - self.W = None - self.ts = None - - def _get_logits( - self, X: np.ndarray, append_bias: bool = True - ) -> tuple[np.ndarray, int, int]: - logits = multiclass_probs_to_logits(X) - if append_bias: - # Adding a constant term to logit vectors to fit the intercept. - logits = np.hstack([logits, np.ones((len(logits), 1))]) - return logits - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - """Fits the calibration matrix W.""" - validate_probabilities(X) - - n, k = X.shape - - self.ts = get_calibrator("ts-mix") - self.ts.fit(X, y) - scaled_probs = self.ts.predict_proba(X) - - reg_intercept = self.lambda_intercept * (k**self.rho) / (n**self.tau) - reg_diagonal = self.lambda_diagonal * (k**self.rho) / (n**self.tau) - reg_off_diagonal = ( - self.lambda_off_diagonal * ((k * (k - 1)) ** self.rho) / (n**self.tau) - ) - - if self.opt == "saga": - logits = self._get_logits(scaled_probs) - self.W = fit_matrix_scaling( - logits, - y, - penalty=self.penalty, - max_iter=self.max_iter, - tol=self.tol, - init_scaling=1.0, - reg_intercept=reg_intercept, - reg_diagonal=reg_diagonal, - reg_off_diagonal=reg_off_diagonal, - ) - - elif self.opt == "bfgs": - logits = self._get_logits(scaled_probs, append_bias=False) - self.W = self._fit_bfgs( - torch.tensor(logits), - torch.tensor(y), - num_classes=k, - reg_intercept=reg_intercept, - reg_diagonal=reg_diagonal, - reg_off_diagonal=reg_off_diagonal, - ) - - def _fit_bfgs( - self, - logits: torch.Tensor, - y: torch.Tensor, - num_classes, - reg_intercept, - reg_diagonal, - reg_off_diagonal, - ): - K = num_classes - - initial_params = torch.zeros( - K * (K + 1), device=logits.device, dtype=logits.dtype - ) - - def loss(params): - W_delta = params[: K * K].view(K, K) - b = params[K * K :] - - scaled_logits = logits + torch.nn.functional.linear(logits, W_delta, b) - ce = torch.nn.functional.cross_entropy(scaled_logits, y) - - W_diag = W_delta.diagonal() - r_i = reg_intercept * b.pow(2).sum() - r_d = reg_diagonal * W_diag.pow(2).sum() - r_o = reg_off_diagonal * (W_delta.pow(2).sum() - W_diag.pow(2).sum()) - - return ce + r_i + r_d + r_o - - method = "l-bfgs" if initial_params.numel() > 1000 else "bfgs" - res = torchmin.minimize(loss, initial_params, method=method) - - flat_result = res.x - W_delta_final = flat_result[: K * K].view(K, K) - b_final = flat_result[K * K :] - - with torch.no_grad(): - W_final = ( - torch.eye(K, device=logits.device, dtype=logits.dtype) + W_delta_final - ) - return torch.hstack([W_final, b_final.unsqueeze(1)]).detach().cpu().numpy() - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - validate_probabilities(X) - - if self.W is None: - raise RuntimeError( - "Calibrator has not been fitted yet. Call fit() before predict_proba()." - ) - - scaled_probs = self.ts.predict_proba(X) - logits = self._get_logits(scaled_probs) - calibrated_logits = logits @ self.W.T - calibrated_logits -= np.max(calibrated_logits, axis=1, keepdims=True) - exp_logits = np.exp(calibrated_logits) - return exp_logits / np.sum(exp_logits, axis=1, keepdims=True) - - -################################## - - -### Default logistic calibrator, compatible with binary and multiclass ### - - -class LogisticCalibrator(Calibrator): - """ - By default, uses 'quadratic-scaling' for binary classification and 'sms' for multiclass classification. - """ - - VALID_BINARY_TYPES = ["linear", "affine", "quadratic"] - VALID_MULTICLASS_TYPES = ["svs", "sms"] - - def __init__(self, binary_type: str = "quadratic", multiclass_type: str = "sms"): - super().__init__() - - if binary_type not in self.VALID_BINARY_TYPES: - raise ValueError( - f"Invalid binary type '{binary_type}'. Must be one of {self.VALID_BINARY_TYPES}" - ) - self.binary_type = binary_type - - if multiclass_type not in self.VALID_MULTICLASS_TYPES: - raise ValueError( - f"Invalid multiclass type '{multiclass_type}'. Must be one of {self.VALID_MULTICLASS_TYPES}" - ) - self.multiclass_type = multiclass_type - - self.cal_ = None - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - n_classes = len(np.unique(y)) - - if n_classes == 2: - if self.binary_type == "affine": - self.cal_ = BinaryLogisticCalibrator(type="affine") - elif self.binary_type == "linear": - self.cal_ = BinaryLogisticCalibrator(type="linear") - elif self.binary_type == "quadratic": - self.cal_ = BinaryLogisticCalibrator(type="quadratic") - - else: - if self.multiclass_type == "svs": - self.cal_ = SVSCalibrator() - elif self.multiclass_type == "sms": - self.cal_ = SMSCalibrator() - - self.cal_.fit(X, y) - return self - - def predict(self, X): - if self.cal_ is None: - raise RuntimeError("LogisticCalibrator must be fitted before prediction.") - return self.cal_.predict(X) - - def predict_proba(self, X): - if self.cal_ is None: - raise RuntimeError("LogisticCalibrator must be fitted before prediction.") - return self.cal_.predict_proba(X) - - -################################## - - -class GuoTemperatureScalingCalibrator(Calibrator): - # adapted from https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py - def __init__(self): - super().__init__() - self.temperature = nn.Parameter(torch.ones(1) * 1.5) - - def temperature_scale(self, logits): - """ - Perform temperature scaling on logits - """ - # Expand temperature to match the size of logits - temperature = self.temperature.unsqueeze(1).expand( - logits.size(0), logits.size(1) - ) - return logits / temperature - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - logits = y_pred.get_logits() - labels = y_true_labels - - nll_criterion = nn.CrossEntropyLoss().cuda() - - # Next: optimize the temperature w.r.t. NLL - optimizer = torch.optim.LBFGS([self.temperature], lr=0.01, max_iter=50) - - def eval(): - optimizer.zero_grad() - loss = nll_criterion(self.temperature_scale(logits), labels) - loss.backward() - return loss - - optimizer.step(eval) - - return self - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - with torch.no_grad(): - return CategoricalLogits(y_pred.get_logits() / self.temperature) - - -class AutoGluonTemperatureScalingCalibrator(Calibrator): - # adapted from - # https://github.com/autogluon/autogluon/blob/c1181326cf6b7e3b27a7420273f1a82808d939e2/core/src/autogluon/core/calibrate/temperature_scaling.py#L9 - # https://github.com/autogluon/autogluon/blob/28a242ebe8d55ba770c991b9db153ab4623c9abd/tabular/src/autogluon/tabular/trainer/abstract_trainer.py#L4433-L4457 - def __init__(self, init_val: float = 1, max_iter: int = 200, lr: float = 0.1): - super().__init__() - self.init_val = init_val - self.max_iter = max_iter - self.lr = lr - self.temperature = init_val - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - y_val_tensor = y_true_labels - temperature_param = torch.nn.Parameter(torch.ones(1).fill_(self.init_val)) - logits = y_pred.get_logits() - - is_invalid = torch.isinf(logits).any().tolist() - if is_invalid: - return - - nll_criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.LBFGS( - [temperature_param], lr=self.lr, max_iter=self.max_iter - ) - scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) - - optimizer_trajectory = [] - - if self.init_val != 1.0: - # need to check 1.0 as well since AutoGluon does it outside - optimizer_trajectory.append( - (nll_criterion(logits, y_val_tensor).item(), 1.0) - ) - - def temperature_scale_step(): - optimizer.zero_grad() - temp = temperature_param.unsqueeze(1).expand(logits.size(0), logits.size(1)) - new_logits = logits / temp - loss = nll_criterion(new_logits, y_val_tensor) - loss.backward() - scheduler.step() - optimizer_trajectory.append((loss.item(), temperature_param.item())) - return loss - - optimizer.step(temperature_scale_step) - - try: - best_loss_index = np.nanargmin(np.array(optimizer_trajectory)[:, 0]) - except ValueError: - self.temperature = 1.0 - return - temperature_scale = float(np.array(optimizer_trajectory)[best_loss_index, 1]) - - if np.isnan(temperature_scale) or temperature_scale <= 0.0: - self.temperature = 1.0 - return - - self.temperature = temperature_scale - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - with torch.no_grad(): - return CategoricalLogits(y_pred.get_logits() / self.temperature) - - -class AutoGluonInverseTemperatureScalingCalibrator(Calibrator): - # adapted from - # https://github.com/autogluon/autogluon/blob/c1181326cf6b7e3b27a7420273f1a82808d939e2/core/src/autogluon/core/calibrate/temperature_scaling.py#L9 - # but optimizing the inverse temperature instead - def __init__(self, init_val: float = 1, max_iter: int = 200, lr: float = 0.1): - super().__init__() - self.init_val = init_val - self.max_iter = max_iter - self.lr = lr - self.inv_temp = init_val - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - y_val_tensor = y_true_labels - inv_temperature_param = torch.nn.Parameter(torch.ones(1).fill_(self.init_val)) - logits = y_pred.get_logits() - - is_invalid = torch.isinf(logits).any().tolist() - if is_invalid: - return - - nll_criterion = torch.nn.CrossEntropyLoss() - optimizer = torch.optim.LBFGS( - [inv_temperature_param], lr=self.lr, max_iter=self.max_iter - ) - scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) - - optimizer_trajectory = [] - - def temperature_scale_step(): - optimizer.zero_grad() - inv_temp = inv_temperature_param.unsqueeze(1).expand( - logits.size(0), logits.size(1) - ) - new_logits = logits * inv_temp - loss = nll_criterion(new_logits, y_val_tensor) - loss.backward() - scheduler.step() - optimizer_trajectory.append((loss.item(), inv_temperature_param.item())) - return loss - - optimizer.step(temperature_scale_step) - - try: - best_loss_index = np.nanargmin(np.array(optimizer_trajectory)[:, 0]) - except ValueError: - return - inv_temperature_scale = float( - np.array(optimizer_trajectory)[best_loss_index, 1] - ) - - if np.isnan(inv_temperature_scale): - return - - self.inv_temp = inv_temperature_scale - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - with torch.no_grad(): - return CategoricalLogits(y_pred.get_logits() * self.inv_temp) - - -class TorchUncertaintyTemperatureScalingCalibrator(TemperatureScalingCalibrator): - # adapted from - # https://github.com/ENSTA-U2IS-AI/torch-uncertainty/blob/3a021d2e34e183b8aad3a0345e6d750c08c72af3/torch_uncertainty/post_processing/calibration/scaler.py#L1 - # https://github.com/ENSTA-U2IS-AI/torch-uncertainty/blob/3a021d2e34e183b8aad3a0345e6d750c08c72af3/torch_uncertainty/post_processing/calibration/temperature_scaler.py#L9 - def __init__(self, init_val: float = 1, lr: float = 0.1, max_iter: int = 100): - super().__init__( - opt="lbfgs", - lr=lr, - max_iter=max_iter, - use_inv_temp=False, - inv_temp_init=1.0 / init_val, - ) - # need to save these values here to comply with sklearn cloneability conventions - self.init_val = init_val - self.lr = lr - self.max_iter = max_iter - -class TabPFNCalibrator(Calibrator): - def __init__(self): - super().__init__() - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - from tabpfn import TabPFNClassifier - - self.cal_ = TabPFNClassifier() - - probs = process_binary_probs(X) - - self.cal_.fit(probs.reshape(-1,1), y) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - - probs = process_binary_probs(X) - - result = self.cal_.predict_proba(probs.reshape(-1,1)) - if len(result.shape) == 1: - return np.stack([1.0 - result, result], axis=1) - elif result.shape[1] == 1: - return np.concatenate([1.0 - result, result], axis=1) - - return result - -class BetacalCalibrator(Calibrator): - # From https://github.com/betacal/python/ - def __init__(self, parameters="abm"): - super().__init__() - - from betacal import BetaCalibration - - self.cal_ = BetaCalibration(parameters=parameters) - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - probs = process_binary_probs(X) - - self.cal_.fit(probs, y) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - probs = process_binary_probs(X) - - result = self.cal_.predict(probs) - - if len(result.shape) == 1: - return np.stack([1.0 - result, result], axis=1) - elif result.shape[1] == 1: - return np.concatenate([1.0 - result, result], axis=1) - - return result - -class MLISplineCalibrator(Calibrator): - # From https://github.com/numeristical/introspective - def __init__(self): - super().__init__() - - from ml_insights import SplineCalib - - self.cal_ = SplineCalib() - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - probs = process_binary_probs(X) - - self.cal_.fit(probs, y) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - probs = process_binary_probs(X) - - result = self.cal_.calibrate(probs) - - if len(result.shape) == 1: - return np.stack([1.0 - result, result], axis=1) - elif result.shape[1] == 1: - return np.concatenate([1.0 - result, result], axis=1) - - return result - -class NetcalTemperatureScalingCalibrator(Calibrator): - # this one does nothing due to https://github.com/EFS-OpenSource/calibration-framework/issues/61 - def __init__(self): - super().__init__() - from netcal.scaling import TemperatureScaling - - self.cal_ = TemperatureScaling() - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - if X.shape[1] == 2: - # binary, convert - X = X[:, 1] - self.cal_.fit(X, y, random_state=0) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - if X.shape[1] == 2: - # binary, convert - X = X[:, 1] - result = self.cal_.transform(X) - if len(result.shape) == 1: - return np.stack([1.0 - result, result], axis=1) - elif result.shape[1] == 1: - return np.concatenate([1.0 - result, result], axis=1) - - return result - - -class TorchcalTemperatureScalingCalibrator(Calibrator): - # adapted from - # https://github.com/rishabh-ranjan/torchcal/blob/3fb65f6423d33d680cd68c7f40a0259d41e8fb0b/torchcal.py#L8 - def __init__(self): - super().__init__() - self.temperature = 1.0 - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - y_pred_logits = y_pred.get_logits() - - temp = torch.ones(1, device=y_pred_logits.device) - - def loss(t): - return torch.nn.functional.cross_entropy(y_pred_logits / t, y_true_labels) - - res = torchmin.minimize(loss, temp, method="newton-exact") - if not res.success: - warnings.warn( - f"{self.__class__}: {res.message} Not updating calibrator params." - ) - else: - temp = res.x - - self.temperature = temp.item() - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - return CategoricalLogits(y_pred.get_logits() / self.temperature) - - -class TorchcalVectorScalingCalibrator(Calibrator): - # adapted from - # https://github.com/rishabh-ranjan/torchcal/blob/3fb65f6423d33d680cd68c7f40a0259d41e8fb0b/torchcal.py#L8 - def __init__(self): - super().__init__() - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - y_pred_logits = y_pred.get_logits() - - num_classes = y_pred_logits.shape[1] - - self.temp = torch.ones(num_classes, device=y_pred_logits.device) - self.bias = torch.zeros(num_classes, device=y_pred_logits.device) - - def loss(temp_bias): - temp, bias = temp_bias.split([num_classes, num_classes]) - return torch.nn.functional.cross_entropy( - y_pred_logits / temp + bias, y_true_labels - ) - - temp_bias = torch.cat([self.temp, self.bias]) - res = torchmin.minimize(loss, temp_bias, method="bfgs") - if not res.success: - warnings.warn( - f"{self.__class__}: {res.message} Not updating calibrator params." - ) - else: - self.temp, self.bias = res.x.split([num_classes, num_classes]) - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - return CategoricalLogits(y_pred.get_logits() / self.temp + self.bias) - - -class TorchcalMatrixScalingCalibrator(Calibrator): - def __init__(self): - super().__init__() - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - y_pred_logits = y_pred.get_logits() - - num_classes = y_pred_logits.shape[1] - - num_params = num_classes * (num_classes + 1) - if num_params <= 1000: - method = "bfgs" - else: - method = "l-bfgs" - - self.itemp = torch.eye( - num_classes, - num_classes, - device=y_pred_logits.device, - dtype=y_pred_logits.dtype, - ) - self.bias = torch.zeros( - num_classes, device=y_pred_logits.device, dtype=y_pred_logits.dtype - ) - - def loss(itemp_bias): - itemp, bias = itemp_bias.split([num_classes**2, num_classes]) - itemp = itemp.view(num_classes, num_classes) - return torch.nn.functional.cross_entropy( - y_pred_logits @ itemp + bias, y_true_labels - ) - - itemp_bias = torch.cat([self.itemp.view(-1), self.bias]) - res = torchmin.minimize(loss, itemp_bias, method=method) - if not res.success: - warnings.warn( - f"{self.__class__}: {res.message} Not updating calibrator params." - ) - else: - self.itemp, self.bias = res.x.split([num_classes**2, num_classes]) - self.itemp = self.itemp.view(num_classes, num_classes) - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - return CategoricalLogits(y_pred.get_logits() @ self.itemp + self.bias) - - -class CenteredIsotonicRegressionCalibrator(Calibrator): - def __init__(self): - super().__init__() - - from cir_model import CenteredIsotonicRegression - - self.cal_ = CenteredIsotonicRegression() - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - # TODO - # we have to use float64 since with float32 it can happen that somewhere internally - # a rounding error occurs (?) and interp1d thinks that it is asked to extrapolate at the boundary value, - # which causes it to output nan values, - # which are seen as a separate possible value but (y==value) is empty because nan==nan is false, - # so an error occurs because the method attempts to take an average of an empty set - - probs = process_binary_probs(X) - - probs = probs.astype(np.float64) - y = y.astype(np.float64) - - self.cal_.fit(probs, y) - self.min_ = np.min(probs) - self.max_ = np.max(probs) - - def predict_proba(self, X): - probs = process_binary_probs(X) - probs = probs.astype(np.float64) - - # have to clip since CenteredIsotonicRegression refuses to extrapolate (?) - probs = np.clip(probs, self.min_, self.max_) - - result = self.cal_.transform(probs) - result = np.clip(result, 0.0, 1.0) - return np.stack([1.0 - result, result], axis=-1) - - -class BinaryVennAbersCalibrator(Calibrator): - def __init__(self): - super().__init__() - - from venn_abers import VennAbers - - self.cal_ = VennAbers() - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - self.cal_.fit(X, y) - - def predict_proba(self, X): - return self.cal_.predict_proba(X)[0] - - -class VennAbersCalibrator(Calibrator): - def __init__(self, use_ovo: bool = False): - super().__init__() - self.use_ovo = use_ovo - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - self.X_ = np.copy(X) - self.y_ = np.copy(y) - - def predict_proba(self, X): - from venn_abers import VennAbersCalibrator - - va = VennAbersCalibrator() - return va.predict_proba( - p_cal=self.X_, - y_cal=self.y_, - p_test=X, - p0_p1_output=False, - va_type="one_vs_one" if self.use_ovo else "one_vs_all", - ) - - -class MulticlassOneVsOneCalibrator(Calibrator): - def __init__(self, binary_calibrator: BaseEstimator): - super().__init__() - self.binary_calibrator = binary_calibrator - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - self.n_classes_ = X.shape[-1] - self.bin_calibs_ = [] - - if self.n_classes_ == 2: - # binary classification - bin_calib = sklearn.base.clone(self.binary_calibrator) - bin_calib.fit(X, y) - self.bin_calibs_.append(bin_calib) - else: - for i in range(self.n_classes_): - for j in range(i + 1, self.n_classes_): - idxs = np.logical_or((y == i), (y == j)) - # idxs = np.arange(X.shape[0]) - bin_probs = np.stack([X[idxs, j], X[idxs, i]], axis=-1) - bin_probs += 1e-30 - bin_probs /= np.sum(bin_probs, axis=-1, keepdims=True) - # print(f'{np.any(np.isnan(bin_probs))=}') - bin_labels = (y[idxs] == i).astype(np.int32) - bin_calib = sklearn.base.clone(self.binary_calibrator) - bin_calib.fit(bin_probs, bin_labels) - self.bin_calibs_.append(bin_calib) - - def predict_proba(self, X): - if self.n_classes_ == 2: - # binary classification - return self.bin_calibs_[0].predict_proba(X) - else: - # use PKPD formula in http://proceedings.mlr.press/v60/manokhin17a/manokhin17a.pdf - pair_probs = [[None] * self.n_classes_ for i in range(self.n_classes_)] - multi_probs = [] - calib_idx = 0 - for i in range(self.n_classes_): - for j in range(i + 1, self.n_classes_): - bin_probs = np.stack([X[:, j], X[:, i]], axis=-1) - bin_probs += 1e-30 - bin_probs /= np.sum(bin_probs, axis=-1, keepdims=True) - pred_probs = self.bin_calibs_[calib_idx].predict_proba(bin_probs) - pair_probs[i][j] = pred_probs[:, 1] - pair_probs[j][i] = pred_probs[:, 0] - # if i == 0 and j == 1: - # print(f'{bin_probs=}') - # print(f'{pred_probs=}') - calib_idx += 1 - - for i in range(self.n_classes_): - sum_inv_probs = sum( - [ - 1.0 / (1e-30 + pair_probs[i][j]) - for j in range(self.n_classes_) - if j != i - ] - ) - multi_probs.append( - 1.0 / np.clip(sum_inv_probs - (self.n_classes_ - 2), 1e-30, np.inf) - ) - - multi_probs = np.stack(multi_probs, axis=-1) - multi_probs = np.clip(multi_probs, a_min=1e-30, a_max=np.inf) - multi_probs = multi_probs / np.sum(multi_probs, axis=-1, keepdims=True) - return multi_probs - - -class MulticlassOneVsRestCalibrator(Calibrator): - def __init__(self, binary_calibrator: BaseEstimator): - super().__init__() - self.binary_calibrator = binary_calibrator - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - self.n_classes_ = X.shape[-1] - self.bin_calibs_ = [] - - X[np.isnan(X)] = 0.0 - - if self.n_classes_ == 2: - # binary classification - bin_calib = sklearn.base.clone(self.binary_calibrator) - bin_calib.fit(X, y) - self.bin_calibs_.append(bin_calib) - else: - for i in range(self.n_classes_): - pos_probs = X[:, i] - neg_probs = 1.0 - pos_probs - bin_probs = np.stack([neg_probs, pos_probs], axis=-1) - bin_labels = (y == i).astype(np.int32) - bin_calib = sklearn.base.clone(self.binary_calibrator) - bin_calib.fit(bin_probs, bin_labels) - self.bin_calibs_.append(bin_calib) - - def predict_proba(self, X): - X[np.isnan(X)] = 0.0 - if self.n_classes_ == 2: - # binary classification - return self.bin_calibs_[0].predict_proba(X) - else: - multi_probs = [] - for i in range(self.n_classes_): - pos_probs = X[:, i] - neg_probs = 1.0 - pos_probs - bin_probs = np.stack([neg_probs, pos_probs], axis=-1) - pred_probs = self.bin_calibs_[i].predict_proba(bin_probs) - multi_probs.append(pred_probs[:, 1]) - multi_probs = np.stack(multi_probs, axis=-1) - multi_probs = np.clip(multi_probs, a_min=1e-30, a_max=np.inf) - multi_probs = multi_probs / np.sum(multi_probs, axis=-1, keepdims=True) - return multi_probs - - -class IdentityEstimator(ClassifierMixin, BaseEstimator): - def __init__(self, n_classes: int): - self.n_classes = n_classes - # somehow it can fail if we don't do the np.asarray() - self.classes_ = np.asarray(list(range(n_classes))) - - def fit(self, X, y): - # self.classes_ = np.unique(y) # if we do this we have a problem for missing classes - return self - - def predict_proba(self, X): - return X - - def predict(self, X): - # having this as a dummy here for the FrozenEstimator solution - # which hovewer doesn't work right now - raise NotImplementedError() - - -class PreprocessingCalibratorWrapper(Calibrator): - def __init__( - self, - calibrator: Calibrator, - pre_calib: Optional[Calibrator] = None, - l2_normalize: bool = False, - ): - super().__init__() - self.calibrator = calibrator - self.pre_calib = pre_calib - self.l2_normalize = l2_normalize - - # options: robust scaling - # pre-TS - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ) -> "Calibrator": - if self.pre_calib: - self.pre_calib_: Calibrator = sklearn.base.clone(self.pre_calib) - self.pre_calib_.fit_torch(y_pred, y_true_labels) - y_pred = self.pre_calib_.predict_proba_torch(y_pred) - if self.l2_normalize: - logits = y_pred.get_logits() - logits = logits - logits.mean(dim=-1, keepdim=True) - self.factor_ = 1.0 / (logits.square().mean().sqrt().item() + 1e-8) - logits = self.factor_ * logits - y_pred = CategoricalLogits(logits) - - self.calib_: Calibrator = sklearn.base.clone(self.calibrator) - self.calib_.fit_torch(y_pred, y_true_labels) - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - if self.pre_calib: - y_pred = self.pre_calib_.predict_proba_torch(y_pred) - if self.l2_normalize: - logits = y_pred.get_logits() - logits = logits - logits.mean(dim=-1, keepdim=True) - logits = self.factor_ * logits - y_pred = CategoricalLogits(logits) - - return self.calib_.predict_proba_torch(y_pred) - - -class SklearnCalibrator(Calibrator): - def __init__(self, method: str, cv: str = "prefit"): - assert cv == 'prefit', "Other methods than prefit not intended and not implemented for sklearn > 1.6" - super().__init__() - self.method = method - self.cv = cv - - @staticmethod - def _normalize_rows(A: np.ndarray) -> np.ndarray: - """Row-normalize so rows sum to 1; safe if a row sums to 0.""" - s = A.sum(axis=1, keepdims=True) - s = np.where(s == 0.0, 1.0, s) - return A / s - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - if X.ndim == 1: - X = np.stack((1.0 - X, X), axis=1) - - y = np.asarray(y) - - self.n_classes_ = X.shape[-1] - self.classes_ = np.arange(self.n_classes_) - - # Fit label encoder up-front (used for the missing-class path) - self.label_encoder_ = LabelEncoder() - y_mapped = self.label_encoder_.fit_transform(y) - self.present_classes_ = self.label_encoder_.classes_ - self.missing_classes_ = np.setdiff1d(self.classes_, self.present_classes_) - - # ---------- Case 1: no missing classes -> original formulation ---------- - if self.missing_classes_.size == 0: - est = IdentityEstimator(self.n_classes_) - est.fit(X, y) - - try: - # sklearn >= 1.6 - from sklearn.frozen import FrozenEstimator - self.calib_ = CalibratedClassifierCV(FrozenEstimator(est), method=self.method) - except ImportError: - # sklearn < 1.6 - self.calib_ = CalibratedClassifierCV(est, method=self.method, cv=self.cv) - - self.calib_.fit(X, y) - return - - # ---------- Case 2: missing classes -> slice + normalize X_present, use y_mapped ---------- - X_present = self._normalize_rows(X[:, self.present_classes_]) - - est = IdentityEstimator(len(self.present_classes_)) - est.fit(X_present, y_mapped) - - try: - from sklearn.frozen import FrozenEstimator - self.calib_ = CalibratedClassifierCV(FrozenEstimator(est), method=self.method) - except ImportError: - self.calib_ = CalibratedClassifierCV(est, method=self.method, cv=self.cv) - - self.calib_.fit(X_present, y_mapped) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - if X.ndim == 1: - X = np.stack((1.0 - X, X), axis=1) - - # If fit saw all classes, keep original behavior - if self.missing_classes_.size == 0: - return self.calib_.predict_proba(X) - - # Missing classes: normalize the same sliced X_present and zero-fill missing classes (Option A) - X_present = self._normalize_rows(X[:, self.present_classes_]) - p_present = self.calib_.predict_proba(X_present) - - p_full = np.zeros((X.shape[0], self.n_classes_), dtype=p_present.dtype) - p_full[:, self.present_classes_] = p_present - return p_full - - -class WrapCalibratorAsClassifier(BaseEstimator, ClassifierMixin): - def __init__(self, calib: BaseEstimator): - self.calib = calib - - def fit(self, X, y): - # print(f'Fitting calibrator: {self.calib}') - self.calib_ = sklearn.base.clone(self.calib) - self.calib_.fit(X, y) - self.classes_ = list(range(np.max(y) + 1)) - return self - - def predict_proba(self, X): - return self.calib_.predict_proba(X) - - -def logloss_np(y_true: np.ndarray, y_proba: np.ndarray): - return -np.mean(np.take_along_axis(np.log(y_proba), y_true[:, None], axis=1)) - - -class DirichletCalibrator(Calibrator): - def __init__( - self, - n_cv: int = 0, - use_odir: bool = False, - reg_lambda: float = 0.0, - reg_mu: Optional[float] = None, - reg_lambda_grid: Optional[List[float]] = None, - reg_mu_grid: Optional[List[float]] = None, - ): - super().__init__() - - self.n_cv = n_cv - self.use_odir = use_odir - self.reg_lambda = reg_lambda - self.reg_mu = reg_mu - self.reg_lambda_grid = reg_lambda_grid - self.reg_mu_grid = reg_mu_grid - - def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: - from dirichletcal.calib.fulldirichlet import FullDirichletCalibrator - - if self.n_cv == 0: - self.cal_ = FullDirichletCalibrator( - reg_lambda=self.reg_lambda, reg_mu=self.reg_mu - ) - elif self.n_cv >= 2: - reg_lambda_grid = self.reg_lambda_grid or [1e-1, 1e-2, 1e-3, 1e-4, 1e-5] - reg_mu_grid = self.reg_mu_grid or [1e-1, 1e-2, 1e-3, 1e-4, 1e-5] - calibrator = FullDirichletCalibrator( - reg_lambda=reg_lambda_grid, - reg_mu=reg_mu_grid if self.use_odir else None, - ) - calibrator = WrapCalibratorAsClassifier(calibrator) - skf = StratifiedKFold(n_splits=self.n_cv, shuffle=True, random_state=0) - self.cal_ = GridSearchCV( - calibrator, - param_grid={ - "calib__reg_lambda": reg_lambda_grid, - "calib__reg_mu": reg_mu_grid if self.use_odir else [None], - }, - cv=skf, - # use this because scikit-learn's logloss scorer fails in case of missing classes - scoring=lambda est, X_test, y_test: -logloss_np( - y_test, est.predict_proba(X_test) - ), - ) - else: - raise ValueError(f"n_cv must be either 0 or >=2, but is {self.n_cv}") - - self.cal_.fit(X, y) - - def predict_proba(self, X: np.ndarray) -> np.ndarray: - return self.cal_.predict_proba(X) - - -class MixtureCalibrator(Calibrator): - def __init__( - self, - calibrator: Calibrator, - output_constant: float = 1.0, - input_constant: float = 0.0, - ): - super().__init__() - self.calibrator = calibrator - self.output_constant = output_constant - self.input_constant = input_constant - - def _get_mixture(self, dist_1: CategoricalDistribution, unif_coef: float): - probs = dist_1.get_probs() - unif_probs = (1.0 / probs.shape[-1]) * torch.ones_like(probs) - return CategoricalProbs((1.0 - unif_coef) * probs + unif_coef * unif_probs) - - def _fit_torch_impl( - self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor - ): - self.n_samples_ = y_pred.get_n_samples() - if self.input_constant != 0.0: - y_pred = self._get_mixture( - y_pred, self.input_constant / (self.n_samples_ + 1) - ) - self.cal_ = sklearn.base.clone(self.calibrator) - self.cal_.fit_torch(y_pred, y_true_labels) - - def predict_proba_torch( - self, y_pred: CategoricalDistribution - ) -> CategoricalDistribution: - if self.input_constant != 0.0: - y_pred = self._get_mixture( - y_pred, self.input_constant / (self.n_samples_ + 1) - ) - y_pred = self.cal_.predict_proba_torch(y_pred) - if self.output_constant != 0.0: - y_pred = self._get_mixture( - y_pred, self.output_constant / (self.n_samples_ + 1) - ) - - return y_pred - - -def get_calibrator( - calibration_method: str, - calibrate_with_mixture: bool = False, - cal_mixture_output_constant: float = 1.0, - cal_mixture_input_constant: float = 0.0, - **config, -) -> Calibrator: - if calibration_method == "platt": - cal = SklearnCalibrator(method="sigmoid", cv="prefit") - elif calibration_method == "platt-logits": - cal = ApplyToLogitsCalibrator(SklearnCalibrator(method="sigmoid", cv="prefit")) - elif calibration_method == "isotonic": - cal = SklearnCalibrator(method="isotonic", cv="prefit") - elif calibration_method == "ivap": - cal = VennAbersCalibrator(use_ovo=config.get("va_use_ovo", False)) - elif calibration_method == "ivap-ovr": - cal = MulticlassOneVsRestCalibrator(BinaryVennAbersCalibrator()) - elif calibration_method == "ivap-ovo": - cal = MulticlassOneVsOneCalibrator(BinaryVennAbersCalibrator()) - elif calibration_method == "isotonic-naive-ovr": - # should be the same as 'isotonic', this is just to check - cal = MulticlassOneVsRestCalibrator( - SklearnCalibrator(method="isotonic", cv="prefit") - ) - elif calibration_method == "cir": - cal = MulticlassOneVsRestCalibrator(CenteredIsotonicRegressionCalibrator()) - elif calibration_method in ["temp-scaling", "ts-mix"]: - cal = TemperatureScalingCalibrator( - opt=config.get("ts_opt", "bisection"), - max_bisection_steps=config.get("ts_max_bisection_steps", 30), - lr=config.get("ts_lr", 0.1), - max_iter=config.get("ts_max_iter", 200), - use_inv_temp=config.get("ts_use_inv_temp", True), - inv_temp_init=config.get("ts_inv_temp_init", 1 / 1.5), - ) - elif calibration_method == "linear-scaling": - cal = BinaryLogisticCalibrator(type="linear") - elif calibration_method == "affine-scaling": - cal = BinaryLogisticCalibrator(type="affine") - elif calibration_method == "quadratic-scaling": - cal = BinaryLogisticCalibrator(type="quadratic") - elif calibration_method == "beta": - cal = BetacalCalibrator( - parameters=config.get("beta_parameters", "abm") - ) - elif calibration_method == "svs": - cal = SVSCalibrator( - penalty=config.get("svs_penalty", "ridge"), - rho=config.get("svs_rho", 1.0), - tau=config.get("svs_tau", 1.0), - lambda_intercept=config.get("svs_lambda_intercept", 1.0), - lambda_diagonal=config.get("svs_lambda_diagonal", 1.0), - opt=config.get("svs_opt", "bfgs"), - max_iter=config.get("svs_max_iter", 500), - tol=config.get("svs_tol", 1e-5), - print_init_info=config.get("svs_print_init_info", True), - ) - elif calibration_method == "sms": - cal = SMSCalibrator( - penalty=config.get("sms_penalty", "ridge"), - rho=config.get("sms_rho", 1.0), - tau=config.get("sms_tau", 1.0), - lambda_intercept=config.get("sms_lambda_intercept", 1.0), - lambda_diagonal=config.get("sms_lambda_diagonal", 1.0), - lambda_off_diagonal=config.get("sms_lambda_off_diagonal", 1.0), - opt=config.get("sms_opt", "bfgs"), - max_iter=config.get("sms_max_iter", 500), - tol=config.get("sms_tol", 1e-5), - print_init_info=config.get("sms_print_init_info", True), - ) - elif calibration_method in ["logistic", "logistic-mix"]: - cal = LogisticCalibrator( - binary_type=config.get("logistic_binary_type", "affine"), - multiclass_type=config.get("logistic_multiclass_type", "sms"), - ) - elif calibration_method == "torchunc-ts": - cal = TorchUncertaintyTemperatureScalingCalibrator( - init_val=config.get("ts_init_val", 1), - lr=config.get("ts_lr", 0.1), - max_iter=config.get("ts_max_iter", 100), - ) - elif calibration_method == "guo-ts": - cal = GuoTemperatureScalingCalibrator() - elif calibration_method == "torchcal-ts": - cal = TorchcalTemperatureScalingCalibrator() - elif calibration_method == "autogluon-ts": - cal = AutoGluonTemperatureScalingCalibrator( - init_val=config.get("ts_init_val", 1), - max_iter=config.get("ts_max_iter", 200), - lr=config.get("ts_lr", 0.1), - ) - elif calibration_method == "autogluon-inv-ts": - cal = AutoGluonInverseTemperatureScalingCalibrator( - init_val=config.get("ts_init_val", 1), - max_iter=config.get("ts_max_iter", 200), - lr=config.get("ts_lr", 0.1), - ) - elif calibration_method == "dircal": - cal = DirichletCalibrator( - n_cv=0, - reg_lambda=config.get("dircal_reg_lambda", 0.0), - reg_mu=config.get("dircal_reg_mu", None), - ) - elif calibration_method == "dircal-cv": - cal = DirichletCalibrator( - n_cv=config.get("dircal_n_cv", 5), - use_odir=config.get("dircal_use_odir", False), - reg_lambda_grid=config.get("dircal_reg_lambda_grid", None), - reg_mu_grid=config.get("dircal_reg_mu_grid", None), - ) - else: - raise ValueError(f'Unknown calibration method "{calibration_method}"') - - if calibrate_with_mixture or calibration_method.endswith("-mix"): - cal = MixtureCalibrator( - cal, - output_constant=cal_mixture_output_constant, - input_constant=cal_mixture_input_constant, - ) - - return cal diff --git a/probmetrics/calibrators/__init__.py b/probmetrics/calibrators/__init__.py new file mode 100644 index 0000000..00fbe3d --- /dev/null +++ b/probmetrics/calibrators/__init__.py @@ -0,0 +1,110 @@ +# probmetrics/calibrators/__init__.py + +from .base import ( + Calibrator, + ApplyToLogitsCalibrator, + WrapCalibratorAsClassifier, + MulticlassOneVsOneCalibrator, + MulticlassOneVsRestCalibrator, + MixtureCalibrator, +) + +from .logistic import ( + VectorScalingCalibrator, + SVSCalibrator, + MatrixScalingCalibrator, + SMSCalibrator, + DirichletCalibrator, + LogisticCalibrator, + BinaryLogisticCalibrator, + RegularizedAffineScalingCalibrator, + RegularizedQuadraticScalingCalibrator, + BetacalCalibrator, +) + +from .temperature_scaling import ( + TemperatureScalingCalibrator, + GuoTemperatureScalingCalibrator, + AutoGluonTemperatureScalingCalibrator, + AutoGluonInverseTemperatureScalingCalibrator, + TorchUncertaintyTemperatureScalingCalibrator, + TorchcalTemperatureScalingCalibrator, + NetcalTemperatureScalingCalibrator, + ETSCalibrator, + SoftPlusCalibrator, +) + +from .nonparametric import ( + NetcalENIRCalibrator, + CenteredIsotonicRegressionCalibrator, + BinaryVennAbersCalibrator, + VennAbersCalibrator, + KernelCalibrator, +) + +from .binning import ( + BinaryHistogramBinningCalibrator, + BinaryPlattBinnerCalibrator, + NetcalBBQCalibrator, +) + +from .splines import ( + MLISplineCalibrator, + CDFSplineCalibrator, +) + +from .sklearn import ( + SklearnCalibrator, +) + +from .trees import ( + CatBoostCalibrator, + LightGBMCalibrator, + XGBoostCalibrator, + BinaryCatBoostCalibrator, +) + +from .factory import get_calibrator + + +__all__ = [ + "Calibrator", + "MulticlassOneVsOneCalibrator", + "MulticlassOneVsRestCalibrator", + "MixtureCalibrator", + "get_calibrator", + "VectorScalingCalibrator", + "SVSCalibrator", + "MatrixScalingCalibrator", + "SMSCalibrator", + "DirichletCalibrator", + "LogisticCalibrator", + "BinaryLogisticCalibrator", + "RegularizedAffineScalingCalibrator", + "RegularizedQuadraticScalingCalibrator", + "BetacalCalibrator", + "TemperatureScalingCalibrator", + "GuoTemperatureScalingCalibrator", + "AutoGluonTemperatureScalingCalibrator", + "AutoGluonInverseTemperatureScalingCalibrator", + "TorchUncertaintyTemperatureScalingCalibrator", + "TorchcalTemperatureScalingCalibrator", + "NetcalTemperatureScalingCalibrator", + "ETSCalibrator", + "SoftPlusCalibrator", + "NetcalENIRCalibrator", + "CenteredIsotonicRegressionCalibrator", + "BinaryVennAbersCalibrator", + "VennAbersCalibrator", + "KernelCalibrator", + "BinaryHistogramBinningCalibrator", + "BinaryPlattBinnerCalibrator", + "NetcalBBQCalibrator", + "MLISplineCalibrator", + "CDFSplineCalibrator", + "SklearnCalibrator", + "CatBoostCalibrator", + "LightGBMCalibrator", + "XGBoostCalibrator", + "BinaryCatBoostCalibrator", +] diff --git a/probmetrics/calibrators/base.py b/probmetrics/calibrators/base.py new file mode 100644 index 0000000..2ac6645 --- /dev/null +++ b/probmetrics/calibrators/base.py @@ -0,0 +1,357 @@ +import torch +import sklearn +import numpy as np + +from probmetrics.distributions import ( + CategoricalDistribution, + CategoricalProbs, +) +from probmetrics.utils import validate_probabilities + +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils.validation import check_is_fitted + + +class Calibrator(BaseEstimator, ClassifierMixin): + """ + Calibrator base class. To implement, + - override at least one of (_fit_impl, _fit_torch_impl) + - override at least one of (_predict_proba_impl, _predict_proba_torch_impl) + """ + + def __init__(self): + assert self.__class__.fit == Calibrator.fit + assert self.__class__.fit_torch == Calibrator.fit_torch + assert self.__class__.predict_proba == Calibrator.predict_proba + assert self.__class__.predict_proba_torch == Calibrator.predict_proba_torch + + def fit(self, X: np.ndarray, y: np.ndarray) -> "Calibrator": + """Maps to _fit_impl if implemented, falls back on _fit_torch_impl otherwise. + """ + if not isinstance(X, np.ndarray): + raise TypeError(f"X must be a numpy array, got {type(X).__name__}") + if not isinstance(y, np.ndarray): + raise TypeError(f"y must be a numpy array, got {type(y).__name__}") + + validate_probabilities(X) + + self.classes_ = list(range(X.shape[-1])) + + # If subclass implemented the numpy version, use it + if self.__class__._fit_impl != Calibrator._fit_impl: + self._fit_impl(X, y) + return self + + # Fallback to torch routing if numpy is not implemented + if self.__class__._fit_torch_impl != Calibrator._fit_torch_impl: + self._fit_torch_impl( + y_pred=CategoricalProbs(torch.as_tensor(X)), + y_true_labels=torch.as_tensor(y, dtype=torch.long), + ) + return self + + raise NotImplementedError( + "Subclasses must implement _fit_impl or _fit_torch_impl." + ) + + def fit_torch( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ) -> "Calibrator": + """Maps to _fit_torch_impl if implemented, falls back on _fit_impl otherwise. + """ + if not isinstance(y_true_labels, torch.Tensor): + raise TypeError( + f"y_true_labels must be a torch.Tensor, got {type(y_true_labels).__name__}" + ) + if not isinstance(y_pred, CategoricalDistribution): + raise TypeError( + f"y_pred must be a CategoricalDistribution, got {type(y_pred).__name__}" + ) + # default implementation, using sklearn + self.classes_ = list(range(y_pred.get_n_classes())) + + # If subclass implemented the torch version, use it + if self.__class__._fit_torch_impl != Calibrator._fit_torch_impl: + self._fit_torch_impl(y_pred, y_true_labels) + return self + + # Fallback to numpy routing if torch is not implemented + if self.__class__._fit_impl != Calibrator._fit_impl: + self._fit_impl( + y_pred.get_probs().detach().cpu().numpy(), + y_true_labels.detach().cpu().numpy(), + ) + return self + + raise NotImplementedError( + "Subclasses must implement _fit_impl or _fit_torch_impl." + ) + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + raise NotImplementedError() + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ) -> None: + raise NotImplementedError() + + def predict_proba(self, X: np.ndarray) -> np.ndarray: + """Maps to _predict_proba_impl if implemented, falls back on + _predict_proba_torch_impl otherwise. + """ + check_is_fitted(self) + if not isinstance(X, np.ndarray): + raise TypeError(f"X must be a numpy array, got {type(X).__name__}") + validate_probabilities(X) + + # If subclass implemented the numpy version, use it + if self.__class__._predict_proba_impl != Calibrator._predict_proba_impl: + return self._predict_proba_impl(X) + + # Fallback to torch routing if numpy is not implemented + if ( + self.__class__._predict_proba_torch_impl + != Calibrator._predict_proba_torch_impl + ): + return ( + self._predict_proba_torch_impl(CategoricalProbs(torch.as_tensor(X))) + .get_probs() + .numpy() + ) + + raise NotImplementedError( + "Subclasses must implement _predict_proba_impl or _predict_proba_torch_impl." + ) + + def predict_proba_torch( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + """Maps to _predict_proba_torch_impl if implemented, falls back on + _predict_proba_impl otherwise. + """ + check_is_fitted(self) + if not isinstance(y_pred, CategoricalDistribution): + raise TypeError( + f"y_pred must be a CategoricalDistribution, got {type(y_pred).__name__}" + ) + + # If subclass implemented the torch version, use it + if ( + self.__class__._predict_proba_torch_impl + != Calibrator._predict_proba_torch_impl + ): + return self._predict_proba_torch_impl(y_pred) + + # Fallback to numpy routing if torch is not implemented + if self.__class__._predict_proba_impl != Calibrator._predict_proba_impl: + y_pred_probs = y_pred.get_probs() + probs = self._predict_proba_impl(y_pred_probs.detach().cpu().numpy()) + return CategoricalProbs( + torch.as_tensor( + probs, device=y_pred_probs.device, dtype=y_pred_probs.dtype + ) + ) + + raise NotImplementedError( + "Subclasses must implement _predict_proba_impl or _predict_proba_torch_impl." + ) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + raise NotImplementedError() + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + raise NotImplementedError() + + def predict(self, X): + y_probs = self.predict_proba(X) + class_idxs = np.argmax(y_probs, axis=-1) + return np.asarray(self.classes_)[class_idxs] + + +class ApplyToLogitsCalibrator(Calibrator): + def __init__(self, calib: BaseEstimator): + super().__init__() + self.calib = calib + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + self.calib_ = sklearn.base.clone(self.calib) + self.calib_.fit(np.log(X + 1e-10), y) + + def _predict_proba_impl(self, X): + return self.calib_.predict_proba(np.log(X + 1e-10)) + + +class MulticlassOneVsOneCalibrator(Calibrator): + def __init__(self, binary_calibrator: BaseEstimator): + super().__init__() + self.binary_calibrator = binary_calibrator + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + self.n_classes_ = X.shape[-1] + self.bin_calibs_ = [] + + if self.n_classes_ == 2: + # binary classification + bin_calib = sklearn.base.clone(self.binary_calibrator) + bin_calib.fit(X, y) + self.bin_calibs_.append(bin_calib) + else: + for i in range(self.n_classes_): + for j in range(i + 1, self.n_classes_): + idxs = np.logical_or((y == i), (y == j)) + # idxs = np.arange(X.shape[0]) + bin_probs = np.stack([X[idxs, j], X[idxs, i]], axis=-1) + bin_probs += 1e-30 + bin_probs /= np.sum(bin_probs, axis=-1, keepdims=True) + # print(f'{np.any(np.isnan(bin_probs))=}') + bin_labels = (y[idxs] == i).astype(np.int32) + bin_calib = sklearn.base.clone(self.binary_calibrator) + bin_calib.fit(bin_probs, bin_labels) + self.bin_calibs_.append(bin_calib) + + def _predict_proba_impl(self, X): + if self.n_classes_ == 2: + # binary classification + return self.bin_calibs_[0].predict_proba(X) + else: + # use PKPD formula in http://proceedings.mlr.press/v60/manokhin17a/manokhin17a.pdf + pair_probs = [[None] * self.n_classes_ for i in range(self.n_classes_)] + multi_probs = [] + calib_idx = 0 + for i in range(self.n_classes_): + for j in range(i + 1, self.n_classes_): + bin_probs = np.stack([X[:, j], X[:, i]], axis=-1) + bin_probs += 1e-30 + bin_probs /= np.sum(bin_probs, axis=-1, keepdims=True) + pred_probs = self.bin_calibs_[calib_idx].predict_proba(bin_probs) + pair_probs[i][j] = pred_probs[:, 1] + pair_probs[j][i] = pred_probs[:, 0] + # if i == 0 and j == 1: + # print(f'{bin_probs=}') + # print(f'{pred_probs=}') + calib_idx += 1 + + for i in range(self.n_classes_): + sum_inv_probs = sum( + [ + 1.0 / (1e-30 + pair_probs[i][j]) + for j in range(self.n_classes_) + if j != i + ] + ) + multi_probs.append( + 1.0 / np.clip(sum_inv_probs - (self.n_classes_ - 2), 1e-30, np.inf) + ) + + multi_probs = np.stack(multi_probs, axis=-1) + multi_probs = np.clip(multi_probs, a_min=1e-30, a_max=np.inf) + multi_probs = multi_probs / np.sum(multi_probs, axis=-1, keepdims=True) + return multi_probs + + +class MulticlassOneVsRestCalibrator(Calibrator): + def __init__(self, binary_calibrator: BaseEstimator): + super().__init__() + self.binary_calibrator = binary_calibrator + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + self.n_classes_ = X.shape[-1] + self.bin_calibs_ = [] + + X = X.copy() + X[np.isnan(X)] = 0.0 + + if self.n_classes_ == 2: + # binary classification + bin_calib = sklearn.base.clone(self.binary_calibrator) + bin_calib.fit(X, y) + self.bin_calibs_.append(bin_calib) + else: + for i in range(self.n_classes_): + pos_probs = X[:, i] + neg_probs = 1.0 - pos_probs + bin_probs = np.stack([neg_probs, pos_probs], axis=-1) + bin_labels = (y == i).astype(np.int32) + bin_calib = sklearn.base.clone(self.binary_calibrator) + bin_calib.fit(bin_probs, bin_labels) + self.bin_calibs_.append(bin_calib) + + def _predict_proba_impl(self, X): + X = X.copy() + X[np.isnan(X)] = 0.0 + if self.n_classes_ == 2: + # binary classification + return self.bin_calibs_[0].predict_proba(X) + else: + multi_probs = [] + for i in range(self.n_classes_): + pos_probs = X[:, i] + neg_probs = 1.0 - pos_probs + bin_probs = np.stack([neg_probs, pos_probs], axis=-1) + pred_probs = self.bin_calibs_[i].predict_proba(bin_probs) + multi_probs.append(pred_probs[:, 1]) + multi_probs = np.stack(multi_probs, axis=-1) + multi_probs = np.clip(multi_probs, a_min=1e-30, a_max=np.inf) + multi_probs = multi_probs / np.sum(multi_probs, axis=-1, keepdims=True) + return multi_probs + + +class MixtureCalibrator(Calibrator): + def __init__( + self, + calibrator: Calibrator, + output_constant: float = 1.0, + input_constant: float = 0.0, + ): + super().__init__() + self.calibrator = calibrator + self.output_constant = output_constant + self.input_constant = input_constant + + def _get_mixture(self, dist_1: CategoricalDistribution, unif_coef: float): + probs = dist_1.get_probs() + unif_probs = (1.0 / probs.shape[-1]) * torch.ones_like(probs) + return CategoricalProbs((1.0 - unif_coef) * probs + unif_coef * unif_probs) + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ): + self.n_samples_ = y_pred.get_n_samples() + if self.input_constant != 0.0: + y_pred = self._get_mixture( + y_pred, self.input_constant / (self.n_samples_ + 1) + ) + self.cal_ = sklearn.base.clone(self.calibrator) + self.cal_.fit_torch(y_pred, y_true_labels) + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + if self.input_constant != 0.0: + y_pred = self._get_mixture( + y_pred, self.input_constant / (self.n_samples_ + 1) + ) + y_pred = self.cal_.predict_proba_torch(y_pred) + if self.output_constant != 0.0: + y_pred = self._get_mixture( + y_pred, self.output_constant / (self.n_samples_ + 1) + ) + + return y_pred + + +class WrapCalibratorAsClassifier(BaseEstimator, ClassifierMixin): + def __init__(self, calib: BaseEstimator): + self.calib = calib + + def fit(self, X, y): + # print(f'Fitting calibrator: {self.calib}') + self.calib_ = sklearn.base.clone(self.calib) + self.calib_.fit(X, y) + self.classes_ = list(range(np.max(y) + 1)) + return self + + def predict_proba(self, X): + return self.calib_.predict_proba(X) diff --git a/probmetrics/calibrators/binning.py b/probmetrics/calibrators/binning.py new file mode 100644 index 0000000..7a0a218 --- /dev/null +++ b/probmetrics/calibrators/binning.py @@ -0,0 +1,154 @@ +import numpy as np + +from .base import Calibrator +from probmetrics.utils import flatten_binary_probs, expand_binary_probs + + +class BinaryHistogramBinningCalibrator(Calibrator): + """ + Binary Histogram Binning Calibrator. + + This calibrator partitions the uncalibrated predicted probabilities into + discrete bins and calculates the empirical probability of the positive + class within each bin. During prediction, it maps uncalibrated probabilities + to the empirical probability of their corresponding bin. + + Parameters + ---------- + n_bins : int, default=10 + The number of bins to partition the probability space into. + strategy : {'uniform', 'quantile'}, default='uniform' + Strategy used to define the widths of the bins. + - 'uniform': Bins have identical widths (e.g., [0, 0.1), [0.1, 0.2)). + - 'quantile': Bins have varying widths but contain approximately the + same number of samples. + + Attributes + ---------- + bin_edges_ : ndarray of shape (n_bins + 1,) + The edges defining the bins. Learned during `fit`. + bin_values_ : ndarray of shape (n_bins,) + The calibrated probability assigned to each bin. Learned during `fit`. + + Reference + --------- + Bianca Zadrozny and Charles Elkan. Obtaining calibrated probability estimates from + decision trees and naive bayesian classifiers. International Conference on Machine + Learning, 2001. + """ + + def __init__(self, n_bins: int = 10, strategy: str = "uniform"): + super().__init__() + + if strategy not in ["uniform", "quantile"]: + raise ValueError( + f"Strategy must be 'uniform' or 'quantile', got {strategy!r}." + ) + + self.strategy = strategy + self.n_bins = n_bins + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + probs = flatten_binary_probs(X) + + if self.strategy == "uniform": + self.bin_edges_ = np.linspace(0.0, 1.0, self.n_bins + 1) + else: + self.bin_edges_ = np.unique( + np.quantile(probs, np.linspace(0.0, 1.0, self.n_bins + 1)) + ) + if len(self.bin_edges_) == 1: + # Fallback if variance is 0 + self.bin_edges_ = np.array([0.0, 1.0]) + + bin_indices = np.digitize(probs, self.bin_edges_, right=False) - 1 + max_idx = len(self.bin_edges_) - 2 + + # Ensures exact 1.0s fall in the final bin rather than creating an out-of-bounds + # index + bin_indices = np.clip(bin_indices, 0, max_idx) + + bin_sums = np.bincount( + bin_indices, weights=y, minlength=len(self.bin_edges_) - 1 + ) + bin_counts = np.bincount(bin_indices, minlength=len(self.bin_edges_) - 1) + + # Fallback for empty bins: Global prior probability of the positive class + # (Safer than midpoints if the underlying model is severely miscalibrated) + global_prior = np.mean(y) if len(y) > 0 else 0.5 + fallback_values = np.full(max_idx + 1, global_prior) + + self.bin_values_ = np.divide( + bin_sums, bin_counts, out=fallback_values, where=(bin_counts > 0) + ) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + probs = flatten_binary_probs(X) + bin_indices = np.digitize(probs, self.bin_edges_, right=False) - 1 + bin_indices = np.clip( + bin_indices, 0, len(self.bin_edges_) - 2 + ) # Ensures exact 1.0s fall in the final bin + probs = self.bin_values_[bin_indices] + probs = expand_binary_probs(probs) + return probs + + +class BinaryPlattBinnerCalibrator(Calibrator): + """ + From https://github.com/p-lambda/verified_calibration + + Reference + --------- + Ananya Kumar, Percy S. Liang, and Tengyu Ma. Verified uncertainty calibration. + Advances in neural information processing systems, 2019. + """ + + def __init__(self): + super().__init__() + try: + from calibration import PlattBinnerCalibrator + except ImportError as e: + raise ImportError("The 'calibration' package is required.") from e + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + probs = flatten_binary_probs(X) + from calibration import PlattBinnerCalibrator + + self.cal_ = PlattBinnerCalibrator(num_calibration=None, num_bins=10) + self.cal_.train_calibration(probs, y) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + probs = flatten_binary_probs(X) + probs = self.cal_.calibrate(probs) + probs = expand_binary_probs(probs) + return probs + + +class NetcalBBQCalibrator(Calibrator): + """ + From https://github.com/EFS-OpenSource/calibration-framework + + Reference + --------- + Mahdi Pakdaman Naeini, Gregory Cooper, and Milos Hauskrecht. Obtaining well + calibrated probabilities using bayesian binning. AAAI Conference on Artificial + Intelligence, 2015. + """ + + def __init__(self): + super().__init__() + try: + from netcal.binning import BBQ + except ImportError as e: + raise ImportError("The 'netcal' package is required.") from e + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from netcal.binning import BBQ + + self.cal_ = BBQ() + self.cal_.fit(X, y) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + probs = self.cal_.transform(X) + probs = expand_binary_probs(probs) + return probs diff --git a/probmetrics/calibrators/factory.py b/probmetrics/calibrators/factory.py new file mode 100644 index 0000000..34ef1b5 --- /dev/null +++ b/probmetrics/calibrators/factory.py @@ -0,0 +1,180 @@ +from .base import ( + Calibrator, + ApplyToLogitsCalibrator, + WrapCalibratorAsClassifier, + MulticlassOneVsOneCalibrator, + MulticlassOneVsRestCalibrator, + MixtureCalibrator, +) + +from .logistic import ( + VectorScalingCalibrator, + SVSCalibrator, + MatrixScalingCalibrator, + SMSCalibrator, + DirichletCalibrator, + LogisticCalibrator, + BinaryLogisticCalibrator, + RegularizedAffineScalingCalibrator, + RegularizedQuadraticScalingCalibrator, + BetacalCalibrator, +) + +from .temperature_scaling import ( + TemperatureScalingCalibrator, + GuoTemperatureScalingCalibrator, + AutoGluonTemperatureScalingCalibrator, + AutoGluonInverseTemperatureScalingCalibrator, + TorchUncertaintyTemperatureScalingCalibrator, + TorchcalTemperatureScalingCalibrator, + NetcalTemperatureScalingCalibrator, + ETSCalibrator, +) + +from .nonparametric import ( + NetcalENIRCalibrator, + CenteredIsotonicRegressionCalibrator, + BinaryVennAbersCalibrator, + VennAbersCalibrator, + KernelCalibrator, +) + +from .binning import ( + BinaryHistogramBinningCalibrator, + BinaryPlattBinnerCalibrator, + NetcalBBQCalibrator, +) + +from .splines import ( + MLISplineCalibrator, + CDFSplineCalibrator, +) + +from .sklearn import ( + SklearnCalibrator, +) + +from .trees import ( + CatBoostCalibrator, +) + + +def get_calibrator( + calibration_method: str, + calibrate_with_mixture: bool = False, + cal_mixture_output_constant: float = 1.0, + cal_mixture_input_constant: float = 0.0, + **config, +) -> Calibrator: + if calibration_method == "platt": + cal = SklearnCalibrator(method="sigmoid", cv="prefit") + elif calibration_method == "platt-logits": + cal = ApplyToLogitsCalibrator(SklearnCalibrator(method="sigmoid", cv="prefit")) + elif calibration_method == "isotonic": + cal = SklearnCalibrator(method="isotonic", cv="prefit") + elif calibration_method == "ivap": + cal = VennAbersCalibrator(use_ovo=config.get("va_use_ovo", False)) + elif calibration_method == "ivap-ovr": + cal = MulticlassOneVsRestCalibrator(BinaryVennAbersCalibrator()) + elif calibration_method == "ivap-ovo": + cal = MulticlassOneVsOneCalibrator(BinaryVennAbersCalibrator()) + elif calibration_method == "isotonic-naive-ovr": + # should be the same as 'isotonic', this is just to check + cal = MulticlassOneVsRestCalibrator( + SklearnCalibrator(method="isotonic", cv="prefit") + ) + elif calibration_method == "cir": + cal = MulticlassOneVsRestCalibrator(CenteredIsotonicRegressionCalibrator()) + elif calibration_method in ["temp-scaling", "ts-mix"]: + cal = TemperatureScalingCalibrator( + opt=config.get("ts_opt", "bisection"), + max_bisection_steps=config.get("ts_max_bisection_steps", 30), + lr=config.get("ts_lr", 0.1), + max_iter=config.get("ts_max_iter", 200), + use_inv_temp=config.get("ts_use_inv_temp", True), + inv_temp_init=config.get("ts_inv_temp_init", 1 / 1.5), + ) + elif calibration_method == "linear-scaling": + cal = BinaryLogisticCalibrator(type="linear") + elif calibration_method == "affine-scaling": + cal = BinaryLogisticCalibrator(type="affine") + elif calibration_method == "quadratic-scaling": + cal = BinaryLogisticCalibrator(type="quadratic") + elif calibration_method == "beta": + cal = BetacalCalibrator(parameters=config.get("beta_parameters", "abm")) + elif calibration_method == "svs": + cal = SVSCalibrator( + penalty=config.get("svs_penalty", "ridge"), + rho=config.get("svs_rho", 1.0), + tau=config.get("svs_tau", 1.0), + lambda_intercept=config.get("svs_lambda_intercept", 1.0), + lambda_diagonal=config.get("svs_lambda_diagonal", 1.0), + opt=config.get("svs_opt", "bfgs"), + max_iter=config.get("svs_max_iter", 500), + tol=config.get("svs_tol", 1e-5), + print_init_info=config.get("svs_print_init_info", True), + ) + elif calibration_method == "sms": + cal = SMSCalibrator( + penalty=config.get("sms_penalty", "ridge"), + rho=config.get("sms_rho", 1.0), + tau=config.get("sms_tau", 1.0), + lambda_intercept=config.get("sms_lambda_intercept", 1.0), + lambda_diagonal=config.get("sms_lambda_diagonal", 1.0), + lambda_off_diagonal=config.get("sms_lambda_off_diagonal", 1.0), + opt=config.get("sms_opt", "bfgs"), + max_iter=config.get("sms_max_iter", 500), + tol=config.get("sms_tol", 1e-5), + print_init_info=config.get("sms_print_init_info", True), + ) + elif calibration_method in ["logistic", "logistic-mix"]: + cal = LogisticCalibrator( + binary_type=config.get("logistic_binary_type", "affine"), + multiclass_type=config.get("logistic_multiclass_type", "sms"), + ) + elif calibration_method == "torchunc-ts": + cal = TorchUncertaintyTemperatureScalingCalibrator( + init_val=config.get("ts_init_val", 1), + lr=config.get("ts_lr", 0.1), + max_iter=config.get("ts_max_iter", 100), + ) + elif calibration_method == "guo-ts": + cal = GuoTemperatureScalingCalibrator() + elif calibration_method == "torchcal-ts": + cal = TorchcalTemperatureScalingCalibrator() + elif calibration_method == "autogluon-ts": + cal = AutoGluonTemperatureScalingCalibrator( + init_val=config.get("ts_init_val", 1), + max_iter=config.get("ts_max_iter", 200), + lr=config.get("ts_lr", 0.1), + ) + elif calibration_method == "autogluon-inv-ts": + cal = AutoGluonInverseTemperatureScalingCalibrator( + init_val=config.get("ts_init_val", 1), + max_iter=config.get("ts_max_iter", 200), + lr=config.get("ts_lr", 0.1), + ) + elif calibration_method == "dircal": + cal = DirichletCalibrator( + n_cv=0, + reg_lambda=config.get("dircal_reg_lambda", 1e-3), + reg_mu=config.get("dircal_reg_mu", 1e-3), + ) + elif calibration_method == "dircal-cv": + cal = DirichletCalibrator( + n_cv=config.get("dircal_n_cv", 5), + use_odir=config.get("dircal_use_odir", False), + reg_lambda_grid=config.get("dircal_reg_lambda_grid", None), + reg_mu_grid=config.get("dircal_reg_mu_grid", None), + ) + else: + raise ValueError(f'Unknown calibration method "{calibration_method}"') + + if calibrate_with_mixture or calibration_method.endswith("-mix"): + cal = MixtureCalibrator( + cal, + output_constant=cal_mixture_output_constant, + input_constant=cal_mixture_input_constant, + ) + + return cal diff --git a/probmetrics/calibrators/logistic.py b/probmetrics/calibrators/logistic.py new file mode 100644 index 0000000..960ac69 --- /dev/null +++ b/probmetrics/calibrators/logistic.py @@ -0,0 +1,874 @@ +import torch +import logging +import torchmin +import warnings +import numpy as np +from typing import Optional, List +from sklearn.model_selection import StratifiedKFold, GridSearchCV + +from .base import Calibrator, WrapCalibratorAsClassifier +from probmetrics.distributions import ( + CategoricalDistribution, + CategoricalLogits, +) + +from probmetrics.saga import ( + fit_affine_scaling, + warm_up_affine_scaling, + fit_quadratic_scaling, + warm_up_quadratic_scaling, + fit_vector_scaling, + warm_up_vector_scaling, + fit_matrix_scaling, + warm_up_matrix_scaling, +) + +from probmetrics.utils import ( + validate_probabilities, + flatten_binary_probs, + expand_binary_probs, + binary_probs_to_logits, + multiclass_probs_to_logits, +) + +logger = logging.getLogger(__name__) + + +class VectorScalingCalibrator(Calibrator): + """ + Adapted from https://github.com/rishabh-ranjan/torchcal + + Reference + --------- + Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern + neural networks. International Conference on Machine Learning, 2017. + """ + def __init__(self): + super().__init__() + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ): + y_pred_logits = y_pred.get_logits() + num_classes = y_pred_logits.shape[1] + + def loss(temp_bias): + temp, bias = temp_bias.split([num_classes, num_classes]) + return torch.nn.functional.cross_entropy( + y_pred_logits / temp + bias, y_true_labels + ) + + temp = torch.ones(num_classes, device=y_pred_logits.device) + bias = torch.zeros(num_classes, device=y_pred_logits.device) + temp_bias = torch.cat([temp, bias]) + res = torchmin.minimize(loss, temp_bias, method="bfgs") + if not res.success: + warnings.warn( + f"{self.__class__}: {res.message} Not updating calibrator params." + ) + self.temp_, self.bias_ = temp, bias + else: + self.temp_, self.bias_ = res.x.split([num_classes, num_classes]) + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + return CategoricalLogits(y_pred.get_logits() / self.temp_ + self.bias_) + + +class SVSCalibrator(Calibrator): + """ + Multiclass post-hoc calibration with a structured scaling scheme + $softmax((aI + diag(v))x + b)$ on the logits x. + + Numba functions are warmed-up the first time the class is initialized with the + 'saga' optimizer. + + A penalty (one of [None, "mcp", "lasso", "ridge"]) is applied to the intercept (b) + and vector scaling (v) parameters, with respective regularization strength + lambda_intercept*(k**rho)/(n**tau) and lambda_diagonal*(k**rho)/(n**tau). + + Instead of fitting the global scaling parameter jointly with the other parameters, + the logits are pre-processed with temperature scaling (with no regularization) and a + is then fixed to a=1. + + [!] solver = 'bfgs' only supports 'ridge' penalty. + [!] For solver = 'bfgs', max_iter and tol are ignored and default torchmin + parameters are used. + + Reference + --------- + Eugène Berta, David Holzmüller, Michael I. Jordan, and Francis Bach. Structured + matrix scaling for multi-class calibration. International Conference on Artificial + Intelligence and Statistics, 2026. + """ + + _warmed_up_saga = False + _ALLOWED_PENALTIES = [None, "mcp", "lasso", "ridge"] + _ALLOWED_OPTIMIZERS = ["saga", "bfgs"] + + def __init__( + self, + penalty: str = "ridge", + rho: float = 1.0, + tau: float = 1.0, + lambda_intercept: float = 1.0, + lambda_diagonal: float = 1.0, + opt: str = "bfgs", + max_iter: int = 500, + tol: float = 1e-5, + print_init_info: bool = True, + ): + super().__init__() + + if penalty not in self._ALLOWED_PENALTIES: + raise ValueError( + f"Penalty '{penalty}' not recognized. Choose from {self._ALLOWED_PENALTIES}." + ) + + if opt not in self._ALLOWED_OPTIMIZERS: + raise ValueError( + f"Optimizer '{opt}' not recognized. Choose from {self._ALLOWED_OPTIMIZERS}." + ) + + if opt == "bfgs" and penalty != "ridge": + raise ValueError( + "The 'bfgs' optimizer only supports the 'ridge' penalty, use 'saga' instead." + ) + + if opt == "saga": + try: + from numba import njit + except ImportError as e: + raise ImportError( + "The 'numba' package is required for optimizer 'saga'." + ) from e + + if not SVSCalibrator._warmed_up_saga: + if print_init_info: + logger.info( + "First SVSCalibrator instantiation with 'saga' - warming up Numba functions..." + ) + warm_up_vector_scaling() + SVSCalibrator._warmed_up_saga = True + if print_init_info: + logger.info("Warmed up!") + + self.penalty = penalty + self.rho = rho + self.tau = tau + self.lambda_intercept = lambda_intercept + self.lambda_diagonal = lambda_diagonal + self.max_iter = max_iter + self.tol = tol + self.opt = opt + self.print_init_info = print_init_info + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + n, k = X.shape + + from .factory import get_calibrator + + self.ts_ = get_calibrator("ts-mix") + self.ts_.fit(X, y) + scaled_probs = self.ts_.predict_proba(X) + + reg_intercept = self.lambda_intercept * (k**self.rho) / (n**self.tau) + reg_diagonal = self.lambda_diagonal * (k**self.rho) / (n**self.tau) + + logits = multiclass_probs_to_logits(scaled_probs) + + if self.opt == "saga": + self.v_, self.b_ = fit_vector_scaling( + logits, + y, + penalty=self.penalty, + reg_intercept=reg_intercept, + reg_diagonal=reg_diagonal, + max_iter=self.max_iter, + tol=self.tol, + ) + + elif self.opt == "bfgs": + self.v_, self.b_ = self._fit_bfgs( + torch.tensor(logits), + torch.as_tensor(y, dtype=torch.long), + num_classes=k, + reg_intercept=reg_intercept, + reg_diagonal=reg_diagonal, + ) + + def _fit_bfgs( + self, + logits: torch.Tensor, + y: torch.Tensor, + num_classes, + reg_intercept, + reg_diagonal, + ): + K = num_classes + + initial_params = torch.zeros(2 * K, device=logits.device, dtype=logits.dtype) + + def loss(params): + v_delta = params[:K] + b = params[K:] + + scaled_logits = logits * (1.0 + v_delta) + b + ce = torch.nn.functional.cross_entropy(scaled_logits, y) + + r_b = reg_intercept * b.pow(2).sum() + r_v = reg_diagonal * v_delta.pow(2).sum() + + return ce + r_b + r_v + + method = "l-bfgs" if initial_params.numel() > 1000 else "bfgs" + res = torchmin.minimize(loss, initial_params, method=method) + + v_delta_final = res.x[:K] + b_final = res.x[K:] + + return ( + 1.0 + v_delta_final + ).detach().cpu().numpy(), b_final.detach().cpu().numpy() + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + scaled_probs = self.ts_.predict_proba(X) + logits = multiclass_probs_to_logits(scaled_probs) + calibrated_logits = self.v_.reshape(1, -1) * logits + self.b_.reshape(1, -1) + calibrated_logits -= np.max(calibrated_logits, axis=1, keepdims=True) + exp_logits = np.exp(calibrated_logits) + return exp_logits / np.sum(exp_logits, axis=1, keepdims=True) + + +class MatrixScalingCalibrator(Calibrator): + """ + Adapted from https://github.com/rishabh-ranjan/torchcal + + Reference + --------- + Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern + neural networks. International Conference on Machine Learning, 2017. + """ + def __init__(self): + super().__init__() + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ): + y_pred_logits = y_pred.get_logits() + + num_classes = y_pred_logits.shape[1] + + num_params = num_classes * (num_classes + 1) + if num_params <= 1000: + method = "bfgs" + else: + method = "l-bfgs" + + def loss(itemp_bias): + itemp, bias = itemp_bias.split([num_classes**2, num_classes]) + itemp = itemp.view(num_classes, num_classes) + return torch.nn.functional.cross_entropy( + y_pred_logits @ itemp + bias, y_true_labels + ) + + itemp = torch.eye( + num_classes, + num_classes, + device=y_pred_logits.device, + dtype=y_pred_logits.dtype, + ) + bias = torch.zeros( + num_classes, device=y_pred_logits.device, dtype=y_pred_logits.dtype + ) + itemp_bias = torch.cat([itemp.view(-1), bias]) + res = torchmin.minimize(loss, itemp_bias, method=method) + if not res.success: + warnings.warn( + f"{self.__class__}: {res.message} Not updating calibrator params." + ) + self.itemp_, self.bias_ = itemp, bias + else: + self.itemp_, self.bias_ = res.x.split([num_classes**2, num_classes]) + self.itemp_ = self.itemp_.view(num_classes, num_classes) + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + return CategoricalLogits(y_pred.get_logits() @ self.itemp_ + self.bias_) + + +class SMSCalibrator(Calibrator): + """ + Multiclass post-hoc calibration with a structured scaling scheme + $softmax( (aI + diag(v) + off_diag(M) )x + b)$ on the logits x. + + Numba functions are warmed-up the first time the class is initialized with the + 'saga' optimizer. + + A penalty (one of ["mcp", "lasso", "group_lasso", "ridge"]) is applied to the + intercept (b), vector scaling (v) and off diagonal (M) parameters, with respective + regularization strength lambda_intercept*(k**rho)/(n**tau), + lambda_diagonal*(k**rho)/(n**tau) and lambda_off_diagonal*((k*(k-1))**rho)/(n**tau). + + Instead of fitting the global scaling parameter jointly with the other parameters, + the logits are pre-processed with temperature scaling (with no regularization) and a + is then fixed to a=1. + + [!] The 'bfgs' solver only supports 'ridge' penalty. + [!] If 'bfgs' solver is used, max_iter and tol are ignored and default torchmin + parameters are used. + + Reference + --------- + Eugène Berta, David Holzmüller, Michael I. Jordan, and Francis Bach. Structured + matrix scaling for multi-class calibration. International Conference on Artificial + Intelligence and Statistics, 2026. + """ + + _warmed_up_saga = False + _ALLOWED_PENALTIES = [None, "mcp", "lasso", "group_lasso", "ridge"] + _ALLOWED_OPTIMIZERS = ["saga", "bfgs"] + + def __init__( + self, + penalty: str = "ridge", + rho: float = 1.0, + tau: float = 1.0, + lambda_intercept: float = 1.0, + lambda_diagonal: float = 1.0, + lambda_off_diagonal: float = 1.0, + opt: str = "bfgs", + max_iter: int = 500, + tol: float = 1e-5, + print_init_info: bool = True, + ): + super().__init__() + + if penalty not in self._ALLOWED_PENALTIES: + raise ValueError( + f"Penalty '{penalty}' not recognized. Choose from {self._ALLOWED_PENALTIES}." + ) + + if opt not in self._ALLOWED_OPTIMIZERS: + raise ValueError( + f"Optimizer '{opt}' not recognized. Choose from {self._ALLOWED_OPTIMIZERS}." + ) + + if opt == "bfgs" and penalty != "ridge": + raise ValueError( + "The 'bfgs' optimizer only supports the 'ridge' penalty, use 'saga' instead." + ) + + if opt == "saga": + try: + from numba import njit + except ImportError as e: + raise ImportError( + "The 'numba' package is required for optimizer 'saga'." + ) from e + + if not SMSCalibrator._warmed_up_saga: + if print_init_info: + logger.info( + "First SMSCalibrator instantiation with 'saga' - warming up Numba functions..." + ) + warm_up_matrix_scaling() + SMSCalibrator._warmed_up_saga = True + if print_init_info: + logger.info("Warmed up!") + + self.penalty = penalty + self.rho = rho + self.tau = tau + self.lambda_intercept = lambda_intercept + self.lambda_diagonal = lambda_diagonal + self.lambda_off_diagonal = lambda_off_diagonal + self.max_iter = max_iter + self.tol = tol + self.opt = opt + self.print_init_info = print_init_info + + def _get_logits( + self, X: np.ndarray, append_bias: bool = True + ) -> tuple[np.ndarray, int, int]: + logits = multiclass_probs_to_logits(X) + if append_bias: + # Adding a constant term to logit vectors to fit the intercept. + logits = np.hstack([logits, np.ones((len(logits), 1))]) + return logits + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + """Fits the calibration matrix W.""" + validate_probabilities(X) + + n, k = X.shape + + from .factory import get_calibrator + + self.ts_ = get_calibrator("ts-mix") + self.ts_.fit(X, y) + scaled_probs = self.ts_.predict_proba(X) + + reg_intercept = self.lambda_intercept * (k**self.rho) / (n**self.tau) + reg_diagonal = self.lambda_diagonal * (k**self.rho) / (n**self.tau) + reg_off_diagonal = ( + self.lambda_off_diagonal * ((k * (k - 1)) ** self.rho) / (n**self.tau) + ) + + if self.opt == "saga": + logits = self._get_logits(scaled_probs) + self.W_ = fit_matrix_scaling( + logits, + y, + penalty=self.penalty, + max_iter=self.max_iter, + tol=self.tol, + init_scaling=1.0, + reg_intercept=reg_intercept, + reg_diagonal=reg_diagonal, + reg_off_diagonal=reg_off_diagonal, + ) + + elif self.opt == "bfgs": + logits = self._get_logits(scaled_probs, append_bias=False) + self.W_ = self._fit_bfgs( + torch.tensor(logits), + torch.as_tensor(y, dtype=torch.long), + num_classes=k, + reg_intercept=reg_intercept, + reg_diagonal=reg_diagonal, + reg_off_diagonal=reg_off_diagonal, + ) + + def _fit_bfgs( + self, + logits: torch.Tensor, + y: torch.Tensor, + num_classes, + reg_intercept, + reg_diagonal, + reg_off_diagonal, + ): + K = num_classes + + initial_params = torch.zeros( + K * (K + 1), device=logits.device, dtype=logits.dtype + ) + + def loss(params): + W_delta = params[: K * K].view(K, K) + b = params[K * K :] + + scaled_logits = logits + torch.nn.functional.linear(logits, W_delta, b) + ce = torch.nn.functional.cross_entropy(scaled_logits, y) + + W_diag = W_delta.diagonal() + r_i = reg_intercept * b.pow(2).sum() + r_d = reg_diagonal * W_diag.pow(2).sum() + r_o = reg_off_diagonal * (W_delta.pow(2).sum() - W_diag.pow(2).sum()) + + return ce + r_i + r_d + r_o + + method = "l-bfgs" if initial_params.numel() > 1000 else "bfgs" + res = torchmin.minimize(loss, initial_params, method=method) + + flat_result = res.x + W_delta_final = flat_result[: K * K].view(K, K) + b_final = flat_result[K * K :] + + with torch.no_grad(): + W_final = ( + torch.eye(K, device=logits.device, dtype=logits.dtype) + W_delta_final + ) + return torch.hstack([W_final, b_final.unsqueeze(1)]).detach().cpu().numpy() + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + scaled_probs = self.ts_.predict_proba(X) + logits = self._get_logits(scaled_probs) + calibrated_logits = logits @ self.W_.T + calibrated_logits -= np.max(calibrated_logits, axis=1, keepdims=True) + exp_logits = np.exp(calibrated_logits) + return exp_logits / np.sum(exp_logits, axis=1, keepdims=True) + + +def logloss_np(y_true: np.ndarray, y_proba: np.ndarray): + return -np.mean(np.take_along_axis(np.log(y_proba), y_true[:, None], axis=1)) + + +class DirichletCalibrator(Calibrator): + """ + From https://github.com/dirichletcal/dirichlet_python + + Reference + --------- + Meelis Kull, Miquel Perello Nieto, Markus Kängsepp, Telmo Silva Filho, Hao Song, and + Peter Flach. Beyond temperature scaling: Obtaining well-calibrated multi-class + probabilities with Dirichlet calibration. Advances in Neural Information Processing + Systems, 2019. + """ + def __init__( + self, + n_cv: int = 0, + use_odir: bool = False, + reg_lambda: float = 0.0, + reg_mu: Optional[float] = None, + reg_lambda_grid: Optional[List[float]] = None, + reg_mu_grid: Optional[List[float]] = None, + ): + super().__init__() + try: + from dirichletcal.calib.fulldirichlet import FullDirichletCalibrator + except ImportError as e: + raise ImportError("The 'dirichletcal' package is required.") from e + + self.n_cv = n_cv + self.use_odir = use_odir + self.reg_lambda = reg_lambda + self.reg_mu = reg_mu + self.reg_lambda_grid = reg_lambda_grid + self.reg_mu_grid = reg_mu_grid + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from dirichletcal.calib.fulldirichlet import FullDirichletCalibrator + + if self.n_cv == 0: + self.cal_ = FullDirichletCalibrator( + reg_lambda=self.reg_lambda, reg_mu=self.reg_mu + ) + elif self.n_cv >= 2: + reg_lambda_grid = self.reg_lambda_grid or [1e-1, 1e-2, 1e-3, 1e-4, 1e-5] + reg_mu_grid = self.reg_mu_grid or [1e-1, 1e-2, 1e-3, 1e-4, 1e-5] + calibrator = FullDirichletCalibrator( + reg_lambda=reg_lambda_grid, + reg_mu=reg_mu_grid if self.use_odir else None, + ) + calibrator = WrapCalibratorAsClassifier(calibrator) + skf = StratifiedKFold(n_splits=self.n_cv, shuffle=True, random_state=0) + self.cal_ = GridSearchCV( + calibrator, + param_grid={ + "calib__reg_lambda": reg_lambda_grid, + "calib__reg_mu": reg_mu_grid if self.use_odir else [None], + }, + cv=skf, + # use this because scikit-learn's logloss scorer fails in case of + # missing classes + scoring=lambda est, X_test, y_test: -logloss_np( + y_test, est.predict_proba(X_test) + ), + ) + else: + raise ValueError(f"n_cv must be either 0 or >=2, but is {self.n_cv}") + + self.cal_.fit(X, y) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + return self.cal_.predict_proba(X) + + +class LogisticCalibrator(Calibrator): + """ + By default, uses 'quadratic-scaling' for binary classification and 'sms' for + multiclass classification. + """ + + VALID_BINARY_TYPES = ["linear", "affine", "quadratic"] + VALID_MULTICLASS_TYPES = ["svs", "sms"] + + def __init__(self, binary_type: str = "quadratic", multiclass_type: str = "sms"): + super().__init__() + + if binary_type not in self.VALID_BINARY_TYPES: + raise ValueError( + f"Invalid binary type '{binary_type}'. Must be one of {self.VALID_BINARY_TYPES}" + ) + self.binary_type = binary_type + + if multiclass_type not in self.VALID_MULTICLASS_TYPES: + raise ValueError( + f"Invalid multiclass type '{multiclass_type}'. Must be one of {self.VALID_MULTICLASS_TYPES}" + ) + self.multiclass_type = multiclass_type + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + n_classes = len(np.unique(y)) + + if n_classes == 2: + if self.binary_type == "affine": + self.cal_ = BinaryLogisticCalibrator(type="affine") + elif self.binary_type == "linear": + self.cal_ = BinaryLogisticCalibrator(type="linear") + elif self.binary_type == "quadratic": + self.cal_ = BinaryLogisticCalibrator(type="quadratic") + + else: + if self.multiclass_type == "svs": + self.cal_ = SVSCalibrator() + elif self.multiclass_type == "sms": + self.cal_ = SMSCalibrator() + + self.cal_.fit(X, y) + return self + + def predict(self, X): + if self.cal_ is None: + raise RuntimeError("LogisticCalibrator must be fitted before prediction.") + return self.cal_.predict(X) + + def _predict_proba_impl(self, X): + if self.cal_ is None: + raise RuntimeError("LogisticCalibrator must be fitted before prediction.") + return self.cal_.predict_proba(X) + + +class BinaryLogisticCalibrator(Calibrator): + """ + Binary post-hoc calibration with linear, affine or quadratic scaling + on the binary logits using sklearn's LogisticRegression. + + This class fits a model of the form: + P(y=1) = sigma(calibrated_logit) + + Where `calibrated_logit` depends on the `type`: + - 'linear': calibrated_logit = w * logit (Temperature scaling with inverse + temperature) + - 'affine': calibrated_logit = b + w * logit (Platt scaling) + - 'quadratic': calibrated_logit = b + w1 * logit + w2 * logit^2 + """ + + VALID_TYPES = ["linear", "affine", "quadratic", "beta"] + + def __init__(self, type: str = "affine"): + """ + Args: + type (str, optional): The type of scaling. + One of ['linear', 'affine', 'quadratic']. + Defaults to 'affine'. + """ + super().__init__() + + if type not in self.VALID_TYPES: + raise ValueError( + f"Invalid type '{type}'. Must be one of {self.VALID_TYPES}" + ) + self.type = type + + def _get_logits(self, X: np.ndarray) -> np.ndarray: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + + if self.type == "linear": + return logits + + ones = np.ones_like(logits) + + if self.type == "affine": + return np.hstack([ones, logits]) + + elif self.type == "quadratic": + return np.hstack([ones, logits, np.square(logits)]) + + if self.type == "beta": # TODO Check that this is indeed beta + return np.hstack([ones, log_p, log_1_minus_p]) + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + """Fits the logistic regression calibrator.""" + from sklearn.linear_model import LogisticRegression + + self.cal_ = LogisticRegression( + penalty=None, fit_intercept=False, solver="lbfgs" + ) + + features = self._get_logits(X) + self.cal_.fit(features, y) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + features = self._get_logits(X) + return self.cal_.predict_proba(features) + + +class RegularizedAffineScalingCalibrator(Calibrator): + """ + Binary post-hoc calibration with Platt scaling (~binary logistic regression) + $sigmoid(ax+b)$ on the logits x. + Uses the SAGA algorithm to fit the scaling and intercept parameters a & b. + Numba functions are warmed-up the first time the class is initialized (~1s). + A penalty (one of ["mcp", "lasso", "ridge"]) can be applied to the intercept + parameter b, with regularization strength lambda_intercept. + To apply no regularization, leave penalty to None. + """ + + _warmed_up = False + + def __init__( + self, + penalty: str = None, + lambda_intercept: float = 0.0, + max_iter: int = 200, + tol: float = 1e-4, + print_init_info: bool = True, + ): + super().__init__() + self.penalty = penalty + self.lambda_intercept = lambda_intercept + self.max_iter = max_iter + self.tol = tol + self.print_init_info = print_init_info + + try: + from numba import njit + except ImportError as e: + raise ImportError("The 'numba' package is required.") from e + + if not RegularizedAffineScalingCalibrator._warmed_up: + if self.print_init_info: + print( + "First RegularizedAffineScalingCalibrator instantiation - warming up Numba functions..." + ) + warm_up_affine_scaling() + RegularizedAffineScalingCalibrator._warmed_up = True + if self.print_init_info: + print("Warmed up!") + + def _get_logits(self, X: np.ndarray) -> np.ndarray: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + ones = np.ones_like(logits) + return np.hstack([ones, logits]) + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + logits = self._get_logits(X) + self.w_ = fit_affine_scaling( + logits, + y, + penalty=self.penalty, + reg_intercept=self.lambda_intercept, + max_iter=self.max_iter, + tol=self.tol, + ) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + logits = self._get_logits(X) + logits = logits.dot(self.w_).reshape(-1, 1) + probs = 1.0 / (1.0 + np.exp(-logits)) + return np.hstack([1 - probs, probs]) + + +class RegularizedQuadraticScalingCalibrator(Calibrator): + """ + Binary post-hoc calibration with quadratic scaling $sigmoid(a + bx + cx^2)$ on the + logits x. + Uses the SAGA algorithm to fit the intercept, scaling and quadratic parameters a, b + & c. + Numba functions are warmed-up the first time the class is initialized (~1s). + A penalty (one of ["mcp", "lasso", "ridge"]) can be applied to the intercept and + quadratic parameters a & c, + with respective regularization strength lambda_intercept and lambda_quadratic. + To apply no regularization, leave penalty to None. + """ + + _warmed_up = False + + def __init__( + self, + penalty: str = None, + lambda_intercept: float = 0.0, + lambda_quadratic: float = 0.0, + max_iter: int = 200, + tol: float = 1e-4, + print_init_info: bool = True, + ): + super().__init__() + self.penalty = penalty + self.lambda_intercept = lambda_intercept + self.lambda_quadratic = lambda_quadratic + self.max_iter = max_iter + self.tol = tol + self.print_init_info = print_init_info + + try: + from numba import njit + except ImportError as e: + raise ImportError("The 'numba' package is required.") from e + + if not RegularizedQuadraticScalingCalibrator._warmed_up: + if self.print_init_info: + print( + "First RegularizedQuadraticScalingCalibrator instantiation - warming up Numba functions..." + ) + warm_up_quadratic_scaling() + RegularizedQuadraticScalingCalibrator._warmed_up = True + if self.print_init_info: + print("Warmed up!") + + def _get_logits(self, X: np.ndarray) -> np.ndarray: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + ones = np.ones_like(logits) + return np.hstack([ones, logits, np.square(logits)]) + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + logits = self._get_logits(X) + self.w_ = fit_quadratic_scaling( + logits, + y, + penalty=self.penalty, + reg_intercept=self.lambda_intercept, + reg_quadratic=self.lambda_quadratic, + max_iter=self.max_iter, + tol=self.tol, + ) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + logits = self._get_logits(X) + logits = logits.dot(self.w_).reshape(-1, 1) + probs = 1.0 / (1.0 + np.exp(-logits)) + return np.hstack([1 - probs, probs]) + + +class BetacalCalibrator(Calibrator): + """ + From https://github.com/betacal/python/ + + Reference + --------- + Meelis Kull, Telmo Silva Filho, and Peter Flach. Beta calibration: a well-founded + and easily implemented improvement on logistic calibration for binary classifiers. + International Conference on Artificial Intelligence and Statistics, 2017. + """ + def __init__(self, parameters="abm"): + super().__init__() + try: + from betacal import BetaCalibration + except ImportError as e: + raise ImportError("The 'betacal' package is required.") from e + self.parameters = parameters + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + probs = flatten_binary_probs(X) + from betacal import BetaCalibration + + self.cal_ = BetaCalibration(parameters=self.parameters) + self.cal_.fit(probs, y) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + probs = flatten_binary_probs(X) + # If logistic has coef[0] = 0 after .fit, an exeption occurs in .predict, + # very rare + try: + probs = self.cal_.predict(probs) + except Exception: + # Manually, following the intial implem, removing the problematic if + df = probs.reshape(-1, 1) + eps = np.finfo(df.dtype).eps + df = np.clip(df, eps, 1 - eps) + + x = np.hstack((df, 1.0 - df)) + x = np.log(x) + x[:, 1] *= -1 + probs = self.cal_.calibrator_.lr_.predict_proba(x)[:, 1] + + probs = expand_binary_probs(probs) + return probs diff --git a/probmetrics/calibrators/nonparametric.py b/probmetrics/calibrators/nonparametric.py new file mode 100644 index 0000000..6068956 --- /dev/null +++ b/probmetrics/calibrators/nonparametric.py @@ -0,0 +1,346 @@ +import torch +import warnings +import numpy as np + +from .base import Calibrator +from .sklearn import SklearnCalibrator + +from probmetrics.utils import flatten_binary_probs, expand_binary_probs +from probmetrics.distributions import CategoricalDistribution, CategoricalProbs + + +class CenteredIsotonicRegressionCalibrator(Calibrator): + """ + From https://github.com/mathijs02/cir-model + + Reference + --------- + Assaf P. Oron and Nancy Flournoy. Centered isotonic regression: point and interval + estimation for dose-response studies. Statistics in Biopharmaceutical Research, + 9(3):258-267, 2017. + """ + + def __init__(self): + super().__init__() + try: + from cir_model import CenteredIsotonicRegression + except ImportError as e: + raise ImportError("The 'cir_model' package is required.") from e + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + # TODO + # We have to use float64 since with float32 it can happen that somewhere + # internally a rounding error occurs (?) and interp1d thinks that it is asked to + # extrapolate at the boundary value, which causes it to output nan values, which + # are seen as a separate possible value but (y==value) is empty because nan==nan + # is false, so an error occurs because the method attempts to take an average of + # an empty set. + from cir_model import CenteredIsotonicRegression + + self.cal_ = CenteredIsotonicRegression() + + probs = flatten_binary_probs(X) + probs = probs.astype(np.float64) + y = y.astype(np.float64) + self.cal_.fit(probs, y) + self.min_ = np.min(probs) + self.max_ = np.max(probs) + + def _predict_proba_impl(self, X): + probs = flatten_binary_probs(X) + probs = probs.astype(np.float64) + # have to clip since CenteredIsotonicRegression refuses to extrapolate (?) + probs = np.clip(probs, self.min_, self.max_) + probs = self.cal_.transform(probs) + probs = np.clip(probs, 0.0, 1.0) + probs = expand_binary_probs(probs) + return probs + + +class BinaryVennAbersCalibrator(Calibrator): + """ + From https://github.com/ip200/venn-abers + + Reference + --------- + Vladimir Vovk, Ivan Petej, and Valentina Fedorova. Large-scale probabilistic + predictors with and without guarantees of validity. Advances in Neural Information + Processing Systems, 2015. + """ + + def __init__(self): + super().__init__() + try: + from venn_abers import VennAbers + except ImportError as e: + raise ImportError("The 'venn_abers' package is required.") from e + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from venn_abers import VennAbers + + self.cal_ = VennAbers() + probs = flatten_binary_probs(X) + probs = expand_binary_probs(probs) + self.cal_.fit(probs, y) + + def _predict_proba_impl(self, X): + probs = flatten_binary_probs(X) + probs = expand_binary_probs(probs) + return self.cal_.predict_proba(probs)[0] + + +class VennAbersCalibrator(Calibrator): + """ + From https://github.com/ip200/venn-abers + + Reference + --------- + Vladimir Vovk, Ivan Petej, and Valentina Fedorova. Large-scale probabilistic + predictors with and without guarantees of validity. Advances in Neural Information + Processing Systems, 2015. + """ + + def __init__(self, use_ovo: bool = False): + super().__init__() + try: + from venn_abers import VennAbersCalibrator + except ImportError as e: + raise ImportError("The 'venn_abers' package is required.") from e + self.use_ovo = use_ovo + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from venn_abers import VennAbersCalibrator + + self.cal_ = VennAbersCalibrator() + self.X_ = np.copy(X) + self.y_ = np.copy(y) + + def _predict_proba_impl(self, X): + return self.cal_.predict_proba( + p_cal=self.X_, + y_cal=self.y_, + p_test=X, + p0_p1_output=False, + va_type="one_vs_one" if self.use_ovo else "one_vs_all", + ) + + +class NetcalENIRCalibrator(Calibrator): + """ + From https://github.com/EFS-OpenSource/calibration-framework + + Reference + --------- + Naeini, Mahdi Pakdaman and Cooper, Gregory F.. Binary classifier calibration using + an ensemble of near isotonic regression models. International Conference on Data + Mining, 2016. + """ + def __init__(self): + super().__init__() + try: + from netcal.binning import ENIR + except ImportError as e: + raise ImportError("The 'netcal' package is required.") from e + self.defaulted_to_IR = False + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from netcal.binning import ENIR + + self.cal_ = ENIR() + probs = flatten_binary_probs(X) + # netcal.binning.ENIR returns an error when .fit() is called with separable data + # (AUC=1). + try: + self.cal_.fit(probs, y) + except Exception as e: + warnings.warn( + f"ENIR failed during .fit(), defaulting to Isotonic regression." + ) + self.cal_ = SklearnCalibrator(method="isotonic", cv="prefit") + self.cal_.fit(probs, y) + self.defaulted_to_IR = True + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + probs = flatten_binary_probs(X) + if self.defaulted_to_IR: + probs = self.cal_.predict_proba(probs) + else: + probs = self.cal_.transform(probs) + probs = expand_binary_probs(probs) + return probs + + +class KernelCalibrator(Calibrator): + """ + Beta / Dirichlet kernel from + https://github.com/tpopordanoska/ece-kde/blob/main/ece_kde.py, initially used to + estimate calibration errors, converted to a post-hoc calibration method here. + + Reference + --------- + Teodora Popordanoska, Raphael Sayer, and Matthew Blaschko. A consistent and + differentiable Lp canonical calibration error estimator. Advances in Neural + Information Processing Systems, 2022. + """ + + def __init__(self, bandwidth=None, device="cpu"): + """ + :param bandwidth: The bandwidth of the kernel. If None, LOO MLE is used to find + it during fit. + :param device: The device type: "cpu" or "cuda" + """ + super().__init__() + self.bandwidth = bandwidth + self.device = device + + def _get_heuristic_bandwidth(self, N: int, num_classes: int) -> float: + """Select a default bandwidth using Scott's Rule adapted for the variance of + Beta/Dirichlet kernels on a simplex.""" + # Intrinsic dimensionality: d = 1 for binary, d = C - 1 for multiclass + d = num_classes - 1 if num_classes > 1 else 1 + return N ** (-2.0 / (d + 4.0)) + + def _get_bandwidth(self, f): + """ + Select a bandwidth based on maximizing the leave-one-out likelihood (LOO MLE). + """ + bandwidths = torch.cat( + ( + torch.logspace(start=-5, end=-1, steps=15), + torch.linspace(0.2, 1, steps=5), + ) + ).to(self.device) + + max_b = -1 + max_l = float("-inf") + n = len(f) + + for b in bandwidths: + log_kern = self._get_kernel_matrix(f, f, b, loo=True) + log_fhat = torch.logsumexp(log_kern, 1) - torch.log( + torch.tensor(n - 1.0, device=self.device) + ) + l = torch.sum(log_fhat).item() + + if l > max_l: + max_l = l + max_b = b.item() + + return max_b + + def _get_kernel_matrix(self, z, zi, bandwidth, loo=False): + """ + Computes the pairwise log-kernel matrix between evaluation points `z` and fitted + points `zi`. + """ + # Clamp inputs safely to prevent log(0) + z_c = torch.clamp(z, 1e-10, 1 - 1e-10) + zi_c = torch.clamp(zi, 1e-10, 1 - 1e-10) + + if z.shape[1] == 1: + # Beta Kernel for Binary Classification + z_uns = z_c.unsqueeze(1) # Shape: (N, 1, 1) + zi_uns = zi_c.unsqueeze(0) # Shape: (1, M, 1) + p = zi_uns / bandwidth + 1 + q = (1 - zi_uns) / bandwidth + 1 + + log_beta = torch.lgamma(p) + torch.lgamma(q) - torch.lgamma(p + q) + log_num = (p - 1) * torch.log(z_uns) + (q - 1) * torch.log(1 - z_uns) + log_kern = (log_num - log_beta).squeeze(-1) # Shape: (N, M) + else: + # Dirichlet Kernel for Multiclass Classification + alphas = zi_c / bandwidth + 1 # Shape: (M, C) + log_beta = torch.sum(torch.lgamma(alphas), dim=1) - torch.lgamma( + torch.sum(alphas, dim=1) + ) # (M,) + log_num = torch.matmul(torch.log(z_c), (alphas - 1).T) # Shape: (N, M) + log_kern = log_num - log_beta.unsqueeze(0) # Shape: (N, M) + + if loo: + # Trick: set diagonal to -inf for leave-one-out calculation + diag_idx = torch.arange(len(z)) + log_kern[diag_idx, diag_idx] = torch.finfo(torch.float).min + + return log_kern + + def _fit_torch_impl( + self, + y_pred: CategoricalDistribution, + y_true_labels: torch.Tensor, + max_fit_samples: int = 10000, + ): + """ + Fits the kernel function by storing the calibration set and calculating the + bandwidth. + """ + with torch.no_grad(): + probs = y_pred.get_probs() + + if probs.ndim == 1: + probs = probs.unsqueeze(1) + + self.num_classes_ = probs.shape[1] + + # TODO: SPEED FIX: Subsample the fit set + N_total = probs.shape[0] + if N_total > max_fit_samples: + indices = torch.randperm(N_total, device=self.device)[:max_fit_samples] + fit_probs = probs[indices] + fit_labels = y_true_labels[indices] + else: + fit_probs = probs + fit_labels = y_true_labels + + # TODO: we use heuristic bandwith instead of tuned bandwidth + if self.bandwidth is None: + self.bandwidth = self._get_heuristic_bandwidth( + len(fit_probs), self.num_classes_ + ) + # if self.bandwidth is None: + # self.bandwidth = self._get_bandwidth(fit_probs) + + self.X_fit_ = fit_probs + self.y_fit_ = fit_labels + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + """ + Predicts calibrated probabilities by evaluating the Nadaraya-Watson estimator. + """ + with torch.no_grad(): + X_t = y_pred.get_probs() + is_1d = X_t.ndim == 1 + if is_1d: + X_t = X_t.unsqueeze(1) + + log_kern = self._get_kernel_matrix( + X_t, self.X_fit_, self.bandwidth, loo=False + ) + log_den = torch.logsumexp(log_kern, dim=1) + + if self.num_classes_ == 1: + # Binary Formulation + log_kern_y = log_kern.clone() + # Mask out entries where y != 1 (effectively multiplying by y in log + # space) + log_kern_y[:, self.y_fit_ == 0] = torch.finfo(torch.float).min + + log_num = torch.logsumexp(log_kern_y, dim=1) + prob_1 = torch.exp(log_num - log_den) + prob_0 = 1.0 - prob_1 + return CategoricalProbs(torch.stack([prob_0, prob_1], dim=1)) + + else: + # Canonical Multiclass Formulation + calibrated_probs = [] + for k in range(self.num_classes_): + log_kern_k = log_kern.clone() + # Mask out entries where y != k + log_kern_k[:, self.y_fit_ != k] = torch.finfo(torch.float).min + log_num_k = torch.logsumexp(log_kern_k, dim=1) + calibrated_probs.append(torch.exp(log_num_k - log_den)) + + res = torch.stack(calibrated_probs, dim=1) + # return res.cpu().numpy() + return CategoricalProbs(res) diff --git a/probmetrics/calibrators/orderpreserving.py b/probmetrics/calibrators/orderpreserving.py new file mode 100644 index 0000000..422ecc5 --- /dev/null +++ b/probmetrics/calibrators/orderpreserving.py @@ -0,0 +1,199 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset + +from .base import Calibrator + + +class _FCModel(nn.Module): + """ + From https://github.com/AmirooR/IntraOrderPreservingCalibration + + Standard Fully Connected base model as specified by 'config.json'. + Maps (n_classes -> hidden -> n_classes). + """ + + def __init__(self, in_features: int, hidden_sizes: list, out_features: int): + super().__init__() + layers = [] + prev = in_features + for h in hidden_sizes: + layers.append(nn.Linear(prev, h)) + layers.append(nn.ReLU()) + prev = h + layers.append(nn.Linear(prev, out_features)) + self.model = nn.Sequential(*layers) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.model(x) + + +class _OrderPreservingModel(nn.Module): + """ + From https://github.com/AmirooR/IntraOrderPreservingCalibration + + Internal PyTorch module adapting the provided OrderPreservingModel. + """ + + def __init__( + self, + base_model: nn.Module, + invariant: bool = True, + residual: bool = False, + num_classes: int = 10, + m_activation=F.softplus, + ): + super().__init__() + self.base_model = base_model + self.num_classes = num_classes + self.invariant = invariant + self.m_activation = m_activation + self.residual = residual + + def compute_u(self, sorted_logits: torch.Tensor) -> torch.Tensor: + diffs = sorted_logits[:, :-1] - sorted_logits[:, 1:] + diffs = torch.cat( + ( + diffs, + torch.ones((diffs.shape[0], 1), dtype=diffs.dtype, device=diffs.device), + ), + dim=1, + ) + assert torch.all(diffs >= 0), f"diffs should be positive: {diffs}" + return diffs.flip([1]) + + def forward(self, logits: torch.Tensor) -> torch.Tensor: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + _, unsorted_indices = torch.sort(sorted_indices, descending=False) + + # [B, C] + u = self.compute_u(sorted_logits) + inp = sorted_logits if self.invariant else logits + m = self.base_model(inp) + + m[:, 1:] = self.m_activation(m[:, 1:].clone()) + m[:, 0] = 0 + um = torch.cumsum(u * m, 1).flip([1]) + out = torch.gather(um, 1, unsorted_indices) + + if self.residual: + out = out + logits + return out + + +class IntraOrderPreservingCalibrator(Calibrator): + """ + TODO work in progress, slow for now. + + From https://github.com/AmirooR/IntraOrderPreservingCalibration + + Order-Preserving Calibrator using a neural network mapping. + + Inputs (X) must be probabilities. + Supports both multi-class (2D) and binary (1D or 2D) probabilities. + + Reference + --------- + Amir Rahimi, Amirreza Shaban, Ching-An Cheng, Richard Hartley, Byron Boots. Intra + Order-preserving Functions for Calibration of Multi-Class Neural Networks. Advances + in Neural Information Processing Systems, 2020. + """ + def __init__( + self, + invariant: bool = True, + residual: bool = False, + hidden_sizes: tuple = (150, 150), + num_epochs: int = 40, + lr: float = 0.005, + weight_decay: float = 0.0005, + batch_size: int = 256, + ): + super().__init__() + + self.invariant = invariant + self.residual = residual + self.hidden_sizes = hidden_sizes + self.num_epochs = num_epochs + self.lr = lr + self.weight_decay = weight_decay + self.batch_size = batch_size + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + """Fits the calibrator using Adam and CrossEntropyLoss.""" + + # Convert 1D binary probabilities to 2D + if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1): + X = np.column_stack((1.0 - X.flatten(), X.flatten())) + + self.n_classes_ = X.shape[1] + + # CrossEntropyLoss expects 1D integer class indices + if y.ndim == 2 and y.shape[1] > 1: + y = np.argmax(y, axis=1) + + # Reverse-engineer logits safely from probabilities + logits = np.log(np.clip(X, 1e-15, 1.0)) + + X_tensor = torch.tensor(logits, dtype=torch.float32) + y_tensor = torch.tensor(y, dtype=torch.long) + + dataset = TensorDataset(X_tensor, y_tensor) + bs = self.batch_size if len(X) > self.batch_size else len(X) + loader = DataLoader(dataset, batch_size=bs, shuffle=True) + + # Build base FC model and wrap it in the Order Preserving Model + base_model = _FCModel( + in_features=self.n_classes_, + hidden_sizes=self.hidden_sizes, + out_features=self.n_classes_, + ) + + self.model_ = _OrderPreservingModel( + base_model=base_model, + invariant=self.invariant, + residual=self.residual, + num_classes=self.n_classes_, + ).to(self.device) + + # Optimizer & Loss as defined in config.json and calibrate.py + optimizer = optim.Adam( + self.model_.parameters(), lr=self.lr, weight_decay=self.weight_decay + ) + criterion = nn.CrossEntropyLoss() + + self.model_.train() + for epoch in range(self.num_epochs): + for batch_x, batch_y in loader: + batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device) + + optimizer.zero_grad() + output_batch = self.model_(batch_x) + loss = criterion(output_batch, batch_y) + loss.backward() + optimizer.step() + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + """Returns calibrated probabilities.""" + if self.model_ is None: + raise RuntimeError( + "Calibrator must be fitted before calling predict_proba." + ) + + # Convert 1D binary probabilities to 2D + if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1): + X = np.column_stack((1.0 - X.flatten(), X.flatten())) + + # Reverse-engineer logits safely from probabilities + logits = np.log(np.clip(X, 1e-15, 1.0)) + X_tensor = torch.tensor(logits, dtype=torch.float32).to(self.device) + + self.model_.eval() + with torch.no_grad(): + calibrated_logits = self.model_(X_tensor) + probs = torch.softmax(calibrated_logits, dim=1).cpu().numpy() + + return probs diff --git a/probmetrics/calibrators/sklearn.py b/probmetrics/calibrators/sklearn.py new file mode 100644 index 0000000..dad7e0a --- /dev/null +++ b/probmetrics/calibrators/sklearn.py @@ -0,0 +1,115 @@ +import sklearn +import numpy as np + +from .base import Calibrator + +from sklearn.preprocessing import LabelEncoder +from sklearn.calibration import CalibratedClassifierCV +from sklearn.base import BaseEstimator, ClassifierMixin + + +class IdentityEstimator(ClassifierMixin, BaseEstimator): + def __init__(self, n_classes: int): + self.n_classes = n_classes + + def fit(self, X, y): + # somehow it can fail if we don't do the np.asarray() + self.classes_ = np.asarray(list(range(self.n_classes))) + # self.classes_ = np.unique(y) # if we do this we have a problem for missing + # classes + return self + + def predict_proba(self, X): + return X + + def predict(self, X): + # having this as a dummy here for the FrozenEstimator solution + # which hovewer doesn't work right now + raise NotImplementedError() + + +class SklearnCalibrator(Calibrator): + def __init__(self, method: str, cv: str = "prefit"): + assert cv == "prefit", ( + "Other methods than prefit not intended and not implemented for sklearn > 1.6" + ) + super().__init__() + self.method = method + self.cv = cv + + @staticmethod + def _normalize_rows(A: np.ndarray) -> np.ndarray: + """Row-normalize so rows sum to 1; safe if a row sums to 0.""" + s = A.sum(axis=1, keepdims=True) + s = np.where(s == 0.0, 1.0, s) + return A / s + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + if X.ndim == 1: + X = np.stack((1.0 - X, X), axis=1) + + y = np.asarray(y) + + self.n_classes_ = X.shape[-1] + self.classes_ = np.arange(self.n_classes_) + + # Fit label encoder up-front (used for the missing-class path) + self.label_encoder_ = LabelEncoder() + y_mapped = self.label_encoder_.fit_transform(y) + self.present_classes_ = self.label_encoder_.classes_ + self.missing_classes_ = np.setdiff1d(self.classes_, self.present_classes_) + + # ---------- Case 1: no missing classes -> original formulation ---------- + if self.missing_classes_.size == 0: + est = IdentityEstimator(self.n_classes_) + est.fit(X, y) + + try: + # sklearn >= 1.6 + from sklearn.frozen import FrozenEstimator + + self.calib_ = CalibratedClassifierCV( + FrozenEstimator(est), method=self.method + ) + except ImportError: + # sklearn < 1.6 + self.calib_ = CalibratedClassifierCV( + est, method=self.method, cv=self.cv + ) + + self.calib_.fit(X, y) + return + + # --- Case 2: missing classes -> slice + normalize X_present, use y_mapped --- + X_present = self._normalize_rows(X[:, self.present_classes_]) + + est = IdentityEstimator(len(self.present_classes_)) + est.fit(X_present, y_mapped) + + try: + from sklearn.frozen import FrozenEstimator + + self.calib_ = CalibratedClassifierCV( + FrozenEstimator(est), method=self.method + ) + except ImportError: + self.calib_ = CalibratedClassifierCV(est, method=self.method, cv=self.cv) + + self.calib_.fit(X_present, y_mapped) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + if X.ndim == 1: + X = np.stack((1.0 - X, X), axis=1) + + # If fit saw all classes, keep original behavior + if self.missing_classes_.size == 0: + return self.calib_.predict_proba(X) + + # Missing classes: normalize the same sliced X_present and zero-fill missing + # classes (Option A) + X_present = self._normalize_rows(X[:, self.present_classes_]) + p_present = self.calib_.predict_proba(X_present) + + p_full = np.zeros((X.shape[0], self.n_classes_), dtype=p_present.dtype) + p_full[:, self.present_classes_] = p_present + return p_full diff --git a/probmetrics/calibrators/splines.py b/probmetrics/calibrators/splines.py new file mode 100644 index 0000000..5902685 --- /dev/null +++ b/probmetrics/calibrators/splines.py @@ -0,0 +1,204 @@ +import numpy as np + +from .base import Calibrator +from probmetrics.utils import flatten_binary_probs, expand_binary_probs + + +class MLISplineCalibrator(Calibrator): + """ + Uses smoothing splines to determine the calibration function. + From https://github.com/numeristical/splinecalib + + Reference + --------- + Brian Lucena. Spline-based probability calibration. arXiv preprint arXiv:1809.07751, + https://arxiv.org/abs/1809.07751, 2018. + """ + def __init__(self): + super().__init__() + try: + from splinecalib import SplineCalib + except ImportError as e: + raise ImportError("The 'splinecalib' package is required.") from e + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from splinecalib import SplineCalib + + self.cal_ = SplineCalib() + if len(np.unique(X)) < 3: # This causes a bug. + self.cal_ = None + else: + self.cal_.fit(X, y) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + if self.cal_ is not None: + probs = self.cal_.calibrate(X) + else: + probs = X + probs = expand_binary_probs(probs) + return probs + + +class _GuptaSpline: + """ + Internal helper class for calculating smoothing splines. + From https://github.com/kartikgupta-at-anu/spline-calibration + """ + + def __init__( + self, x: np.ndarray, y: np.ndarray, kx: np.ndarray, runout: str = "parabolic" + ): + self.kx = kx + self.delta = kx[1] - kx[0] + self.nknots = len(kx) + self.runout = runout + + m_from_ky = self.ky_to_M() + my_from_ky = np.concatenate([m_from_ky, np.eye(len(kx))], axis=0) + y_from_my = self.my_to_y(x) + y_from_ky = y_from_my @ my_from_ky + + # Find the least squares solution + ky = np.linalg.lstsq(y_from_ky, y, rcond=-1)[0] + + self.ky = ky + self.my = my_from_ky @ ky + + def my_to_y(self, vecx: np.ndarray) -> np.ndarray: + ndata = len(vecx) + mM = np.zeros((ndata, self.nknots)) + my = np.zeros((ndata, self.nknots)) + + for i, xx in enumerate(vecx): + j = int(np.floor((xx - self.kx[0]) / self.delta)) + j = max(0, min(j, self.nknots - 2)) + x = xx - j * self.delta + + mM[i, j] = ( + -(x**3) / (6.0 * self.delta) + x**2 / 2.0 - 2.0 * self.delta * x / 6.0 + ) + mM[i, j + 1] = x**3 / (6.0 * self.delta) - self.delta * x / 6.0 + my[i, j] = -x / self.delta + 1.0 + my[i, j + 1] = x / self.delta + + return np.concatenate([mM, my], axis=1) + + def my_to_dy(self, vecx: np.ndarray) -> np.ndarray: + ndata = len(vecx) + mM = np.zeros((ndata, self.nknots)) + my = np.zeros((ndata, self.nknots)) + + for i, xx in enumerate(vecx): + j = int(np.floor((xx - self.kx[0]) / self.delta)) + j = max(0, min(j, self.nknots - 2)) + x = xx - j * self.delta + + mM[i, j] = -(x**2) / (2.0 * self.delta) + x - 2.0 * self.delta / 6.0 + mM[i, j + 1] = x**2 / (2.0 * self.delta) - self.delta / 6.0 + my[i, j] = -1.0 / self.delta + my[i, j + 1] = 1.0 / self.delta + + return np.concatenate([mM, my], axis=1) + + def ky_to_M(self) -> np.ndarray: + A = 4.0 * np.eye(self.nknots - 2) + for i in range(1, self.nknots - 2): + A[i - 1, i] = 1.0 + A[i, i - 1] = 1.0 + + if self.runout == "parabolic": + A[0, 0] = 5.0 + A[-1, -1] = 5.0 + elif self.runout == "cubic": + A[0, 0] = 6.0 + A[0, 1] = 0.0 + A[-1, -1] = 6.0 + A[-1, -2] = 0.0 + + B = np.zeros((self.nknots - 2, self.nknots)) + for i in range(0, self.nknots - 2): + B[i, i] = 1.0 + B[i, i + 1] = -2.0 + B[i, i + 2] = 1.0 + + B = B * (6 / self.delta**2) + + Ainv = np.linalg.inv(A) + AinvB = Ainv @ B + + if self.runout == "natural": + z0 = np.zeros((1, self.nknots)) + z1 = np.zeros((1, self.nknots)) + elif self.runout == "parabolic": + z0 = AinvB[0] + z1 = AinvB[-1] + elif self.runout == "cubic": + z0 = 2.0 * AinvB[0] - AinvB[1] + z1 = 2.0 * AinvB[-1] - AinvB[-2] + + z0 = z0.reshape((1, -1)) + z1 = z1.reshape((1, -1)) + + return np.concatenate([z0, AinvB, z1], axis=0) + + def evaluate(self, x: np.ndarray) -> np.ndarray: + return self.my_to_y(x) @ self.my + + def evaluate_deriv(self, x: np.ndarray) -> np.ndarray: + return self.my_to_dy(x) @ self.my + + +class CDFSplineCalibrator(Calibrator): + """ + Approximates the empirical cumulative distribution using a differentiable function + via splines. + + Based on https://github.com/kartikgupta-at-anu/spline-calibration + + Reference + --------- + Kartik Gupta, Amir Rahimi, Thalaiyasingam Ajanthan, Thomas Mensink, Cristian + Sminchisescu, and Richard Hartley. Calibration of neural networks using splines. + International Conference on Learning Representations, 2021. + """ + + def __init__(self, spline_method: str = "natural", splines: int = 6): + super().__init__() + self.spline_method = spline_method + self.splines = splines + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + probs = flatten_binary_probs(X) + + # Sort the data according to score + order = probs.argsort() + sorted_probs = probs[order] + sorted_labels = y[order] + + nsamples = len(sorted_probs) + integrated_accuracy = np.cumsum(sorted_labels) / nsamples + integrated_scores = np.cumsum(sorted_probs) / nsamples + percentiles = np.linspace(0.0, 1.0, nsamples) + + kx = np.linspace(0.0, 1.0, self.splines) + spline = _GuptaSpline( + percentiles, + integrated_accuracy - integrated_scores, + kx, + runout=self.spline_method, + ) + + # Evaluate the spline and store arrays for interpolation + acc = spline.evaluate_deriv(percentiles) + acc += sorted_probs + + self.calib_scores_ = sorted_probs + self.calib_acc_ = acc + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + probs = flatten_binary_probs(X) + # Use numpy's optimized linear interpolation instead of a custom Python + # implementation + probs = np.interp(probs, self.calib_scores_, self.calib_acc_) + probs = expand_binary_probs(probs) + return probs diff --git a/probmetrics/calibrators/temperature_scaling.py b/probmetrics/calibrators/temperature_scaling.py new file mode 100644 index 0000000..157fecc --- /dev/null +++ b/probmetrics/calibrators/temperature_scaling.py @@ -0,0 +1,673 @@ +import torch +import torchmin +import warnings +import numpy as np +from torch import nn, optim +from torch.utils.data import DataLoader, TensorDataset + +from .base import Calibrator +from probmetrics.utils import expand_binary_probs, multiclass_probs_to_logits +from probmetrics.distributions import ( + CategoricalDistribution, + CategoricalLogits, +) + +from typing import Callable + + +def bisection_search(f: Callable[[float], float], a: float, b: float, n_steps: int): + for _ in range(n_steps): + c = a + 0.5 * (b - a) + f_c = f(c) + if f_c > 0: + b = c + else: + a = c + + return 0.5 * (a + b) + + +class TemperatureScalingCalibrator(Calibrator): + """ + Fast, optimized implementation. By default, uses bisection search on the gradient of + the cross entropy for a scaling (inverse temperature) parameter. + + Reference + --------- + Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern + neural networks. International Conference on Machine Learning, 2017. + """ + def __init__( + self, + opt: str = "bisection", + max_bisection_steps: int = 30, + lr: float = 0.1, + max_iter: int = 200, + use_inv_temp: bool = True, + inv_temp_init: float = 1 / 1.5, + ): + super().__init__() + self.lr = lr + self.max_bisection_steps = max_bisection_steps + self.max_iter = max_iter + self.use_inv_temp = use_inv_temp + self.inv_temp_init = inv_temp_init + self.opt = opt + + def _get_loss_grad(self, invtemp: float, logits: torch.Tensor, y: torch.Tensor): + part_1 = torch.mean( + torch.sum(logits * torch.softmax(invtemp * logits, dim=-1), dim=-1) + ) + part_2 = torch.mean(logits[torch.arange(logits.shape[0]), y]) + return (part_1 - part_2).item() + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ): + logits = y_pred.get_logits() + labels = y_true_labels + + if self.opt in ["lbfgs", "lbfgs_line_search"]: + self._fit_lbfgs(logits, labels) + elif self.opt == "bisection": + self._fit_bisection(logits, labels) + else: + raise ValueError(f'Unknown optimizer "{self.opt}"') + + def _fit_lbfgs(self, logits: torch.Tensor, labels: torch.Tensor): + # following https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py + param = nn.Parameter( + torch.ones(1, device=logits.device) + * (self.inv_temp_init if self.use_inv_temp else 1 / self.inv_temp_init) + ) + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.LBFGS( + [param], + lr=self.lr, + max_iter=self.max_iter, + line_search_fn="strong_wolfe" if self.opt == "lbfgs_line_search" else None, + ) + + def eval(): + optimizer.zero_grad() + y_pred = ( + logits * param[:, None] + if self.use_inv_temp + else logits / param[:, None] + ) + loss = criterion(y_pred, labels) + loss.backward() + return loss + + optimizer.step(eval) + + self.invtemp_ = param.item() if self.use_inv_temp else 1 / param.item() + + def _fit_bisection(self, logits: torch.Tensor, labels: torch.Tensor): + objective_grad = lambda u, l=logits, tar=labels: self._get_loss_grad( + np.exp(u), l, tar + ) + + # should reach about float32 accuracy + # need log_2(32) = 5 steps to get to length 1 and then 24 more steps to get to float32 epsilon (2^{-24}) + self.invtemp_ = np.exp( + bisection_search( + objective_grad, a=-16, b=16, n_steps=self.max_bisection_steps + ) + ) + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + return CategoricalLogits(self.invtemp_ * y_pred.get_logits()) + + +### Other temperature scaling implementations ### + + +class GuoTemperatureScalingCalibrator(Calibrator): + """ + Adapted from https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py + + Reference + --------- + Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern + neural networks. International Conference on Machine Learning, 2017. + """ + def __init__(self): + super().__init__() + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ): + self.temp_ = nn.Parameter(torch.ones(1) * 1.5) + + logits = y_pred.get_logits() + labels = y_true_labels + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + nll_criterion = nn.CrossEntropyLoss().to(device) + + # Next: optimize the temperature w.r.t. NLL + optimizer = torch.optim.LBFGS([self.temp_], lr=0.01, max_iter=50) + + def eval(): + optimizer.zero_grad() + loss = nll_criterion(self.temperature_scale(logits), labels) + loss.backward() + return loss + + optimizer.step(eval) + + return self + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + with torch.no_grad(): + return CategoricalLogits(y_pred.get_logits() / self.temp_) + + def temperature_scale(self, logits): + """ + Perform temperature scaling on logits + """ + # Expand temperature to match the size of logits + temp = self.temp_.unsqueeze(1).expand(logits.size(0), logits.size(1)) + return logits / temp + + +class AutoGluonTemperatureScalingCalibrator(Calibrator): + """ + Adapted from + https://github.com/autogluon/autogluon/blob/c1181326cf6b7e3b27a7420273f1a82808d939e2/core/src/autogluon/core/calibrate/temperature_scaling.py#L9 + https://github.com/autogluon/autogluon/blob/28a242ebe8d55ba770c991b9db153ab4623c9abd/tabular/src/autogluon/tabular/trainer/abstract_trainer.py#L4433-L4457 + + Reference + --------- + Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern + neural networks. International Conference on Machine Learning, 2017. + """ + def __init__(self, init_val: float = 1, max_iter: int = 200, lr: float = 0.1): + super().__init__() + self.init_val = init_val + self.max_iter = max_iter + self.lr = lr + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ): + y_val_tensor = y_true_labels + temperature_param = torch.nn.Parameter(torch.ones(1).fill_(self.init_val)) + logits = y_pred.get_logits() + + is_invalid = torch.isinf(logits).any().tolist() + if is_invalid: + return + + nll_criterion = torch.nn.CrossEntropyLoss() + optimizer = torch.optim.LBFGS( + [temperature_param], lr=self.lr, max_iter=self.max_iter + ) + scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) + + optimizer_trajectory = [] + + if self.init_val != 1.0: + # need to check 1.0 as well since AutoGluon does it outside + optimizer_trajectory.append( + (nll_criterion(logits, y_val_tensor).item(), 1.0) + ) + + def temperature_scale_step(): + optimizer.zero_grad() + temp = temperature_param.unsqueeze(1).expand(logits.size(0), logits.size(1)) + new_logits = logits / temp + loss = nll_criterion(new_logits, y_val_tensor) + loss.backward() + scheduler.step() + optimizer_trajectory.append((loss.item(), temperature_param.item())) + return loss + + optimizer.step(temperature_scale_step) + + try: + best_loss_index = np.nanargmin(np.array(optimizer_trajectory)[:, 0]) + except ValueError: + self.temperature = 1.0 + return + temperature_scale = float(np.array(optimizer_trajectory)[best_loss_index, 1]) + + if np.isnan(temperature_scale) or temperature_scale <= 0.0: + self.temperature = 1.0 + return + + self.temp_ = temperature_scale + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + with torch.no_grad(): + return CategoricalLogits(y_pred.get_logits() / self.temp_) + + +class AutoGluonInverseTemperatureScalingCalibrator(Calibrator): + # adapted from + # https://github.com/autogluon/autogluon/blob/c1181326cf6b7e3b27a7420273f1a82808d939e2/core/src/autogluon/core/calibrate/temperature_scaling.py#L9 + # but optimizing the inverse temperature instead + def __init__(self, init_val: float = 1, max_iter: int = 200, lr: float = 0.1): + super().__init__() + self.init_val = init_val + self.max_iter = max_iter + self.lr = lr + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ): + y_val_tensor = y_true_labels + inv_temperature_param = torch.nn.Parameter(torch.ones(1).fill_(self.init_val)) + logits = y_pred.get_logits() + + is_invalid = torch.isinf(logits).any().tolist() + if is_invalid: + return + + nll_criterion = torch.nn.CrossEntropyLoss() + optimizer = torch.optim.LBFGS( + [inv_temperature_param], lr=self.lr, max_iter=self.max_iter + ) + scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) + + optimizer_trajectory = [] + + def temperature_scale_step(): + optimizer.zero_grad() + inv_temp = inv_temperature_param.unsqueeze(1).expand( + logits.size(0), logits.size(1) + ) + new_logits = logits * inv_temp + loss = nll_criterion(new_logits, y_val_tensor) + loss.backward() + scheduler.step() + optimizer_trajectory.append((loss.item(), inv_temperature_param.item())) + return loss + + optimizer.step(temperature_scale_step) + + try: + best_loss_index = np.nanargmin(np.array(optimizer_trajectory)[:, 0]) + except ValueError: + return + inv_temperature_scale = float( + np.array(optimizer_trajectory)[best_loss_index, 1] + ) + + if np.isnan(inv_temperature_scale): + return + + self.inv_temp_ = inv_temperature_scale + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + with torch.no_grad(): + return CategoricalLogits(y_pred.get_logits() * self.inv_temp_) + + +class TorchUncertaintyTemperatureScalingCalibrator(TemperatureScalingCalibrator): + # adapted from + # https://github.com/ENSTA-U2IS-AI/torch-uncertainty/blob/3a021d2e34e183b8aad3a0345e6d750c08c72af3/torch_uncertainty/post_processing/calibration/scaler.py#L1 + # https://github.com/ENSTA-U2IS-AI/torch-uncertainty/blob/3a021d2e34e183b8aad3a0345e6d750c08c72af3/torch_uncertainty/post_processing/calibration/temperature_scaler.py#L9 + def __init__(self, init_val: float = 1, lr: float = 0.1, max_iter: int = 100): + super().__init__( + opt="lbfgs", + lr=lr, + max_iter=max_iter, + use_inv_temp=False, + inv_temp_init=1.0 / init_val, + ) + # need to save these values here to comply with sklearn cloneability conventions + self.init_val = init_val + self.lr = lr + self.max_iter = max_iter + + +class NetcalTemperatureScalingCalibrator(Calibrator): + # this one does nothing due to https://github.com/EFS-OpenSource/calibration-framework/issues/61 + def __init__(self): + super().__init__() + try: + from netcal.scaling import TemperatureScaling + except ImportError as e: + raise ImportError("The 'netcal' package is required.") from e + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from netcal.scaling import TemperatureScaling + + self.cal_ = TemperatureScaling() + + if X.shape[1] == 2: + # binary, convert + X = X[:, 1] + self.cal_.fit(X, y, random_state=0) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + if X.shape[1] == 2: + # binary, convert + X = X[:, 1] + probs = self.cal_.transform(X) + probs = expand_binary_probs(probs) + return probs + + +class TorchcalTemperatureScalingCalibrator(Calibrator): + # adapted from + # https://github.com/rishabh-ranjan/torchcal/blob/3fb65f6423d33d680cd68c7f40a0259d41e8fb0b/torchcal.py#L8 + def __init__(self): + super().__init__() + + def _fit_torch_impl( + self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor + ): + y_pred_logits = y_pred.get_logits() + + temp = torch.ones(1, device=y_pred_logits.device) + + def loss(t): + return torch.nn.functional.cross_entropy(y_pred_logits / t, y_true_labels) + + res = torchmin.minimize(loss, temp, method="newton-exact") + if not res.success: + warnings.warn( + f"{self.__class__}: {res.message} Not updating calibrator params." + ) + else: + temp = res.x + + self.temp_ = temp.item() + + def _predict_proba_torch_impl( + self, y_pred: CategoricalDistribution + ) -> CategoricalDistribution: + return CategoricalLogits(y_pred.get_logits() / self.temp_) + + +class ETSCalibrator(Calibrator): + """ + From https://github.com/zhang64-llnl/Mix-n-Match-Calibration + Optimizes a temperature parameter and then finds the optimal ensemble weights + between the scaled predictions, original predictions, and a uniform distribution. + + Inputs (X) must be probabilities. + Supports both multi-class (2D) and binary (1D or 2D) probabilities. + + Reference + --------- + Jize Zhang, Bhavya Kailkhura, and T. Yong-Jin Han. Mix-n-match: Ensemble and + compositional methods for uncertainty calibration in deep learning. International + Conference on Machine Learning, 2020. + """ + + def __init__(self, loss: str = "mse"): + super().__init__() + try: + from scipy import optimize + except ImportError as e: + raise ImportError("The 'scipy' package is required.") from e + + if loss not in ["mse", "ce"]: + raise ValueError( + "Loss must be either 'mse' (Mean Squared Error) or 'ce' (Cross-Entropy)." + ) + self.loss = loss + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + """ + Fits the calibrator. + X: array of probabilities (n_samples, n_classes) or (n_samples,) for binary. + y: labels (n_samples,) or one-hot encoded (n_samples, n_classes). + """ + from scipy import optimize + # Convert 1D binary probabilities to 2D (Class 0: 1 - p, Class 1: p) + if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1): + X = np.column_stack((1.0 - X.flatten(), X.flatten())) + self.n_classes_ = X.shape[1] + logits = multiclass_probs_to_logits(X) + + # Convert labels to one-hot if they are purely 1D indices + if y.ndim == 1 or (y.ndim == 2 and y.shape[1] == 1): + y_one_hot = np.eye(self.n_classes_)[y.flatten()] + else: + y_one_hot = y + + # 1. Optimize Temperature (t) using the derived logits + bnds_t = ((0.05, 5.0),) + t_target_func = self._ll_t if self.loss == "ce" else self._mse_t + res_t = optimize.minimize( + t_target_func, + x0=[1.0], + args=(logits, y_one_hot), + method="L-BFGS-B", + bounds=bnds_t, + tol=1e-12, + ) + self.t_ = res_t.x[0] + + # 2. Optimize Ensemble Weights (w0, w1, w2) + p1 = X # Original probabilities + p0 = self._stable_softmax(logits / self.t_) # Temperature-scaled probabilities + p2 = np.ones_like(p0) / self.n_classes_ # Uniform distribution + + bnds_w = ((0.0, 1.0), (0.0, 1.0), (0.0, 1.0)) + constraints = {"type": "eq", "fun": lambda w: np.sum(w) - 1.0} + + w_target_func = self._ll_w if self.loss == "ce" else self._mse_w + res_w = optimize.minimize( + w_target_func, + x0=[1.0, 0.0, 0.0], + args=(p0, p1, p2, y_one_hot), + method="SLSQP", + constraints=constraints, + bounds=bnds_w, + tol=1e-12, + ) + self.w_ = res_w.x + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + """ + Predicts calibrated probabilities. + X: array of probabilities (n_samples, n_classes) or (n_samples,) for binary. + """ + # Convert 1D binary probabilities to 2D (Class 0: 1 - p, Class 1: p) + if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1): + X = np.column_stack((1.0 - X.flatten(), X.flatten())) + self.n_classes_ = X.shape[1] + logits = multiclass_probs_to_logits(X) + + p1 = X + p0 = self._stable_softmax(logits / self.t_) + p2 = np.ones_like(p0) / self.n_classes_ + + p_calibrated = self.w_[0] * p0 + self.w_[1] * p1 + self.w_[2] * p2 + + # Ensure mathematical strictness of probabilities + return p_calibrated / np.sum(p_calibrated, axis=1, keepdims=True) + + @staticmethod + def _stable_softmax(x: np.ndarray) -> np.ndarray: + """Helper to prevent np.exp() overflow errors.""" + e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) + return e_x / np.sum(e_x, axis=1, keepdims=True) + + @staticmethod + def _mse_t(t: float, logit: np.ndarray, label: np.ndarray) -> float: + p = ETSCalibrator._stable_softmax(logit / t) + return np.mean((p - label) ** 2) + + @staticmethod + def _ll_t(t: float, logit: np.ndarray, label: np.ndarray) -> float: + p = ETSCalibrator._stable_softmax(logit / t) + p = np.clip(p, 1e-20, 1 - 1e-20) + return -np.sum(label * np.log(p)) / p.shape[0] + + @staticmethod + def _mse_w( + w: np.ndarray, p0: np.ndarray, p1: np.ndarray, p2: np.ndarray, label: np.ndarray + ) -> float: + p = w[0] * p0 + w[1] * p1 + w[2] * p2 + p = p / np.sum(p, axis=1, keepdims=True) + return np.mean((p - label) ** 2) + + @staticmethod + def _ll_w( + w: np.ndarray, p0: np.ndarray, p1: np.ndarray, p2: np.ndarray, label: np.ndarray + ) -> float: + p = w[0] * p0 + w[1] * p1 + w[2] * p2 + p = np.clip(p, 1e-20, 1 - 1e-20) + return -np.sum(label * np.log(p)) / p.shape[0] + + +def _softplus_fn( + x: torch.Tensor, a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, u: torch.Tensor +) -> torch.Tensor: + """Core mathematical mapping for the Softplus recalibration.""" + return -(c / a) * torch.log(u + torch.exp(-a * (x - b))) + + +class _PyTorchSoftplus(nn.Module): + """ + Internal PyTorch module adapting the original code for standard 2D + (n_samples, n_classes) input. + """ + + def __init__( + self, method: str, T: float = 1.0, use_shift: bool = False, n_classes: int = 2 + ): + super().__init__() + + # Initialize defaults + init_a = 0.135 if method == "SPU" else 1.0 + init_c = T if method == "SPTS" else 1.0 + + self.a = nn.Parameter(torch.ones(1) * init_a) + self.b = nn.Parameter(torch.ones(1) * 1.0) + self.c = nn.Parameter(torch.ones(1) * init_c) + self.u = nn.Parameter(torch.ones(1) * 0.1) + + # Apply specific freezing rules based on the method + if method in ["SP1", "SPTS"]: + self.c.requires_grad = False + + self.use_shift = use_shift + if self.use_shift: + # Learned weight vector for cross-class shifting + self.w = nn.Parameter(torch.zeros(n_classes)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.use_shift: + # Dot product of x and w gives a shift per sample: (n_samples,) + # We unsqueeze to (n_samples, 1) so it broadcasts across all classes + shift = (x @ self.w).unsqueeze(1) + x = x + shift + + # The original code squares a, c, and u to enforce strict positivity + return _softplus_fn(x, self.a**2, self.b, self.c**2, self.u**2) + + +class SoftPlusCalibrator(Calibrator): + """ + TODO work in progress, very slow for now. + + Softplus Recalibrator using PyTorch for optimization. + Supports SPU, SP1, and SPTS, with or without the cross-class 'shift' mechanism. + + Inputs (X) must be probabilities. + Supports both multi-class (2D) and binary (1D or 2D) probabilities. + + Reference + --------- + Christopher Qian, Feng Liang, and Jason Adams. Extending Temperature Scaling with + Homogenizing Maps. Journal of Machine Learning Research 26(161):1-46, 2025 + """ + + def __init__( + self, + method: str = "SPU", + T: float = 1.0, + use_shift: bool = False, + n_epochs: int = 100, + lr: float = 0.01, + batch_size: int = 256, + ): + super().__init__() + if method not in ["SPU", "SP1", "SPTS"]: + raise ValueError("Method must be one of 'SPU', 'SP1', or 'SPTS'.") + + self.method = method + self.T = T + self.use_shift = use_shift + self.n_epochs = n_epochs + self.lr = lr + self.batch_size = batch_size + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + """Fits the calibrator using Adam and CrossEntropyLoss.""" + # Convert 1D binary probabilities to 2D (Class 0: 1 - p, Class 1: p) + if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1): + X = np.column_stack((1.0 - X.flatten(), X.flatten())) + self.n_classes_ = X.shape[1] + logits = multiclass_probs_to_logits(X) + + # CrossEntropyLoss expects 1D integer class indices, not one-hot + if y.ndim == 2 and y.shape[1] > 1: + y = np.argmax(y, axis=1) + + X_tensor = torch.tensor(logits, dtype=torch.float32) + y_tensor = torch.tensor(y, dtype=torch.long) + + dataset = TensorDataset(X_tensor, y_tensor) + # Fall back to len(X) batch size if the dataset is tiny, matching original script logic + bs = self.batch_size if len(X) > self.batch_size else len(X) + loader = DataLoader(dataset, batch_size=bs, shuffle=True) + + self.model_ = _PyTorchSoftplus( + method=self.method, + T=self.T, + use_shift=self.use_shift, + n_classes=self.n_classes_, + ).to(self.device) + + optimizer = optim.Adam(self.model_.parameters(), lr=self.lr) + criterion = nn.CrossEntropyLoss() + + self.model_.train() + for _ in range(self.n_epochs): + for batch_x, batch_y in loader: + batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device) + + optimizer.zero_grad() + logits_calibrated = self.model_(batch_x) + loss = criterion(logits_calibrated, batch_y) + loss.backward() + optimizer.step() + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + """Returns calibrated probabilities.""" + if self.model_ is None: + raise RuntimeError( + "Calibrator must be fitted before calling predict_proba." + ) + + # Convert 1D binary probabilities to 2D + if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1): + X = np.column_stack((1.0 - X.flatten(), X.flatten())) + logits = multiclass_probs_to_logits(X) + X_tensor = torch.tensor(logits, dtype=torch.float32).to(self.device) + + self.model_.eval() + with torch.no_grad(): + calibrated_logits = self.model_(X_tensor) + # Apply softmax to get final probabilities + probs = torch.softmax(calibrated_logits, dim=1).cpu().numpy() + + return probs diff --git a/probmetrics/calibrators/trees.py b/probmetrics/calibrators/trees.py new file mode 100644 index 0000000..c9b6d03 --- /dev/null +++ b/probmetrics/calibrators/trees.py @@ -0,0 +1,377 @@ +import warnings +import numpy as np +from .base import Calibrator +from probmetrics.utils import binary_probs_to_logits, multiclass_probs_to_logits + + +class LightGBMCalibrator(Calibrator): + def __init__(self, use_cv: bool = True): + super().__init__() + try: + import lightgbm as lgb + from sklearn.model_selection import StratifiedKFold + except ImportError as e: + raise ImportError( + "The 'lightgbm' and 'scikit-learn' packages are required." + ) from e + self.use_cv = use_cv + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + import lightgbm as lgb + from sklearn.model_selection import StratifiedKFold + + self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2 + self.cal_models_ = [] + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + objective = "binary" + metric = "binary_logloss" + else: + logits = multiclass_probs_to_logits(X) + objective = "multiclass" + metric = "multi_logloss" + + params = { + "objective": objective, + "metric": metric, + "max_depth": 3, + "num_leaves": 8, + "verbose": -1, + } + if self.n_classes_ > 2: + params["num_class"] = self.n_classes_ + + if self.use_cv: + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) + for train_idx, val_idx in skf.split(logits, y): + train_data = lgb.Dataset(logits[train_idx], label=y[train_idx]) + val_data = lgb.Dataset( + logits[val_idx], label=y[val_idx], reference=train_data + ) + + model = lgb.train( + params, + train_data, + valid_sets=[val_data], + callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)], + ) + self.cal_models_.append(model) + else: + train_data = lgb.Dataset(logits, label=y) + + model = lgb.train(params, train_data) + self.cal_models_.append(model) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + else: + logits = multiclass_probs_to_logits(X) + + all_preds = [] + for model in self.cal_models_: + pred = model.predict(logits) + if self.n_classes_ == 2: + pred = np.vstack([1 - pred, pred]).T + all_preds.append(pred) + + return np.mean(all_preds, axis=0) + + +class XGBoostCalibrator(Calibrator): + def __init__(self, use_cv: bool = True): + super().__init__() + try: + import xgboost as xgb + from sklearn.model_selection import StratifiedKFold + except ImportError as e: + raise ImportError( + "The 'xgboost' and 'scikit-learn' packages are required." + ) from e + self.use_cv = use_cv + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + import xgboost as xgb + from sklearn.model_selection import StratifiedKFold + + self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2 + self.n_observed_classes_ = len(np.unique(y)) + + if self.n_observed_classes_ != self.n_classes_: + warnings.warn( + "# classes X.shape[1] is different from # unique labels len(np.unique(y))" + ) + self.observed_classes_ = np.unique(y) + class_to_index = { + cls: idx for idx, cls in enumerate(self.observed_classes_) + } + y_remapped = np.array([class_to_index[c] for c in y]) + else: + y_remapped = y + + self.cal_models_ = [] + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + objective = "binary:logistic" + eval_metric = "logloss" + else: + logits = multiclass_probs_to_logits(X) + objective = "multi:softprob" + eval_metric = "mlogloss" + + params = { + "objective": objective, + "eval_metric": eval_metric, + "max_depth": 3, + } + if self.n_classes_ > 2: + params["num_class"] = self.n_observed_classes_ + + if self.use_cv: + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) + for train_idx, val_idx in skf.split(logits, y_remapped): + train_kwargs = {"data": logits[train_idx], "label": y_remapped[train_idx]} + val_kwargs = {"data": logits[val_idx], "label": y_remapped[val_idx]} + + dtrain = xgb.DMatrix(**train_kwargs) + dval = xgb.DMatrix(**val_kwargs) + + model = xgb.train( + params, + dtrain, + evals=[(dval, "eval")], + early_stopping_rounds=50, + verbose_eval=False, + ) + self.cal_models_.append(model) + else: + dtrain_kwargs = {"data": logits, "label": y_remapped} + dtrain = xgb.DMatrix(**dtrain_kwargs) + model = xgb.train(params, dtrain) + self.cal_models_.append(model) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + import xgboost as xgb + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + else: + logits = multiclass_probs_to_logits(X) + + dtest_kwargs = {"data": logits} + dtest = xgb.DMatrix(**dtest_kwargs) + + all_preds = [] + for model in self.cal_models_: + pred = model.predict(dtest) + if self.n_classes_ == 2: + pred = np.vstack([1 - pred, pred]).T + all_preds.append(pred) + preds = np.mean(all_preds, axis=0) + + if self.n_observed_classes_ != self.n_classes_: + full_preds = np.zeros((preds.shape[0], self.n_classes_)) + full_preds[:, self.observed_classes_] = preds + preds = full_preds + + return preds + + +class CatBoostCalibrator(Calibrator): + def __init__(self, use_cv: bool = True): + super().__init__() + try: + from catboost import CatBoostClassifier, Pool + from sklearn.model_selection import StratifiedKFold + except ImportError as e: + raise ImportError( + "The 'catboost' and 'scikit-learn' packages are required." + ) from e + self.use_cv = use_cv + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from catboost import CatBoostClassifier, Pool + from sklearn.model_selection import StratifiedKFold + + self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2 + self.n_observed_classes_ = len(np.unique(y)) + + if self.n_observed_classes_ != self.n_classes_: + warnings.warn( + "# classes X.shape[1] is different from # unique labels len(np.unique(y))" + ) + self.observed_classes_ = np.unique(y) + class_to_index = { + cls: idx for idx, cls in enumerate(self.observed_classes_) + } + y_remapped = np.array([class_to_index[c] for c in y]) + else: + y_remapped = y + + self.cal_models_ = [] + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + else: + logits = multiclass_probs_to_logits(X) + + if self.use_cv: + skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) + for train_idx, val_idx in skf.split(logits, y_remapped): + train_kwargs = {"data": logits[train_idx], "label": y_remapped[train_idx]} + val_kwargs = {"data": logits[val_idx], "label": y_remapped[val_idx]} + + train_pool = Pool(**train_kwargs) + val_pool = Pool(**val_kwargs) + + model = CatBoostClassifier( + depth=3, + loss_function="Logloss" if self.n_classes_ == 2 else "MultiClass", + iterations=1000, + early_stopping_rounds=50, + verbose=False, + ) + model.fit(train_pool, eval_set=val_pool, verbose=False) + self.cal_models_.append(model) + else: + pool_kwargs = {"data": logits, "label": y_remapped} + pool = Pool(**pool_kwargs) + + model = CatBoostClassifier(depth=3, verbose=False) + model.fit(pool) + self.cal_models_.append(model) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + from catboost import Pool + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + else: + logits = multiclass_probs_to_logits(X) + + pool_kwargs = {"data": logits} + pool = Pool(**pool_kwargs) + + preds = [model.predict_proba(pool) for model in self.cal_models_] + preds = np.mean(preds, axis=0) + + if self.n_observed_classes_ != self.n_classes_: + full_preds = np.zeros((preds.shape[0], self.n_classes_)) + full_preds[:, self.observed_classes_] = preds + preds = full_preds + + return preds + + +class BinaryCatBoostCalibrator(Calibrator): + def __init__(self, tiny=False, monotone=False): + super().__init__() + try: + from catboost import CatBoostClassifier, Pool + except ImportError as e: + raise ImportError("The 'catboost' packages is required.") from e + self.tiny = tiny + self.monotone = monotone + self.model_params = { + "verbose": 0, + "allow_writing_files": False, + "iterations": 100, + } + + if self.tiny: + self.model_params["depth"] = 3 + + if self.monotone: + self.model_params["monotone_constraints"] = "(1)" + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from catboost import CatBoostClassifier, Pool + + self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2 + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + else: + raise ValueError("This calibrator is made for binary predictions only.") + + pool_kwargs = {"data": logits, "label": y} + pool = Pool(**pool_kwargs) + model = CatBoostClassifier(**self.model_params) + model.fit(pool) + self.cal_ = model + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + from catboost import Pool + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + else: + raise ValueError("This calibrator is made for binary predictions only.") + + pool_kwargs = {"data": logits} + pool = Pool(**pool_kwargs) + + return self.cal_.predict_proba(pool) + + +class InitLogitCatBoostCalibrator(Calibrator): + def __init__(self, init_logits=True): + super().__init__() + try: + from catboost import CatBoostClassifier, Pool + except ImportError as e: + raise ImportError("The 'catboost' package is required.") from e + self.init_logits = init_logits + self.model_params = { + "iterations": 50, + "learning_rate": 0.1, + "depth": 4, + "verbose": 0, + "allow_writing_files": False, + } + + def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None: + from catboost import CatBoostClassifier, Pool + + self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2 + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + else: + logits = multiclass_probs_to_logits(X) + + pool_kwargs = {"data": logits, "label": y} + if self.init_logits: + pool_kwargs["baseline"] = logits + pool = Pool(**pool_kwargs) + + self.cal_ = CatBoostClassifier(**self.model_params) + self.cal_.fit(pool) + + def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray: + from catboost import Pool + + if self.n_classes_ == 2: + log_p, log_1_minus_p = binary_probs_to_logits(X) + logits = log_p - log_1_minus_p + else: + logits = multiclass_probs_to_logits(X) + + pool_kwargs = {"data": logits} + if self.init_logits: + pool_kwargs["baseline"] = logits + + pool = Pool(**pool_kwargs) + + return self.cal_.predict_proba(pool) diff --git a/probmetrics/metrics.py b/probmetrics/metrics.py index 8856f63..a3e655e 100644 --- a/probmetrics/metrics.py +++ b/probmetrics/metrics.py @@ -113,7 +113,7 @@ def _get_candidate_metrics() -> List['Metrics']: TopClassLoss(ProperLpLoss(p=2)), TopClassLoss(ProperLpLoss(p=float('inf'))), BrierLoss(binary_as_multiclass=False), - SmoothCalibrationError(), + SmoothCalibrationError(), KuiperCalibrationMetric(), MSE(), RMSE(), NRMSE(), MAE(), NMAE()] cand_list.extend([MeanProbNormalizedMetric(metric) for metric in [ClippedLogLoss(), LogLoss(), BrierLoss(), ClassError(), Accuracy()]]) @@ -900,6 +900,59 @@ def _compute_mean(self, y_true: CategoricalDistribution, y_pred: CategoricalDist return torch.as_tensor(rp.smECE(f=conf, y=acc), device=labels_torch.device, dtype=torch.float32) +class KuiperCalibrationMetric(ClassificationMetric): + def __init__(self): + super().__init__(name='kuiper', is_lower_better=True) + + def _compute_mean(self, y_true: CategoricalDistribution, y_pred: CategoricalDistribution, **kwargs) -> Optional[torch.Tensor]: + labels = y_true.get_modes().flatten().float() + probs = y_pred.get_probs()[..., 1].flatten().float() + + N = labels.numel() + if N == 0: + return torch.tensor(0.0, device=labels.device) + + probs_sorted, sort_idx = torch.sort(probs) + labels_sorted = labels[sort_idx] + + cum_probs = torch.cumsum(probs_sorted, dim=0) / N + cum_labels = torch.cumsum(labels_sorted, dim=0) / N + + diffs = cum_labels - cum_probs + + max_dev = torch.max(diffs.max(), torch.tensor(0.0, device=diffs.device)) + min_dev = torch.min(diffs.min(), torch.tensor(0.0, device=diffs.device)) + + kuiper_score = max_dev - min_dev + + return kuiper_score + + +class KolmogorovSmirnovCalibrationMetric(ClassificationMetric): + def __init__(self): + super().__init__(name='kolmogorov-smirnov', is_lower_better=True) + + def _compute_mean(self, y_true: CategoricalDistribution, y_pred: CategoricalDistribution, **kwargs) -> Optional[torch.Tensor]: + labels = y_true.get_modes().flatten().float() + probs = y_pred.get_probs()[..., 1].flatten().float() + + N = labels.numel() + if N == 0: + return torch.tensor(0.0, device=labels.device) + + probs_sorted, sort_idx = torch.sort(probs) + labels_sorted = labels[sort_idx] + + cum_probs = torch.cumsum(probs_sorted, dim=0) / N + cum_labels = torch.cumsum(labels_sorted, dim=0) / N + + diffs = cum_labels - cum_probs + abs_diffs = torch.abs(diffs) + kolmogorovsmirnov = abs_diffs.max() + + return kolmogorovsmirnov + + class MetricsWithCalibration(Metrics): def __init__(self, metrics: Metrics, calibrator: sklearn.base.ClassifierMixin, val_splitter: Splitter, cal_name: Optional[str] = None, random_state: Optional[int] = None): diff --git a/probmetrics/utils.py b/probmetrics/utils.py index f7fb3b6..585f73d 100644 --- a/probmetrics/utils.py +++ b/probmetrics/utils.py @@ -12,41 +12,48 @@ def join_dicts(*dicts, allow_overlap: bool = True): def validate_probabilities(X: np.ndarray) -> None: """ - Checks if X is a valid 2D array of probability distributions. + Checks if X is a valid array of probabilities, either 1D (binary) or 2D (binary or multiclass). Raises ValueError if the validation fails. """ - if X.ndim != 2: - raise ValueError(f"Input array must be 2D, but got {X.ndim} dimensions.") + if X.ndim > 2: + raise ValueError(f"Input array must be 1D or 2D, but got {X.ndim} dimensions.") + if not np.all(np.isfinite(X)): + raise ValueError("Input array contains NaN or infinite values.") if np.any(X < 0) or np.any(X > 1): raise ValueError("All probability values must be within the [0, 1] range.") - if not np.all(np.isclose(X.sum(axis=1), 1.0)): + if X.ndim == 2 and not np.all(np.isclose(X.sum(axis=1), 1.0, atol=1e-6)): raise ValueError("Each row of the input array must sum to 1 to be a valid probability distribution.") -def process_binary_probs(probs: np.ndarray) -> np.ndarray: +def flatten_binary_probs(probs: np.ndarray) -> np.ndarray: probs = probs.astype(np.float32) - - if probs.ndim == 1: # 1D array of probabilities - return probs - elif probs.ndim == 2 and probs.shape[1] == 2: # 2D array of probability pairs [(p0, p1), ...] - return probs[:, 1] + if probs.ndim == 1: + p_pos = probs + elif probs.ndim == 2 and probs.shape[1] == 2: + p_pos = probs[:, 1] else: raise ValueError(f"Invalid input shape: {probs.shape}." f"1D array or 2D array with shape (n, 2)") + return p_pos + +def expand_binary_probs(probs: np.ndarray) -> np.ndarray: + if probs.ndim == 1: + return np.stack([1.0 - probs, probs], axis=1) + elif probs.ndim == 2 and probs.shape[1] == 1: + return np.concatenate([1.0 - probs, probs], axis=1) + else: + return probs def binary_probs_to_logits(probs: np.ndarray) -> np.ndarray: """Converts binary probabilities to logits using the logit function (inverse sigmoid). Clips logits to the log of the tiniest normal float32 to avoid infinite logit values. """ - probs = process_binary_probs(probs) - + probs = flatten_binary_probs(probs) with np.errstate(divide="ignore"): log_p = np.log(probs) log_1_minus_p = np.log1p(-probs) - thresh = np.log(np.finfo(np.float32).tiny) log_p = np.clip(log_p, a_min=thresh, a_max=-thresh).reshape(-1, 1) log_1_minus_p = np.clip(log_1_minus_p, a_min=thresh, a_max=-thresh).reshape(-1, 1) - return log_p, log_1_minus_p def multiclass_probs_to_logits(probs: np.ndarray) -> np.ndarray: diff --git a/pyproject.toml b/pyproject.toml index facf76c..d3949f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,21 +5,24 @@ build-backend = "hatchling.build" [project] name = "probmetrics" dynamic = ["version"] -description = 'Probabilistic metrics and calibration methods' +description = "Probabilistic metrics and calibration methods" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = "Apache-2.0" -keywords = ['Metrics', 'Classification', 'Calibration', 'Refinement'] +keywords = ["Metrics", "Classification", "Calibration", "Refinement"] authors = [ - { name = "David Holzmüller" }, #, email = "a@b.org" }, + {name = "David Holzmüller", email = "a@b.org"}, + {name = "Eugène Berta", email = "eugene.berta@gmail.com"}, + {name = "Sacha Braun"} ] classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "License :: OSI Approved :: Apache Software License", @@ -28,25 +31,28 @@ dependencies = [ "torch>=2.0", "numpy>=1.25", "scikit-learn>=1.3", - # older versions of torchmetrics (<1.2.1) have a bug that makes certain metrics slow: - # https://github.com/Lightning-AI/torchmetrics/pull/2184 - "torchmetrics>=1.2.1", - # added constraint due to some pytabkit failure case with github actions, needed for python >= 3.10 anyway - "numba>=0.59.0", + "torchmetrics>=1.2.1", # older versions of torchmetrics (<1.2.1) have a bug that + # makes certain metrics slow: https://github.com/Lightning-AI/torchmetrics/pull/2184 + "pytorch-minimize", ] [project.optional-dependencies] extra = [ + "numba>=0.59.0", # For SAGA based versions of logistic calibrators. Added constraint + # due to some pytabkit failure case with github actions, needed for python >= 3.10 + # anyway + "netcal", # For BBQ, ENIR, Netcal TS + "uncertainty-calibration", # For BinaryPlattBinnerCalibrator + "betacal", # For BetacalCalibrator + "splinecalib", # For MLISplineCalibrator "venn-abers", # Venn-Abers calibration "cir-model", # centered isotonic regression - "netcal", # for temperature scaling comparison - "relplot", # for smooth ECE - "pytorch-minimize", # for torchcal TS implementation - # relplot fails for sklearn 1.7: ModuleNotFoundError: No module named 'sklearn.utils._estimator_html_repr' - "scikit-learn>=1.3", - "catboost", # For the WS_CatboostClassifier - "lightgbm", # For the WS_LGBMClassifier - "scipy", # For the WS_LGBMClassifier to handle softmax + "scipy", # For WS_LGBMClassifier and ETSCalibrator + "relplot", # for smooth ECE. relplot fails for sklearn 1.7: ModuleNotFoundError: No + # module named 'sklearn.utils._estimator_html_repr' + "catboost", # For WS_CatboostClassifier and CatBoostCalibrator + "lightgbm", # For WS_LGBMClassifier and LightGBMCalibrator + "xgboost", # For XGBoostCalibrator ] dev = [ "matplotlib>=3.0", # plotting @@ -76,7 +82,7 @@ installer = "uv" features = ["extra", "dev"] [[tool.hatch.envs.test.matrix]] -python = ["3.9", "3.10", "3.11", "3.12"] +python = ["3.10", "3.11", "3.12", "3.13", "3.14"] [tool.hatch.envs.test.scripts] run-tests = "pytest {args}" @@ -118,4 +124,4 @@ exclude_lines = [ "no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:", -] \ No newline at end of file +] diff --git a/tests/test_calibrators.py b/tests/test_calibrators.py index aa96f68..dad91f1 100644 --- a/tests/test_calibrators.py +++ b/tests/test_calibrators.py @@ -1,5 +1,9 @@ from typing import Tuple, Optional +# Without this XGBoost gets a segmentation fault +import os +os.environ["OMP_NUM_THREADS"] = "1" + import tests # otherwise coverage warns import pytest @@ -9,32 +13,56 @@ import torch.autograd.graph from sklearn.model_selection import train_test_split -from probmetrics.calibrators import Calibrator, VennAbersCalibrator, MulticlassOneVsRestCalibrator, \ - BinaryVennAbersCalibrator, MulticlassOneVsOneCalibrator, SklearnCalibrator, get_calibrator, \ - TemperatureScalingCalibrator, PreprocessingCalibratorWrapper +from probmetrics.calibrators import * from probmetrics.distributions import CategoricalDirac, CategoricalProbs from probmetrics.metrics import BrierLoss, ClassificationMetric, SmoothCalibrationError, CalibrationError -binary_only_calibrators = {'linear-scaling', 'affine-scaling', 'quadratic-scaling'} +binary_only_calibrators = { + "linear-scaling", + "affine-scaling", + "quadratic-scaling", + "Hist-uniform", + "Hist-quantile", + "Scaling-Binning", + "BBQ", + "CIR", + "Venn-Abers", + "ENIR", + "Beta", + "Spline", + "CDF-Spline", +} calibrator_specs = [ - (name, get_calibrator(name)) for name in - [ - 'platt', 'isotonic', 'platt-logits', 'ivap-ovr', 'ivap-ovo', 'cir', 'temp-scaling', - 'autogluon-ts', 'torchunc-ts', - 'linear-scaling', 'affine-scaling', 'quadratic-scaling', 'logistic' - ] - ] + [ - ('svs', get_calibrator('svs', svs_opt='bfgs')), - # test with bfgs instead of saga since it is deterministic. - ('sms', get_calibrator('sms', sms_opt='bfgs')), - # test with bfgs instead of saga since it is deterministic. - ('temp-scaling-lbfgs', TemperatureScalingCalibrator(opt='lbfgs')), - ('temp-scaling-mixture', get_calibrator('temp-scaling', calibrate_with_mixture=True)), - ] - - -# [!] 'ivap' not running for test_calibrator_missing_class. + (name, get_calibrator(name)) for name in [ + "platt", "isotonic", + # "platt-logits", + "ivap-ovr", "ivap-ovo", "cir", + "temp-scaling", "autogluon-ts", "torchunc-ts", "linear-scaling", + "affine-scaling", "quadratic-scaling", "logistic", "svs", "sms" + ]] + [ + ("temp-scaling-lbfgs", TemperatureScalingCalibrator(opt="lbfgs")), + ("temp-scaling-mixture", get_calibrator("temp-scaling", calibrate_with_mixture=True)), + ("Hist-uniform", BinaryHistogramBinningCalibrator(strategy="uniform")), + ("Hist-quantile", BinaryHistogramBinningCalibrator(strategy="quantile")), + ("Scaling-Binning", BinaryPlattBinnerCalibrator()), + ("BBQ", NetcalBBQCalibrator()), + ("CIR", CenteredIsotonicRegressionCalibrator()), + ("Venn-Abers", BinaryVennAbersCalibrator()), + ("ENIR", NetcalENIRCalibrator()), + ("Beta", BetacalCalibrator()), + ("Spline", MLISplineCalibrator()), + ("CDF-Spline", CDFSplineCalibrator()), + ("ETS", ETSCalibrator()), + ("VS", VectorScalingCalibrator()), + ("MS", MatrixScalingCalibrator()), + ("Kernel", KernelCalibrator()), + ("XGBoost", XGBoostCalibrator()), + ("LightGBM", LightGBMCalibrator()), + ("CatBoost", CatBoostCalibrator()), +] + +# [!] "ivap" not running for test_calibrator_missing_class. def sample_labels(p: np.ndarray, random_state: Optional[int] = None) -> np.ndarray: """ @@ -49,10 +77,10 @@ def sample_labels(p: np.ndarray, random_state: Optional[int] = None) -> np.ndarr rng = np.random.default_rng(seed=random_state) cumulative_probs = np.cumsum(p, axis=-1) # Cumulative sum along each row random_values = rng.random(size=(*p.shape[:-1], 1)) # Uniform random values for each sample - # print(f'{p.shape=}, {cumulative_probs.shape=}, {random_values.shape=}') + # print(f"{p.shape=}, {cumulative_probs.shape=}, {random_values.shape=}") samples = (random_values < cumulative_probs).argmax( axis=-1) # Find the first index where cumulative probability exceeds random value - # print(f'{samples.shape=}') + # print(f"{samples.shape=}") return samples @@ -74,11 +102,11 @@ def calib_dataset(request) -> Tuple[np.ndarray, np.ndarray]: # test that they work with multiclass? # test that torch and numpy interfaces are equivalent? -# @pytest.mark.parametrize('calibrators', [ +# @pytest.mark.parametrize("calibrators", [ # (VennAbersCalibrator(use_ovo=False), MulticlassOneVsRestCalibrator(BinaryVennAbersCalibrator())), # (VennAbersCalibrator(use_ovo=True), MulticlassOneVsOneCalibrator(BinaryVennAbersCalibrator())), -# (SklearnCalibrator(method='isotonic', cv='prefit'), -# MulticlassOneVsRestCalibrator(SklearnCalibrator(method='isotonic', cv='prefit'))) +# (SklearnCalibrator(method="isotonic", cv="prefit"), +# MulticlassOneVsRestCalibrator(SklearnCalibrator(method="isotonic", cv="prefit"))) # ]) # def test_calibrators_equal(calibrators: Tuple[Calibrator, Calibrator], calib_dataset: Tuple[np.ndarray, np.ndarray]): # cal1, cal2 = calibrators @@ -94,16 +122,16 @@ def calib_dataset(request) -> Tuple[np.ndarray, np.ndarray]: # # assert np.allclose(cal1.predict_proba(X_test), cal2.predict_proba(X_test)) # np.testing.assert_allclose(cal1.predict_proba(X_test), cal2.predict_proba(X_test), atol=1e-7) -@pytest.mark.parametrize('metric', [BrierLoss(), SmoothCalibrationError(), CalibrationError()]) -@pytest.mark.parametrize(('calibrator_name', 'calibrator'), +@pytest.mark.parametrize("metric", [BrierLoss(), SmoothCalibrationError(), CalibrationError()]) +@pytest.mark.parametrize(("calibrator_name", "calibrator"), # torchunc-ts sometimes fails, so we exclude it here - [(name, calib) for name, calib in calibrator_specs if name != 'torchunc-ts']) + [(name, calib) for name, calib in calibrator_specs if name != "torchunc-ts"]) def test_calibrator_performance(metric: ClassificationMetric, calibrator_name: str, calibrator: Calibrator, calib_dataset: Tuple[np.ndarray, np.ndarray]): X, y = calib_dataset n_classes = X.shape[-1] - # don't test with train/test split since it might fail due to overfitting + # don"t test with train/test split since it might fail due to overfitting # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # # cal = sklearn.base.clone(calibrator) @@ -127,10 +155,10 @@ def test_calibrator_performance(metric: ClassificationMetric, calibrator_name: s with_cal = metric.compute(y_true=y_true, y_pred=CategoricalProbs(torch.as_tensor(y_pred_probs))).item() # loss after calibration should be better than before - assert with_cal < without_cal + assert with_cal <= without_cal -@pytest.mark.parametrize(('calibrator_name', 'calibrator'), calibrator_specs) +@pytest.mark.parametrize(("calibrator_name", "calibrator"), calibrator_specs) def test_calibrator_torch_vs_numpy(calibrator_name: str, calibrator: Calibrator, calib_dataset: Tuple[np.ndarray, np.ndarray]): X, y = calib_dataset @@ -160,7 +188,7 @@ def test_calibrator_torch_vs_numpy(calibrator_name: str, calibrator: Calibrator, np.testing.assert_allclose(preds[i], preds[0], atol=1e-7) -@pytest.mark.parametrize(('calibrator_name', 'calibrator'), calibrator_specs) +@pytest.mark.parametrize(("calibrator_name", "calibrator"), calibrator_specs) def test_calibrator_missing_class(calibrator_name: str, calibrator: Calibrator): n_samples = 1000 n_classes = 4