diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 83242a1..6756cdc 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [windows-latest, ubuntu-latest, macos-latest]
-        python-version: ['3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.10', '3.11', '3.12', '3.13', '3.14']
     runs-on: ${{ matrix.os }}
     env:
       PIP_NO_INPUT: "1"
@@ -25,6 +25,9 @@ jobs:
       PIP_RETRIES: "5"
     steps:
       - uses: actions/checkout@v4
+      - name: Install OpenMP
+        if: runner.os == 'macOS'
+        run: brew install libomp
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:
diff --git a/README.md b/README.md
index df4c465..fe29c57 100644
--- a/README.md
+++ b/README.md
@@ -5,27 +5,26 @@
 # Probmetrics: Classification metrics and post-hoc calibration
 
 This package (PyTorch-based) currently contains
-- classification metrics, especially also 
-metrics for assessing the quality of probabilistic predictions, and
-- post-hoc calibration methods, especially
-  - a fast and accurate implementation of temperature scaling.
-  - an implementation of structured matrix scaling (SMS), 
-    a regularized version of matrix scaling that outperforms other 
-    logistic-based calibration functions.
-
-It accompanies our papers
-[Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195) and [Structured Matrix Scaling for Multi-Class Calibration](https://arxiv.org/abs/2511.03685) 
-and [A Variational Estimator for Lp Calibration Errors](https://arxiv.org/abs/2602.24230).
+- post-hoc calibration methods, in particular:
+  - a fast and accurate temperature scaling implementation described in [1]
+  - an implementation of structured matrix scaling (SMS), a regularized version of matrix scaling introduced in [2]
+  - implementations for all the post-hoc calibration methods in the CalArena benchmark [4]
+- classification metrics, especially metrics for assessing the quality of probabilistic
+  predictions, in particular:
+  - our classifier based $L_p$ calibration error estimators from [3]
+
+It accompanies our papers  
+[1] [Rethinking Early Stopping: Refine, Then Calibrate](https://arxiv.org/abs/2501.19195)
+(see also: [vision experiments](https://github.com/eugeneberta/RefineThenCalibrate-Vision), [tabular experiments](https://github.com/dholzmueller/pytabkit), [theory](https://github.com/eugeneberta/RefineThenCalibrate-Theory))  
+[2] [Structured Matrix Scaling for Multi-Class Calibration](https://arxiv.org/abs/2511.03685)
+(see also: [experiments](https://github.com/eugeneberta/LogisticCalibrationBenchmark))  
+[3] [A Variational Estimator for Lp Calibration Errors](https://arxiv.org/abs/2602.24230)
+(see also [all experiments](https://github.com/ElSacho/Evaluating_Lp_Calibration_Errors))  
+[4] [CalArena: A Large-Scale Post-Hoc Calibration Benchmark](link_to_come)
+(see also [leaderboards and experiments](https://github.com/probkit/CalArena))  
+
 Please cite us if you use this repository for research purposes.
-The experiments from the papers can be found here: 
-- Rethinking Early Stopping:
-  - [vision experiments](https://github.com/eugeneberta/RefineThenCalibrate-Vision).
-  - [tabular experiments](https://github.com/dholzmueller/pytabkit).
-  - [theory](https://github.com/eugeneberta/RefineThenCalibrate-Theory).
-- Structured Matrix Scaling: 
-  [all experiments](https://github.com/eugeneberta/LogisticCalibrationBenchmark).
-- A Variational Estimator for Lp Calibration Errors: 
-  [all experiments](https://github.com/ElSacho/Evaluating_Lp_Calibration_Errors).
+
 
 ## Installation
 
@@ -34,31 +33,40 @@ Probmetrics is available via
 pip install probmetrics
 ```
 To obtain all functionality, install `probmetrics[extra,dev,dirichletcal]`.
-- extra installs more packages for our CatBoost/LightGBM-based $L_p$ calibration 
-  error metrics, smooth ECE (only works with scikit-learn versions <= 1.6), 
-  Venn-Abers calibration, 
-  centered isotonic regression, 
-  and the temperature scaling implementation in NetCal.
-- dev installs more packages for development (esp. testing)
-- dirichletcal installs Dirichlet calibration, 
-  which however only works for Python 3.12 upwards.
+- extra installs:
+  - numba for SAGA based optimizers in logistic calibrators SVS, SMS and others.
+  - relplot for smooth ECE (only works with scikit-learn versions <= 1.6).
+  - catboost, lightgbm for our classifier-based $L_p$ calibration error metrics.
+  - other packages supporting more post-hoc calibration methods: netcal (BBQ, ENIR, Netcal TS),
+    uncertainty-calibration (Scaling-binning), betacal (Beta calibration), splinecalib
+    (Spline calibration), venn-abers (Venn-Abers calibration), cir-model (CIR), scipy
+    (ETS).
+- dev installs more packages for development (especially testing)
+- dirichletcal installs Dirichlet calibration, which however only works for Python 3.12
+upwards.
 
 ## Using post-hoc calibration methods
 
-You can create a calibrator as follows:
+You can create a calibrator by importing it from `probmetrics.calibrators`
+```python
+from probmetrics.calibrators import TemperatureScalingCalibrator
+
+calib = TemperatureScalingCalibrator()
+```
+
+or using the `get_calibrator` factory function:
 ```python
 from probmetrics.calibrators import get_calibrator
 
-calib = get_calibrator('logistic')
+calib = get_calibrator("logistic")
 ```
 
-These are the main supported methods:
+<!-- Below are the main supported methods:
 - `'logistic'` defaults to structured matrix scaling (SMS) for multiclass 
   and quadratic scaling for binary calibration. 
-  We recommend using `'logistic'` for best results, 
-  especially on multiclass problems. 
+  We recommend using `'logistic'` for best results, especially on multiclass problems. 
   It can be slow for larger numbers of classes. Only runs on CPU. 
-  For the SAGA version (not the default), 
+  For the SAGA version (not the default),
   the first call is slower due to numba compilation.
 - `'svs'`: Structured vector scaling (SVS) for multiclass problems, 
   faster than SMS for multiclass while being almost as good in many cases.
@@ -66,8 +74,7 @@ These are the main supported methods:
   underperforms `'logistic'` (quadratic scaling) in our benchmarks but preserves AUC.
 - `'temp-scaling'`: Our 
   [highly efficient implementation of temperature scaling](https://arxiv.org/abs/2501.19195)
-  that, unlike some other implementations, 
-  does not suffer from optimization issues. 
+  that, unlike some other implementations, does not suffer from optimization issues. 
   Temperature scaling is not as expressive as matrix or vector scaling variants,
   but it is faster and has the least overfitting risk.
 - `'ts-mix'`: Same as `'temp-scaling'` but with Laplace smoothing 
@@ -78,29 +85,29 @@ These are the main supported methods:
 - `'ivap'` Inductive Venn-Abers predictor (a version of isotonic regression, slow but a bit better)
 - `'cir'` Centered isotonic regression (slightly better and slower than isotonic)
 - `'dircal'` Dirichlet calibration (slow, logistic performs better in our experiments)
-- `'dircal-cv'` Dirichlet calibration optimized with cross-validation (very slow)
+- `'dircal-cv'` Dirichlet calibration optimized with cross-validation (very slow) -->
+
+You can find all post-hoc calibration functions implemented in `probmetrics/calibrators`.
+A subset of those methods is supported by the `get_calibrator` function, find details in `probmetrics/calibrators/factory.py`.
 
-More details on parameters and other methods can be found in the get_calibrator function 
-[here](https://github.com/dholzmueller/probmetrics/probmetrics/calibrators.py).
 
-### Usage with `numpy`
+### Usage with numpy
 
 ```python
 import numpy as np
 
 probas = np.asarray([[0.1, 0.9]])  # shape = (n_samples, n_classes)
 labels = np.asarray([1])  # shape = (n_samples,)
+
 calib.fit(probas, labels)
-calibrated_probas = calib.predict_proba(probas)
+calibrated_probas = calib.predict_proba(probas) # shape = (n_samples, n_classes)
 ```
 
 ### Usage with PyTorch
 
-The PyTorch version can be used directly with GPU tensors, 
-which is leveraged by our temperature scaling implementation 
-but not by most other methods.
-For temperature scaling, this could accelerate things, 
-but the CPU version can be faster 
+The PyTorch version can be used directly with GPU tensors, which is leveraged by our
+temperature scaling implementation but not by most other methods.
+For temperature scaling, this could accelerate things, but the CPU version can be faster 
 for smaller validation sets (around 1K-10K samples).
 
 ```python
@@ -116,74 +123,83 @@ result = calib.predict_proba_torch(CategoricalProbs(probas))
 calibrated_probas = result.get_probs()
 ```
 
+## Using metrics
 
-## Using our refinement and calibration metrics
-
-We provide estimators for refinement error 
-(loss after post-hoc calibration)
-and calibration error 
-(loss improvement through post-hoc calibration). 
-They can be used as follows:
+We provide classification and calibration metrics, which can be used as follows:
 
 ```python
 import torch
 from probmetrics.metrics import Metrics
 
-# compute multiple metrics at once 
-# this is more efficient than computing them individually
-metrics = Metrics.from_names(['logloss', 
-                              'refinement_logloss_ts-mix_all', 
-                              'calib-err_logloss_ts-mix_all'])
+# computing multiple metrics at once is more efficient than computing them individually
+metrics = Metrics.from_names([
+    "logloss",
+    "brier", # for binary, this is 2x the brier from sklearn
+    "accuracy",
+    "class-error",
+    "auroc-ovr", # one-vs-rest
+    "auroc-ovo-sklearn", # one-vs-one (can be slow!)
+    "ece-15", # Expected calibration error with 15 bins
+    "rmsce-15", # Root mean squared calibration error with 15 bins
+    "mce-15", # Maximum calibration error with 15 bins
+    "smece", # Smooth ECE, requires the relplot package.
+])
+
 y_true = torch.tensor(...)
-y_logits = torch.tensor(...)
-results = metrics.compute_all_from_labels_logits(y_true, y_logits)
-print(results['refinement_logloss_ts-mix_all'].item())
+y_pred = torch.tensor(...)
+
+results = metrics.compute_all_from_labels_probs(y_true, y_pred)
+print(results["brier"].item())
 ```
 
-## Using more metrics
+While there are some classes for regression metrics, they are not implemented yet.
+
+The following function returns a list of all metric names:
+```python
+from probmetrics.metrics import Metrics, MetricType
+Metrics.get_available_names(metric_type=MetricType.CLASS)
+```
+
+### Using our refinement and calibration metrics
+
+We provide estimators for refinement error (loss after post-hoc calibration) and
+calibration error (loss improvement through post-hoc calibration).
+They can be used as follows:
 
-In general, while some metrics can be 
-flexibly configured using the corresponding classes,
-many metrics are available through their name. 
-Here are some relevant classification metrics:
 ```python
 from probmetrics.metrics import Metrics
 
 metrics = Metrics.from_names([
-    'logloss',
-    'brier',  # for binary, this is 2x the brier from sklearn
-    'accuracy', 'class-error',
-    'auroc-ovr', # one-vs-rest
-    'auroc-ovo-sklearn', # one-vs-one (can be slow!)
-    # calibration metrics
-    'ece-15', 'rmsce-15', 'mce-15', 'smece'
-    'refinement_logloss_ts-mix_all', 
-    'calib-err_logloss_ts-mix_all',
-    'refinement_brier_ts-mix_all', 
-    'calib-err_brier_ts-mix_all',
-    'calib-err_proper-L1-binary-as-1d_WS_CatboostClassifier_all',
-    'calib-err_proper-L2-binary-as-1d_WS_CatboostClassifier_all',
-    'calib-err_proper-Linf-binary-as-1d_WS_CatboostClassifier_all',
+    # Mean test logloss after post-hoc calibration with ts-mix:
+    "refinement_logloss_ts-mix_all",
+
+    # Difference in mean test logloss before/after calibration with ts-mix:
+    "calib-err_logloss_ts-mix_all",
+
+    # Same thing for Brier score:
+    "refinement_brier_ts-mix_all",
+    "calib-err_brier_ts-mix_all",
+
+    # Using the L1, L2 and Linf calibration error estimators (CatBoost based) described
+    # in [3]:
+    "calib-err_proper-L1-binary-as-1d_WS_CatboostClassifier_all",
+    "calib-err_proper-L2-binary-as-1d_WS_CatboostClassifier_all",
+    "calib-err_proper-Linf-binary-as-1d_WS_CatboostClassifier_all",
 ])
-```
 
-The following function returns a list of all metric names:
-```python
-from probmetrics.metrics import Metrics, MetricType
-Metrics.get_available_names(metric_type=MetricType.CLASS)
-```
+y_true = torch.tensor(...)
+y_logits = torch.tensor(...)
 
-While there are some classes for regression metrics, they are not implemented.
+results = metrics.compute_all_from_labels_logits(y_true, y_logits)
+print(results['refinement_logloss_ts-mix_all'].item())
+```
 
-## Advanced calibration, confidence, and top-class metrics
+### Advanced calibration, confidence, and top-class metrics
 
 Beyond standard metrics, you can evaluate proper Lp calibration errors for 
 any p, as well as isolate specific types of errors like over-confidence, 
 under-confidence, and top-class errors. 
 
-**Note:** Over- and under-confidence metrics are designed for binary classification.
-To use those for multi-class, please use `TopClassLoss(OverConfidenceLoss(your_metric))`.
-
 ```python
 from probmetrics.metrics import (
   ProperLpLoss,
@@ -212,17 +228,23 @@ over_topclass_brier = TopClassLoss(OverConfidenceLoss(BrierLoss()))
 
 # Some metrics are listed by default, here are some of them
 metrics = metrics = Metrics.from_names([
-    'proper-L1-binary-as-1d', # use to estimate  E[ \| Y - E[Y|f(X)] \|_1 ] and treat binary 
-                              # predictions as scalars with shapes (n,1) )
-    'proper-L2', # use to estimate  E[ \| Y - E[Y|f(X)] \|_2 ] (and treat binary predictions 
-                 # as vector with shapes (n,2) )
+    # Estimate  E[ \| Y - E[Y|f(X)] \|_1 ] and treat binary predictions as scalars with
+    # shapes (n,1)
+    "proper-L1-binary-as-1d"
+
+    # Estimate  E[ \| Y - E[Y|f(X)] \|_2 ] and treat binary predictions as vectors with
+    # shapes (n,2)
+    "proper-L2",
+
     "topclass-proper-L1-binary-as-1d", # Estimate L1 calibration error of top class 
     "topclass-under-proper-L1-binary-as-1d", # Estimate L1-overconfidence of top class 
     "topclass-over-proper-L1-binary-as-1d", # Estimate L1-underconfidence of top class
 ])
-
 ```
 
+> **Note:** Over- and under-confidence metrics are designed for binary classification.
+For multi-class in a top-class fashion, please use `TopClassLoss(OverConfidenceLoss(your_metric))`.
+
 Once those losses are defined, you can evaluate the calibration error by doing:
 
 ```python
@@ -232,53 +254,51 @@ from probmetrics.splitters import CVSplitter
 
 loss = ProperLpLoss(p=2) 
 
-metrics = MetricsWithCalibration(loss,
-                            calibrator=WS_CatboostClassifier(), # The classifier used to recalibrate the predictions
-                            val_splitter=CVSplitter(n_cv=5) # cross-validation splitter
-                            )
-
-# or use combined metrics to evaluate multiple metrics 
-# while fitting the post-hoc calibrator only once
-combined_losses = CombinedMetrics( 
-                                    [
-                                    ProperLpLoss(p=1), 
-                                    OverConfidenceLoss.from_name("brier"), 
-                                    OverConfidenceLoss.from_name("proper-L1") , 
-                                    UnderConfidenceLoss.from_name("proper-L1" ), 
-                                    UnderConfidenceLoss( BrierLoss() ),
-                                    BrierLoss()
-                                    ]
-                                  )
-
-metrics = MetricsWithCalibration(combined_losses,
-                            calibrator=WS_LGBMClassifier(), 
-                            val_splitter=CVSplitter(n_cv=5)
-                            )
+metrics = MetricsWithCalibration(
+    loss,
+    calibrator=WS_CatboostClassifier(), # classifier used to recalibrate the predictions
+    val_splitter=CVSplitter(n_cv=5) # cross-validation splitter
+)
+
+# Or use combined metrics to evaluate multiple metrics while fitting the post-hoc
+# calibrator only once
+combined_losses = CombinedMetrics([
+    ProperLpLoss(p=1), 
+    OverConfidenceLoss.from_name("brier"), 
+    OverConfidenceLoss.from_name("proper-L1") , 
+    UnderConfidenceLoss.from_name("proper-L1" ), 
+    UnderConfidenceLoss(BrierLoss()),
+    BrierLoss()
+])
+
+metrics = MetricsWithCalibration(
+    combined_losses,
+    calibrator=WS_LGBMClassifier(), 
+    val_splitter=CVSplitter(n_cv=5)
+)
 
 y_true = torch.tensor(...)
 y_prob = torch.tensor(...)
+
 results = metrics.compute_all_from_labels_probs(y_true, y_prob)
 ```
 
-The `calibrator` argument is a class used to recalibrate the original predictions. 
-Any estimator that inherits from sklearn.base.ClassifierMixin (i.e., follows the 
-scikit-learn classifier API) and implements `predict_proba()` can be used.
+The `calibrator` argument is used to recalibrate the original predictions. 
+Any class inheriting from sklearn.base.ClassifierMixin (i.e., follows the scikit-learn 
+classifier API) and implementing `predict_proba()` can be used.
 We recommend using `WS_CatboostClassifier` with default parameters. 
-The "WS" stands for "Warm Start", as predictions are initialized at the 
-original predicted $f(x)$ values (see the paper [A Variational Estimator for Lp 
-Calibration Errors](https://arxiv.org/abs/2602.24230) for additional information). 
-
+"WS" stands for "Warm Started", as predictions are initialized at the original predicted
+$f(x)$ values (see [3]). 
 
-### Binary vs. multiclass formatting
+#### Binary vs. multiclass formatting
 
-The library internally stores predictions in a multiclass format 
-with shape `(n_samples, n_classes)`.
-For binary classification, for some metrics 
-you can control whether to treat the output as a two-column distribution 
-or a single-column probability using the `binary_as_multiclass` parameter.
-For example, for `BrierLoss()`, using `binary_as_multiclass=False` 
-will yield the scikit-learn formula, while `binary_as_multiclass=True` 
-will yield twice the value.
+The library internally stores predictions in a multiclass format with shape
+`(n_samples, n_classes)`.
+For binary classification, for some metrics you can control whether to treat the output
+as a two-column distribution or a single-column probability using the
+`binary_as_multiclass` parameter.
+For example, for `BrierLoss()`, using `binary_as_multiclass=False` will yield the
+scikit-learn formula, while `binary_as_multiclass=True` will yield twice the value.
 
 Setting `binary_as_multiclass=False` tells the loss function to treat 
 `(n_samples, 2)` predictions as a single-column `(n_samples, 1)` probability.
@@ -306,9 +326,22 @@ based on $f(X)$ instead of $g(f(X))$ so the loss difference uses the same choice
 
 ## Releases
 
+- v1.3.0 by [@eugeneberta](https://github.com/eugeneberta):
+  - Splitted calibrators.py into several subfiles and implemented many new calibrators,
+  most of which are benchmarked and ranked in the CalArena leaderboard.
+  Among others, added Binary histogram binning, Scaling-binning (from
+  uncertainty-calibration), BBQ (from netcal), Beta calibration (from betacal), ENIR
+  (from netcal), binary and multiclass Kernel calibration (using beta and dirichlet
+  kernels from ece_kde), Spline calibration (from splinecalib), CDF-Spline calibration,
+  Ensemble Temperature Scaling, tree based calibration with CatBoost, LightGBM and
+  XGBoost.  
+  - Re-structured the base `Calibrator` class to differentiate
+  `_predict_proba_torch_impl` from `_predict_proba_impl`.
+  - Added Kuiper and Kolmogorov-Smirnov binary calibration metrics.
+  - Deprecated python 3.9, added python 3.13 and 3.14 support.
 - v1.2.0 by [@elsacho](https://github.com/elsacho): Added new proper loss functions:
-  - ProperLpLoss(p=p): Metrics to evaluate $E[ \Vert f(X) - E[Y|f(X)] \Vert_p ]$ where $f(X)$ are the 
-    predictions of the classifier, $p >= 1$, including `p=float("inf")`
+  - ProperLpLoss(p=p): Metrics to evaluate $E[ \Vert f(X) - E[Y|f(X)] \Vert_p ]$ where
+    $f(X)$ are the predictions of the classifier, $p >= 1$, including `p=float("inf")`
   - TopClassLoss: A wrapper to variationally evaluate top-class errors.
   - OverConfidenceLoss & UnderConfidenceLoss: Wrappers to variationally evaluate 
     over/under-confidence in binary predictors.
@@ -316,7 +349,8 @@ based on $f(X)$ instead of $g(f(X))$ so the loss difference uses the same choice
   - New classifiers:  Added `WS_CatboostClassifier` and `WS_LGBMClassifier` for 
     evaluating calibration errors.
   - removed sklearn < 1.7 constraint.
-- v1.1.0 by [@eugeneberta](https://github.com/eugeneberta): Improvements to the SVS and SMS calibrators:
+- v1.1.0 by [@eugeneberta](https://github.com/eugeneberta): Improvements to the SVS and
+  SMS calibrators:
   - logit pre-processing with `'ts-mix'` is now automatic, 
     and the global scaling parameter $\alpha$ is fixed to 1. This yields:
     - improved performance on our tabular and computer vision benchmarks 
@@ -329,10 +363,9 @@ based on $f(X)$ instead of $g(f(X))$ so the loss difference uses the same choice
   - the default binary calibrator in `LogisticCalibrator` is now quadratic scaling 
     instead of affine scaling, this can be changed back by using 
     `LogisticCalibrator(binary_type='affine')`.
-- v1.0.0 by [@eugeneberta](https://github.com/eugeneberta): New post-hoc calibrators like `'logistic'` 
-  including structured matrix scaling (SMS), 
-  structured vector scaling (SVS), 
-  affine scaling, and quadratic scaling.
+- v1.0.0 by [@eugeneberta](https://github.com/eugeneberta): New post-hoc calibrators
+  like `'logistic'` including structured matrix scaling (SMS), structured vector scaling
+  (SVS), affine scaling, and quadratic scaling.
 - v0.0.2 by [@dholzmueller](https://github.com/dholzmueller):
   - Removed numpy<2.0 constraint
   - allow 1D vectors in CategoricalLogits / CategoricalProbs
diff --git a/probmetrics/__about__.py b/probmetrics/__about__.py
index 881244a..d63e145 100644
--- a/probmetrics/__about__.py
+++ b/probmetrics/__about__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: Apache-2.0
-__version__ = "1.2.0"
+__version__ = "1.3.0"
diff --git a/probmetrics/calibrators.py b/probmetrics/calibrators.py
deleted file mode 100644
index ef01281..0000000
--- a/probmetrics/calibrators.py
+++ /dev/null
@@ -1,1858 +0,0 @@
-import warnings
-from typing import Callable, Optional, List
-
-import torch
-import sklearn
-import logging
-import torchmin
-import numpy as np
-
-from probmetrics.saga import (
-    fit_affine_scaling,
-    warm_up_affine_scaling,
-    fit_quadratic_scaling,
-    warm_up_quadratic_scaling,
-    fit_vector_scaling,
-    warm_up_vector_scaling,
-    fit_matrix_scaling,
-    warm_up_matrix_scaling,
-)
-from probmetrics.utils import (
-    process_binary_probs,
-    binary_probs_to_logits,
-    multiclass_probs_to_logits,
-    validate_probabilities,
-)
-from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.calibration import CalibratedClassifierCV
-from sklearn.model_selection import StratifiedKFold, GridSearchCV
-from sklearn.preprocessing import LabelEncoder
-
-from torch import nn
-
-from probmetrics.distributions import (
-    CategoricalDistribution,
-    CategoricalProbs,
-    CategoricalLogits,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class Calibrator(BaseEstimator, ClassifierMixin):
-    """
-    Calibrator base class. To implement,
-    - override at least one of _fit_impl and _fit_torch_impl
-    - override at least one of predict_proba and predict_proba_torch
-    """
-
-    def __init__(self):
-        assert self.__class__.fit == Calibrator.fit
-        assert self.__class__.fit_torch == Calibrator.fit_torch
-
-    def fit(self, X, y) -> "Calibrator":
-        assert isinstance(X, np.ndarray)
-        assert isinstance(y, np.ndarray)
-
-        self.classes_ = list(range(X.shape[-1]))
-
-        if self.__class__._fit_impl != Calibrator._fit_impl:
-            self._fit_impl(X, y)
-            return self
-
-        if self.__class__._fit_torch_impl != Calibrator._fit_torch_impl:
-            self._fit_torch_impl(
-                y_pred=CategoricalProbs(torch.as_tensor(X)),
-                y_true_labels=torch.as_tensor(y, dtype=torch.long),
-            )
-            return self
-
-        raise NotImplementedError()
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        raise NotImplementedError()
-
-    def fit_torch(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ) -> "Calibrator":
-        assert isinstance(y_true_labels, torch.Tensor)
-        assert isinstance(y_pred, CategoricalDistribution)
-        # default implementation, using sklearn
-        self.classes_ = list(range(y_pred.get_n_classes()))
-
-        if self.__class__._fit_torch_impl != Calibrator._fit_torch_impl:
-            self._fit_torch_impl(y_pred, y_true_labels)
-            return self
-
-        if self.__class__._fit_impl != Calibrator._fit_impl:
-            self._fit_impl(
-                y_pred.get_probs().detach().cpu().numpy(),
-                y_true_labels.detach().cpu().numpy(),
-            )
-            return self
-
-        raise NotImplementedError()
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        raise NotImplementedError()
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        if self.__class__.predict_proba_torch != Calibrator.predict_proba_torch:
-            return (
-                self.predict_proba_torch(CategoricalProbs(torch.as_tensor(X)))
-                .get_probs()
-                .numpy()
-            )
-
-        raise NotImplementedError()
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        if self.__class__.predict_proba != Calibrator.predict_proba:
-            y_pred_probs = y_pred.get_probs()
-            probs = self.predict_proba(y_pred_probs.detach().cpu().numpy())
-            return CategoricalProbs(
-                torch.as_tensor(
-                    probs, device=y_pred_probs.device, dtype=y_pred_probs.dtype
-                )
-            )
-
-        raise NotImplementedError()
-
-    def predict(self, X):
-        y_probs = self.predict_proba(X)
-        class_idxs = np.argmax(y_probs, axis=-1)
-        return np.asarray(self.classes_)[class_idxs]
-
-
-class ApplyToLogitsCalibrator(Calibrator):
-    def __init__(self, calib: BaseEstimator):
-        super().__init__()
-        self.calib = calib
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        self.calib_ = sklearn.base.clone(self.calib)
-        self.calib_.fit(np.log(X + 1e-10), y)
-
-    def predict_proba(self, X):
-        return self.calib_.predict_proba(np.log(X + 1e-10))
-
-
-def bisection_search(f: Callable[[float], float], a: float, b: float, n_steps: int):
-    for _ in range(n_steps):
-        c = a + 0.5 * (b - a)
-        f_c = f(c)
-        if f_c > 0:
-            b = c
-        else:
-            a = c
-
-    return 0.5 * (a + b)
-
-
-class TemperatureScalingCalibrator(Calibrator):
-    def __init__(
-        self,
-        opt: str = "bisection",
-        max_bisection_steps: int = 30,
-        lr: float = 0.1,
-        max_iter: int = 200,
-        use_inv_temp: bool = True,
-        inv_temp_init: float = 1 / 1.5,
-    ):
-        super().__init__()
-        self.lr = lr
-        self.max_bisection_steps = max_bisection_steps
-        self.max_iter = max_iter
-        self.use_inv_temp = use_inv_temp
-        self.inv_temp_init = inv_temp_init
-        self.opt = opt
-
-    def _get_loss_grad(self, invtemp: float, logits: torch.Tensor, y: torch.Tensor):
-        part_1 = torch.mean(
-            torch.sum(logits * torch.softmax(invtemp * logits, dim=-1), dim=-1)
-        )
-        part_2 = torch.mean(logits[torch.arange(logits.shape[0]), y])
-        return (part_1 - part_2).item()
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        logits = y_pred.get_logits()
-        labels = y_true_labels
-
-        if self.opt in ["lbfgs", "lbfgs_line_search"]:
-            self._fit_lbfgs(logits, labels)
-        elif self.opt == "bisection":
-            self._fit_bisection(logits, labels)
-        else:
-            raise ValueError(f'Unknown optimizer "{self.opt}"')
-
-        # print(f'{self.invtemp_=}')
-
-    def _fit_lbfgs(self, logits: torch.Tensor, labels: torch.Tensor):
-        # following https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py
-        param = nn.Parameter(
-            torch.ones(1, device=logits.device)
-            * (self.inv_temp_init if self.use_inv_temp else 1 / self.inv_temp_init)
-        )
-        criterion = nn.CrossEntropyLoss()
-        optimizer = torch.optim.LBFGS(
-            [param],
-            lr=self.lr,
-            max_iter=self.max_iter,
-            line_search_fn="strong_wolfe" if self.opt == "lbfgs_line_search" else None,
-        )
-
-        def eval():
-            optimizer.zero_grad()
-            y_pred = (
-                logits * param[:, None]
-                if self.use_inv_temp
-                else logits / param[:, None]
-            )
-            loss = criterion(y_pred, labels)
-            loss.backward()
-            return loss
-
-        optimizer.step(eval)
-
-        self.invtemp_ = param.item() if self.use_inv_temp else 1 / param.item()
-
-    def _fit_bisection(self, logits: torch.Tensor, labels: torch.Tensor):
-        objective_grad = lambda u, l=logits, tar=labels: self._get_loss_grad(
-            np.exp(u), l, tar
-        )
-
-        # should reach about float32 accuracy
-        # need log_2(32) = 5 steps to get to length 1 and then 24 more steps to get to float32 epsilon (2^{-24})
-        self.invtemp_ = np.exp(
-            bisection_search(
-                objective_grad, a=-16, b=16, n_steps=self.max_bisection_steps
-            )
-        )
-        # print(f'{self.invtemp_=}')
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        return CategoricalLogits(self.invtemp_ * y_pred.get_logits())
-
-
-### New binary calibrators ###
-
-
-class BinaryLogisticCalibrator(Calibrator):
-    """
-    Binary post-hoc calibration with linear, affine or quadratic scaling
-    on the binary logits using sklearn's LogisticRegression.
-
-    This class fits a model of the form:
-    P(y=1) = sigma(calibrated_logit)
-
-    Where `calibrated_logit` depends on the `type`:
-    - 'linear':    calibrated_logit = w * logit (Temperature scaling with inverse temperature)
-    - 'affine':    calibrated_logit = b + w * logit (Platt scaling)
-    - 'quadratic': calibrated_logit = b + w1 * logit + w2 * logit^2
-    """
-
-    VALID_TYPES = ["linear", "affine", "quadratic", "beta"]
-
-    def __init__(self, type: str = "affine"):
-        """
-        Args:
-            type (str, optional): The type of scaling.
-                One of ['linear', 'affine', 'quadratic'].
-                Defaults to 'affine'.
-        """
-        super().__init__()
-
-        if type not in self.VALID_TYPES:
-            raise ValueError(
-                f"Invalid type '{type}'. Must be one of {self.VALID_TYPES}"
-            )
-        self.type = type
-        self.cal_ = None
-
-    def _get_logits(self, X: np.ndarray) -> np.ndarray:
-        log_p, log_1_minus_p = binary_probs_to_logits(X)
-        logits = log_p - log_1_minus_p
-        
-        if self.type == "linear":
-            return logits
-        
-        ones = np.ones_like(logits)
-        
-        if self.type == "affine":
-            return np.hstack([ones, logits])
-        
-        elif self.type == "quadratic":
-            return np.hstack([ones, logits, np.square(logits)])
-        
-        if self.type == "beta": # TODO Check that this is indeed beta
-            return np.hstack([ones, log_p, log_1_minus_p])
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        """Fits the logistic regression calibrator."""
-        from sklearn.linear_model import LogisticRegression
-
-        self.cal_ = LogisticRegression(
-            penalty=None,
-            fit_intercept=False,
-            solver="lbfgs"
-        )
-
-        features = self._get_logits(X)
-        self.cal_.fit(features, y)
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        """Generates calibrated probability estimates."""
-        if self.cal_ is None:
-            raise RuntimeError(
-                "Calibrator has not been fitted yet. Call fit() before predict_proba()."
-            )
-
-        features = self._get_logits(X)
-        return self.cal_.predict_proba(features)
-
-
-class RegularizedAffineScalingCalibrator(Calibrator):
-    """
-    Binary post-hoc calibration with Platt scaling (~binary logistic regression) $sigmoid(ax+b)$ on the logits x.
-    Uses the SAGA algorithm to fit the scaling and intercept parameters a & b.
-    Numba functions are warmed-up the first time the class is initialized (~1s).
-    A penalty (one of ["mcp", "lasso", "ridge"]) can be applied to the intercept parameter b, with regularization strength lambda_intercept.
-    To apply no regularization, leave penalty to None.
-    """
-
-    _warmed_up = False
-
-    def __init__(
-        self,
-        penalty: str = None,
-        lambda_intercept: float = 0.0,
-        max_iter: int = 200,
-        tol: float = 1e-4,
-        print_init_info: bool = True,
-    ):
-        super().__init__()
-        self.penalty = penalty
-        self.lambda_intercept = lambda_intercept
-        self.max_iter = max_iter
-        self.tol = tol
-        self.print_init_info = print_init_info
-
-        self.w = None
-
-        if not RegularizedAffineScalingCalibrator._warmed_up:
-            if self.print_init_info:
-                print(
-                    "First RegularizedAffineScalingCalibrator instantiation - warming up Numba functions..."
-                )
-            warm_up_affine_scaling()
-            RegularizedAffineScalingCalibrator._warmed_up = True
-            if self.print_init_info:
-                print("Warmed up!")
-
-    def _get_logits(self, X: np.ndarray) -> np.ndarray:
-        log_p, log_1_minus_p = binary_probs_to_logits(X)
-        logits = log_p - log_1_minus_p
-        ones = np.ones_like(logits)
-        return np.hstack([ones, logits])
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        logits = self._get_logits(X)
-        self.w = fit_affine_scaling(
-            logits,
-            y,
-            penalty=self.penalty,
-            reg_intercept=self.lambda_intercept,
-            max_iter=self.max_iter,
-            tol=self.tol,
-        )
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        if self.w is None:
-            raise RuntimeError(
-                "Calibrator has not been fitted yet. Call fit() before predict_proba()."
-            )
-
-        logits = self._get_logits(X)
-        logits = logits.dot(self.w).reshape(-1, 1)
-        probs = 1.0 / (1.0 + np.exp(-logits))
-        return np.hstack([1 - probs, probs])
-
-
-class RegularizedQuadraticScalingCalibrator(Calibrator):
-    """
-    Binary post-hoc calibration with quadratic scaling $sigmoid(a + bx + cx^2)$ on the logits x.
-    Uses the SAGA algorithm to fit the intercept, scaling and quadratic parameters a, b & c.
-    Numba functions are warmed-up the first time the class is initialized (~1s).
-    A penalty (one of ["mcp", "lasso", "ridge"]) can be applied to the intercept and quadratic parameters a & c,
-    with respective regularization strength lambda_intercept and lambda_quadratic.
-    To apply no regularization, leave penalty to None.
-    """
-
-    _warmed_up = False
-
-    def __init__(
-        self,
-        penalty: str = None,
-        lambda_intercept: float = 0.0,
-        lambda_quadratic: float = 0.0,
-        max_iter: int = 200,
-        tol: float = 1e-4,
-        print_init_info: bool = True,
-    ):
-        super().__init__()
-        self.penalty = penalty
-        self.lambda_intercept = lambda_intercept
-        self.lambda_quadratic = lambda_quadratic
-        self.max_iter = max_iter
-        self.tol = tol
-        self.print_init_info = print_init_info
-
-        self.w = None
-
-        if not RegularizedQuadraticScalingCalibrator._warmed_up:
-            if self.print_init_info:
-                print(
-                    "First RegularizedQuadraticScalingCalibrator instantiation - warming up Numba functions..."
-                )
-            warm_up_quadratic_scaling()
-            RegularizedQuadraticScalingCalibrator._warmed_up = True
-            if self.print_init_info:
-                print("Warmed up!")
-
-    def _get_logits(self, X: np.ndarray) -> np.ndarray:
-        log_p, log_1_minus_p = binary_probs_to_logits(X)
-        logits = log_p - log_1_minus_p
-        ones = np.ones_like(logits)
-        return np.hstack([ones, logits, np.square(logits)])
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        logits = self._get_logits(X)
-        self.w = fit_quadratic_scaling(
-            logits,
-            y,
-            penalty=self.penalty,
-            reg_intercept=self.lambda_intercept,
-            reg_quadratic=self.lambda_quadratic,
-            max_iter=self.max_iter,
-            tol=self.tol,
-        )
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        if self.w is None:
-            raise RuntimeError(
-                "Calibrator has not been fitted yet. Call fit() before predict_proba()."
-            )
-
-        logits = self._get_logits(X)
-        logits = logits.dot(self.w).reshape(-1, 1)
-        probs = 1.0 / (1.0 + np.exp(-logits))
-        return np.hstack([1 - probs, probs])
-
-
-##################################
-
-
-### New multiclass calibrators ###
-
-
-class SVSCalibrator(Calibrator):
-    """
-    Multiclass post-hoc calibration with a structured scaling scheme $softmax((aI + diag(v))x + b)$ on the logits x.
-
-    Numba functions are warmed-up the first time the class is initialized with the 'saga' optimizer.
-
-    A penalty (one of [None, "mcp", "lasso", "ridge"]) is applied to the intercept (b) and vector scaling (v) parameters,
-    with respective regularization strength lambda_intercept*(k**rho)/(n**tau) and lambda_diagonal*(k**rho)/(n**tau).
-
-    Instead of fitting the global scaling parameter jointly with the other parameters, the logits are pre-processed with
-    temperature scaling (with no regularization) and a is then fixed to a=1.
-
-    [!] solver = 'bfgs' only supports 'ridge' penalty.
-    [!] For solver = 'bfgs', max_iter and tol are ignored and default torchmin parameters are used.
-    """
-
-    _warmed_up_saga = False
-    _ALLOWED_PENALTIES = [None, "mcp", "lasso", "ridge"]
-    _ALLOWED_OPTIMIZERS = ["saga", "bfgs"]
-
-    def __init__(
-        self,
-        penalty: str = "ridge",
-        rho: float = 1.0,
-        tau: float = 1.0,
-        lambda_intercept: float = 1.0,
-        lambda_diagonal: float = 1.0,
-        opt: str = "bfgs",
-        max_iter: int = 500,
-        tol: float = 1e-5,
-        print_init_info: bool = True,
-    ):
-        super().__init__()
-
-        if penalty not in self._ALLOWED_PENALTIES:
-            raise ValueError(
-                f"Penalty '{penalty}' not recognized. Choose from {self._ALLOWED_PENALTIES}."
-            )
-
-        if opt not in self._ALLOWED_OPTIMIZERS:
-            raise ValueError(
-                f"Optimizer '{opt}' not recognized. Choose from {self._ALLOWED_OPTIMIZERS}."
-            )
-
-        if opt == "bfgs" and penalty != "ridge":
-            raise ValueError(
-                "The 'bfgs' optimizer only supports the 'ridge' penalty, use 'saga' instead."
-            )
-
-        if opt == "saga" and not SVSCalibrator._warmed_up_saga:
-            if print_init_info:
-                logger.info(
-                    "First SVSCalibrator instantiation with 'saga' - warming up Numba functions..."
-                )
-            warm_up_vector_scaling()
-            SVSCalibrator._warmed_up_saga = True
-            if print_init_info:
-                logger.info("Warmed up!")
-
-        self.penalty = penalty
-        self.rho = rho
-        self.tau = tau
-        self.lambda_intercept = lambda_intercept
-        self.lambda_diagonal = lambda_diagonal
-        self.max_iter = max_iter
-        self.tol = tol
-        self.opt = opt
-        self.print_init_info = print_init_info
-
-        self.v = None
-        self.b = None
-        self.ts = None
-
-    def _get_logits(self, X: np.ndarray) -> tuple[np.ndarray, int, int]:
-        logits = multiclass_probs_to_logits(X)
-        return logits
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        validate_probabilities(X)
-
-        n, k = X.shape
-
-        self.ts = get_calibrator("ts-mix")
-        self.ts.fit(X, y)
-        scaled_probs = self.ts.predict_proba(X)
-
-        reg_intercept = self.lambda_intercept * (k**self.rho) / (n**self.tau)
-        reg_diagonal = self.lambda_diagonal * (k**self.rho) / (n**self.tau)
-
-        logits = self._get_logits(scaled_probs)
-
-        if self.opt == "saga":
-            self.v, self.b = fit_vector_scaling(
-                logits,
-                y,
-                penalty=self.penalty,
-                reg_intercept=reg_intercept,
-                reg_diagonal=reg_diagonal,
-                max_iter=self.max_iter,
-                tol=self.tol,
-            )
-
-        elif self.opt == "bfgs":
-            self.v, self.b = self._fit_bfgs(
-                torch.tensor(logits),
-                torch.tensor(y),
-                num_classes=k,
-                reg_intercept=reg_intercept,
-                reg_diagonal=reg_diagonal,
-            )
-
-    def _fit_bfgs(
-        self,
-        logits: torch.Tensor,
-        y: torch.Tensor,
-        num_classes,
-        reg_intercept,
-        reg_diagonal,
-    ):
-        K = num_classes
-
-        initial_params = torch.zeros(2 * K, device=logits.device, dtype=logits.dtype)
-
-        def loss(params):
-            v_delta = params[:K]
-            b = params[K:]
-
-            scaled_logits = logits * (1.0 + v_delta) + b
-            ce = torch.nn.functional.cross_entropy(scaled_logits, y)
-
-            r_b = reg_intercept * b.pow(2).sum()
-            r_v = reg_diagonal * v_delta.pow(2).sum()
-
-            return ce + r_b + r_v
-
-        method = "l-bfgs" if initial_params.numel() > 1000 else "bfgs"
-        res = torchmin.minimize(loss, initial_params, method=method)
-
-        v_delta_final = res.x[:K]
-        b_final = res.x[K:]
-
-        return (
-            1.0 + v_delta_final
-        ).detach().cpu().numpy(), b_final.detach().cpu().numpy()
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        validate_probabilities(X)
-
-        if self.v is None:
-            raise RuntimeError(
-                "Calibrator has not been fitted yet. Call fit() before predict_proba()."
-            )
-
-        scaled_probs = self.ts.predict_proba(X)
-        logits = self._get_logits(scaled_probs)
-        calibrated_logits = self.v.reshape(1, -1) * logits + self.b.reshape(1, -1)
-        calibrated_logits -= np.max(calibrated_logits, axis=1, keepdims=True)
-        exp_logits = np.exp(calibrated_logits)
-        return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
-
-
-class SMSCalibrator(Calibrator):
-    """
-    Multiclass post-hoc calibration with a structured scaling scheme $softmax( (aI + diag(v) + off_diag(M) )x + b)$ on the logits x.
-
-    Numba functions are warmed-up the first time the class is initialized with the 'saga' optimizer.
-
-    A penalty (one of ["mcp", "lasso", "group_lasso", "ridge"]) is applied to the intercept (b), vector scaling (v) and off diagonal (M) parameters,
-    with respective regularization strength lambda_intercept*(k**rho)/(n**tau), lambda_diagonal*(k**rho)/(n**tau) and lambda_off_diagonal*((k*(k-1))**rho)/(n**tau).
-
-    Instead of fitting the global scaling parameter jointly with the other parameters, the logits are pre-processed with
-    temperature scaling (with no regularization) and a is then fixed to a=1.
-
-    [!] The 'bfgs' solver only supports 'ridge' penalty.
-    [!] If 'bfgs' solver is used, max_iter and tol are ignored and default torchmin parameters are used.
-    """
-
-    _warmed_up_saga = False
-    _ALLOWED_PENALTIES = [None, "mcp", "lasso", "group_lasso", "ridge"]
-    _ALLOWED_OPTIMIZERS = ["saga", "bfgs"]
-
-    def __init__(
-        self,
-        penalty: str = "ridge",
-        rho: float = 1.0,
-        tau: float = 1.0,
-        lambda_intercept: float = 1.0,
-        lambda_diagonal: float = 1.0,
-        lambda_off_diagonal: float = 1.0,
-        opt: str = "bfgs",
-        max_iter: int = 500,
-        tol: float = 1e-5,
-        print_init_info: bool = True,
-    ):
-        super().__init__()
-
-        if penalty not in self._ALLOWED_PENALTIES:
-            raise ValueError(
-                f"Penalty '{penalty}' not recognized. Choose from {self._ALLOWED_PENALTIES}."
-            )
-
-        if opt not in self._ALLOWED_OPTIMIZERS:
-            raise ValueError(
-                f"Optimizer '{opt}' not recognized. Choose from {self._ALLOWED_OPTIMIZERS}."
-            )
-
-        if opt == "bfgs" and penalty != "ridge":
-            raise ValueError(
-                "The 'bfgs' optimizer only supports the 'ridge' penalty, use 'saga' instead."
-            )
-
-        if opt == "saga" and not SMSCalibrator._warmed_up_saga:
-            if print_init_info:
-                logger.info(
-                    "First SMSCalibrator instantiation with 'saga' - warming up Numba functions..."
-                )
-            warm_up_matrix_scaling()
-            SMSCalibrator._warmed_up_saga = True
-            if print_init_info:
-                logger.info("Warmed up!")
-
-        self.penalty = penalty
-        self.rho = rho
-        self.tau = tau
-        self.lambda_intercept = lambda_intercept
-        self.lambda_diagonal = lambda_diagonal
-        self.lambda_off_diagonal = lambda_off_diagonal
-        self.max_iter = max_iter
-        self.tol = tol
-        self.opt = opt
-        self.print_init_info = print_init_info
-
-        self.W = None
-        self.ts = None
-
-    def _get_logits(
-        self, X: np.ndarray, append_bias: bool = True
-    ) -> tuple[np.ndarray, int, int]:
-        logits = multiclass_probs_to_logits(X)
-        if append_bias:
-            # Adding a constant term to logit vectors to fit the intercept.
-            logits = np.hstack([logits, np.ones((len(logits), 1))])
-        return logits
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        """Fits the calibration matrix W."""
-        validate_probabilities(X)
-
-        n, k = X.shape
-
-        self.ts = get_calibrator("ts-mix")
-        self.ts.fit(X, y)
-        scaled_probs = self.ts.predict_proba(X)
-
-        reg_intercept = self.lambda_intercept * (k**self.rho) / (n**self.tau)
-        reg_diagonal = self.lambda_diagonal * (k**self.rho) / (n**self.tau)
-        reg_off_diagonal = (
-            self.lambda_off_diagonal * ((k * (k - 1)) ** self.rho) / (n**self.tau)
-        )
-
-        if self.opt == "saga":
-            logits = self._get_logits(scaled_probs)
-            self.W = fit_matrix_scaling(
-                logits,
-                y,
-                penalty=self.penalty,
-                max_iter=self.max_iter,
-                tol=self.tol,
-                init_scaling=1.0,
-                reg_intercept=reg_intercept,
-                reg_diagonal=reg_diagonal,
-                reg_off_diagonal=reg_off_diagonal,
-            )
-
-        elif self.opt == "bfgs":
-            logits = self._get_logits(scaled_probs, append_bias=False)
-            self.W = self._fit_bfgs(
-                torch.tensor(logits),
-                torch.tensor(y),
-                num_classes=k,
-                reg_intercept=reg_intercept,
-                reg_diagonal=reg_diagonal,
-                reg_off_diagonal=reg_off_diagonal,
-            )
-
-    def _fit_bfgs(
-        self,
-        logits: torch.Tensor,
-        y: torch.Tensor,
-        num_classes,
-        reg_intercept,
-        reg_diagonal,
-        reg_off_diagonal,
-    ):
-        K = num_classes
-
-        initial_params = torch.zeros(
-            K * (K + 1), device=logits.device, dtype=logits.dtype
-        )
-
-        def loss(params):
-            W_delta = params[: K * K].view(K, K)
-            b = params[K * K :]
-
-            scaled_logits = logits + torch.nn.functional.linear(logits, W_delta, b)
-            ce = torch.nn.functional.cross_entropy(scaled_logits, y)
-
-            W_diag = W_delta.diagonal()
-            r_i = reg_intercept * b.pow(2).sum()
-            r_d = reg_diagonal * W_diag.pow(2).sum()
-            r_o = reg_off_diagonal * (W_delta.pow(2).sum() - W_diag.pow(2).sum())
-
-            return ce + r_i + r_d + r_o
-
-        method = "l-bfgs" if initial_params.numel() > 1000 else "bfgs"
-        res = torchmin.minimize(loss, initial_params, method=method)
-
-        flat_result = res.x
-        W_delta_final = flat_result[: K * K].view(K, K)
-        b_final = flat_result[K * K :]
-
-        with torch.no_grad():
-            W_final = (
-                torch.eye(K, device=logits.device, dtype=logits.dtype) + W_delta_final
-            )
-            return torch.hstack([W_final, b_final.unsqueeze(1)]).detach().cpu().numpy()
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        validate_probabilities(X)
-
-        if self.W is None:
-            raise RuntimeError(
-                "Calibrator has not been fitted yet. Call fit() before predict_proba()."
-            )
-
-        scaled_probs = self.ts.predict_proba(X)
-        logits = self._get_logits(scaled_probs)
-        calibrated_logits = logits @ self.W.T
-        calibrated_logits -= np.max(calibrated_logits, axis=1, keepdims=True)
-        exp_logits = np.exp(calibrated_logits)
-        return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
-
-
-##################################
-
-
-### Default logistic calibrator, compatible with binary and multiclass ###
-
-
-class LogisticCalibrator(Calibrator):
-    """
-    By default, uses 'quadratic-scaling' for binary classification and 'sms' for multiclass classification.
-    """
-
-    VALID_BINARY_TYPES = ["linear", "affine", "quadratic"]
-    VALID_MULTICLASS_TYPES = ["svs", "sms"]
-
-    def __init__(self, binary_type: str = "quadratic", multiclass_type: str = "sms"):
-        super().__init__()
-
-        if binary_type not in self.VALID_BINARY_TYPES:
-            raise ValueError(
-                f"Invalid binary type '{binary_type}'. Must be one of {self.VALID_BINARY_TYPES}"
-            )
-        self.binary_type = binary_type
-
-        if multiclass_type not in self.VALID_MULTICLASS_TYPES:
-            raise ValueError(
-                f"Invalid multiclass type '{multiclass_type}'. Must be one of {self.VALID_MULTICLASS_TYPES}"
-            )
-        self.multiclass_type = multiclass_type
-
-        self.cal_ = None
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        n_classes = len(np.unique(y))
-
-        if n_classes == 2:
-            if self.binary_type == "affine":
-                self.cal_ = BinaryLogisticCalibrator(type="affine")
-            elif self.binary_type == "linear":
-                self.cal_ = BinaryLogisticCalibrator(type="linear")
-            elif self.binary_type == "quadratic":
-                self.cal_ = BinaryLogisticCalibrator(type="quadratic")
-
-        else:
-            if self.multiclass_type == "svs":
-                self.cal_ = SVSCalibrator()
-            elif self.multiclass_type == "sms":
-                self.cal_ = SMSCalibrator()
-
-        self.cal_.fit(X, y)
-        return self
-
-    def predict(self, X):
-        if self.cal_ is None:
-            raise RuntimeError("LogisticCalibrator must be fitted before prediction.")
-        return self.cal_.predict(X)
-
-    def predict_proba(self, X):
-        if self.cal_ is None:
-            raise RuntimeError("LogisticCalibrator must be fitted before prediction.")
-        return self.cal_.predict_proba(X)
-
-
-##################################
-
-
-class GuoTemperatureScalingCalibrator(Calibrator):
-    # adapted from https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py
-    def __init__(self):
-        super().__init__()
-        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
-
-    def temperature_scale(self, logits):
-        """
-        Perform temperature scaling on logits
-        """
-        # Expand temperature to match the size of logits
-        temperature = self.temperature.unsqueeze(1).expand(
-            logits.size(0), logits.size(1)
-        )
-        return logits / temperature
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        logits = y_pred.get_logits()
-        labels = y_true_labels
-
-        nll_criterion = nn.CrossEntropyLoss().cuda()
-
-        # Next: optimize the temperature w.r.t. NLL
-        optimizer = torch.optim.LBFGS([self.temperature], lr=0.01, max_iter=50)
-
-        def eval():
-            optimizer.zero_grad()
-            loss = nll_criterion(self.temperature_scale(logits), labels)
-            loss.backward()
-            return loss
-
-        optimizer.step(eval)
-
-        return self
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        with torch.no_grad():
-            return CategoricalLogits(y_pred.get_logits() / self.temperature)
-
-
-class AutoGluonTemperatureScalingCalibrator(Calibrator):
-    # adapted from
-    # https://github.com/autogluon/autogluon/blob/c1181326cf6b7e3b27a7420273f1a82808d939e2/core/src/autogluon/core/calibrate/temperature_scaling.py#L9
-    # https://github.com/autogluon/autogluon/blob/28a242ebe8d55ba770c991b9db153ab4623c9abd/tabular/src/autogluon/tabular/trainer/abstract_trainer.py#L4433-L4457
-    def __init__(self, init_val: float = 1, max_iter: int = 200, lr: float = 0.1):
-        super().__init__()
-        self.init_val = init_val
-        self.max_iter = max_iter
-        self.lr = lr
-        self.temperature = init_val
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        y_val_tensor = y_true_labels
-        temperature_param = torch.nn.Parameter(torch.ones(1).fill_(self.init_val))
-        logits = y_pred.get_logits()
-
-        is_invalid = torch.isinf(logits).any().tolist()
-        if is_invalid:
-            return
-
-        nll_criterion = torch.nn.CrossEntropyLoss()
-        optimizer = torch.optim.LBFGS(
-            [temperature_param], lr=self.lr, max_iter=self.max_iter
-        )
-        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
-
-        optimizer_trajectory = []
-
-        if self.init_val != 1.0:
-            # need to check 1.0 as well since AutoGluon does it outside
-            optimizer_trajectory.append(
-                (nll_criterion(logits, y_val_tensor).item(), 1.0)
-            )
-
-        def temperature_scale_step():
-            optimizer.zero_grad()
-            temp = temperature_param.unsqueeze(1).expand(logits.size(0), logits.size(1))
-            new_logits = logits / temp
-            loss = nll_criterion(new_logits, y_val_tensor)
-            loss.backward()
-            scheduler.step()
-            optimizer_trajectory.append((loss.item(), temperature_param.item()))
-            return loss
-
-        optimizer.step(temperature_scale_step)
-
-        try:
-            best_loss_index = np.nanargmin(np.array(optimizer_trajectory)[:, 0])
-        except ValueError:
-            self.temperature = 1.0
-            return
-        temperature_scale = float(np.array(optimizer_trajectory)[best_loss_index, 1])
-
-        if np.isnan(temperature_scale) or temperature_scale <= 0.0:
-            self.temperature = 1.0
-            return
-
-        self.temperature = temperature_scale
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        with torch.no_grad():
-            return CategoricalLogits(y_pred.get_logits() / self.temperature)
-
-
-class AutoGluonInverseTemperatureScalingCalibrator(Calibrator):
-    # adapted from
-    # https://github.com/autogluon/autogluon/blob/c1181326cf6b7e3b27a7420273f1a82808d939e2/core/src/autogluon/core/calibrate/temperature_scaling.py#L9
-    # but optimizing the inverse temperature instead
-    def __init__(self, init_val: float = 1, max_iter: int = 200, lr: float = 0.1):
-        super().__init__()
-        self.init_val = init_val
-        self.max_iter = max_iter
-        self.lr = lr
-        self.inv_temp = init_val
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        y_val_tensor = y_true_labels
-        inv_temperature_param = torch.nn.Parameter(torch.ones(1).fill_(self.init_val))
-        logits = y_pred.get_logits()
-
-        is_invalid = torch.isinf(logits).any().tolist()
-        if is_invalid:
-            return
-
-        nll_criterion = torch.nn.CrossEntropyLoss()
-        optimizer = torch.optim.LBFGS(
-            [inv_temperature_param], lr=self.lr, max_iter=self.max_iter
-        )
-        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
-
-        optimizer_trajectory = []
-
-        def temperature_scale_step():
-            optimizer.zero_grad()
-            inv_temp = inv_temperature_param.unsqueeze(1).expand(
-                logits.size(0), logits.size(1)
-            )
-            new_logits = logits * inv_temp
-            loss = nll_criterion(new_logits, y_val_tensor)
-            loss.backward()
-            scheduler.step()
-            optimizer_trajectory.append((loss.item(), inv_temperature_param.item()))
-            return loss
-
-        optimizer.step(temperature_scale_step)
-
-        try:
-            best_loss_index = np.nanargmin(np.array(optimizer_trajectory)[:, 0])
-        except ValueError:
-            return
-        inv_temperature_scale = float(
-            np.array(optimizer_trajectory)[best_loss_index, 1]
-        )
-
-        if np.isnan(inv_temperature_scale):
-            return
-
-        self.inv_temp = inv_temperature_scale
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        with torch.no_grad():
-            return CategoricalLogits(y_pred.get_logits() * self.inv_temp)
-
-
-class TorchUncertaintyTemperatureScalingCalibrator(TemperatureScalingCalibrator):
-    # adapted from
-    # https://github.com/ENSTA-U2IS-AI/torch-uncertainty/blob/3a021d2e34e183b8aad3a0345e6d750c08c72af3/torch_uncertainty/post_processing/calibration/scaler.py#L1
-    # https://github.com/ENSTA-U2IS-AI/torch-uncertainty/blob/3a021d2e34e183b8aad3a0345e6d750c08c72af3/torch_uncertainty/post_processing/calibration/temperature_scaler.py#L9
-    def __init__(self, init_val: float = 1, lr: float = 0.1, max_iter: int = 100):
-        super().__init__(
-            opt="lbfgs",
-            lr=lr,
-            max_iter=max_iter,
-            use_inv_temp=False,
-            inv_temp_init=1.0 / init_val,
-        )
-        # need to save these values here to comply with sklearn cloneability conventions
-        self.init_val = init_val
-        self.lr = lr
-        self.max_iter = max_iter
-
-class TabPFNCalibrator(Calibrator):
-    def __init__(self):
-        super().__init__()
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        from tabpfn import TabPFNClassifier
-        
-        self.cal_ = TabPFNClassifier()
-        
-        probs = process_binary_probs(X)
-
-        self.cal_.fit(probs.reshape(-1,1), y)
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        
-        probs = process_binary_probs(X)
-
-        result = self.cal_.predict_proba(probs.reshape(-1,1))
-        if len(result.shape) == 1:
-            return np.stack([1.0 - result, result], axis=1)
-        elif result.shape[1] == 1:
-            return np.concatenate([1.0 - result, result], axis=1)
-
-        return result
-
-class BetacalCalibrator(Calibrator):
-    # From https://github.com/betacal/python/
-    def __init__(self, parameters="abm"):
-        super().__init__()
-        
-        from betacal import BetaCalibration
-        
-        self.cal_ = BetaCalibration(parameters=parameters)
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        probs = process_binary_probs(X)
-        
-        self.cal_.fit(probs, y)
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        probs = process_binary_probs(X)
-        
-        result = self.cal_.predict(probs)
-        
-        if len(result.shape) == 1:
-            return np.stack([1.0 - result, result], axis=1)
-        elif result.shape[1] == 1:
-            return np.concatenate([1.0 - result, result], axis=1)
-
-        return result
-    
-class MLISplineCalibrator(Calibrator):
-    # From https://github.com/numeristical/introspective
-    def __init__(self):
-        super().__init__()
-
-        from ml_insights import SplineCalib
-
-        self.cal_ = SplineCalib()
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        probs = process_binary_probs(X)
-        
-        self.cal_.fit(probs, y)
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        probs = process_binary_probs(X)
-
-        result = self.cal_.calibrate(probs)
-        
-        if len(result.shape) == 1:
-            return np.stack([1.0 - result, result], axis=1)
-        elif result.shape[1] == 1:
-            return np.concatenate([1.0 - result, result], axis=1)
-
-        return result
-
-class NetcalTemperatureScalingCalibrator(Calibrator):
-    # this one does nothing due to https://github.com/EFS-OpenSource/calibration-framework/issues/61
-    def __init__(self):
-        super().__init__()
-        from netcal.scaling import TemperatureScaling
-
-        self.cal_ = TemperatureScaling()
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        if X.shape[1] == 2:
-            # binary, convert
-            X = X[:, 1]
-        self.cal_.fit(X, y, random_state=0)
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        if X.shape[1] == 2:
-            # binary, convert
-            X = X[:, 1]
-        result = self.cal_.transform(X)
-        if len(result.shape) == 1:
-            return np.stack([1.0 - result, result], axis=1)
-        elif result.shape[1] == 1:
-            return np.concatenate([1.0 - result, result], axis=1)
-
-        return result
-
-
-class TorchcalTemperatureScalingCalibrator(Calibrator):
-    # adapted from
-    # https://github.com/rishabh-ranjan/torchcal/blob/3fb65f6423d33d680cd68c7f40a0259d41e8fb0b/torchcal.py#L8
-    def __init__(self):
-        super().__init__()
-        self.temperature = 1.0
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        y_pred_logits = y_pred.get_logits()
-
-        temp = torch.ones(1, device=y_pred_logits.device)
-
-        def loss(t):
-            return torch.nn.functional.cross_entropy(y_pred_logits / t, y_true_labels)
-
-        res = torchmin.minimize(loss, temp, method="newton-exact")
-        if not res.success:
-            warnings.warn(
-                f"{self.__class__}: {res.message} Not updating calibrator params."
-            )
-        else:
-            temp = res.x
-
-        self.temperature = temp.item()
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        return CategoricalLogits(y_pred.get_logits() / self.temperature)
-
-
-class TorchcalVectorScalingCalibrator(Calibrator):
-    # adapted from
-    # https://github.com/rishabh-ranjan/torchcal/blob/3fb65f6423d33d680cd68c7f40a0259d41e8fb0b/torchcal.py#L8
-    def __init__(self):
-        super().__init__()
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        y_pred_logits = y_pred.get_logits()
-
-        num_classes = y_pred_logits.shape[1]
-
-        self.temp = torch.ones(num_classes, device=y_pred_logits.device)
-        self.bias = torch.zeros(num_classes, device=y_pred_logits.device)
-
-        def loss(temp_bias):
-            temp, bias = temp_bias.split([num_classes, num_classes])
-            return torch.nn.functional.cross_entropy(
-                y_pred_logits / temp + bias, y_true_labels
-            )
-
-        temp_bias = torch.cat([self.temp, self.bias])
-        res = torchmin.minimize(loss, temp_bias, method="bfgs")
-        if not res.success:
-            warnings.warn(
-                f"{self.__class__}: {res.message} Not updating calibrator params."
-            )
-        else:
-            self.temp, self.bias = res.x.split([num_classes, num_classes])
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        return CategoricalLogits(y_pred.get_logits() / self.temp + self.bias)
-
-
-class TorchcalMatrixScalingCalibrator(Calibrator):
-    def __init__(self):
-        super().__init__()
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        y_pred_logits = y_pred.get_logits()
-
-        num_classes = y_pred_logits.shape[1]
-
-        num_params = num_classes * (num_classes + 1)
-        if num_params <= 1000:
-            method = "bfgs"
-        else:
-            method = "l-bfgs"
-
-        self.itemp = torch.eye(
-            num_classes,
-            num_classes,
-            device=y_pred_logits.device,
-            dtype=y_pred_logits.dtype,
-        )
-        self.bias = torch.zeros(
-            num_classes, device=y_pred_logits.device, dtype=y_pred_logits.dtype
-        )
-
-        def loss(itemp_bias):
-            itemp, bias = itemp_bias.split([num_classes**2, num_classes])
-            itemp = itemp.view(num_classes, num_classes)
-            return torch.nn.functional.cross_entropy(
-                y_pred_logits @ itemp + bias, y_true_labels
-            )
-
-        itemp_bias = torch.cat([self.itemp.view(-1), self.bias])
-        res = torchmin.minimize(loss, itemp_bias, method=method)
-        if not res.success:
-            warnings.warn(
-                f"{self.__class__}: {res.message} Not updating calibrator params."
-            )
-        else:
-            self.itemp, self.bias = res.x.split([num_classes**2, num_classes])
-            self.itemp = self.itemp.view(num_classes, num_classes)
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        return CategoricalLogits(y_pred.get_logits() @ self.itemp + self.bias)
-
-
-class CenteredIsotonicRegressionCalibrator(Calibrator):
-    def __init__(self):
-        super().__init__()
-
-        from cir_model import CenteredIsotonicRegression
-
-        self.cal_ = CenteredIsotonicRegression()
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        # TODO
-        # we have to use float64 since with float32 it can happen that somewhere internally
-        # a rounding error occurs (?) and interp1d thinks that it is asked to extrapolate at the boundary value,
-        # which causes it to output nan values,
-        # which are seen as a separate possible value but (y==value) is empty because nan==nan is false,
-        # so an error occurs because the method attempts to take an average of an empty set
-
-        probs = process_binary_probs(X)
-
-        probs = probs.astype(np.float64)
-        y = y.astype(np.float64)
-
-        self.cal_.fit(probs, y)
-        self.min_ = np.min(probs)
-        self.max_ = np.max(probs)
-
-    def predict_proba(self, X):
-        probs = process_binary_probs(X)
-        probs = probs.astype(np.float64)
-
-        # have to clip since CenteredIsotonicRegression refuses to extrapolate (?)
-        probs = np.clip(probs, self.min_, self.max_)
-        
-        result = self.cal_.transform(probs)
-        result = np.clip(result, 0.0, 1.0)
-        return np.stack([1.0 - result, result], axis=-1)
-
-
-class BinaryVennAbersCalibrator(Calibrator):
-    def __init__(self):
-        super().__init__()
-
-        from venn_abers import VennAbers
-
-        self.cal_ = VennAbers()
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        self.cal_.fit(X, y)
-
-    def predict_proba(self, X):
-        return self.cal_.predict_proba(X)[0]
-
-
-class VennAbersCalibrator(Calibrator):
-    def __init__(self, use_ovo: bool = False):
-        super().__init__()
-        self.use_ovo = use_ovo
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        self.X_ = np.copy(X)
-        self.y_ = np.copy(y)
-
-    def predict_proba(self, X):
-        from venn_abers import VennAbersCalibrator
-
-        va = VennAbersCalibrator()
-        return va.predict_proba(
-            p_cal=self.X_,
-            y_cal=self.y_,
-            p_test=X,
-            p0_p1_output=False,
-            va_type="one_vs_one" if self.use_ovo else "one_vs_all",
-        )
-
-
-class MulticlassOneVsOneCalibrator(Calibrator):
-    def __init__(self, binary_calibrator: BaseEstimator):
-        super().__init__()
-        self.binary_calibrator = binary_calibrator
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        self.n_classes_ = X.shape[-1]
-        self.bin_calibs_ = []
-
-        if self.n_classes_ == 2:
-            # binary classification
-            bin_calib = sklearn.base.clone(self.binary_calibrator)
-            bin_calib.fit(X, y)
-            self.bin_calibs_.append(bin_calib)
-        else:
-            for i in range(self.n_classes_):
-                for j in range(i + 1, self.n_classes_):
-                    idxs = np.logical_or((y == i), (y == j))
-                    # idxs = np.arange(X.shape[0])
-                    bin_probs = np.stack([X[idxs, j], X[idxs, i]], axis=-1)
-                    bin_probs += 1e-30
-                    bin_probs /= np.sum(bin_probs, axis=-1, keepdims=True)
-                    # print(f'{np.any(np.isnan(bin_probs))=}')
-                    bin_labels = (y[idxs] == i).astype(np.int32)
-                    bin_calib = sklearn.base.clone(self.binary_calibrator)
-                    bin_calib.fit(bin_probs, bin_labels)
-                    self.bin_calibs_.append(bin_calib)
-
-    def predict_proba(self, X):
-        if self.n_classes_ == 2:
-            # binary classification
-            return self.bin_calibs_[0].predict_proba(X)
-        else:
-            # use PKPD formula in http://proceedings.mlr.press/v60/manokhin17a/manokhin17a.pdf
-            pair_probs = [[None] * self.n_classes_ for i in range(self.n_classes_)]
-            multi_probs = []
-            calib_idx = 0
-            for i in range(self.n_classes_):
-                for j in range(i + 1, self.n_classes_):
-                    bin_probs = np.stack([X[:, j], X[:, i]], axis=-1)
-                    bin_probs += 1e-30
-                    bin_probs /= np.sum(bin_probs, axis=-1, keepdims=True)
-                    pred_probs = self.bin_calibs_[calib_idx].predict_proba(bin_probs)
-                    pair_probs[i][j] = pred_probs[:, 1]
-                    pair_probs[j][i] = pred_probs[:, 0]
-                    # if i == 0 and j == 1:
-                    #     print(f'{bin_probs=}')
-                    #     print(f'{pred_probs=}')
-                    calib_idx += 1
-
-            for i in range(self.n_classes_):
-                sum_inv_probs = sum(
-                    [
-                        1.0 / (1e-30 + pair_probs[i][j])
-                        for j in range(self.n_classes_)
-                        if j != i
-                    ]
-                )
-                multi_probs.append(
-                    1.0 / np.clip(sum_inv_probs - (self.n_classes_ - 2), 1e-30, np.inf)
-                )
-
-            multi_probs = np.stack(multi_probs, axis=-1)
-            multi_probs = np.clip(multi_probs, a_min=1e-30, a_max=np.inf)
-            multi_probs = multi_probs / np.sum(multi_probs, axis=-1, keepdims=True)
-            return multi_probs
-
-
-class MulticlassOneVsRestCalibrator(Calibrator):
-    def __init__(self, binary_calibrator: BaseEstimator):
-        super().__init__()
-        self.binary_calibrator = binary_calibrator
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        self.n_classes_ = X.shape[-1]
-        self.bin_calibs_ = []
-
-        X[np.isnan(X)] = 0.0
-
-        if self.n_classes_ == 2:
-            # binary classification
-            bin_calib = sklearn.base.clone(self.binary_calibrator)
-            bin_calib.fit(X, y)
-            self.bin_calibs_.append(bin_calib)
-        else:
-            for i in range(self.n_classes_):
-                pos_probs = X[:, i]
-                neg_probs = 1.0 - pos_probs
-                bin_probs = np.stack([neg_probs, pos_probs], axis=-1)
-                bin_labels = (y == i).astype(np.int32)
-                bin_calib = sklearn.base.clone(self.binary_calibrator)
-                bin_calib.fit(bin_probs, bin_labels)
-                self.bin_calibs_.append(bin_calib)
-
-    def predict_proba(self, X):
-        X[np.isnan(X)] = 0.0
-        if self.n_classes_ == 2:
-            # binary classification
-            return self.bin_calibs_[0].predict_proba(X)
-        else:
-            multi_probs = []
-            for i in range(self.n_classes_):
-                pos_probs = X[:, i]
-                neg_probs = 1.0 - pos_probs
-                bin_probs = np.stack([neg_probs, pos_probs], axis=-1)
-                pred_probs = self.bin_calibs_[i].predict_proba(bin_probs)
-                multi_probs.append(pred_probs[:, 1])
-            multi_probs = np.stack(multi_probs, axis=-1)
-            multi_probs = np.clip(multi_probs, a_min=1e-30, a_max=np.inf)
-            multi_probs = multi_probs / np.sum(multi_probs, axis=-1, keepdims=True)
-            return multi_probs
-
-
-class IdentityEstimator(ClassifierMixin, BaseEstimator):
-    def __init__(self, n_classes: int):
-        self.n_classes = n_classes
-        # somehow it can fail if we don't do the np.asarray()
-        self.classes_ = np.asarray(list(range(n_classes)))
-
-    def fit(self, X, y):
-        # self.classes_ = np.unique(y)  # if we do this we have a problem for missing classes
-        return self
-
-    def predict_proba(self, X):
-        return X
-
-    def predict(self, X):
-        # having this as a dummy here for the FrozenEstimator solution
-        # which hovewer doesn't work right now
-        raise NotImplementedError()
-
-
-class PreprocessingCalibratorWrapper(Calibrator):
-    def __init__(
-        self,
-        calibrator: Calibrator,
-        pre_calib: Optional[Calibrator] = None,
-        l2_normalize: bool = False,
-    ):
-        super().__init__()
-        self.calibrator = calibrator
-        self.pre_calib = pre_calib
-        self.l2_normalize = l2_normalize
-
-        # options: robust scaling
-        # pre-TS
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ) -> "Calibrator":
-        if self.pre_calib:
-            self.pre_calib_: Calibrator = sklearn.base.clone(self.pre_calib)
-            self.pre_calib_.fit_torch(y_pred, y_true_labels)
-            y_pred = self.pre_calib_.predict_proba_torch(y_pred)
-        if self.l2_normalize:
-            logits = y_pred.get_logits()
-            logits = logits - logits.mean(dim=-1, keepdim=True)
-            self.factor_ = 1.0 / (logits.square().mean().sqrt().item() + 1e-8)
-            logits = self.factor_ * logits
-            y_pred = CategoricalLogits(logits)
-
-        self.calib_: Calibrator = sklearn.base.clone(self.calibrator)
-        self.calib_.fit_torch(y_pred, y_true_labels)
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        if self.pre_calib:
-            y_pred = self.pre_calib_.predict_proba_torch(y_pred)
-        if self.l2_normalize:
-            logits = y_pred.get_logits()
-            logits = logits - logits.mean(dim=-1, keepdim=True)
-            logits = self.factor_ * logits
-            y_pred = CategoricalLogits(logits)
-
-        return self.calib_.predict_proba_torch(y_pred)
-
-
-class SklearnCalibrator(Calibrator):
-    def __init__(self, method: str, cv: str = "prefit"):
-        assert cv == 'prefit', "Other methods than prefit not intended and not implemented for sklearn > 1.6"
-        super().__init__()
-        self.method = method
-        self.cv = cv
-
-    @staticmethod
-    def _normalize_rows(A: np.ndarray) -> np.ndarray:
-        """Row-normalize so rows sum to 1; safe if a row sums to 0."""
-        s = A.sum(axis=1, keepdims=True)
-        s = np.where(s == 0.0, 1.0, s)
-        return A / s
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        if X.ndim == 1:
-            X = np.stack((1.0 - X, X), axis=1)
-
-        y = np.asarray(y)
-
-        self.n_classes_ = X.shape[-1]
-        self.classes_ = np.arange(self.n_classes_)
-
-        # Fit label encoder up-front (used for the missing-class path)
-        self.label_encoder_ = LabelEncoder()
-        y_mapped = self.label_encoder_.fit_transform(y)
-        self.present_classes_ = self.label_encoder_.classes_
-        self.missing_classes_ = np.setdiff1d(self.classes_, self.present_classes_)
-
-        # ---------- Case 1: no missing classes -> original formulation ----------
-        if self.missing_classes_.size == 0:
-            est = IdentityEstimator(self.n_classes_)
-            est.fit(X, y)
-
-            try:
-                # sklearn >= 1.6
-                from sklearn.frozen import FrozenEstimator
-                self.calib_ = CalibratedClassifierCV(FrozenEstimator(est), method=self.method)
-            except ImportError:
-                # sklearn < 1.6
-                self.calib_ = CalibratedClassifierCV(est, method=self.method, cv=self.cv)
-
-            self.calib_.fit(X, y)
-            return
-
-        # ---------- Case 2: missing classes -> slice + normalize X_present, use y_mapped ----------
-        X_present = self._normalize_rows(X[:, self.present_classes_])
-
-        est = IdentityEstimator(len(self.present_classes_))
-        est.fit(X_present, y_mapped)
-
-        try:
-            from sklearn.frozen import FrozenEstimator
-            self.calib_ = CalibratedClassifierCV(FrozenEstimator(est), method=self.method)
-        except ImportError:
-            self.calib_ = CalibratedClassifierCV(est, method=self.method, cv=self.cv)
-
-        self.calib_.fit(X_present, y_mapped)
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        if X.ndim == 1:
-            X = np.stack((1.0 - X, X), axis=1)
-
-        # If fit saw all classes, keep original behavior
-        if self.missing_classes_.size == 0:
-            return self.calib_.predict_proba(X)
-
-        # Missing classes: normalize the same sliced X_present and zero-fill missing classes (Option A)
-        X_present = self._normalize_rows(X[:, self.present_classes_])
-        p_present = self.calib_.predict_proba(X_present)
-
-        p_full = np.zeros((X.shape[0], self.n_classes_), dtype=p_present.dtype)
-        p_full[:, self.present_classes_] = p_present
-        return p_full
-
-
-class WrapCalibratorAsClassifier(BaseEstimator, ClassifierMixin):
-    def __init__(self, calib: BaseEstimator):
-        self.calib = calib
-
-    def fit(self, X, y):
-        # print(f'Fitting calibrator: {self.calib}')
-        self.calib_ = sklearn.base.clone(self.calib)
-        self.calib_.fit(X, y)
-        self.classes_ = list(range(np.max(y) + 1))
-        return self
-
-    def predict_proba(self, X):
-        return self.calib_.predict_proba(X)
-
-
-def logloss_np(y_true: np.ndarray, y_proba: np.ndarray):
-    return -np.mean(np.take_along_axis(np.log(y_proba), y_true[:, None], axis=1))
-
-
-class DirichletCalibrator(Calibrator):
-    def __init__(
-        self,
-        n_cv: int = 0,
-        use_odir: bool = False,
-        reg_lambda: float = 0.0,
-        reg_mu: Optional[float] = None,
-        reg_lambda_grid: Optional[List[float]] = None,
-        reg_mu_grid: Optional[List[float]] = None,
-    ):
-        super().__init__()
-
-        self.n_cv = n_cv
-        self.use_odir = use_odir
-        self.reg_lambda = reg_lambda
-        self.reg_mu = reg_mu
-        self.reg_lambda_grid = reg_lambda_grid
-        self.reg_mu_grid = reg_mu_grid
-
-    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
-        from dirichletcal.calib.fulldirichlet import FullDirichletCalibrator
-
-        if self.n_cv == 0:
-            self.cal_ = FullDirichletCalibrator(
-                reg_lambda=self.reg_lambda, reg_mu=self.reg_mu
-            )
-        elif self.n_cv >= 2:
-            reg_lambda_grid = self.reg_lambda_grid or [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
-            reg_mu_grid = self.reg_mu_grid or [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
-            calibrator = FullDirichletCalibrator(
-                reg_lambda=reg_lambda_grid,
-                reg_mu=reg_mu_grid if self.use_odir else None,
-            )
-            calibrator = WrapCalibratorAsClassifier(calibrator)
-            skf = StratifiedKFold(n_splits=self.n_cv, shuffle=True, random_state=0)
-            self.cal_ = GridSearchCV(
-                calibrator,
-                param_grid={
-                    "calib__reg_lambda": reg_lambda_grid,
-                    "calib__reg_mu": reg_mu_grid if self.use_odir else [None],
-                },
-                cv=skf,
-                # use this because scikit-learn's logloss scorer fails in case of missing classes
-                scoring=lambda est, X_test, y_test: -logloss_np(
-                    y_test, est.predict_proba(X_test)
-                ),
-            )
-        else:
-            raise ValueError(f"n_cv must be either 0 or >=2, but is {self.n_cv}")
-
-        self.cal_.fit(X, y)
-
-    def predict_proba(self, X: np.ndarray) -> np.ndarray:
-        return self.cal_.predict_proba(X)
-
-
-class MixtureCalibrator(Calibrator):
-    def __init__(
-        self,
-        calibrator: Calibrator,
-        output_constant: float = 1.0,
-        input_constant: float = 0.0,
-    ):
-        super().__init__()
-        self.calibrator = calibrator
-        self.output_constant = output_constant
-        self.input_constant = input_constant
-
-    def _get_mixture(self, dist_1: CategoricalDistribution, unif_coef: float):
-        probs = dist_1.get_probs()
-        unif_probs = (1.0 / probs.shape[-1]) * torch.ones_like(probs)
-        return CategoricalProbs((1.0 - unif_coef) * probs + unif_coef * unif_probs)
-
-    def _fit_torch_impl(
-        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
-    ):
-        self.n_samples_ = y_pred.get_n_samples()
-        if self.input_constant != 0.0:
-            y_pred = self._get_mixture(
-                y_pred, self.input_constant / (self.n_samples_ + 1)
-            )
-        self.cal_ = sklearn.base.clone(self.calibrator)
-        self.cal_.fit_torch(y_pred, y_true_labels)
-
-    def predict_proba_torch(
-        self, y_pred: CategoricalDistribution
-    ) -> CategoricalDistribution:
-        if self.input_constant != 0.0:
-            y_pred = self._get_mixture(
-                y_pred, self.input_constant / (self.n_samples_ + 1)
-            )
-        y_pred = self.cal_.predict_proba_torch(y_pred)
-        if self.output_constant != 0.0:
-            y_pred = self._get_mixture(
-                y_pred, self.output_constant / (self.n_samples_ + 1)
-            )
-
-        return y_pred
-
-
-def get_calibrator(
-    calibration_method: str,
-    calibrate_with_mixture: bool = False,
-    cal_mixture_output_constant: float = 1.0,
-    cal_mixture_input_constant: float = 0.0,
-    **config,
-) -> Calibrator:
-    if calibration_method == "platt":
-        cal = SklearnCalibrator(method="sigmoid", cv="prefit")
-    elif calibration_method == "platt-logits":
-        cal = ApplyToLogitsCalibrator(SklearnCalibrator(method="sigmoid", cv="prefit"))
-    elif calibration_method == "isotonic":
-        cal = SklearnCalibrator(method="isotonic", cv="prefit")
-    elif calibration_method == "ivap":
-        cal = VennAbersCalibrator(use_ovo=config.get("va_use_ovo", False))
-    elif calibration_method == "ivap-ovr":
-        cal = MulticlassOneVsRestCalibrator(BinaryVennAbersCalibrator())
-    elif calibration_method == "ivap-ovo":
-        cal = MulticlassOneVsOneCalibrator(BinaryVennAbersCalibrator())
-    elif calibration_method == "isotonic-naive-ovr":
-        # should be the same as 'isotonic', this is just to check
-        cal = MulticlassOneVsRestCalibrator(
-            SklearnCalibrator(method="isotonic", cv="prefit")
-        )
-    elif calibration_method == "cir":
-        cal = MulticlassOneVsRestCalibrator(CenteredIsotonicRegressionCalibrator())
-    elif calibration_method in ["temp-scaling", "ts-mix"]:
-        cal = TemperatureScalingCalibrator(
-            opt=config.get("ts_opt", "bisection"),
-            max_bisection_steps=config.get("ts_max_bisection_steps", 30),
-            lr=config.get("ts_lr", 0.1),
-            max_iter=config.get("ts_max_iter", 200),
-            use_inv_temp=config.get("ts_use_inv_temp", True),
-            inv_temp_init=config.get("ts_inv_temp_init", 1 / 1.5),
-        )
-    elif calibration_method == "linear-scaling":
-        cal = BinaryLogisticCalibrator(type="linear")
-    elif calibration_method == "affine-scaling":
-        cal = BinaryLogisticCalibrator(type="affine")
-    elif calibration_method == "quadratic-scaling":
-        cal = BinaryLogisticCalibrator(type="quadratic")
-    elif calibration_method == "beta":
-        cal = BetacalCalibrator(
-            parameters=config.get("beta_parameters", "abm")
-        )
-    elif calibration_method == "svs":
-        cal = SVSCalibrator(
-            penalty=config.get("svs_penalty", "ridge"),
-            rho=config.get("svs_rho", 1.0),
-            tau=config.get("svs_tau", 1.0),
-            lambda_intercept=config.get("svs_lambda_intercept", 1.0),
-            lambda_diagonal=config.get("svs_lambda_diagonal", 1.0),
-            opt=config.get("svs_opt", "bfgs"),
-            max_iter=config.get("svs_max_iter", 500),
-            tol=config.get("svs_tol", 1e-5),
-            print_init_info=config.get("svs_print_init_info", True),
-        )
-    elif calibration_method == "sms":
-        cal = SMSCalibrator(
-            penalty=config.get("sms_penalty", "ridge"),
-            rho=config.get("sms_rho", 1.0),
-            tau=config.get("sms_tau", 1.0),
-            lambda_intercept=config.get("sms_lambda_intercept", 1.0),
-            lambda_diagonal=config.get("sms_lambda_diagonal", 1.0),
-            lambda_off_diagonal=config.get("sms_lambda_off_diagonal", 1.0),
-            opt=config.get("sms_opt", "bfgs"),
-            max_iter=config.get("sms_max_iter", 500),
-            tol=config.get("sms_tol", 1e-5),
-            print_init_info=config.get("sms_print_init_info", True),
-        )
-    elif calibration_method in ["logistic", "logistic-mix"]:
-        cal = LogisticCalibrator(
-            binary_type=config.get("logistic_binary_type", "affine"),
-            multiclass_type=config.get("logistic_multiclass_type", "sms"),
-        )
-    elif calibration_method == "torchunc-ts":
-        cal = TorchUncertaintyTemperatureScalingCalibrator(
-            init_val=config.get("ts_init_val", 1),
-            lr=config.get("ts_lr", 0.1),
-            max_iter=config.get("ts_max_iter", 100),
-        )
-    elif calibration_method == "guo-ts":
-        cal = GuoTemperatureScalingCalibrator()
-    elif calibration_method == "torchcal-ts":
-        cal = TorchcalTemperatureScalingCalibrator()
-    elif calibration_method == "autogluon-ts":
-        cal = AutoGluonTemperatureScalingCalibrator(
-            init_val=config.get("ts_init_val", 1),
-            max_iter=config.get("ts_max_iter", 200),
-            lr=config.get("ts_lr", 0.1),
-        )
-    elif calibration_method == "autogluon-inv-ts":
-        cal = AutoGluonInverseTemperatureScalingCalibrator(
-            init_val=config.get("ts_init_val", 1),
-            max_iter=config.get("ts_max_iter", 200),
-            lr=config.get("ts_lr", 0.1),
-        )
-    elif calibration_method == "dircal":
-        cal = DirichletCalibrator(
-            n_cv=0,
-            reg_lambda=config.get("dircal_reg_lambda", 0.0),
-            reg_mu=config.get("dircal_reg_mu", None),
-        )
-    elif calibration_method == "dircal-cv":
-        cal = DirichletCalibrator(
-            n_cv=config.get("dircal_n_cv", 5),
-            use_odir=config.get("dircal_use_odir", False),
-            reg_lambda_grid=config.get("dircal_reg_lambda_grid", None),
-            reg_mu_grid=config.get("dircal_reg_mu_grid", None),
-        )
-    else:
-        raise ValueError(f'Unknown calibration method "{calibration_method}"')
-
-    if calibrate_with_mixture or calibration_method.endswith("-mix"):
-        cal = MixtureCalibrator(
-            cal,
-            output_constant=cal_mixture_output_constant,
-            input_constant=cal_mixture_input_constant,
-        )
-
-    return cal
diff --git a/probmetrics/calibrators/__init__.py b/probmetrics/calibrators/__init__.py
new file mode 100644
index 0000000..00fbe3d
--- /dev/null
+++ b/probmetrics/calibrators/__init__.py
@@ -0,0 +1,110 @@
+# probmetrics/calibrators/__init__.py
+
+from .base import (
+    Calibrator,
+    ApplyToLogitsCalibrator,
+    WrapCalibratorAsClassifier,
+    MulticlassOneVsOneCalibrator,
+    MulticlassOneVsRestCalibrator,
+    MixtureCalibrator,
+)
+
+from .logistic import (
+    VectorScalingCalibrator,
+    SVSCalibrator,
+    MatrixScalingCalibrator,
+    SMSCalibrator,
+    DirichletCalibrator,
+    LogisticCalibrator,
+    BinaryLogisticCalibrator,
+    RegularizedAffineScalingCalibrator,
+    RegularizedQuadraticScalingCalibrator,
+    BetacalCalibrator,
+)
+
+from .temperature_scaling import (
+    TemperatureScalingCalibrator,
+    GuoTemperatureScalingCalibrator,
+    AutoGluonTemperatureScalingCalibrator,
+    AutoGluonInverseTemperatureScalingCalibrator,
+    TorchUncertaintyTemperatureScalingCalibrator,
+    TorchcalTemperatureScalingCalibrator,
+    NetcalTemperatureScalingCalibrator,
+    ETSCalibrator,
+    SoftPlusCalibrator,
+)
+
+from .nonparametric import (
+    NetcalENIRCalibrator,
+    CenteredIsotonicRegressionCalibrator,
+    BinaryVennAbersCalibrator,
+    VennAbersCalibrator,
+    KernelCalibrator,
+)
+
+from .binning import (
+    BinaryHistogramBinningCalibrator,
+    BinaryPlattBinnerCalibrator,
+    NetcalBBQCalibrator,
+)
+
+from .splines import (
+    MLISplineCalibrator,
+    CDFSplineCalibrator,
+)
+
+from .sklearn import (
+    SklearnCalibrator,
+)
+
+from .trees import (
+    CatBoostCalibrator,
+    LightGBMCalibrator,
+    XGBoostCalibrator,
+    BinaryCatBoostCalibrator,
+)
+
+from .factory import get_calibrator
+
+
+__all__ = [
+    "Calibrator",
+    "MulticlassOneVsOneCalibrator",
+    "MulticlassOneVsRestCalibrator",
+    "MixtureCalibrator",
+    "get_calibrator",
+    "VectorScalingCalibrator",
+    "SVSCalibrator",
+    "MatrixScalingCalibrator",
+    "SMSCalibrator",
+    "DirichletCalibrator",
+    "LogisticCalibrator",
+    "BinaryLogisticCalibrator",
+    "RegularizedAffineScalingCalibrator",
+    "RegularizedQuadraticScalingCalibrator",
+    "BetacalCalibrator",
+    "TemperatureScalingCalibrator",
+    "GuoTemperatureScalingCalibrator",
+    "AutoGluonTemperatureScalingCalibrator",
+    "AutoGluonInverseTemperatureScalingCalibrator",
+    "TorchUncertaintyTemperatureScalingCalibrator",
+    "TorchcalTemperatureScalingCalibrator",
+    "NetcalTemperatureScalingCalibrator",
+    "ETSCalibrator",
+    "SoftPlusCalibrator",
+    "NetcalENIRCalibrator",
+    "CenteredIsotonicRegressionCalibrator",
+    "BinaryVennAbersCalibrator",
+    "VennAbersCalibrator",
+    "KernelCalibrator",
+    "BinaryHistogramBinningCalibrator",
+    "BinaryPlattBinnerCalibrator",
+    "NetcalBBQCalibrator",
+    "MLISplineCalibrator",
+    "CDFSplineCalibrator",
+    "SklearnCalibrator",
+    "CatBoostCalibrator",
+    "LightGBMCalibrator",
+    "XGBoostCalibrator",
+    "BinaryCatBoostCalibrator",
+]
diff --git a/probmetrics/calibrators/base.py b/probmetrics/calibrators/base.py
new file mode 100644
index 0000000..2ac6645
--- /dev/null
+++ b/probmetrics/calibrators/base.py
@@ -0,0 +1,357 @@
+import torch
+import sklearn
+import numpy as np
+
+from probmetrics.distributions import (
+    CategoricalDistribution,
+    CategoricalProbs,
+)
+from probmetrics.utils import validate_probabilities
+
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.utils.validation import check_is_fitted
+
+
+class Calibrator(BaseEstimator, ClassifierMixin):
+    """
+    Calibrator base class. To implement,
+    - override at least one of (_fit_impl, _fit_torch_impl)
+    - override at least one of (_predict_proba_impl, _predict_proba_torch_impl)
+    """
+
+    def __init__(self):
+        assert self.__class__.fit == Calibrator.fit
+        assert self.__class__.fit_torch == Calibrator.fit_torch
+        assert self.__class__.predict_proba == Calibrator.predict_proba
+        assert self.__class__.predict_proba_torch == Calibrator.predict_proba_torch
+
+    def fit(self, X: np.ndarray, y: np.ndarray) -> "Calibrator":
+        """Maps to _fit_impl if implemented, falls back on _fit_torch_impl otherwise.
+        """
+        if not isinstance(X, np.ndarray):
+            raise TypeError(f"X must be a numpy array, got {type(X).__name__}")
+        if not isinstance(y, np.ndarray):
+            raise TypeError(f"y must be a numpy array, got {type(y).__name__}")
+
+        validate_probabilities(X)
+
+        self.classes_ = list(range(X.shape[-1]))
+
+        # If subclass implemented the numpy version, use it
+        if self.__class__._fit_impl != Calibrator._fit_impl:
+            self._fit_impl(X, y)
+            return self
+
+        # Fallback to torch routing if numpy is not implemented
+        if self.__class__._fit_torch_impl != Calibrator._fit_torch_impl:
+            self._fit_torch_impl(
+                y_pred=CategoricalProbs(torch.as_tensor(X)),
+                y_true_labels=torch.as_tensor(y, dtype=torch.long),
+            )
+            return self
+
+        raise NotImplementedError(
+            "Subclasses must implement _fit_impl or _fit_torch_impl."
+        )
+
+    def fit_torch(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ) -> "Calibrator":
+        """Maps to _fit_torch_impl if implemented, falls back on _fit_impl otherwise.
+        """
+        if not isinstance(y_true_labels, torch.Tensor):
+            raise TypeError(
+                f"y_true_labels must be a torch.Tensor, got {type(y_true_labels).__name__}"
+            )
+        if not isinstance(y_pred, CategoricalDistribution):
+            raise TypeError(
+                f"y_pred must be a CategoricalDistribution, got {type(y_pred).__name__}"
+            )
+        # default implementation, using sklearn
+        self.classes_ = list(range(y_pred.get_n_classes()))
+
+        # If subclass implemented the torch version, use it
+        if self.__class__._fit_torch_impl != Calibrator._fit_torch_impl:
+            self._fit_torch_impl(y_pred, y_true_labels)
+            return self
+
+        # Fallback to numpy routing if torch is not implemented
+        if self.__class__._fit_impl != Calibrator._fit_impl:
+            self._fit_impl(
+                y_pred.get_probs().detach().cpu().numpy(),
+                y_true_labels.detach().cpu().numpy(),
+            )
+            return self
+
+        raise NotImplementedError(
+            "Subclasses must implement _fit_impl or _fit_torch_impl."
+        )
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        raise NotImplementedError()
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ) -> None:
+        raise NotImplementedError()
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        """Maps to _predict_proba_impl if implemented, falls back on
+        _predict_proba_torch_impl otherwise.
+        """
+        check_is_fitted(self)
+        if not isinstance(X, np.ndarray):
+            raise TypeError(f"X must be a numpy array, got {type(X).__name__}")
+        validate_probabilities(X)
+
+        # If subclass implemented the numpy version, use it
+        if self.__class__._predict_proba_impl != Calibrator._predict_proba_impl:
+            return self._predict_proba_impl(X)
+
+        # Fallback to torch routing if numpy is not implemented
+        if (
+            self.__class__._predict_proba_torch_impl
+            != Calibrator._predict_proba_torch_impl
+        ):
+            return (
+                self._predict_proba_torch_impl(CategoricalProbs(torch.as_tensor(X)))
+                .get_probs()
+                .numpy()
+            )
+
+        raise NotImplementedError(
+            "Subclasses must implement _predict_proba_impl or _predict_proba_torch_impl."
+        )
+
+    def predict_proba_torch(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        """Maps to _predict_proba_torch_impl if implemented, falls back on
+        _predict_proba_impl otherwise.
+        """
+        check_is_fitted(self)
+        if not isinstance(y_pred, CategoricalDistribution):
+            raise TypeError(
+                f"y_pred must be a CategoricalDistribution, got {type(y_pred).__name__}"
+            )
+
+        # If subclass implemented the torch version, use it
+        if (
+            self.__class__._predict_proba_torch_impl
+            != Calibrator._predict_proba_torch_impl
+        ):
+            return self._predict_proba_torch_impl(y_pred)
+
+        # Fallback to numpy routing if torch is not implemented
+        if self.__class__._predict_proba_impl != Calibrator._predict_proba_impl:
+            y_pred_probs = y_pred.get_probs()
+            probs = self._predict_proba_impl(y_pred_probs.detach().cpu().numpy())
+            return CategoricalProbs(
+                torch.as_tensor(
+                    probs, device=y_pred_probs.device, dtype=y_pred_probs.dtype
+                )
+            )
+
+        raise NotImplementedError(
+            "Subclasses must implement _predict_proba_impl or _predict_proba_torch_impl."
+        )
+    
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        raise NotImplementedError()
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        raise NotImplementedError()
+
+    def predict(self, X):
+        y_probs = self.predict_proba(X)
+        class_idxs = np.argmax(y_probs, axis=-1)
+        return np.asarray(self.classes_)[class_idxs]
+
+
+class ApplyToLogitsCalibrator(Calibrator):
+    def __init__(self, calib: BaseEstimator):
+        super().__init__()
+        self.calib = calib
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        self.calib_ = sklearn.base.clone(self.calib)
+        self.calib_.fit(np.log(X + 1e-10), y)
+
+    def _predict_proba_impl(self, X):
+        return self.calib_.predict_proba(np.log(X + 1e-10))
+
+
+class MulticlassOneVsOneCalibrator(Calibrator):
+    def __init__(self, binary_calibrator: BaseEstimator):
+        super().__init__()
+        self.binary_calibrator = binary_calibrator
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        self.n_classes_ = X.shape[-1]
+        self.bin_calibs_ = []
+
+        if self.n_classes_ == 2:
+            # binary classification
+            bin_calib = sklearn.base.clone(self.binary_calibrator)
+            bin_calib.fit(X, y)
+            self.bin_calibs_.append(bin_calib)
+        else:
+            for i in range(self.n_classes_):
+                for j in range(i + 1, self.n_classes_):
+                    idxs = np.logical_or((y == i), (y == j))
+                    # idxs = np.arange(X.shape[0])
+                    bin_probs = np.stack([X[idxs, j], X[idxs, i]], axis=-1)
+                    bin_probs += 1e-30
+                    bin_probs /= np.sum(bin_probs, axis=-1, keepdims=True)
+                    # print(f'{np.any(np.isnan(bin_probs))=}')
+                    bin_labels = (y[idxs] == i).astype(np.int32)
+                    bin_calib = sklearn.base.clone(self.binary_calibrator)
+                    bin_calib.fit(bin_probs, bin_labels)
+                    self.bin_calibs_.append(bin_calib)
+
+    def _predict_proba_impl(self, X):
+        if self.n_classes_ == 2:
+            # binary classification
+            return self.bin_calibs_[0].predict_proba(X)
+        else:
+            # use PKPD formula in http://proceedings.mlr.press/v60/manokhin17a/manokhin17a.pdf
+            pair_probs = [[None] * self.n_classes_ for i in range(self.n_classes_)]
+            multi_probs = []
+            calib_idx = 0
+            for i in range(self.n_classes_):
+                for j in range(i + 1, self.n_classes_):
+                    bin_probs = np.stack([X[:, j], X[:, i]], axis=-1)
+                    bin_probs += 1e-30
+                    bin_probs /= np.sum(bin_probs, axis=-1, keepdims=True)
+                    pred_probs = self.bin_calibs_[calib_idx].predict_proba(bin_probs)
+                    pair_probs[i][j] = pred_probs[:, 1]
+                    pair_probs[j][i] = pred_probs[:, 0]
+                    # if i == 0 and j == 1:
+                    #     print(f'{bin_probs=}')
+                    #     print(f'{pred_probs=}')
+                    calib_idx += 1
+
+            for i in range(self.n_classes_):
+                sum_inv_probs = sum(
+                    [
+                        1.0 / (1e-30 + pair_probs[i][j])
+                        for j in range(self.n_classes_)
+                        if j != i
+                    ]
+                )
+                multi_probs.append(
+                    1.0 / np.clip(sum_inv_probs - (self.n_classes_ - 2), 1e-30, np.inf)
+                )
+
+            multi_probs = np.stack(multi_probs, axis=-1)
+            multi_probs = np.clip(multi_probs, a_min=1e-30, a_max=np.inf)
+            multi_probs = multi_probs / np.sum(multi_probs, axis=-1, keepdims=True)
+            return multi_probs
+
+
+class MulticlassOneVsRestCalibrator(Calibrator):
+    def __init__(self, binary_calibrator: BaseEstimator):
+        super().__init__()
+        self.binary_calibrator = binary_calibrator
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        self.n_classes_ = X.shape[-1]
+        self.bin_calibs_ = []
+
+        X = X.copy()
+        X[np.isnan(X)] = 0.0
+
+        if self.n_classes_ == 2:
+            # binary classification
+            bin_calib = sklearn.base.clone(self.binary_calibrator)
+            bin_calib.fit(X, y)
+            self.bin_calibs_.append(bin_calib)
+        else:
+            for i in range(self.n_classes_):
+                pos_probs = X[:, i]
+                neg_probs = 1.0 - pos_probs
+                bin_probs = np.stack([neg_probs, pos_probs], axis=-1)
+                bin_labels = (y == i).astype(np.int32)
+                bin_calib = sklearn.base.clone(self.binary_calibrator)
+                bin_calib.fit(bin_probs, bin_labels)
+                self.bin_calibs_.append(bin_calib)
+
+    def _predict_proba_impl(self, X):
+        X = X.copy()
+        X[np.isnan(X)] = 0.0
+        if self.n_classes_ == 2:
+            # binary classification
+            return self.bin_calibs_[0].predict_proba(X)
+        else:
+            multi_probs = []
+            for i in range(self.n_classes_):
+                pos_probs = X[:, i]
+                neg_probs = 1.0 - pos_probs
+                bin_probs = np.stack([neg_probs, pos_probs], axis=-1)
+                pred_probs = self.bin_calibs_[i].predict_proba(bin_probs)
+                multi_probs.append(pred_probs[:, 1])
+            multi_probs = np.stack(multi_probs, axis=-1)
+            multi_probs = np.clip(multi_probs, a_min=1e-30, a_max=np.inf)
+            multi_probs = multi_probs / np.sum(multi_probs, axis=-1, keepdims=True)
+            return multi_probs
+
+
+class MixtureCalibrator(Calibrator):
+    def __init__(
+        self,
+        calibrator: Calibrator,
+        output_constant: float = 1.0,
+        input_constant: float = 0.0,
+    ):
+        super().__init__()
+        self.calibrator = calibrator
+        self.output_constant = output_constant
+        self.input_constant = input_constant
+
+    def _get_mixture(self, dist_1: CategoricalDistribution, unif_coef: float):
+        probs = dist_1.get_probs()
+        unif_probs = (1.0 / probs.shape[-1]) * torch.ones_like(probs)
+        return CategoricalProbs((1.0 - unif_coef) * probs + unif_coef * unif_probs)
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ):
+        self.n_samples_ = y_pred.get_n_samples()
+        if self.input_constant != 0.0:
+            y_pred = self._get_mixture(
+                y_pred, self.input_constant / (self.n_samples_ + 1)
+            )
+        self.cal_ = sklearn.base.clone(self.calibrator)
+        self.cal_.fit_torch(y_pred, y_true_labels)
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        if self.input_constant != 0.0:
+            y_pred = self._get_mixture(
+                y_pred, self.input_constant / (self.n_samples_ + 1)
+            )
+        y_pred = self.cal_.predict_proba_torch(y_pred)
+        if self.output_constant != 0.0:
+            y_pred = self._get_mixture(
+                y_pred, self.output_constant / (self.n_samples_ + 1)
+            )
+
+        return y_pred
+
+
+class WrapCalibratorAsClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, calib: BaseEstimator):
+        self.calib = calib
+
+    def fit(self, X, y):
+        # print(f'Fitting calibrator: {self.calib}')
+        self.calib_ = sklearn.base.clone(self.calib)
+        self.calib_.fit(X, y)
+        self.classes_ = list(range(np.max(y) + 1))
+        return self
+
+    def predict_proba(self, X):
+        return self.calib_.predict_proba(X)
diff --git a/probmetrics/calibrators/binning.py b/probmetrics/calibrators/binning.py
new file mode 100644
index 0000000..7a0a218
--- /dev/null
+++ b/probmetrics/calibrators/binning.py
@@ -0,0 +1,154 @@
+import numpy as np
+
+from .base import Calibrator
+from probmetrics.utils import flatten_binary_probs, expand_binary_probs
+
+
+class BinaryHistogramBinningCalibrator(Calibrator):
+    """
+    Binary Histogram Binning Calibrator.
+
+    This calibrator partitions the uncalibrated predicted probabilities into
+    discrete bins and calculates the empirical probability of the positive
+    class within each bin. During prediction, it maps uncalibrated probabilities
+    to the empirical probability of their corresponding bin.
+
+    Parameters
+    ----------
+    n_bins : int, default=10
+        The number of bins to partition the probability space into.
+    strategy : {'uniform', 'quantile'}, default='uniform'
+        Strategy used to define the widths of the bins.
+        - 'uniform': Bins have identical widths (e.g., [0, 0.1), [0.1, 0.2)).
+        - 'quantile': Bins have varying widths but contain approximately the
+          same number of samples.
+
+    Attributes
+    ----------
+    bin_edges_ : ndarray of shape (n_bins + 1,)
+        The edges defining the bins. Learned during `fit`.
+    bin_values_ : ndarray of shape (n_bins,)
+        The calibrated probability assigned to each bin. Learned during `fit`.
+
+    Reference
+    ---------
+    Bianca Zadrozny and Charles Elkan. Obtaining calibrated probability estimates from
+    decision trees and naive bayesian classifiers. International Conference on Machine
+    Learning, 2001.
+    """
+
+    def __init__(self, n_bins: int = 10, strategy: str = "uniform"):
+        super().__init__()
+
+        if strategy not in ["uniform", "quantile"]:
+            raise ValueError(
+                f"Strategy must be 'uniform' or 'quantile', got {strategy!r}."
+            )
+
+        self.strategy = strategy
+        self.n_bins = n_bins
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        probs = flatten_binary_probs(X)
+
+        if self.strategy == "uniform":
+            self.bin_edges_ = np.linspace(0.0, 1.0, self.n_bins + 1)
+        else:
+            self.bin_edges_ = np.unique(
+                np.quantile(probs, np.linspace(0.0, 1.0, self.n_bins + 1))
+            )
+            if len(self.bin_edges_) == 1:
+                # Fallback if variance is 0
+                self.bin_edges_ = np.array([0.0, 1.0])
+
+        bin_indices = np.digitize(probs, self.bin_edges_, right=False) - 1
+        max_idx = len(self.bin_edges_) - 2
+
+        # Ensures exact 1.0s fall in the final bin rather than creating an out-of-bounds
+        # index
+        bin_indices = np.clip(bin_indices, 0, max_idx)
+
+        bin_sums = np.bincount(
+            bin_indices, weights=y, minlength=len(self.bin_edges_) - 1
+        )
+        bin_counts = np.bincount(bin_indices, minlength=len(self.bin_edges_) - 1)
+
+        # Fallback for empty bins: Global prior probability of the positive class
+        # (Safer than midpoints if the underlying model is severely miscalibrated)
+        global_prior = np.mean(y) if len(y) > 0 else 0.5
+        fallback_values = np.full(max_idx + 1, global_prior)
+
+        self.bin_values_ = np.divide(
+            bin_sums, bin_counts, out=fallback_values, where=(bin_counts > 0)
+        )
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        probs = flatten_binary_probs(X)
+        bin_indices = np.digitize(probs, self.bin_edges_, right=False) - 1
+        bin_indices = np.clip(
+            bin_indices, 0, len(self.bin_edges_) - 2
+        )  # Ensures exact 1.0s fall in the final bin
+        probs = self.bin_values_[bin_indices]
+        probs = expand_binary_probs(probs)
+        return probs
+
+
+class BinaryPlattBinnerCalibrator(Calibrator):
+    """
+    From https://github.com/p-lambda/verified_calibration
+
+    Reference
+    ---------
+    Ananya Kumar, Percy S. Liang, and Tengyu Ma. Verified uncertainty calibration.
+    Advances in neural information processing systems, 2019.
+    """
+
+    def __init__(self):
+        super().__init__()
+        try:
+            from calibration import PlattBinnerCalibrator
+        except ImportError as e:
+            raise ImportError("The 'calibration' package is required.") from e
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        probs = flatten_binary_probs(X)
+        from calibration import PlattBinnerCalibrator
+
+        self.cal_ = PlattBinnerCalibrator(num_calibration=None, num_bins=10)
+        self.cal_.train_calibration(probs, y)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        probs = flatten_binary_probs(X)
+        probs = self.cal_.calibrate(probs)
+        probs = expand_binary_probs(probs)
+        return probs
+
+
+class NetcalBBQCalibrator(Calibrator):
+    """
+    From https://github.com/EFS-OpenSource/calibration-framework
+
+    Reference
+    ---------
+    Mahdi Pakdaman Naeini, Gregory Cooper, and Milos Hauskrecht. Obtaining well
+    calibrated probabilities using bayesian binning. AAAI Conference on Artificial
+    Intelligence, 2015.
+    """
+
+    def __init__(self):
+        super().__init__()
+        try:
+            from netcal.binning import BBQ
+        except ImportError as e:
+            raise ImportError("The 'netcal' package is required.") from e
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from netcal.binning import BBQ
+
+        self.cal_ = BBQ()
+        self.cal_.fit(X, y)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        probs = self.cal_.transform(X)
+        probs = expand_binary_probs(probs)
+        return probs
diff --git a/probmetrics/calibrators/factory.py b/probmetrics/calibrators/factory.py
new file mode 100644
index 0000000..34ef1b5
--- /dev/null
+++ b/probmetrics/calibrators/factory.py
@@ -0,0 +1,180 @@
+from .base import (
+    Calibrator,
+    ApplyToLogitsCalibrator,
+    WrapCalibratorAsClassifier,
+    MulticlassOneVsOneCalibrator,
+    MulticlassOneVsRestCalibrator,
+    MixtureCalibrator,
+)
+
+from .logistic import (
+    VectorScalingCalibrator,
+    SVSCalibrator,
+    MatrixScalingCalibrator,
+    SMSCalibrator,
+    DirichletCalibrator,
+    LogisticCalibrator,
+    BinaryLogisticCalibrator,
+    RegularizedAffineScalingCalibrator,
+    RegularizedQuadraticScalingCalibrator,
+    BetacalCalibrator,
+)
+
+from .temperature_scaling import (
+    TemperatureScalingCalibrator,
+    GuoTemperatureScalingCalibrator,
+    AutoGluonTemperatureScalingCalibrator,
+    AutoGluonInverseTemperatureScalingCalibrator,
+    TorchUncertaintyTemperatureScalingCalibrator,
+    TorchcalTemperatureScalingCalibrator,
+    NetcalTemperatureScalingCalibrator,
+    ETSCalibrator,
+)
+
+from .nonparametric import (
+    NetcalENIRCalibrator,
+    CenteredIsotonicRegressionCalibrator,
+    BinaryVennAbersCalibrator,
+    VennAbersCalibrator,
+    KernelCalibrator,
+)
+
+from .binning import (
+    BinaryHistogramBinningCalibrator,
+    BinaryPlattBinnerCalibrator,
+    NetcalBBQCalibrator,
+)
+
+from .splines import (
+    MLISplineCalibrator,
+    CDFSplineCalibrator,
+)
+
+from .sklearn import (
+    SklearnCalibrator,
+)
+
+from .trees import (
+    CatBoostCalibrator,
+)
+
+
+def get_calibrator(
+    calibration_method: str,
+    calibrate_with_mixture: bool = False,
+    cal_mixture_output_constant: float = 1.0,
+    cal_mixture_input_constant: float = 0.0,
+    **config,
+) -> Calibrator:
+    if calibration_method == "platt":
+        cal = SklearnCalibrator(method="sigmoid", cv="prefit")
+    elif calibration_method == "platt-logits":
+        cal = ApplyToLogitsCalibrator(SklearnCalibrator(method="sigmoid", cv="prefit"))
+    elif calibration_method == "isotonic":
+        cal = SklearnCalibrator(method="isotonic", cv="prefit")
+    elif calibration_method == "ivap":
+        cal = VennAbersCalibrator(use_ovo=config.get("va_use_ovo", False))
+    elif calibration_method == "ivap-ovr":
+        cal = MulticlassOneVsRestCalibrator(BinaryVennAbersCalibrator())
+    elif calibration_method == "ivap-ovo":
+        cal = MulticlassOneVsOneCalibrator(BinaryVennAbersCalibrator())
+    elif calibration_method == "isotonic-naive-ovr":
+        # should be the same as 'isotonic', this is just to check
+        cal = MulticlassOneVsRestCalibrator(
+            SklearnCalibrator(method="isotonic", cv="prefit")
+        )
+    elif calibration_method == "cir":
+        cal = MulticlassOneVsRestCalibrator(CenteredIsotonicRegressionCalibrator())
+    elif calibration_method in ["temp-scaling", "ts-mix"]:
+        cal = TemperatureScalingCalibrator(
+            opt=config.get("ts_opt", "bisection"),
+            max_bisection_steps=config.get("ts_max_bisection_steps", 30),
+            lr=config.get("ts_lr", 0.1),
+            max_iter=config.get("ts_max_iter", 200),
+            use_inv_temp=config.get("ts_use_inv_temp", True),
+            inv_temp_init=config.get("ts_inv_temp_init", 1 / 1.5),
+        )
+    elif calibration_method == "linear-scaling":
+        cal = BinaryLogisticCalibrator(type="linear")
+    elif calibration_method == "affine-scaling":
+        cal = BinaryLogisticCalibrator(type="affine")
+    elif calibration_method == "quadratic-scaling":
+        cal = BinaryLogisticCalibrator(type="quadratic")
+    elif calibration_method == "beta":
+        cal = BetacalCalibrator(parameters=config.get("beta_parameters", "abm"))
+    elif calibration_method == "svs":
+        cal = SVSCalibrator(
+            penalty=config.get("svs_penalty", "ridge"),
+            rho=config.get("svs_rho", 1.0),
+            tau=config.get("svs_tau", 1.0),
+            lambda_intercept=config.get("svs_lambda_intercept", 1.0),
+            lambda_diagonal=config.get("svs_lambda_diagonal", 1.0),
+            opt=config.get("svs_opt", "bfgs"),
+            max_iter=config.get("svs_max_iter", 500),
+            tol=config.get("svs_tol", 1e-5),
+            print_init_info=config.get("svs_print_init_info", True),
+        )
+    elif calibration_method == "sms":
+        cal = SMSCalibrator(
+            penalty=config.get("sms_penalty", "ridge"),
+            rho=config.get("sms_rho", 1.0),
+            tau=config.get("sms_tau", 1.0),
+            lambda_intercept=config.get("sms_lambda_intercept", 1.0),
+            lambda_diagonal=config.get("sms_lambda_diagonal", 1.0),
+            lambda_off_diagonal=config.get("sms_lambda_off_diagonal", 1.0),
+            opt=config.get("sms_opt", "bfgs"),
+            max_iter=config.get("sms_max_iter", 500),
+            tol=config.get("sms_tol", 1e-5),
+            print_init_info=config.get("sms_print_init_info", True),
+        )
+    elif calibration_method in ["logistic", "logistic-mix"]:
+        cal = LogisticCalibrator(
+            binary_type=config.get("logistic_binary_type", "affine"),
+            multiclass_type=config.get("logistic_multiclass_type", "sms"),
+        )
+    elif calibration_method == "torchunc-ts":
+        cal = TorchUncertaintyTemperatureScalingCalibrator(
+            init_val=config.get("ts_init_val", 1),
+            lr=config.get("ts_lr", 0.1),
+            max_iter=config.get("ts_max_iter", 100),
+        )
+    elif calibration_method == "guo-ts":
+        cal = GuoTemperatureScalingCalibrator()
+    elif calibration_method == "torchcal-ts":
+        cal = TorchcalTemperatureScalingCalibrator()
+    elif calibration_method == "autogluon-ts":
+        cal = AutoGluonTemperatureScalingCalibrator(
+            init_val=config.get("ts_init_val", 1),
+            max_iter=config.get("ts_max_iter", 200),
+            lr=config.get("ts_lr", 0.1),
+        )
+    elif calibration_method == "autogluon-inv-ts":
+        cal = AutoGluonInverseTemperatureScalingCalibrator(
+            init_val=config.get("ts_init_val", 1),
+            max_iter=config.get("ts_max_iter", 200),
+            lr=config.get("ts_lr", 0.1),
+        )
+    elif calibration_method == "dircal":
+        cal = DirichletCalibrator(
+            n_cv=0,
+            reg_lambda=config.get("dircal_reg_lambda", 1e-3),
+            reg_mu=config.get("dircal_reg_mu", 1e-3),
+        )
+    elif calibration_method == "dircal-cv":
+        cal = DirichletCalibrator(
+            n_cv=config.get("dircal_n_cv", 5),
+            use_odir=config.get("dircal_use_odir", False),
+            reg_lambda_grid=config.get("dircal_reg_lambda_grid", None),
+            reg_mu_grid=config.get("dircal_reg_mu_grid", None),
+        )
+    else:
+        raise ValueError(f'Unknown calibration method "{calibration_method}"')
+
+    if calibrate_with_mixture or calibration_method.endswith("-mix"):
+        cal = MixtureCalibrator(
+            cal,
+            output_constant=cal_mixture_output_constant,
+            input_constant=cal_mixture_input_constant,
+        )
+
+    return cal
diff --git a/probmetrics/calibrators/logistic.py b/probmetrics/calibrators/logistic.py
new file mode 100644
index 0000000..960ac69
--- /dev/null
+++ b/probmetrics/calibrators/logistic.py
@@ -0,0 +1,874 @@
+import torch
+import logging
+import torchmin
+import warnings
+import numpy as np
+from typing import Optional, List
+from sklearn.model_selection import StratifiedKFold, GridSearchCV
+
+from .base import Calibrator, WrapCalibratorAsClassifier
+from probmetrics.distributions import (
+    CategoricalDistribution,
+    CategoricalLogits,
+)
+
+from probmetrics.saga import (
+    fit_affine_scaling,
+    warm_up_affine_scaling,
+    fit_quadratic_scaling,
+    warm_up_quadratic_scaling,
+    fit_vector_scaling,
+    warm_up_vector_scaling,
+    fit_matrix_scaling,
+    warm_up_matrix_scaling,
+)
+
+from probmetrics.utils import (
+    validate_probabilities,
+    flatten_binary_probs,
+    expand_binary_probs,
+    binary_probs_to_logits,
+    multiclass_probs_to_logits,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class VectorScalingCalibrator(Calibrator):
+    """
+    Adapted from https://github.com/rishabh-ranjan/torchcal
+
+    Reference
+    ---------
+    Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern
+    neural networks. International Conference on Machine Learning, 2017.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ):
+        y_pred_logits = y_pred.get_logits()
+        num_classes = y_pred_logits.shape[1]
+
+        def loss(temp_bias):
+            temp, bias = temp_bias.split([num_classes, num_classes])
+            return torch.nn.functional.cross_entropy(
+                y_pred_logits / temp + bias, y_true_labels
+            )
+
+        temp = torch.ones(num_classes, device=y_pred_logits.device)
+        bias = torch.zeros(num_classes, device=y_pred_logits.device)
+        temp_bias = torch.cat([temp, bias])
+        res = torchmin.minimize(loss, temp_bias, method="bfgs")
+        if not res.success:
+            warnings.warn(
+                f"{self.__class__}: {res.message} Not updating calibrator params."
+            )
+            self.temp_, self.bias_ = temp, bias
+        else:
+            self.temp_, self.bias_ = res.x.split([num_classes, num_classes])
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        return CategoricalLogits(y_pred.get_logits() / self.temp_ + self.bias_)
+
+
+class SVSCalibrator(Calibrator):
+    """
+    Multiclass post-hoc calibration with a structured scaling scheme
+    $softmax((aI + diag(v))x + b)$ on the logits x.
+
+    Numba functions are warmed-up the first time the class is initialized with the
+    'saga' optimizer.
+
+    A penalty (one of [None, "mcp", "lasso", "ridge"]) is applied to the intercept (b)
+    and vector scaling (v) parameters, with respective regularization strength
+    lambda_intercept*(k**rho)/(n**tau) and lambda_diagonal*(k**rho)/(n**tau).
+
+    Instead of fitting the global scaling parameter jointly with the other parameters,
+    the logits are pre-processed with temperature scaling (with no regularization) and a
+    is then fixed to a=1.
+
+    [!] solver = 'bfgs' only supports 'ridge' penalty.
+    [!] For solver = 'bfgs', max_iter and tol are ignored and default torchmin
+    parameters are used.
+
+    Reference
+    ---------
+    Eugène Berta, David Holzmüller, Michael I. Jordan, and Francis Bach. Structured
+    matrix scaling for multi-class calibration. International Conference on Artificial
+    Intelligence and Statistics, 2026.
+    """
+
+    _warmed_up_saga = False
+    _ALLOWED_PENALTIES = [None, "mcp", "lasso", "ridge"]
+    _ALLOWED_OPTIMIZERS = ["saga", "bfgs"]
+
+    def __init__(
+        self,
+        penalty: str = "ridge",
+        rho: float = 1.0,
+        tau: float = 1.0,
+        lambda_intercept: float = 1.0,
+        lambda_diagonal: float = 1.0,
+        opt: str = "bfgs",
+        max_iter: int = 500,
+        tol: float = 1e-5,
+        print_init_info: bool = True,
+    ):
+        super().__init__()
+
+        if penalty not in self._ALLOWED_PENALTIES:
+            raise ValueError(
+                f"Penalty '{penalty}' not recognized. Choose from {self._ALLOWED_PENALTIES}."
+            )
+
+        if opt not in self._ALLOWED_OPTIMIZERS:
+            raise ValueError(
+                f"Optimizer '{opt}' not recognized. Choose from {self._ALLOWED_OPTIMIZERS}."
+            )
+
+        if opt == "bfgs" and penalty != "ridge":
+            raise ValueError(
+                "The 'bfgs' optimizer only supports the 'ridge' penalty, use 'saga' instead."
+            )
+
+        if opt == "saga":
+            try:
+                from numba import njit
+            except ImportError as e:
+                raise ImportError(
+                    "The 'numba' package is required for optimizer 'saga'."
+                ) from e
+
+            if not SVSCalibrator._warmed_up_saga:
+                if print_init_info:
+                    logger.info(
+                        "First SVSCalibrator instantiation with 'saga' - warming up Numba functions..."
+                    )
+                warm_up_vector_scaling()
+                SVSCalibrator._warmed_up_saga = True
+                if print_init_info:
+                    logger.info("Warmed up!")
+
+        self.penalty = penalty
+        self.rho = rho
+        self.tau = tau
+        self.lambda_intercept = lambda_intercept
+        self.lambda_diagonal = lambda_diagonal
+        self.max_iter = max_iter
+        self.tol = tol
+        self.opt = opt
+        self.print_init_info = print_init_info
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        n, k = X.shape
+
+        from .factory import get_calibrator
+
+        self.ts_ = get_calibrator("ts-mix")
+        self.ts_.fit(X, y)
+        scaled_probs = self.ts_.predict_proba(X)
+
+        reg_intercept = self.lambda_intercept * (k**self.rho) / (n**self.tau)
+        reg_diagonal = self.lambda_diagonal * (k**self.rho) / (n**self.tau)
+
+        logits = multiclass_probs_to_logits(scaled_probs)
+
+        if self.opt == "saga":
+            self.v_, self.b_ = fit_vector_scaling(
+                logits,
+                y,
+                penalty=self.penalty,
+                reg_intercept=reg_intercept,
+                reg_diagonal=reg_diagonal,
+                max_iter=self.max_iter,
+                tol=self.tol,
+            )
+
+        elif self.opt == "bfgs":
+            self.v_, self.b_ = self._fit_bfgs(
+                torch.tensor(logits),
+                torch.as_tensor(y, dtype=torch.long),
+                num_classes=k,
+                reg_intercept=reg_intercept,
+                reg_diagonal=reg_diagonal,
+            )
+
+    def _fit_bfgs(
+        self,
+        logits: torch.Tensor,
+        y: torch.Tensor,
+        num_classes,
+        reg_intercept,
+        reg_diagonal,
+    ):
+        K = num_classes
+
+        initial_params = torch.zeros(2 * K, device=logits.device, dtype=logits.dtype)
+
+        def loss(params):
+            v_delta = params[:K]
+            b = params[K:]
+
+            scaled_logits = logits * (1.0 + v_delta) + b
+            ce = torch.nn.functional.cross_entropy(scaled_logits, y)
+
+            r_b = reg_intercept * b.pow(2).sum()
+            r_v = reg_diagonal * v_delta.pow(2).sum()
+
+            return ce + r_b + r_v
+
+        method = "l-bfgs" if initial_params.numel() > 1000 else "bfgs"
+        res = torchmin.minimize(loss, initial_params, method=method)
+
+        v_delta_final = res.x[:K]
+        b_final = res.x[K:]
+
+        return (
+            1.0 + v_delta_final
+        ).detach().cpu().numpy(), b_final.detach().cpu().numpy()
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        scaled_probs = self.ts_.predict_proba(X)
+        logits = multiclass_probs_to_logits(scaled_probs)
+        calibrated_logits = self.v_.reshape(1, -1) * logits + self.b_.reshape(1, -1)
+        calibrated_logits -= np.max(calibrated_logits, axis=1, keepdims=True)
+        exp_logits = np.exp(calibrated_logits)
+        return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
+
+
+class MatrixScalingCalibrator(Calibrator):
+    """
+    Adapted from https://github.com/rishabh-ranjan/torchcal
+
+    Reference
+    ---------
+    Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern
+    neural networks. International Conference on Machine Learning, 2017.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ):
+        y_pred_logits = y_pred.get_logits()
+
+        num_classes = y_pred_logits.shape[1]
+
+        num_params = num_classes * (num_classes + 1)
+        if num_params <= 1000:
+            method = "bfgs"
+        else:
+            method = "l-bfgs"
+
+        def loss(itemp_bias):
+            itemp, bias = itemp_bias.split([num_classes**2, num_classes])
+            itemp = itemp.view(num_classes, num_classes)
+            return torch.nn.functional.cross_entropy(
+                y_pred_logits @ itemp + bias, y_true_labels
+            )
+
+        itemp = torch.eye(
+            num_classes,
+            num_classes,
+            device=y_pred_logits.device,
+            dtype=y_pred_logits.dtype,
+        )
+        bias = torch.zeros(
+            num_classes, device=y_pred_logits.device, dtype=y_pred_logits.dtype
+        )
+        itemp_bias = torch.cat([itemp.view(-1), bias])
+        res = torchmin.minimize(loss, itemp_bias, method=method)
+        if not res.success:
+            warnings.warn(
+                f"{self.__class__}: {res.message} Not updating calibrator params."
+            )
+            self.itemp_, self.bias_ = itemp, bias
+        else:
+            self.itemp_, self.bias_ = res.x.split([num_classes**2, num_classes])
+            self.itemp_ = self.itemp_.view(num_classes, num_classes)
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        return CategoricalLogits(y_pred.get_logits() @ self.itemp_ + self.bias_)
+
+
+class SMSCalibrator(Calibrator):
+    """
+    Multiclass post-hoc calibration with a structured scaling scheme
+    $softmax( (aI + diag(v) + off_diag(M) )x + b)$ on the logits x.
+
+    Numba functions are warmed-up the first time the class is initialized with the
+    'saga' optimizer.
+
+    A penalty (one of ["mcp", "lasso", "group_lasso", "ridge"]) is applied to the
+    intercept (b), vector scaling (v) and off diagonal (M) parameters, with respective
+    regularization strength lambda_intercept*(k**rho)/(n**tau),
+    lambda_diagonal*(k**rho)/(n**tau) and lambda_off_diagonal*((k*(k-1))**rho)/(n**tau).
+
+    Instead of fitting the global scaling parameter jointly with the other parameters,
+    the logits are pre-processed with temperature scaling (with no regularization) and a
+    is then fixed to a=1.
+
+    [!] The 'bfgs' solver only supports 'ridge' penalty.
+    [!] If 'bfgs' solver is used, max_iter and tol are ignored and default torchmin
+    parameters are used.
+
+    Reference
+    ---------
+    Eugène Berta, David Holzmüller, Michael I. Jordan, and Francis Bach. Structured
+    matrix scaling for multi-class calibration. International Conference on Artificial
+    Intelligence and Statistics, 2026.
+    """
+
+    _warmed_up_saga = False
+    _ALLOWED_PENALTIES = [None, "mcp", "lasso", "group_lasso", "ridge"]
+    _ALLOWED_OPTIMIZERS = ["saga", "bfgs"]
+
+    def __init__(
+        self,
+        penalty: str = "ridge",
+        rho: float = 1.0,
+        tau: float = 1.0,
+        lambda_intercept: float = 1.0,
+        lambda_diagonal: float = 1.0,
+        lambda_off_diagonal: float = 1.0,
+        opt: str = "bfgs",
+        max_iter: int = 500,
+        tol: float = 1e-5,
+        print_init_info: bool = True,
+    ):
+        super().__init__()
+
+        if penalty not in self._ALLOWED_PENALTIES:
+            raise ValueError(
+                f"Penalty '{penalty}' not recognized. Choose from {self._ALLOWED_PENALTIES}."
+            )
+
+        if opt not in self._ALLOWED_OPTIMIZERS:
+            raise ValueError(
+                f"Optimizer '{opt}' not recognized. Choose from {self._ALLOWED_OPTIMIZERS}."
+            )
+
+        if opt == "bfgs" and penalty != "ridge":
+            raise ValueError(
+                "The 'bfgs' optimizer only supports the 'ridge' penalty, use 'saga' instead."
+            )
+
+        if opt == "saga":
+            try:
+                from numba import njit
+            except ImportError as e:
+                raise ImportError(
+                    "The 'numba' package is required for optimizer 'saga'."
+                ) from e
+        
+            if not SMSCalibrator._warmed_up_saga:
+                if print_init_info:
+                    logger.info(
+                        "First SMSCalibrator instantiation with 'saga' - warming up Numba functions..."
+                    )
+                warm_up_matrix_scaling()
+                SMSCalibrator._warmed_up_saga = True
+                if print_init_info:
+                    logger.info("Warmed up!")
+
+        self.penalty = penalty
+        self.rho = rho
+        self.tau = tau
+        self.lambda_intercept = lambda_intercept
+        self.lambda_diagonal = lambda_diagonal
+        self.lambda_off_diagonal = lambda_off_diagonal
+        self.max_iter = max_iter
+        self.tol = tol
+        self.opt = opt
+        self.print_init_info = print_init_info
+
+    def _get_logits(
+        self, X: np.ndarray, append_bias: bool = True
+    ) -> tuple[np.ndarray, int, int]:
+        logits = multiclass_probs_to_logits(X)
+        if append_bias:
+            # Adding a constant term to logit vectors to fit the intercept.
+            logits = np.hstack([logits, np.ones((len(logits), 1))])
+        return logits
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        """Fits the calibration matrix W."""
+        validate_probabilities(X)
+
+        n, k = X.shape
+
+        from .factory import get_calibrator
+
+        self.ts_ = get_calibrator("ts-mix")
+        self.ts_.fit(X, y)
+        scaled_probs = self.ts_.predict_proba(X)
+
+        reg_intercept = self.lambda_intercept * (k**self.rho) / (n**self.tau)
+        reg_diagonal = self.lambda_diagonal * (k**self.rho) / (n**self.tau)
+        reg_off_diagonal = (
+            self.lambda_off_diagonal * ((k * (k - 1)) ** self.rho) / (n**self.tau)
+        )
+
+        if self.opt == "saga":
+            logits = self._get_logits(scaled_probs)
+            self.W_ = fit_matrix_scaling(
+                logits,
+                y,
+                penalty=self.penalty,
+                max_iter=self.max_iter,
+                tol=self.tol,
+                init_scaling=1.0,
+                reg_intercept=reg_intercept,
+                reg_diagonal=reg_diagonal,
+                reg_off_diagonal=reg_off_diagonal,
+            )
+
+        elif self.opt == "bfgs":
+            logits = self._get_logits(scaled_probs, append_bias=False)
+            self.W_ = self._fit_bfgs(
+                torch.tensor(logits),
+                torch.as_tensor(y, dtype=torch.long),
+                num_classes=k,
+                reg_intercept=reg_intercept,
+                reg_diagonal=reg_diagonal,
+                reg_off_diagonal=reg_off_diagonal,
+            )
+
+    def _fit_bfgs(
+        self,
+        logits: torch.Tensor,
+        y: torch.Tensor,
+        num_classes,
+        reg_intercept,
+        reg_diagonal,
+        reg_off_diagonal,
+    ):
+        K = num_classes
+
+        initial_params = torch.zeros(
+            K * (K + 1), device=logits.device, dtype=logits.dtype
+        )
+
+        def loss(params):
+            W_delta = params[: K * K].view(K, K)
+            b = params[K * K :]
+
+            scaled_logits = logits + torch.nn.functional.linear(logits, W_delta, b)
+            ce = torch.nn.functional.cross_entropy(scaled_logits, y)
+
+            W_diag = W_delta.diagonal()
+            r_i = reg_intercept * b.pow(2).sum()
+            r_d = reg_diagonal * W_diag.pow(2).sum()
+            r_o = reg_off_diagonal * (W_delta.pow(2).sum() - W_diag.pow(2).sum())
+
+            return ce + r_i + r_d + r_o
+
+        method = "l-bfgs" if initial_params.numel() > 1000 else "bfgs"
+        res = torchmin.minimize(loss, initial_params, method=method)
+
+        flat_result = res.x
+        W_delta_final = flat_result[: K * K].view(K, K)
+        b_final = flat_result[K * K :]
+
+        with torch.no_grad():
+            W_final = (
+                torch.eye(K, device=logits.device, dtype=logits.dtype) + W_delta_final
+            )
+            return torch.hstack([W_final, b_final.unsqueeze(1)]).detach().cpu().numpy()
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        scaled_probs = self.ts_.predict_proba(X)
+        logits = self._get_logits(scaled_probs)
+        calibrated_logits = logits @ self.W_.T
+        calibrated_logits -= np.max(calibrated_logits, axis=1, keepdims=True)
+        exp_logits = np.exp(calibrated_logits)
+        return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
+
+
+def logloss_np(y_true: np.ndarray, y_proba: np.ndarray):
+    return -np.mean(np.take_along_axis(np.log(y_proba), y_true[:, None], axis=1))
+
+
+class DirichletCalibrator(Calibrator):
+    """
+    From https://github.com/dirichletcal/dirichlet_python
+
+    Reference
+    ---------
+    Meelis Kull, Miquel Perello Nieto, Markus Kängsepp, Telmo Silva Filho, Hao Song, and
+    Peter Flach. Beyond temperature scaling: Obtaining well-calibrated multi-class
+    probabilities with Dirichlet calibration. Advances in Neural Information Processing
+    Systems, 2019.
+    """
+    def __init__(
+        self,
+        n_cv: int = 0,
+        use_odir: bool = False,
+        reg_lambda: float = 0.0,
+        reg_mu: Optional[float] = None,
+        reg_lambda_grid: Optional[List[float]] = None,
+        reg_mu_grid: Optional[List[float]] = None,
+    ):
+        super().__init__()
+        try:
+            from dirichletcal.calib.fulldirichlet import FullDirichletCalibrator
+        except ImportError as e:
+            raise ImportError("The 'dirichletcal' package is required.") from e
+
+        self.n_cv = n_cv
+        self.use_odir = use_odir
+        self.reg_lambda = reg_lambda
+        self.reg_mu = reg_mu
+        self.reg_lambda_grid = reg_lambda_grid
+        self.reg_mu_grid = reg_mu_grid
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from dirichletcal.calib.fulldirichlet import FullDirichletCalibrator
+
+        if self.n_cv == 0:
+            self.cal_ = FullDirichletCalibrator(
+                reg_lambda=self.reg_lambda, reg_mu=self.reg_mu
+            )
+        elif self.n_cv >= 2:
+            reg_lambda_grid = self.reg_lambda_grid or [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
+            reg_mu_grid = self.reg_mu_grid or [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
+            calibrator = FullDirichletCalibrator(
+                reg_lambda=reg_lambda_grid,
+                reg_mu=reg_mu_grid if self.use_odir else None,
+            )
+            calibrator = WrapCalibratorAsClassifier(calibrator)
+            skf = StratifiedKFold(n_splits=self.n_cv, shuffle=True, random_state=0)
+            self.cal_ = GridSearchCV(
+                calibrator,
+                param_grid={
+                    "calib__reg_lambda": reg_lambda_grid,
+                    "calib__reg_mu": reg_mu_grid if self.use_odir else [None],
+                },
+                cv=skf,
+                # use this because scikit-learn's logloss scorer fails in case of
+                # missing classes
+                scoring=lambda est, X_test, y_test: -logloss_np(
+                    y_test, est.predict_proba(X_test)
+                ),
+            )
+        else:
+            raise ValueError(f"n_cv must be either 0 or >=2, but is {self.n_cv}")
+
+        self.cal_.fit(X, y)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        return self.cal_.predict_proba(X)
+
+
+class LogisticCalibrator(Calibrator):
+    """
+    By default, uses 'quadratic-scaling' for binary classification and 'sms' for
+    multiclass classification.
+    """
+
+    VALID_BINARY_TYPES = ["linear", "affine", "quadratic"]
+    VALID_MULTICLASS_TYPES = ["svs", "sms"]
+
+    def __init__(self, binary_type: str = "quadratic", multiclass_type: str = "sms"):
+        super().__init__()
+
+        if binary_type not in self.VALID_BINARY_TYPES:
+            raise ValueError(
+                f"Invalid binary type '{binary_type}'. Must be one of {self.VALID_BINARY_TYPES}"
+            )
+        self.binary_type = binary_type
+
+        if multiclass_type not in self.VALID_MULTICLASS_TYPES:
+            raise ValueError(
+                f"Invalid multiclass type '{multiclass_type}'. Must be one of {self.VALID_MULTICLASS_TYPES}"
+            )
+        self.multiclass_type = multiclass_type
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        n_classes = len(np.unique(y))
+
+        if n_classes == 2:
+            if self.binary_type == "affine":
+                self.cal_ = BinaryLogisticCalibrator(type="affine")
+            elif self.binary_type == "linear":
+                self.cal_ = BinaryLogisticCalibrator(type="linear")
+            elif self.binary_type == "quadratic":
+                self.cal_ = BinaryLogisticCalibrator(type="quadratic")
+
+        else:
+            if self.multiclass_type == "svs":
+                self.cal_ = SVSCalibrator()
+            elif self.multiclass_type == "sms":
+                self.cal_ = SMSCalibrator()
+
+        self.cal_.fit(X, y)
+        return self
+
+    def predict(self, X):
+        if self.cal_ is None:
+            raise RuntimeError("LogisticCalibrator must be fitted before prediction.")
+        return self.cal_.predict(X)
+
+    def _predict_proba_impl(self, X):
+        if self.cal_ is None:
+            raise RuntimeError("LogisticCalibrator must be fitted before prediction.")
+        return self.cal_.predict_proba(X)
+
+
+class BinaryLogisticCalibrator(Calibrator):
+    """
+    Binary post-hoc calibration with linear, affine or quadratic scaling
+    on the binary logits using sklearn's LogisticRegression.
+
+    This class fits a model of the form:
+    P(y=1) = sigma(calibrated_logit)
+
+    Where `calibrated_logit` depends on the `type`:
+    - 'linear':    calibrated_logit = w * logit (Temperature scaling with inverse
+    temperature)
+    - 'affine':    calibrated_logit = b + w * logit (Platt scaling)
+    - 'quadratic': calibrated_logit = b + w1 * logit + w2 * logit^2
+    """
+
+    VALID_TYPES = ["linear", "affine", "quadratic", "beta"]
+
+    def __init__(self, type: str = "affine"):
+        """
+        Args:
+            type (str, optional): The type of scaling.
+                One of ['linear', 'affine', 'quadratic'].
+                Defaults to 'affine'.
+        """
+        super().__init__()
+
+        if type not in self.VALID_TYPES:
+            raise ValueError(
+                f"Invalid type '{type}'. Must be one of {self.VALID_TYPES}"
+            )
+        self.type = type
+
+    def _get_logits(self, X: np.ndarray) -> np.ndarray:
+        log_p, log_1_minus_p = binary_probs_to_logits(X)
+        logits = log_p - log_1_minus_p
+
+        if self.type == "linear":
+            return logits
+
+        ones = np.ones_like(logits)
+
+        if self.type == "affine":
+            return np.hstack([ones, logits])
+
+        elif self.type == "quadratic":
+            return np.hstack([ones, logits, np.square(logits)])
+
+        if self.type == "beta":  # TODO Check that this is indeed beta
+            return np.hstack([ones, log_p, log_1_minus_p])
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        """Fits the logistic regression calibrator."""
+        from sklearn.linear_model import LogisticRegression
+
+        self.cal_ = LogisticRegression(
+            penalty=None, fit_intercept=False, solver="lbfgs"
+        )
+
+        features = self._get_logits(X)
+        self.cal_.fit(features, y)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        features = self._get_logits(X)
+        return self.cal_.predict_proba(features)
+
+
+class RegularizedAffineScalingCalibrator(Calibrator):
+    """
+    Binary post-hoc calibration with Platt scaling (~binary logistic regression)
+    $sigmoid(ax+b)$ on the logits x.
+    Uses the SAGA algorithm to fit the scaling and intercept parameters a & b.
+    Numba functions are warmed-up the first time the class is initialized (~1s).
+    A penalty (one of ["mcp", "lasso", "ridge"]) can be applied to the intercept
+    parameter b, with regularization strength lambda_intercept.
+    To apply no regularization, leave penalty to None.
+    """
+
+    _warmed_up = False
+
+    def __init__(
+        self,
+        penalty: str = None,
+        lambda_intercept: float = 0.0,
+        max_iter: int = 200,
+        tol: float = 1e-4,
+        print_init_info: bool = True,
+    ):
+        super().__init__()
+        self.penalty = penalty
+        self.lambda_intercept = lambda_intercept
+        self.max_iter = max_iter
+        self.tol = tol
+        self.print_init_info = print_init_info
+
+        try:
+            from numba import njit
+        except ImportError as e:
+            raise ImportError("The 'numba' package is required.") from e
+
+        if not RegularizedAffineScalingCalibrator._warmed_up:
+            if self.print_init_info:
+                print(
+                    "First RegularizedAffineScalingCalibrator instantiation - warming up Numba functions..."
+                )
+            warm_up_affine_scaling()
+            RegularizedAffineScalingCalibrator._warmed_up = True
+            if self.print_init_info:
+                print("Warmed up!")
+
+    def _get_logits(self, X: np.ndarray) -> np.ndarray:
+        log_p, log_1_minus_p = binary_probs_to_logits(X)
+        logits = log_p - log_1_minus_p
+        ones = np.ones_like(logits)
+        return np.hstack([ones, logits])
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        logits = self._get_logits(X)
+        self.w_ = fit_affine_scaling(
+            logits,
+            y,
+            penalty=self.penalty,
+            reg_intercept=self.lambda_intercept,
+            max_iter=self.max_iter,
+            tol=self.tol,
+        )
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        logits = self._get_logits(X)
+        logits = logits.dot(self.w_).reshape(-1, 1)
+        probs = 1.0 / (1.0 + np.exp(-logits))
+        return np.hstack([1 - probs, probs])
+
+
+class RegularizedQuadraticScalingCalibrator(Calibrator):
+    """
+    Binary post-hoc calibration with quadratic scaling $sigmoid(a + bx + cx^2)$ on the
+    logits x.
+    Uses the SAGA algorithm to fit the intercept, scaling and quadratic parameters a, b
+    & c.
+    Numba functions are warmed-up the first time the class is initialized (~1s).
+    A penalty (one of ["mcp", "lasso", "ridge"]) can be applied to the intercept and
+    quadratic parameters a & c,
+    with respective regularization strength lambda_intercept and lambda_quadratic.
+    To apply no regularization, leave penalty to None.
+    """
+
+    _warmed_up = False
+
+    def __init__(
+        self,
+        penalty: str = None,
+        lambda_intercept: float = 0.0,
+        lambda_quadratic: float = 0.0,
+        max_iter: int = 200,
+        tol: float = 1e-4,
+        print_init_info: bool = True,
+    ):
+        super().__init__()
+        self.penalty = penalty
+        self.lambda_intercept = lambda_intercept
+        self.lambda_quadratic = lambda_quadratic
+        self.max_iter = max_iter
+        self.tol = tol
+        self.print_init_info = print_init_info
+
+        try:
+            from numba import njit
+        except ImportError as e:
+            raise ImportError("The 'numba' package is required.") from e
+
+        if not RegularizedQuadraticScalingCalibrator._warmed_up:
+            if self.print_init_info:
+                print(
+                    "First RegularizedQuadraticScalingCalibrator instantiation - warming up Numba functions..."
+                )
+            warm_up_quadratic_scaling()
+            RegularizedQuadraticScalingCalibrator._warmed_up = True
+            if self.print_init_info:
+                print("Warmed up!")
+
+    def _get_logits(self, X: np.ndarray) -> np.ndarray:
+        log_p, log_1_minus_p = binary_probs_to_logits(X)
+        logits = log_p - log_1_minus_p
+        ones = np.ones_like(logits)
+        return np.hstack([ones, logits, np.square(logits)])
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        logits = self._get_logits(X)
+        self.w_ = fit_quadratic_scaling(
+            logits,
+            y,
+            penalty=self.penalty,
+            reg_intercept=self.lambda_intercept,
+            reg_quadratic=self.lambda_quadratic,
+            max_iter=self.max_iter,
+            tol=self.tol,
+        )
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        logits = self._get_logits(X)
+        logits = logits.dot(self.w_).reshape(-1, 1)
+        probs = 1.0 / (1.0 + np.exp(-logits))
+        return np.hstack([1 - probs, probs])
+
+
+class BetacalCalibrator(Calibrator):
+    """
+    From https://github.com/betacal/python/
+
+    Reference
+    ---------
+    Meelis Kull, Telmo Silva Filho, and Peter Flach. Beta calibration: a well-founded
+    and easily implemented improvement on logistic calibration for binary classifiers.
+    International Conference on Artificial Intelligence and Statistics, 2017.
+    """
+    def __init__(self, parameters="abm"):
+        super().__init__()
+        try:
+            from betacal import BetaCalibration
+        except ImportError as e:
+            raise ImportError("The 'betacal' package is required.") from e
+        self.parameters = parameters
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        probs = flatten_binary_probs(X)
+        from betacal import BetaCalibration
+
+        self.cal_ = BetaCalibration(parameters=self.parameters)
+        self.cal_.fit(probs, y)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        probs = flatten_binary_probs(X)
+        # If logistic has coef[0] = 0 after .fit, an exeption occurs in .predict,
+        # very rare
+        try:
+            probs = self.cal_.predict(probs)
+        except Exception:
+            # Manually, following the intial implem, removing the problematic if
+            df = probs.reshape(-1, 1)
+            eps = np.finfo(df.dtype).eps
+            df = np.clip(df, eps, 1 - eps)
+
+            x = np.hstack((df, 1.0 - df))
+            x = np.log(x)
+            x[:, 1] *= -1
+            probs = self.cal_.calibrator_.lr_.predict_proba(x)[:, 1]
+
+        probs = expand_binary_probs(probs)
+        return probs
diff --git a/probmetrics/calibrators/nonparametric.py b/probmetrics/calibrators/nonparametric.py
new file mode 100644
index 0000000..6068956
--- /dev/null
+++ b/probmetrics/calibrators/nonparametric.py
@@ -0,0 +1,346 @@
+import torch
+import warnings
+import numpy as np
+
+from .base import Calibrator
+from .sklearn import SklearnCalibrator
+
+from probmetrics.utils import flatten_binary_probs, expand_binary_probs
+from probmetrics.distributions import CategoricalDistribution, CategoricalProbs
+
+
+class CenteredIsotonicRegressionCalibrator(Calibrator):
+    """
+    From https://github.com/mathijs02/cir-model
+
+    Reference
+    ---------
+    Assaf P. Oron and Nancy Flournoy. Centered isotonic regression: point and interval
+    estimation for dose-response studies. Statistics in Biopharmaceutical Research,
+    9(3):258-267, 2017.
+    """
+
+    def __init__(self):
+        super().__init__()
+        try:
+            from cir_model import CenteredIsotonicRegression
+        except ImportError as e:
+            raise ImportError("The 'cir_model' package is required.") from e
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        # TODO
+        # We have to use float64 since with float32 it can happen that somewhere
+        # internally a rounding error occurs (?) and interp1d thinks that it is asked to
+        # extrapolate at the boundary value, which causes it to output nan values, which
+        # are seen as a separate possible value but (y==value) is empty because nan==nan
+        # is false, so an error occurs because the method attempts to take an average of
+        # an empty set.
+        from cir_model import CenteredIsotonicRegression
+
+        self.cal_ = CenteredIsotonicRegression()
+
+        probs = flatten_binary_probs(X)
+        probs = probs.astype(np.float64)
+        y = y.astype(np.float64)
+        self.cal_.fit(probs, y)
+        self.min_ = np.min(probs)
+        self.max_ = np.max(probs)
+
+    def _predict_proba_impl(self, X):
+        probs = flatten_binary_probs(X)
+        probs = probs.astype(np.float64)
+        # have to clip since CenteredIsotonicRegression refuses to extrapolate (?)
+        probs = np.clip(probs, self.min_, self.max_)
+        probs = self.cal_.transform(probs)
+        probs = np.clip(probs, 0.0, 1.0)
+        probs = expand_binary_probs(probs)
+        return probs
+
+
+class BinaryVennAbersCalibrator(Calibrator):
+    """
+    From https://github.com/ip200/venn-abers
+
+    Reference
+    ---------
+    Vladimir Vovk, Ivan Petej, and Valentina Fedorova. Large-scale probabilistic
+    predictors with and without guarantees of validity. Advances in Neural Information
+    Processing Systems, 2015.
+    """
+
+    def __init__(self):
+        super().__init__()
+        try:
+            from venn_abers import VennAbers
+        except ImportError as e:
+            raise ImportError("The 'venn_abers' package is required.") from e
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from venn_abers import VennAbers
+
+        self.cal_ = VennAbers()
+        probs = flatten_binary_probs(X)
+        probs = expand_binary_probs(probs)
+        self.cal_.fit(probs, y)
+
+    def _predict_proba_impl(self, X):
+        probs = flatten_binary_probs(X)
+        probs = expand_binary_probs(probs)
+        return self.cal_.predict_proba(probs)[0]
+
+
+class VennAbersCalibrator(Calibrator):
+    """
+    From https://github.com/ip200/venn-abers
+
+    Reference
+    ---------
+    Vladimir Vovk, Ivan Petej, and Valentina Fedorova. Large-scale probabilistic
+    predictors with and without guarantees of validity. Advances in Neural Information
+    Processing Systems, 2015.
+    """
+
+    def __init__(self, use_ovo: bool = False):
+        super().__init__()
+        try:
+            from venn_abers import VennAbersCalibrator
+        except ImportError as e:
+            raise ImportError("The 'venn_abers' package is required.") from e
+        self.use_ovo = use_ovo
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from venn_abers import VennAbersCalibrator
+
+        self.cal_ = VennAbersCalibrator()
+        self.X_ = np.copy(X)
+        self.y_ = np.copy(y)
+
+    def _predict_proba_impl(self, X):
+        return self.cal_.predict_proba(
+            p_cal=self.X_,
+            y_cal=self.y_,
+            p_test=X,
+            p0_p1_output=False,
+            va_type="one_vs_one" if self.use_ovo else "one_vs_all",
+        )
+
+
+class NetcalENIRCalibrator(Calibrator):
+    """
+    From https://github.com/EFS-OpenSource/calibration-framework
+
+    Reference
+    ---------
+    Naeini, Mahdi Pakdaman and Cooper, Gregory F.. Binary classifier calibration using
+    an ensemble of near isotonic regression models. International Conference on Data
+    Mining, 2016.
+    """
+    def __init__(self):
+        super().__init__()
+        try:
+            from netcal.binning import ENIR
+        except ImportError as e:
+            raise ImportError("The 'netcal' package is required.") from e
+        self.defaulted_to_IR = False
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from netcal.binning import ENIR
+
+        self.cal_ = ENIR()
+        probs = flatten_binary_probs(X)
+        # netcal.binning.ENIR returns an error when .fit() is called with separable data
+        # (AUC=1).
+        try:
+            self.cal_.fit(probs, y)
+        except Exception as e:
+            warnings.warn(
+                f"ENIR failed during .fit(), defaulting to Isotonic regression."
+            )
+            self.cal_ = SklearnCalibrator(method="isotonic", cv="prefit")
+            self.cal_.fit(probs, y)
+            self.defaulted_to_IR = True
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        probs = flatten_binary_probs(X)
+        if self.defaulted_to_IR:
+            probs = self.cal_.predict_proba(probs)
+        else:
+            probs = self.cal_.transform(probs)
+        probs = expand_binary_probs(probs)
+        return probs
+
+
+class KernelCalibrator(Calibrator):
+    """
+    Beta / Dirichlet kernel from
+    https://github.com/tpopordanoska/ece-kde/blob/main/ece_kde.py, initially used to
+    estimate calibration errors, converted to a post-hoc calibration method here.
+
+    Reference
+    ---------
+    Teodora Popordanoska, Raphael Sayer, and Matthew Blaschko. A consistent and
+    differentiable Lp canonical calibration error estimator. Advances in Neural
+    Information Processing Systems, 2022.
+    """
+
+    def __init__(self, bandwidth=None, device="cpu"):
+        """
+        :param bandwidth: The bandwidth of the kernel. If None, LOO MLE is used to find
+        it during fit.
+        :param device: The device type: "cpu" or "cuda"
+        """
+        super().__init__()
+        self.bandwidth = bandwidth
+        self.device = device
+
+    def _get_heuristic_bandwidth(self, N: int, num_classes: int) -> float:
+        """Select a default bandwidth using Scott's Rule adapted for the variance of
+        Beta/Dirichlet kernels on a simplex."""
+        # Intrinsic dimensionality: d = 1 for binary, d = C - 1 for multiclass
+        d = num_classes - 1 if num_classes > 1 else 1
+        return N ** (-2.0 / (d + 4.0))
+
+    def _get_bandwidth(self, f):
+        """
+        Select a bandwidth based on maximizing the leave-one-out likelihood (LOO MLE).
+        """
+        bandwidths = torch.cat(
+            (
+                torch.logspace(start=-5, end=-1, steps=15),
+                torch.linspace(0.2, 1, steps=5),
+            )
+        ).to(self.device)
+
+        max_b = -1
+        max_l = float("-inf")
+        n = len(f)
+
+        for b in bandwidths:
+            log_kern = self._get_kernel_matrix(f, f, b, loo=True)
+            log_fhat = torch.logsumexp(log_kern, 1) - torch.log(
+                torch.tensor(n - 1.0, device=self.device)
+            )
+            l = torch.sum(log_fhat).item()
+
+            if l > max_l:
+                max_l = l
+                max_b = b.item()
+
+        return max_b
+
+    def _get_kernel_matrix(self, z, zi, bandwidth, loo=False):
+        """
+        Computes the pairwise log-kernel matrix between evaluation points `z` and fitted
+        points `zi`.
+        """
+        # Clamp inputs safely to prevent log(0)
+        z_c = torch.clamp(z, 1e-10, 1 - 1e-10)
+        zi_c = torch.clamp(zi, 1e-10, 1 - 1e-10)
+
+        if z.shape[1] == 1:
+            # Beta Kernel for Binary Classification
+            z_uns = z_c.unsqueeze(1)  # Shape: (N, 1, 1)
+            zi_uns = zi_c.unsqueeze(0)  # Shape: (1, M, 1)
+            p = zi_uns / bandwidth + 1
+            q = (1 - zi_uns) / bandwidth + 1
+
+            log_beta = torch.lgamma(p) + torch.lgamma(q) - torch.lgamma(p + q)
+            log_num = (p - 1) * torch.log(z_uns) + (q - 1) * torch.log(1 - z_uns)
+            log_kern = (log_num - log_beta).squeeze(-1)  # Shape: (N, M)
+        else:
+            # Dirichlet Kernel for Multiclass Classification
+            alphas = zi_c / bandwidth + 1  # Shape: (M, C)
+            log_beta = torch.sum(torch.lgamma(alphas), dim=1) - torch.lgamma(
+                torch.sum(alphas, dim=1)
+            )  # (M,)
+            log_num = torch.matmul(torch.log(z_c), (alphas - 1).T)  # Shape: (N, M)
+            log_kern = log_num - log_beta.unsqueeze(0)  # Shape: (N, M)
+
+        if loo:
+            # Trick: set diagonal to -inf for leave-one-out calculation
+            diag_idx = torch.arange(len(z))
+            log_kern[diag_idx, diag_idx] = torch.finfo(torch.float).min
+
+        return log_kern
+
+    def _fit_torch_impl(
+        self,
+        y_pred: CategoricalDistribution,
+        y_true_labels: torch.Tensor,
+        max_fit_samples: int = 10000,
+    ):
+        """
+        Fits the kernel function by storing the calibration set and calculating the
+        bandwidth.
+        """
+        with torch.no_grad():
+            probs = y_pred.get_probs()
+
+            if probs.ndim == 1:
+                probs = probs.unsqueeze(1)
+
+            self.num_classes_ = probs.shape[1]
+
+            # TODO: SPEED FIX: Subsample the fit set
+            N_total = probs.shape[0]
+            if N_total > max_fit_samples:
+                indices = torch.randperm(N_total, device=self.device)[:max_fit_samples]
+                fit_probs = probs[indices]
+                fit_labels = y_true_labels[indices]
+            else:
+                fit_probs = probs
+                fit_labels = y_true_labels
+
+            # TODO: we use heuristic bandwith instead of tuned bandwidth
+            if self.bandwidth is None:
+                self.bandwidth = self._get_heuristic_bandwidth(
+                    len(fit_probs), self.num_classes_
+                )
+            # if self.bandwidth is None:
+            #     self.bandwidth = self._get_bandwidth(fit_probs)
+
+            self.X_fit_ = fit_probs
+            self.y_fit_ = fit_labels
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        """
+        Predicts calibrated probabilities by evaluating the Nadaraya-Watson estimator.
+        """
+        with torch.no_grad():
+            X_t = y_pred.get_probs()
+            is_1d = X_t.ndim == 1
+            if is_1d:
+                X_t = X_t.unsqueeze(1)
+
+            log_kern = self._get_kernel_matrix(
+                X_t, self.X_fit_, self.bandwidth, loo=False
+            )
+            log_den = torch.logsumexp(log_kern, dim=1)
+
+            if self.num_classes_ == 1:
+                # Binary Formulation
+                log_kern_y = log_kern.clone()
+                # Mask out entries where y != 1 (effectively multiplying by y in log
+                # space)
+                log_kern_y[:, self.y_fit_ == 0] = torch.finfo(torch.float).min
+
+                log_num = torch.logsumexp(log_kern_y, dim=1)
+                prob_1 = torch.exp(log_num - log_den)
+                prob_0 = 1.0 - prob_1
+                return CategoricalProbs(torch.stack([prob_0, prob_1], dim=1))
+
+            else:
+                # Canonical Multiclass Formulation
+                calibrated_probs = []
+                for k in range(self.num_classes_):
+                    log_kern_k = log_kern.clone()
+                    # Mask out entries where y != k
+                    log_kern_k[:, self.y_fit_ != k] = torch.finfo(torch.float).min
+                    log_num_k = torch.logsumexp(log_kern_k, dim=1)
+                    calibrated_probs.append(torch.exp(log_num_k - log_den))
+
+                res = torch.stack(calibrated_probs, dim=1)
+                # return res.cpu().numpy()
+                return CategoricalProbs(res)
diff --git a/probmetrics/calibrators/orderpreserving.py b/probmetrics/calibrators/orderpreserving.py
new file mode 100644
index 0000000..422ecc5
--- /dev/null
+++ b/probmetrics/calibrators/orderpreserving.py
@@ -0,0 +1,199 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+
+from .base import Calibrator
+
+
+class _FCModel(nn.Module):
+    """
+    From https://github.com/AmirooR/IntraOrderPreservingCalibration
+
+    Standard Fully Connected base model as specified by 'config.json'.
+    Maps (n_classes -> hidden -> n_classes).
+    """
+
+    def __init__(self, in_features: int, hidden_sizes: list, out_features: int):
+        super().__init__()
+        layers = []
+        prev = in_features
+        for h in hidden_sizes:
+            layers.append(nn.Linear(prev, h))
+            layers.append(nn.ReLU())
+            prev = h
+        layers.append(nn.Linear(prev, out_features))
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)
+
+
+class _OrderPreservingModel(nn.Module):
+    """
+    From https://github.com/AmirooR/IntraOrderPreservingCalibration
+
+    Internal PyTorch module adapting the provided OrderPreservingModel.
+    """
+
+    def __init__(
+        self,
+        base_model: nn.Module,
+        invariant: bool = True,
+        residual: bool = False,
+        num_classes: int = 10,
+        m_activation=F.softplus,
+    ):
+        super().__init__()
+        self.base_model = base_model
+        self.num_classes = num_classes
+        self.invariant = invariant
+        self.m_activation = m_activation
+        self.residual = residual
+
+    def compute_u(self, sorted_logits: torch.Tensor) -> torch.Tensor:
+        diffs = sorted_logits[:, :-1] - sorted_logits[:, 1:]
+        diffs = torch.cat(
+            (
+                diffs,
+                torch.ones((diffs.shape[0], 1), dtype=diffs.dtype, device=diffs.device),
+            ),
+            dim=1,
+        )
+        assert torch.all(diffs >= 0), f"diffs should be positive: {diffs}"
+        return diffs.flip([1])
+
+    def forward(self, logits: torch.Tensor) -> torch.Tensor:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        _, unsorted_indices = torch.sort(sorted_indices, descending=False)
+
+        # [B, C]
+        u = self.compute_u(sorted_logits)
+        inp = sorted_logits if self.invariant else logits
+        m = self.base_model(inp)
+
+        m[:, 1:] = self.m_activation(m[:, 1:].clone())
+        m[:, 0] = 0
+        um = torch.cumsum(u * m, 1).flip([1])
+        out = torch.gather(um, 1, unsorted_indices)
+
+        if self.residual:
+            out = out + logits
+        return out
+
+
+class IntraOrderPreservingCalibrator(Calibrator):
+    """
+    TODO work in progress, slow for now.
+
+    From https://github.com/AmirooR/IntraOrderPreservingCalibration
+
+    Order-Preserving Calibrator using a neural network mapping.
+
+    Inputs (X) must be probabilities.
+    Supports both multi-class (2D) and binary (1D or 2D) probabilities.
+
+    Reference
+    ---------
+    Amir Rahimi, Amirreza Shaban, Ching-An Cheng, Richard Hartley, Byron Boots. Intra
+    Order-preserving Functions for Calibration of Multi-Class Neural Networks. Advances
+    in Neural Information Processing Systems, 2020.
+    """
+    def __init__(
+        self,
+        invariant: bool = True,
+        residual: bool = False,
+        hidden_sizes: tuple = (150, 150),
+        num_epochs: int = 40,
+        lr: float = 0.005,
+        weight_decay: float = 0.0005,
+        batch_size: int = 256,
+    ):
+        super().__init__()
+
+        self.invariant = invariant
+        self.residual = residual
+        self.hidden_sizes = hidden_sizes
+        self.num_epochs = num_epochs
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.batch_size = batch_size
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        """Fits the calibrator using Adam and CrossEntropyLoss."""
+
+        # Convert 1D binary probabilities to 2D
+        if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1):
+            X = np.column_stack((1.0 - X.flatten(), X.flatten()))
+
+        self.n_classes_ = X.shape[1]
+
+        # CrossEntropyLoss expects 1D integer class indices
+        if y.ndim == 2 and y.shape[1] > 1:
+            y = np.argmax(y, axis=1)
+
+        # Reverse-engineer logits safely from probabilities
+        logits = np.log(np.clip(X, 1e-15, 1.0))
+
+        X_tensor = torch.tensor(logits, dtype=torch.float32)
+        y_tensor = torch.tensor(y, dtype=torch.long)
+
+        dataset = TensorDataset(X_tensor, y_tensor)
+        bs = self.batch_size if len(X) > self.batch_size else len(X)
+        loader = DataLoader(dataset, batch_size=bs, shuffle=True)
+
+        # Build base FC model and wrap it in the Order Preserving Model
+        base_model = _FCModel(
+            in_features=self.n_classes_,
+            hidden_sizes=self.hidden_sizes,
+            out_features=self.n_classes_,
+        )
+
+        self.model_ = _OrderPreservingModel(
+            base_model=base_model,
+            invariant=self.invariant,
+            residual=self.residual,
+            num_classes=self.n_classes_,
+        ).to(self.device)
+
+        # Optimizer & Loss as defined in config.json and calibrate.py
+        optimizer = optim.Adam(
+            self.model_.parameters(), lr=self.lr, weight_decay=self.weight_decay
+        )
+        criterion = nn.CrossEntropyLoss()
+
+        self.model_.train()
+        for epoch in range(self.num_epochs):
+            for batch_x, batch_y in loader:
+                batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
+
+                optimizer.zero_grad()
+                output_batch = self.model_(batch_x)
+                loss = criterion(output_batch, batch_y)
+                loss.backward()
+                optimizer.step()
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        """Returns calibrated probabilities."""
+        if self.model_ is None:
+            raise RuntimeError(
+                "Calibrator must be fitted before calling predict_proba."
+            )
+
+        # Convert 1D binary probabilities to 2D
+        if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1):
+            X = np.column_stack((1.0 - X.flatten(), X.flatten()))
+
+        # Reverse-engineer logits safely from probabilities
+        logits = np.log(np.clip(X, 1e-15, 1.0))
+        X_tensor = torch.tensor(logits, dtype=torch.float32).to(self.device)
+
+        self.model_.eval()
+        with torch.no_grad():
+            calibrated_logits = self.model_(X_tensor)
+            probs = torch.softmax(calibrated_logits, dim=1).cpu().numpy()
+
+        return probs
diff --git a/probmetrics/calibrators/sklearn.py b/probmetrics/calibrators/sklearn.py
new file mode 100644
index 0000000..dad7e0a
--- /dev/null
+++ b/probmetrics/calibrators/sklearn.py
@@ -0,0 +1,115 @@
+import sklearn
+import numpy as np
+
+from .base import Calibrator
+
+from sklearn.preprocessing import LabelEncoder
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.base import BaseEstimator, ClassifierMixin
+
+
+class IdentityEstimator(ClassifierMixin, BaseEstimator):
+    def __init__(self, n_classes: int):
+        self.n_classes = n_classes
+
+    def fit(self, X, y):
+        # somehow it can fail if we don't do the np.asarray()
+        self.classes_ = np.asarray(list(range(self.n_classes)))
+        # self.classes_ = np.unique(y)  # if we do this we have a problem for missing
+        # classes
+        return self
+
+    def predict_proba(self, X):
+        return X
+
+    def predict(self, X):
+        # having this as a dummy here for the FrozenEstimator solution
+        # which hovewer doesn't work right now
+        raise NotImplementedError()
+
+
+class SklearnCalibrator(Calibrator):
+    def __init__(self, method: str, cv: str = "prefit"):
+        assert cv == "prefit", (
+            "Other methods than prefit not intended and not implemented for sklearn > 1.6"
+        )
+        super().__init__()
+        self.method = method
+        self.cv = cv
+
+    @staticmethod
+    def _normalize_rows(A: np.ndarray) -> np.ndarray:
+        """Row-normalize so rows sum to 1; safe if a row sums to 0."""
+        s = A.sum(axis=1, keepdims=True)
+        s = np.where(s == 0.0, 1.0, s)
+        return A / s
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        if X.ndim == 1:
+            X = np.stack((1.0 - X, X), axis=1)
+
+        y = np.asarray(y)
+
+        self.n_classes_ = X.shape[-1]
+        self.classes_ = np.arange(self.n_classes_)
+
+        # Fit label encoder up-front (used for the missing-class path)
+        self.label_encoder_ = LabelEncoder()
+        y_mapped = self.label_encoder_.fit_transform(y)
+        self.present_classes_ = self.label_encoder_.classes_
+        self.missing_classes_ = np.setdiff1d(self.classes_, self.present_classes_)
+
+        # ---------- Case 1: no missing classes -> original formulation ----------
+        if self.missing_classes_.size == 0:
+            est = IdentityEstimator(self.n_classes_)
+            est.fit(X, y)
+
+            try:
+                # sklearn >= 1.6
+                from sklearn.frozen import FrozenEstimator
+
+                self.calib_ = CalibratedClassifierCV(
+                    FrozenEstimator(est), method=self.method
+                )
+            except ImportError:
+                # sklearn < 1.6
+                self.calib_ = CalibratedClassifierCV(
+                    est, method=self.method, cv=self.cv
+                )
+
+            self.calib_.fit(X, y)
+            return
+
+        # --- Case 2: missing classes -> slice + normalize X_present, use y_mapped ---
+        X_present = self._normalize_rows(X[:, self.present_classes_])
+
+        est = IdentityEstimator(len(self.present_classes_))
+        est.fit(X_present, y_mapped)
+
+        try:
+            from sklearn.frozen import FrozenEstimator
+
+            self.calib_ = CalibratedClassifierCV(
+                FrozenEstimator(est), method=self.method
+            )
+        except ImportError:
+            self.calib_ = CalibratedClassifierCV(est, method=self.method, cv=self.cv)
+
+        self.calib_.fit(X_present, y_mapped)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        if X.ndim == 1:
+            X = np.stack((1.0 - X, X), axis=1)
+
+        # If fit saw all classes, keep original behavior
+        if self.missing_classes_.size == 0:
+            return self.calib_.predict_proba(X)
+
+        # Missing classes: normalize the same sliced X_present and zero-fill missing
+        # classes (Option A)
+        X_present = self._normalize_rows(X[:, self.present_classes_])
+        p_present = self.calib_.predict_proba(X_present)
+
+        p_full = np.zeros((X.shape[0], self.n_classes_), dtype=p_present.dtype)
+        p_full[:, self.present_classes_] = p_present
+        return p_full
diff --git a/probmetrics/calibrators/splines.py b/probmetrics/calibrators/splines.py
new file mode 100644
index 0000000..5902685
--- /dev/null
+++ b/probmetrics/calibrators/splines.py
@@ -0,0 +1,204 @@
+import numpy as np
+
+from .base import Calibrator
+from probmetrics.utils import flatten_binary_probs, expand_binary_probs
+
+
+class MLISplineCalibrator(Calibrator):
+    """
+    Uses smoothing splines to determine the calibration function.
+    From https://github.com/numeristical/splinecalib
+
+    Reference
+    ---------
+    Brian Lucena. Spline-based probability calibration. arXiv preprint arXiv:1809.07751,
+    https://arxiv.org/abs/1809.07751, 2018.
+    """
+    def __init__(self):
+        super().__init__()
+        try:
+            from splinecalib import SplineCalib
+        except ImportError as e:
+            raise ImportError("The 'splinecalib' package is required.") from e
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from splinecalib import SplineCalib
+
+        self.cal_ = SplineCalib()
+        if len(np.unique(X)) < 3:  # This causes a bug.
+            self.cal_ = None
+        else:
+            self.cal_.fit(X, y)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        if self.cal_ is not None:
+            probs = self.cal_.calibrate(X)
+        else:
+            probs = X
+        probs = expand_binary_probs(probs)
+        return probs
+
+
+class _GuptaSpline:
+    """
+    Internal helper class for calculating smoothing splines.
+    From https://github.com/kartikgupta-at-anu/spline-calibration
+    """
+
+    def __init__(
+        self, x: np.ndarray, y: np.ndarray, kx: np.ndarray, runout: str = "parabolic"
+    ):
+        self.kx = kx
+        self.delta = kx[1] - kx[0]
+        self.nknots = len(kx)
+        self.runout = runout
+
+        m_from_ky = self.ky_to_M()
+        my_from_ky = np.concatenate([m_from_ky, np.eye(len(kx))], axis=0)
+        y_from_my = self.my_to_y(x)
+        y_from_ky = y_from_my @ my_from_ky
+
+        # Find the least squares solution
+        ky = np.linalg.lstsq(y_from_ky, y, rcond=-1)[0]
+
+        self.ky = ky
+        self.my = my_from_ky @ ky
+
+    def my_to_y(self, vecx: np.ndarray) -> np.ndarray:
+        ndata = len(vecx)
+        mM = np.zeros((ndata, self.nknots))
+        my = np.zeros((ndata, self.nknots))
+
+        for i, xx in enumerate(vecx):
+            j = int(np.floor((xx - self.kx[0]) / self.delta))
+            j = max(0, min(j, self.nknots - 2))
+            x = xx - j * self.delta
+
+            mM[i, j] = (
+                -(x**3) / (6.0 * self.delta) + x**2 / 2.0 - 2.0 * self.delta * x / 6.0
+            )
+            mM[i, j + 1] = x**3 / (6.0 * self.delta) - self.delta * x / 6.0
+            my[i, j] = -x / self.delta + 1.0
+            my[i, j + 1] = x / self.delta
+
+        return np.concatenate([mM, my], axis=1)
+
+    def my_to_dy(self, vecx: np.ndarray) -> np.ndarray:
+        ndata = len(vecx)
+        mM = np.zeros((ndata, self.nknots))
+        my = np.zeros((ndata, self.nknots))
+
+        for i, xx in enumerate(vecx):
+            j = int(np.floor((xx - self.kx[0]) / self.delta))
+            j = max(0, min(j, self.nknots - 2))
+            x = xx - j * self.delta
+
+            mM[i, j] = -(x**2) / (2.0 * self.delta) + x - 2.0 * self.delta / 6.0
+            mM[i, j + 1] = x**2 / (2.0 * self.delta) - self.delta / 6.0
+            my[i, j] = -1.0 / self.delta
+            my[i, j + 1] = 1.0 / self.delta
+
+        return np.concatenate([mM, my], axis=1)
+
+    def ky_to_M(self) -> np.ndarray:
+        A = 4.0 * np.eye(self.nknots - 2)
+        for i in range(1, self.nknots - 2):
+            A[i - 1, i] = 1.0
+            A[i, i - 1] = 1.0
+
+        if self.runout == "parabolic":
+            A[0, 0] = 5.0
+            A[-1, -1] = 5.0
+        elif self.runout == "cubic":
+            A[0, 0] = 6.0
+            A[0, 1] = 0.0
+            A[-1, -1] = 6.0
+            A[-1, -2] = 0.0
+
+        B = np.zeros((self.nknots - 2, self.nknots))
+        for i in range(0, self.nknots - 2):
+            B[i, i] = 1.0
+            B[i, i + 1] = -2.0
+            B[i, i + 2] = 1.0
+
+        B = B * (6 / self.delta**2)
+
+        Ainv = np.linalg.inv(A)
+        AinvB = Ainv @ B
+
+        if self.runout == "natural":
+            z0 = np.zeros((1, self.nknots))
+            z1 = np.zeros((1, self.nknots))
+        elif self.runout == "parabolic":
+            z0 = AinvB[0]
+            z1 = AinvB[-1]
+        elif self.runout == "cubic":
+            z0 = 2.0 * AinvB[0] - AinvB[1]
+            z1 = 2.0 * AinvB[-1] - AinvB[-2]
+
+        z0 = z0.reshape((1, -1))
+        z1 = z1.reshape((1, -1))
+
+        return np.concatenate([z0, AinvB, z1], axis=0)
+
+    def evaluate(self, x: np.ndarray) -> np.ndarray:
+        return self.my_to_y(x) @ self.my
+
+    def evaluate_deriv(self, x: np.ndarray) -> np.ndarray:
+        return self.my_to_dy(x) @ self.my
+
+
+class CDFSplineCalibrator(Calibrator):
+    """
+    Approximates the empirical cumulative distribution using a differentiable function
+    via splines.
+
+    Based on https://github.com/kartikgupta-at-anu/spline-calibration
+
+    Reference
+    ---------
+    Kartik Gupta, Amir Rahimi, Thalaiyasingam Ajanthan, Thomas Mensink, Cristian
+    Sminchisescu, and Richard Hartley. Calibration of neural networks using splines.
+    International Conference on Learning Representations, 2021.
+    """
+
+    def __init__(self, spline_method: str = "natural", splines: int = 6):
+        super().__init__()
+        self.spline_method = spline_method
+        self.splines = splines
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        probs = flatten_binary_probs(X)
+
+        # Sort the data according to score
+        order = probs.argsort()
+        sorted_probs = probs[order]
+        sorted_labels = y[order]
+
+        nsamples = len(sorted_probs)
+        integrated_accuracy = np.cumsum(sorted_labels) / nsamples
+        integrated_scores = np.cumsum(sorted_probs) / nsamples
+        percentiles = np.linspace(0.0, 1.0, nsamples)
+
+        kx = np.linspace(0.0, 1.0, self.splines)
+        spline = _GuptaSpline(
+            percentiles,
+            integrated_accuracy - integrated_scores,
+            kx,
+            runout=self.spline_method,
+        )
+
+        # Evaluate the spline and store arrays for interpolation
+        acc = spline.evaluate_deriv(percentiles)
+        acc += sorted_probs
+
+        self.calib_scores_ = sorted_probs
+        self.calib_acc_ = acc
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        probs = flatten_binary_probs(X)
+        # Use numpy's optimized linear interpolation instead of a custom Python
+        # implementation
+        probs = np.interp(probs, self.calib_scores_, self.calib_acc_)
+        probs = expand_binary_probs(probs)
+        return probs
diff --git a/probmetrics/calibrators/temperature_scaling.py b/probmetrics/calibrators/temperature_scaling.py
new file mode 100644
index 0000000..157fecc
--- /dev/null
+++ b/probmetrics/calibrators/temperature_scaling.py
@@ -0,0 +1,673 @@
+import torch
+import torchmin
+import warnings
+import numpy as np
+from torch import nn, optim
+from torch.utils.data import DataLoader, TensorDataset
+
+from .base import Calibrator
+from probmetrics.utils import expand_binary_probs, multiclass_probs_to_logits
+from probmetrics.distributions import (
+    CategoricalDistribution,
+    CategoricalLogits,
+)
+
+from typing import Callable
+
+
+def bisection_search(f: Callable[[float], float], a: float, b: float, n_steps: int):
+    for _ in range(n_steps):
+        c = a + 0.5 * (b - a)
+        f_c = f(c)
+        if f_c > 0:
+            b = c
+        else:
+            a = c
+
+    return 0.5 * (a + b)
+
+
+class TemperatureScalingCalibrator(Calibrator):
+    """
+    Fast, optimized implementation. By default, uses bisection search on the gradient of
+    the cross entropy for a scaling (inverse temperature) parameter.
+
+    Reference
+    ---------
+    Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern
+    neural networks. International Conference on Machine Learning, 2017.
+    """
+    def __init__(
+        self,
+        opt: str = "bisection",
+        max_bisection_steps: int = 30,
+        lr: float = 0.1,
+        max_iter: int = 200,
+        use_inv_temp: bool = True,
+        inv_temp_init: float = 1 / 1.5,
+    ):
+        super().__init__()
+        self.lr = lr
+        self.max_bisection_steps = max_bisection_steps
+        self.max_iter = max_iter
+        self.use_inv_temp = use_inv_temp
+        self.inv_temp_init = inv_temp_init
+        self.opt = opt
+
+    def _get_loss_grad(self, invtemp: float, logits: torch.Tensor, y: torch.Tensor):
+        part_1 = torch.mean(
+            torch.sum(logits * torch.softmax(invtemp * logits, dim=-1), dim=-1)
+        )
+        part_2 = torch.mean(logits[torch.arange(logits.shape[0]), y])
+        return (part_1 - part_2).item()
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ):
+        logits = y_pred.get_logits()
+        labels = y_true_labels
+
+        if self.opt in ["lbfgs", "lbfgs_line_search"]:
+            self._fit_lbfgs(logits, labels)
+        elif self.opt == "bisection":
+            self._fit_bisection(logits, labels)
+        else:
+            raise ValueError(f'Unknown optimizer "{self.opt}"')
+
+    def _fit_lbfgs(self, logits: torch.Tensor, labels: torch.Tensor):
+        # following https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py
+        param = nn.Parameter(
+            torch.ones(1, device=logits.device)
+            * (self.inv_temp_init if self.use_inv_temp else 1 / self.inv_temp_init)
+        )
+        criterion = nn.CrossEntropyLoss()
+        optimizer = torch.optim.LBFGS(
+            [param],
+            lr=self.lr,
+            max_iter=self.max_iter,
+            line_search_fn="strong_wolfe" if self.opt == "lbfgs_line_search" else None,
+        )
+
+        def eval():
+            optimizer.zero_grad()
+            y_pred = (
+                logits * param[:, None]
+                if self.use_inv_temp
+                else logits / param[:, None]
+            )
+            loss = criterion(y_pred, labels)
+            loss.backward()
+            return loss
+
+        optimizer.step(eval)
+
+        self.invtemp_ = param.item() if self.use_inv_temp else 1 / param.item()
+
+    def _fit_bisection(self, logits: torch.Tensor, labels: torch.Tensor):
+        objective_grad = lambda u, l=logits, tar=labels: self._get_loss_grad(
+            np.exp(u), l, tar
+        )
+
+        # should reach about float32 accuracy
+        # need log_2(32) = 5 steps to get to length 1 and then 24 more steps to get to float32 epsilon (2^{-24})
+        self.invtemp_ = np.exp(
+            bisection_search(
+                objective_grad, a=-16, b=16, n_steps=self.max_bisection_steps
+            )
+        )
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        return CategoricalLogits(self.invtemp_ * y_pred.get_logits())
+
+
+### Other temperature scaling implementations ###
+
+
+class GuoTemperatureScalingCalibrator(Calibrator):
+    """
+    Adapted from https://github.com/gpleiss/temperature_scaling/blob/master/temperature_scaling.py
+
+    Reference
+    ---------
+    Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern
+    neural networks. International Conference on Machine Learning, 2017.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ):
+        self.temp_ = nn.Parameter(torch.ones(1) * 1.5)
+
+        logits = y_pred.get_logits()
+        labels = y_true_labels
+
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        nll_criterion = nn.CrossEntropyLoss().to(device)
+
+        # Next: optimize the temperature w.r.t. NLL
+        optimizer = torch.optim.LBFGS([self.temp_], lr=0.01, max_iter=50)
+
+        def eval():
+            optimizer.zero_grad()
+            loss = nll_criterion(self.temperature_scale(logits), labels)
+            loss.backward()
+            return loss
+
+        optimizer.step(eval)
+
+        return self
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        with torch.no_grad():
+            return CategoricalLogits(y_pred.get_logits() / self.temp_)
+
+    def temperature_scale(self, logits):
+        """
+        Perform temperature scaling on logits
+        """
+        # Expand temperature to match the size of logits
+        temp = self.temp_.unsqueeze(1).expand(logits.size(0), logits.size(1))
+        return logits / temp
+
+
+class AutoGluonTemperatureScalingCalibrator(Calibrator):
+    """
+    Adapted from
+    https://github.com/autogluon/autogluon/blob/c1181326cf6b7e3b27a7420273f1a82808d939e2/core/src/autogluon/core/calibrate/temperature_scaling.py#L9
+    https://github.com/autogluon/autogluon/blob/28a242ebe8d55ba770c991b9db153ab4623c9abd/tabular/src/autogluon/tabular/trainer/abstract_trainer.py#L4433-L4457
+
+    Reference
+    ---------
+    Chuan Guo, Geoff Pleiss, Yu Sun, and Kilian Q. Weinberger. On calibration of modern
+    neural networks. International Conference on Machine Learning, 2017.
+    """
+    def __init__(self, init_val: float = 1, max_iter: int = 200, lr: float = 0.1):
+        super().__init__()
+        self.init_val = init_val
+        self.max_iter = max_iter
+        self.lr = lr
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ):
+        y_val_tensor = y_true_labels
+        temperature_param = torch.nn.Parameter(torch.ones(1).fill_(self.init_val))
+        logits = y_pred.get_logits()
+
+        is_invalid = torch.isinf(logits).any().tolist()
+        if is_invalid:
+            return
+
+        nll_criterion = torch.nn.CrossEntropyLoss()
+        optimizer = torch.optim.LBFGS(
+            [temperature_param], lr=self.lr, max_iter=self.max_iter
+        )
+        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
+
+        optimizer_trajectory = []
+
+        if self.init_val != 1.0:
+            # need to check 1.0 as well since AutoGluon does it outside
+            optimizer_trajectory.append(
+                (nll_criterion(logits, y_val_tensor).item(), 1.0)
+            )
+
+        def temperature_scale_step():
+            optimizer.zero_grad()
+            temp = temperature_param.unsqueeze(1).expand(logits.size(0), logits.size(1))
+            new_logits = logits / temp
+            loss = nll_criterion(new_logits, y_val_tensor)
+            loss.backward()
+            scheduler.step()
+            optimizer_trajectory.append((loss.item(), temperature_param.item()))
+            return loss
+
+        optimizer.step(temperature_scale_step)
+
+        try:
+            best_loss_index = np.nanargmin(np.array(optimizer_trajectory)[:, 0])
+        except ValueError:
+            self.temperature = 1.0
+            return
+        temperature_scale = float(np.array(optimizer_trajectory)[best_loss_index, 1])
+
+        if np.isnan(temperature_scale) or temperature_scale <= 0.0:
+            self.temperature = 1.0
+            return
+
+        self.temp_ = temperature_scale
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        with torch.no_grad():
+            return CategoricalLogits(y_pred.get_logits() / self.temp_)
+
+
+class AutoGluonInverseTemperatureScalingCalibrator(Calibrator):
+    # adapted from
+    # https://github.com/autogluon/autogluon/blob/c1181326cf6b7e3b27a7420273f1a82808d939e2/core/src/autogluon/core/calibrate/temperature_scaling.py#L9
+    # but optimizing the inverse temperature instead
+    def __init__(self, init_val: float = 1, max_iter: int = 200, lr: float = 0.1):
+        super().__init__()
+        self.init_val = init_val
+        self.max_iter = max_iter
+        self.lr = lr
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ):
+        y_val_tensor = y_true_labels
+        inv_temperature_param = torch.nn.Parameter(torch.ones(1).fill_(self.init_val))
+        logits = y_pred.get_logits()
+
+        is_invalid = torch.isinf(logits).any().tolist()
+        if is_invalid:
+            return
+
+        nll_criterion = torch.nn.CrossEntropyLoss()
+        optimizer = torch.optim.LBFGS(
+            [inv_temperature_param], lr=self.lr, max_iter=self.max_iter
+        )
+        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
+
+        optimizer_trajectory = []
+
+        def temperature_scale_step():
+            optimizer.zero_grad()
+            inv_temp = inv_temperature_param.unsqueeze(1).expand(
+                logits.size(0), logits.size(1)
+            )
+            new_logits = logits * inv_temp
+            loss = nll_criterion(new_logits, y_val_tensor)
+            loss.backward()
+            scheduler.step()
+            optimizer_trajectory.append((loss.item(), inv_temperature_param.item()))
+            return loss
+
+        optimizer.step(temperature_scale_step)
+
+        try:
+            best_loss_index = np.nanargmin(np.array(optimizer_trajectory)[:, 0])
+        except ValueError:
+            return
+        inv_temperature_scale = float(
+            np.array(optimizer_trajectory)[best_loss_index, 1]
+        )
+
+        if np.isnan(inv_temperature_scale):
+            return
+
+        self.inv_temp_ = inv_temperature_scale
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        with torch.no_grad():
+            return CategoricalLogits(y_pred.get_logits() * self.inv_temp_)
+
+
+class TorchUncertaintyTemperatureScalingCalibrator(TemperatureScalingCalibrator):
+    # adapted from
+    # https://github.com/ENSTA-U2IS-AI/torch-uncertainty/blob/3a021d2e34e183b8aad3a0345e6d750c08c72af3/torch_uncertainty/post_processing/calibration/scaler.py#L1
+    # https://github.com/ENSTA-U2IS-AI/torch-uncertainty/blob/3a021d2e34e183b8aad3a0345e6d750c08c72af3/torch_uncertainty/post_processing/calibration/temperature_scaler.py#L9
+    def __init__(self, init_val: float = 1, lr: float = 0.1, max_iter: int = 100):
+        super().__init__(
+            opt="lbfgs",
+            lr=lr,
+            max_iter=max_iter,
+            use_inv_temp=False,
+            inv_temp_init=1.0 / init_val,
+        )
+        # need to save these values here to comply with sklearn cloneability conventions
+        self.init_val = init_val
+        self.lr = lr
+        self.max_iter = max_iter
+
+
+class NetcalTemperatureScalingCalibrator(Calibrator):
+    # this one does nothing due to https://github.com/EFS-OpenSource/calibration-framework/issues/61
+    def __init__(self):
+        super().__init__()
+        try:
+            from netcal.scaling import TemperatureScaling
+        except ImportError as e:
+            raise ImportError("The 'netcal' package is required.") from e
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from netcal.scaling import TemperatureScaling
+
+        self.cal_ = TemperatureScaling()
+
+        if X.shape[1] == 2:
+            # binary, convert
+            X = X[:, 1]
+        self.cal_.fit(X, y, random_state=0)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        if X.shape[1] == 2:
+            # binary, convert
+            X = X[:, 1]
+        probs = self.cal_.transform(X)
+        probs = expand_binary_probs(probs)
+        return probs
+
+
+class TorchcalTemperatureScalingCalibrator(Calibrator):
+    # adapted from
+    # https://github.com/rishabh-ranjan/torchcal/blob/3fb65f6423d33d680cd68c7f40a0259d41e8fb0b/torchcal.py#L8
+    def __init__(self):
+        super().__init__()
+
+    def _fit_torch_impl(
+        self, y_pred: CategoricalDistribution, y_true_labels: torch.Tensor
+    ):
+        y_pred_logits = y_pred.get_logits()
+
+        temp = torch.ones(1, device=y_pred_logits.device)
+
+        def loss(t):
+            return torch.nn.functional.cross_entropy(y_pred_logits / t, y_true_labels)
+
+        res = torchmin.minimize(loss, temp, method="newton-exact")
+        if not res.success:
+            warnings.warn(
+                f"{self.__class__}: {res.message} Not updating calibrator params."
+            )
+        else:
+            temp = res.x
+
+        self.temp_ = temp.item()
+
+    def _predict_proba_torch_impl(
+        self, y_pred: CategoricalDistribution
+    ) -> CategoricalDistribution:
+        return CategoricalLogits(y_pred.get_logits() / self.temp_)
+
+
+class ETSCalibrator(Calibrator):
+    """
+    From https://github.com/zhang64-llnl/Mix-n-Match-Calibration
+    Optimizes a temperature parameter and then finds the optimal ensemble weights
+    between the scaled predictions, original predictions, and a uniform distribution.
+
+    Inputs (X) must be probabilities.
+    Supports both multi-class (2D) and binary (1D or 2D) probabilities.
+
+    Reference
+    ---------
+    Jize Zhang, Bhavya Kailkhura, and T. Yong-Jin Han. Mix-n-match: Ensemble and
+    compositional methods for uncertainty calibration in deep learning. International
+    Conference on Machine Learning, 2020.
+    """
+
+    def __init__(self, loss: str = "mse"):
+        super().__init__()
+        try:
+            from scipy import optimize
+        except ImportError as e:
+            raise ImportError("The 'scipy' package is required.") from e
+        
+        if loss not in ["mse", "ce"]:
+            raise ValueError(
+                "Loss must be either 'mse' (Mean Squared Error) or 'ce' (Cross-Entropy)."
+            )
+        self.loss = loss
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        """
+        Fits the calibrator.
+        X: array of probabilities (n_samples, n_classes) or (n_samples,) for binary.
+        y: labels (n_samples,) or one-hot encoded (n_samples, n_classes).
+        """
+        from scipy import optimize
+        # Convert 1D binary probabilities to 2D (Class 0: 1 - p, Class 1: p)
+        if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1):
+            X = np.column_stack((1.0 - X.flatten(), X.flatten()))
+        self.n_classes_ = X.shape[1]
+        logits = multiclass_probs_to_logits(X)
+
+        # Convert labels to one-hot if they are purely 1D indices
+        if y.ndim == 1 or (y.ndim == 2 and y.shape[1] == 1):
+            y_one_hot = np.eye(self.n_classes_)[y.flatten()]
+        else:
+            y_one_hot = y
+
+        # 1. Optimize Temperature (t) using the derived logits
+        bnds_t = ((0.05, 5.0),)
+        t_target_func = self._ll_t if self.loss == "ce" else self._mse_t
+        res_t = optimize.minimize(
+            t_target_func,
+            x0=[1.0],
+            args=(logits, y_one_hot),
+            method="L-BFGS-B",
+            bounds=bnds_t,
+            tol=1e-12,
+        )
+        self.t_ = res_t.x[0]
+
+        # 2. Optimize Ensemble Weights (w0, w1, w2)
+        p1 = X  # Original probabilities
+        p0 = self._stable_softmax(logits / self.t_)  # Temperature-scaled probabilities
+        p2 = np.ones_like(p0) / self.n_classes_  # Uniform distribution
+
+        bnds_w = ((0.0, 1.0), (0.0, 1.0), (0.0, 1.0))
+        constraints = {"type": "eq", "fun": lambda w: np.sum(w) - 1.0}
+
+        w_target_func = self._ll_w if self.loss == "ce" else self._mse_w
+        res_w = optimize.minimize(
+            w_target_func,
+            x0=[1.0, 0.0, 0.0],
+            args=(p0, p1, p2, y_one_hot),
+            method="SLSQP",
+            constraints=constraints,
+            bounds=bnds_w,
+            tol=1e-12,
+        )
+        self.w_ = res_w.x
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        """
+        Predicts calibrated probabilities.
+        X: array of probabilities (n_samples, n_classes) or (n_samples,) for binary.
+        """
+        # Convert 1D binary probabilities to 2D (Class 0: 1 - p, Class 1: p)
+        if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1):
+            X = np.column_stack((1.0 - X.flatten(), X.flatten()))
+        self.n_classes_ = X.shape[1]
+        logits = multiclass_probs_to_logits(X)
+
+        p1 = X
+        p0 = self._stable_softmax(logits / self.t_)
+        p2 = np.ones_like(p0) / self.n_classes_
+
+        p_calibrated = self.w_[0] * p0 + self.w_[1] * p1 + self.w_[2] * p2
+
+        # Ensure mathematical strictness of probabilities
+        return p_calibrated / np.sum(p_calibrated, axis=1, keepdims=True)
+
+    @staticmethod
+    def _stable_softmax(x: np.ndarray) -> np.ndarray:
+        """Helper to prevent np.exp() overflow errors."""
+        e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
+        return e_x / np.sum(e_x, axis=1, keepdims=True)
+
+    @staticmethod
+    def _mse_t(t: float, logit: np.ndarray, label: np.ndarray) -> float:
+        p = ETSCalibrator._stable_softmax(logit / t)
+        return np.mean((p - label) ** 2)
+
+    @staticmethod
+    def _ll_t(t: float, logit: np.ndarray, label: np.ndarray) -> float:
+        p = ETSCalibrator._stable_softmax(logit / t)
+        p = np.clip(p, 1e-20, 1 - 1e-20)
+        return -np.sum(label * np.log(p)) / p.shape[0]
+
+    @staticmethod
+    def _mse_w(
+        w: np.ndarray, p0: np.ndarray, p1: np.ndarray, p2: np.ndarray, label: np.ndarray
+    ) -> float:
+        p = w[0] * p0 + w[1] * p1 + w[2] * p2
+        p = p / np.sum(p, axis=1, keepdims=True)
+        return np.mean((p - label) ** 2)
+
+    @staticmethod
+    def _ll_w(
+        w: np.ndarray, p0: np.ndarray, p1: np.ndarray, p2: np.ndarray, label: np.ndarray
+    ) -> float:
+        p = w[0] * p0 + w[1] * p1 + w[2] * p2
+        p = np.clip(p, 1e-20, 1 - 1e-20)
+        return -np.sum(label * np.log(p)) / p.shape[0]
+
+
+def _softplus_fn(
+    x: torch.Tensor, a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, u: torch.Tensor
+) -> torch.Tensor:
+    """Core mathematical mapping for the Softplus recalibration."""
+    return -(c / a) * torch.log(u + torch.exp(-a * (x - b)))
+
+
+class _PyTorchSoftplus(nn.Module):
+    """
+    Internal PyTorch module adapting the original code for standard 2D
+    (n_samples, n_classes) input.
+    """
+
+    def __init__(
+        self, method: str, T: float = 1.0, use_shift: bool = False, n_classes: int = 2
+    ):
+        super().__init__()
+
+        # Initialize defaults
+        init_a = 0.135 if method == "SPU" else 1.0
+        init_c = T if method == "SPTS" else 1.0
+
+        self.a = nn.Parameter(torch.ones(1) * init_a)
+        self.b = nn.Parameter(torch.ones(1) * 1.0)
+        self.c = nn.Parameter(torch.ones(1) * init_c)
+        self.u = nn.Parameter(torch.ones(1) * 0.1)
+
+        # Apply specific freezing rules based on the method
+        if method in ["SP1", "SPTS"]:
+            self.c.requires_grad = False
+
+        self.use_shift = use_shift
+        if self.use_shift:
+            # Learned weight vector for cross-class shifting
+            self.w = nn.Parameter(torch.zeros(n_classes))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_shift:
+            # Dot product of x and w gives a shift per sample: (n_samples,)
+            # We unsqueeze to (n_samples, 1) so it broadcasts across all classes
+            shift = (x @ self.w).unsqueeze(1)
+            x = x + shift
+
+        # The original code squares a, c, and u to enforce strict positivity
+        return _softplus_fn(x, self.a**2, self.b, self.c**2, self.u**2)
+
+
+class SoftPlusCalibrator(Calibrator):
+    """
+    TODO work in progress, very slow for now.
+
+    Softplus Recalibrator using PyTorch for optimization.
+    Supports SPU, SP1, and SPTS, with or without the cross-class 'shift' mechanism.
+
+    Inputs (X) must be probabilities.
+    Supports both multi-class (2D) and binary (1D or 2D) probabilities.
+
+    Reference
+    ---------
+    Christopher Qian, Feng Liang, and Jason Adams. Extending Temperature Scaling with
+    Homogenizing Maps. Journal of Machine Learning Research 26(161):1-46, 2025
+    """
+
+    def __init__(
+        self,
+        method: str = "SPU",
+        T: float = 1.0,
+        use_shift: bool = False,
+        n_epochs: int = 100,
+        lr: float = 0.01,
+        batch_size: int = 256,
+    ):
+        super().__init__()
+        if method not in ["SPU", "SP1", "SPTS"]:
+            raise ValueError("Method must be one of 'SPU', 'SP1', or 'SPTS'.")
+
+        self.method = method
+        self.T = T
+        self.use_shift = use_shift
+        self.n_epochs = n_epochs
+        self.lr = lr
+        self.batch_size = batch_size
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        """Fits the calibrator using Adam and CrossEntropyLoss."""
+        # Convert 1D binary probabilities to 2D (Class 0: 1 - p, Class 1: p)
+        if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1):
+            X = np.column_stack((1.0 - X.flatten(), X.flatten()))
+        self.n_classes_ = X.shape[1]
+        logits = multiclass_probs_to_logits(X)
+
+        # CrossEntropyLoss expects 1D integer class indices, not one-hot
+        if y.ndim == 2 and y.shape[1] > 1:
+            y = np.argmax(y, axis=1)
+
+        X_tensor = torch.tensor(logits, dtype=torch.float32)
+        y_tensor = torch.tensor(y, dtype=torch.long)
+
+        dataset = TensorDataset(X_tensor, y_tensor)
+        # Fall back to len(X) batch size if the dataset is tiny, matching original script logic
+        bs = self.batch_size if len(X) > self.batch_size else len(X)
+        loader = DataLoader(dataset, batch_size=bs, shuffle=True)
+
+        self.model_ = _PyTorchSoftplus(
+            method=self.method,
+            T=self.T,
+            use_shift=self.use_shift,
+            n_classes=self.n_classes_,
+        ).to(self.device)
+
+        optimizer = optim.Adam(self.model_.parameters(), lr=self.lr)
+        criterion = nn.CrossEntropyLoss()
+
+        self.model_.train()
+        for _ in range(self.n_epochs):
+            for batch_x, batch_y in loader:
+                batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
+
+                optimizer.zero_grad()
+                logits_calibrated = self.model_(batch_x)
+                loss = criterion(logits_calibrated, batch_y)
+                loss.backward()
+                optimizer.step()
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        """Returns calibrated probabilities."""
+        if self.model_ is None:
+            raise RuntimeError(
+                "Calibrator must be fitted before calling predict_proba."
+            )
+
+        # Convert 1D binary probabilities to 2D
+        if X.ndim == 1 or (X.ndim == 2 and X.shape[1] == 1):
+            X = np.column_stack((1.0 - X.flatten(), X.flatten()))
+        logits = multiclass_probs_to_logits(X)
+        X_tensor = torch.tensor(logits, dtype=torch.float32).to(self.device)
+
+        self.model_.eval()
+        with torch.no_grad():
+            calibrated_logits = self.model_(X_tensor)
+            # Apply softmax to get final probabilities
+            probs = torch.softmax(calibrated_logits, dim=1).cpu().numpy()
+
+        return probs
diff --git a/probmetrics/calibrators/trees.py b/probmetrics/calibrators/trees.py
new file mode 100644
index 0000000..c9b6d03
--- /dev/null
+++ b/probmetrics/calibrators/trees.py
@@ -0,0 +1,377 @@
+import warnings
+import numpy as np
+from .base import Calibrator
+from probmetrics.utils import binary_probs_to_logits, multiclass_probs_to_logits
+
+
+class LightGBMCalibrator(Calibrator):
+    def __init__(self, use_cv: bool = True):
+        super().__init__()
+        try:
+            import lightgbm as lgb
+            from sklearn.model_selection import StratifiedKFold
+        except ImportError as e:
+            raise ImportError(
+                "The 'lightgbm' and 'scikit-learn' packages are required."
+            ) from e
+        self.use_cv = use_cv
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        import lightgbm as lgb
+        from sklearn.model_selection import StratifiedKFold
+
+        self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2
+        self.cal_models_ = []
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+            objective = "binary"
+            metric = "binary_logloss"
+        else:
+            logits = multiclass_probs_to_logits(X)
+            objective = "multiclass"
+            metric = "multi_logloss"
+
+        params = {
+            "objective": objective,
+            "metric": metric,
+            "max_depth": 3,
+            "num_leaves": 8,
+            "verbose": -1,
+        }
+        if self.n_classes_ > 2:
+            params["num_class"] = self.n_classes_
+
+        if self.use_cv:
+            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+            for train_idx, val_idx in skf.split(logits, y):
+                train_data = lgb.Dataset(logits[train_idx], label=y[train_idx])
+                val_data = lgb.Dataset(
+                    logits[val_idx], label=y[val_idx], reference=train_data
+                )
+
+                model = lgb.train(
+                    params,
+                    train_data,
+                    valid_sets=[val_data],
+                    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
+                )
+                self.cal_models_.append(model)
+        else:
+            train_data = lgb.Dataset(logits, label=y)
+
+            model = lgb.train(params, train_data)
+            self.cal_models_.append(model)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+        else:
+            logits = multiclass_probs_to_logits(X)
+
+        all_preds = []
+        for model in self.cal_models_:
+            pred = model.predict(logits)
+            if self.n_classes_ == 2:
+                pred = np.vstack([1 - pred, pred]).T
+            all_preds.append(pred)
+
+        return np.mean(all_preds, axis=0)
+
+
+class XGBoostCalibrator(Calibrator):
+    def __init__(self, use_cv: bool = True):
+        super().__init__()
+        try:
+            import xgboost as xgb
+            from sklearn.model_selection import StratifiedKFold
+        except ImportError as e:
+            raise ImportError(
+                "The 'xgboost' and 'scikit-learn' packages are required."
+            ) from e
+        self.use_cv = use_cv
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        import xgboost as xgb
+        from sklearn.model_selection import StratifiedKFold
+
+        self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2
+        self.n_observed_classes_ = len(np.unique(y))
+
+        if self.n_observed_classes_ != self.n_classes_:
+            warnings.warn(
+                "# classes X.shape[1] is different from # unique labels len(np.unique(y))"
+            )
+            self.observed_classes_ = np.unique(y)
+            class_to_index = {
+                cls: idx for idx, cls in enumerate(self.observed_classes_)
+            }
+            y_remapped = np.array([class_to_index[c] for c in y])
+        else:
+            y_remapped = y
+
+        self.cal_models_ = []
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+            objective = "binary:logistic"
+            eval_metric = "logloss"
+        else:
+            logits = multiclass_probs_to_logits(X)
+            objective = "multi:softprob"
+            eval_metric = "mlogloss"
+
+        params = {
+            "objective": objective,
+            "eval_metric": eval_metric,
+            "max_depth": 3,
+        }
+        if self.n_classes_ > 2:
+            params["num_class"] = self.n_observed_classes_
+
+        if self.use_cv:
+            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+            for train_idx, val_idx in skf.split(logits, y_remapped):
+                train_kwargs = {"data": logits[train_idx], "label": y_remapped[train_idx]}
+                val_kwargs = {"data": logits[val_idx], "label": y_remapped[val_idx]}
+
+                dtrain = xgb.DMatrix(**train_kwargs)
+                dval = xgb.DMatrix(**val_kwargs)
+
+                model = xgb.train(
+                    params,
+                    dtrain,
+                    evals=[(dval, "eval")],
+                    early_stopping_rounds=50,
+                    verbose_eval=False,
+                )
+                self.cal_models_.append(model)
+        else:
+            dtrain_kwargs = {"data": logits, "label": y_remapped}
+            dtrain = xgb.DMatrix(**dtrain_kwargs)
+            model = xgb.train(params, dtrain)
+            self.cal_models_.append(model)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        import xgboost as xgb
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+        else:
+            logits = multiclass_probs_to_logits(X)
+
+        dtest_kwargs = {"data": logits}
+        dtest = xgb.DMatrix(**dtest_kwargs)
+
+        all_preds = []
+        for model in self.cal_models_:
+            pred = model.predict(dtest)
+            if self.n_classes_ == 2:
+                pred = np.vstack([1 - pred, pred]).T
+            all_preds.append(pred)
+        preds = np.mean(all_preds, axis=0)
+
+        if self.n_observed_classes_ != self.n_classes_:
+            full_preds = np.zeros((preds.shape[0], self.n_classes_))
+            full_preds[:, self.observed_classes_] = preds
+            preds = full_preds
+
+        return preds
+
+
+class CatBoostCalibrator(Calibrator):
+    def __init__(self, use_cv: bool = True):
+        super().__init__()
+        try:
+            from catboost import CatBoostClassifier, Pool
+            from sklearn.model_selection import StratifiedKFold
+        except ImportError as e:
+            raise ImportError(
+                "The 'catboost' and 'scikit-learn' packages are required."
+            ) from e
+        self.use_cv = use_cv
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from catboost import CatBoostClassifier, Pool
+        from sklearn.model_selection import StratifiedKFold
+
+        self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2
+        self.n_observed_classes_ = len(np.unique(y))
+
+        if self.n_observed_classes_ != self.n_classes_:
+            warnings.warn(
+                "# classes X.shape[1] is different from # unique labels len(np.unique(y))"
+            )
+            self.observed_classes_ = np.unique(y)
+            class_to_index = {
+                cls: idx for idx, cls in enumerate(self.observed_classes_)
+            }
+            y_remapped = np.array([class_to_index[c] for c in y])
+        else:
+            y_remapped = y
+
+        self.cal_models_ = []
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+        else:
+            logits = multiclass_probs_to_logits(X)
+
+        if self.use_cv:
+            skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+            for train_idx, val_idx in skf.split(logits, y_remapped):
+                train_kwargs = {"data": logits[train_idx], "label": y_remapped[train_idx]}
+                val_kwargs = {"data": logits[val_idx], "label": y_remapped[val_idx]}
+
+                train_pool = Pool(**train_kwargs)
+                val_pool = Pool(**val_kwargs)
+
+                model = CatBoostClassifier(
+                    depth=3,
+                    loss_function="Logloss" if self.n_classes_ == 2 else "MultiClass",
+                    iterations=1000,
+                    early_stopping_rounds=50,
+                    verbose=False,
+                )
+                model.fit(train_pool, eval_set=val_pool, verbose=False)
+                self.cal_models_.append(model)
+        else:
+            pool_kwargs = {"data": logits, "label": y_remapped}
+            pool = Pool(**pool_kwargs)
+
+            model = CatBoostClassifier(depth=3, verbose=False)
+            model.fit(pool)
+            self.cal_models_.append(model)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        from catboost import Pool
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+        else:
+            logits = multiclass_probs_to_logits(X)
+
+        pool_kwargs = {"data": logits}
+        pool = Pool(**pool_kwargs)
+
+        preds = [model.predict_proba(pool) for model in self.cal_models_]
+        preds = np.mean(preds, axis=0)
+    
+        if self.n_observed_classes_ != self.n_classes_:
+            full_preds = np.zeros((preds.shape[0], self.n_classes_))
+            full_preds[:, self.observed_classes_] = preds
+            preds = full_preds
+
+        return preds
+
+
+class BinaryCatBoostCalibrator(Calibrator):
+    def __init__(self, tiny=False, monotone=False):
+        super().__init__()
+        try:
+            from catboost import CatBoostClassifier, Pool
+        except ImportError as e:
+            raise ImportError("The 'catboost' packages is required.") from e
+        self.tiny = tiny
+        self.monotone = monotone
+        self.model_params = {
+            "verbose": 0,
+            "allow_writing_files": False,
+            "iterations": 100,
+        }
+
+        if self.tiny:
+            self.model_params["depth"] = 3
+
+        if self.monotone:
+            self.model_params["monotone_constraints"] = "(1)"
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from catboost import CatBoostClassifier, Pool
+
+        self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+        else:
+            raise ValueError("This calibrator is made for binary predictions only.")
+
+        pool_kwargs = {"data": logits, "label": y}
+        pool = Pool(**pool_kwargs)
+        model = CatBoostClassifier(**self.model_params)
+        model.fit(pool)
+        self.cal_ = model
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        from catboost import Pool
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+        else:
+            raise ValueError("This calibrator is made for binary predictions only.")
+
+        pool_kwargs = {"data": logits}
+        pool = Pool(**pool_kwargs)
+
+        return self.cal_.predict_proba(pool)
+
+
+class InitLogitCatBoostCalibrator(Calibrator):
+    def __init__(self, init_logits=True):
+        super().__init__()
+        try:
+            from catboost import CatBoostClassifier, Pool
+        except ImportError as e:
+            raise ImportError("The 'catboost' package is required.") from e
+        self.init_logits = init_logits
+        self.model_params = {
+            "iterations": 50,
+            "learning_rate": 0.1,
+            "depth": 4,
+            "verbose": 0,
+            "allow_writing_files": False,
+        }
+
+    def _fit_impl(self, X: np.ndarray, y: np.ndarray) -> None:
+        from catboost import CatBoostClassifier, Pool
+
+        self.n_classes_ = X.shape[1] if len(X.shape) > 1 and X.shape[1] > 1 else 2
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+        else:
+            logits = multiclass_probs_to_logits(X)
+
+        pool_kwargs = {"data": logits, "label": y}
+        if self.init_logits:
+            pool_kwargs["baseline"] = logits
+        pool = Pool(**pool_kwargs)
+
+        self.cal_ = CatBoostClassifier(**self.model_params)
+        self.cal_.fit(pool)
+
+    def _predict_proba_impl(self, X: np.ndarray) -> np.ndarray:
+        from catboost import Pool
+
+        if self.n_classes_ == 2:
+            log_p, log_1_minus_p = binary_probs_to_logits(X)
+            logits = log_p - log_1_minus_p
+        else:
+            logits = multiclass_probs_to_logits(X)
+
+        pool_kwargs = {"data": logits}
+        if self.init_logits:
+            pool_kwargs["baseline"] = logits
+
+        pool = Pool(**pool_kwargs)
+
+        return self.cal_.predict_proba(pool)
diff --git a/probmetrics/metrics.py b/probmetrics/metrics.py
index 8856f63..a3e655e 100644
--- a/probmetrics/metrics.py
+++ b/probmetrics/metrics.py
@@ -113,7 +113,7 @@ def _get_candidate_metrics() -> List['Metrics']:
                      TopClassLoss(ProperLpLoss(p=2)),
                      TopClassLoss(ProperLpLoss(p=float('inf'))),
                      BrierLoss(binary_as_multiclass=False),
-                     SmoothCalibrationError(),
+                     SmoothCalibrationError(), KuiperCalibrationMetric(),
                      MSE(), RMSE(), NRMSE(), MAE(), NMAE()]
         cand_list.extend([MeanProbNormalizedMetric(metric) for metric in
                           [ClippedLogLoss(), LogLoss(), BrierLoss(), ClassError(), Accuracy()]])
@@ -900,6 +900,59 @@ def _compute_mean(self, y_true: CategoricalDistribution, y_pred: CategoricalDist
         return torch.as_tensor(rp.smECE(f=conf, y=acc), device=labels_torch.device, dtype=torch.float32)
 
 
+class KuiperCalibrationMetric(ClassificationMetric):
+    def __init__(self):
+        super().__init__(name='kuiper', is_lower_better=True)
+
+    def _compute_mean(self, y_true: CategoricalDistribution, y_pred: CategoricalDistribution, **kwargs) -> Optional[torch.Tensor]:
+        labels = y_true.get_modes().flatten().float()
+        probs = y_pred.get_probs()[..., 1].flatten().float()
+
+        N = labels.numel()
+        if N == 0:
+            return torch.tensor(0.0, device=labels.device)
+
+        probs_sorted, sort_idx = torch.sort(probs)
+        labels_sorted = labels[sort_idx]
+
+        cum_probs = torch.cumsum(probs_sorted, dim=0) / N
+        cum_labels = torch.cumsum(labels_sorted, dim=0) / N
+
+        diffs = cum_labels - cum_probs
+
+        max_dev = torch.max(diffs.max(), torch.tensor(0.0, device=diffs.device))
+        min_dev = torch.min(diffs.min(), torch.tensor(0.0, device=diffs.device))
+
+        kuiper_score = max_dev - min_dev
+
+        return kuiper_score
+    
+
+class KolmogorovSmirnovCalibrationMetric(ClassificationMetric):
+    def __init__(self):
+        super().__init__(name='kolmogorov-smirnov', is_lower_better=True)
+
+    def _compute_mean(self, y_true: CategoricalDistribution, y_pred: CategoricalDistribution, **kwargs) -> Optional[torch.Tensor]:
+        labels = y_true.get_modes().flatten().float()
+        probs = y_pred.get_probs()[..., 1].flatten().float()
+
+        N = labels.numel()
+        if N == 0:
+            return torch.tensor(0.0, device=labels.device)
+
+        probs_sorted, sort_idx = torch.sort(probs)
+        labels_sorted = labels[sort_idx]
+
+        cum_probs = torch.cumsum(probs_sorted, dim=0) / N
+        cum_labels = torch.cumsum(labels_sorted, dim=0) / N
+
+        diffs = cum_labels - cum_probs
+        abs_diffs = torch.abs(diffs)
+        kolmogorovsmirnov = abs_diffs.max()
+
+        return kolmogorovsmirnov
+
+
 class MetricsWithCalibration(Metrics):
     def __init__(self, metrics: Metrics, calibrator: sklearn.base.ClassifierMixin, val_splitter: Splitter,
                  cal_name: Optional[str] = None, random_state: Optional[int] = None):
diff --git a/probmetrics/utils.py b/probmetrics/utils.py
index f7fb3b6..585f73d 100644
--- a/probmetrics/utils.py
+++ b/probmetrics/utils.py
@@ -12,41 +12,48 @@ def join_dicts(*dicts, allow_overlap: bool = True):
 
 def validate_probabilities(X: np.ndarray) -> None:
     """
-    Checks if X is a valid 2D array of probability distributions.
+    Checks if X is a valid array of probabilities, either 1D (binary) or 2D (binary or multiclass).
     Raises ValueError if the validation fails.
     """
-    if X.ndim != 2:
-        raise ValueError(f"Input array must be 2D, but got {X.ndim} dimensions.")
+    if X.ndim > 2:
+        raise ValueError(f"Input array must be 1D or 2D, but got {X.ndim} dimensions.")
+    if not np.all(np.isfinite(X)):
+        raise ValueError("Input array contains NaN or infinite values.")
     if np.any(X < 0) or np.any(X > 1):
         raise ValueError("All probability values must be within the [0, 1] range.")
-    if not np.all(np.isclose(X.sum(axis=1), 1.0)):
+    if X.ndim == 2 and not np.all(np.isclose(X.sum(axis=1), 1.0, atol=1e-6)):
         raise ValueError("Each row of the input array must sum to 1 to be a valid probability distribution.")
 
-def process_binary_probs(probs: np.ndarray) -> np.ndarray:
+def flatten_binary_probs(probs: np.ndarray) -> np.ndarray:
     probs = probs.astype(np.float32)
-
-    if probs.ndim == 1: # 1D array of probabilities
-        return probs
-    elif probs.ndim == 2 and probs.shape[1] == 2: # 2D array of probability pairs [(p0, p1), ...]
-        return probs[:, 1]
+    if probs.ndim == 1:
+        p_pos = probs
+    elif probs.ndim == 2 and probs.shape[1] == 2:
+        p_pos = probs[:, 1]
     else:
         raise ValueError(f"Invalid input shape: {probs.shape}."
                          f"1D array or 2D array with shape (n, 2)")
+    return p_pos
+
+def expand_binary_probs(probs: np.ndarray) -> np.ndarray:
+    if probs.ndim == 1:
+        return np.stack([1.0 - probs, probs], axis=1)
+    elif probs.ndim == 2 and probs.shape[1] == 1:
+        return np.concatenate([1.0 - probs, probs], axis=1)
+    else:
+        return probs
 
 def binary_probs_to_logits(probs: np.ndarray) -> np.ndarray:
     """Converts binary probabilities to logits using the logit function (inverse sigmoid).
     Clips logits to the log of the tiniest normal float32 to avoid infinite logit values.
     """
-    probs = process_binary_probs(probs)
-    
+    probs = flatten_binary_probs(probs)
     with np.errstate(divide="ignore"):
         log_p = np.log(probs)
         log_1_minus_p = np.log1p(-probs)
-    
     thresh = np.log(np.finfo(np.float32).tiny)
     log_p = np.clip(log_p, a_min=thresh, a_max=-thresh).reshape(-1, 1)
     log_1_minus_p = np.clip(log_1_minus_p, a_min=thresh, a_max=-thresh).reshape(-1, 1)
-    
     return log_p, log_1_minus_p
 
 def multiclass_probs_to_logits(probs: np.ndarray) -> np.ndarray:
diff --git a/pyproject.toml b/pyproject.toml
index facf76c..d3949f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,21 +5,24 @@ build-backend = "hatchling.build"
 [project]
 name = "probmetrics"
 dynamic = ["version"]
-description = 'Probabilistic metrics and calibration methods'
+description = "Probabilistic metrics and calibration methods"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = "Apache-2.0"
-keywords = ['Metrics', 'Classification', 'Calibration', 'Refinement']
+keywords = ["Metrics", "Classification", "Calibration", "Refinement"]
 authors = [
-    { name = "David Holzmüller" }, #, email = "a@b.org" },
+    {name = "David Holzmüller", email = "a@b.org"},
+    {name = "Eugène Berta", email = "eugene.berta@gmail.com"},
+    {name = "Sacha Braun"}
 ]
 classifiers = [
     "Development Status :: 4 - Beta",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
     "License :: OSI Approved :: Apache Software License",
@@ -28,25 +31,28 @@ dependencies = [
     "torch>=2.0",
     "numpy>=1.25",
     "scikit-learn>=1.3",
-    # older versions of torchmetrics (<1.2.1) have a bug that makes certain metrics slow:
-    # https://github.com/Lightning-AI/torchmetrics/pull/2184
-    "torchmetrics>=1.2.1",
-    # added constraint due to some pytabkit failure case with github actions, needed for python >= 3.10 anyway
-    "numba>=0.59.0",
+    "torchmetrics>=1.2.1", # older versions of torchmetrics (<1.2.1) have a bug that
+    # makes certain metrics slow: https://github.com/Lightning-AI/torchmetrics/pull/2184
+    "pytorch-minimize",
 ]
 
 [project.optional-dependencies]
 extra = [
+    "numba>=0.59.0", # For SAGA based versions of logistic calibrators. Added constraint
+    # due to some pytabkit failure case with github actions, needed for python >= 3.10
+    # anyway
+    "netcal", # For BBQ, ENIR, Netcal TS
+    "uncertainty-calibration", # For BinaryPlattBinnerCalibrator
+    "betacal", # For BetacalCalibrator
+    "splinecalib", # For MLISplineCalibrator
     "venn-abers", # Venn-Abers calibration
     "cir-model", # centered isotonic regression
-    "netcal", # for temperature scaling comparison
-    "relplot", # for smooth ECE
-    "pytorch-minimize", # for torchcal TS implementation
-    # relplot fails for sklearn 1.7: ModuleNotFoundError: No module named 'sklearn.utils._estimator_html_repr'
-    "scikit-learn>=1.3",
-    "catboost", # For the WS_CatboostClassifier
-    "lightgbm", # For the WS_LGBMClassifier
-    "scipy",    # For the WS_LGBMClassifier to handle softmax 
+    "scipy",    # For WS_LGBMClassifier and ETSCalibrator
+    "relplot", # for smooth ECE. relplot fails for sklearn 1.7: ModuleNotFoundError: No
+    # module named 'sklearn.utils._estimator_html_repr'
+    "catboost", # For WS_CatboostClassifier and CatBoostCalibrator
+    "lightgbm", # For WS_LGBMClassifier and LightGBMCalibrator
+    "xgboost", # For XGBoostCalibrator
 ]
 dev = [
     "matplotlib>=3.0", # plotting
@@ -76,7 +82,7 @@ installer = "uv"
 features = ["extra", "dev"]
 
 [[tool.hatch.envs.test.matrix]]
-python = ["3.9", "3.10", "3.11", "3.12"]
+python = ["3.10", "3.11", "3.12", "3.13", "3.14"]
 
 [tool.hatch.envs.test.scripts]
 run-tests = "pytest {args}"
@@ -118,4 +124,4 @@ exclude_lines = [
     "no cov",
     "if __name__ == .__main__.:",
     "if TYPE_CHECKING:",
-]
\ No newline at end of file
+]
diff --git a/tests/test_calibrators.py b/tests/test_calibrators.py
index aa96f68..dad91f1 100644
--- a/tests/test_calibrators.py
+++ b/tests/test_calibrators.py
@@ -1,5 +1,9 @@
 from typing import Tuple, Optional
 
+# Without this XGBoost gets a segmentation fault
+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+
 import tests  # otherwise coverage warns
 
 import pytest
@@ -9,32 +13,56 @@
 import torch.autograd.graph
 from sklearn.model_selection import train_test_split
 
-from probmetrics.calibrators import Calibrator, VennAbersCalibrator, MulticlassOneVsRestCalibrator, \
-    BinaryVennAbersCalibrator, MulticlassOneVsOneCalibrator, SklearnCalibrator, get_calibrator, \
-    TemperatureScalingCalibrator, PreprocessingCalibratorWrapper
+from probmetrics.calibrators import *
 from probmetrics.distributions import CategoricalDirac, CategoricalProbs
 from probmetrics.metrics import BrierLoss, ClassificationMetric, SmoothCalibrationError, CalibrationError
 
-binary_only_calibrators = {'linear-scaling', 'affine-scaling', 'quadratic-scaling'}
+binary_only_calibrators = {
+    "linear-scaling",
+    "affine-scaling",
+    "quadratic-scaling",
+    "Hist-uniform",
+    "Hist-quantile",
+    "Scaling-Binning",
+    "BBQ",
+    "CIR",
+    "Venn-Abers",
+    "ENIR",
+    "Beta",
+    "Spline",
+    "CDF-Spline",
+}
 
 calibrator_specs = [
-                       (name, get_calibrator(name)) for name in
-                       [
-                           'platt', 'isotonic', 'platt-logits', 'ivap-ovr', 'ivap-ovo', 'cir', 'temp-scaling',
-                           'autogluon-ts', 'torchunc-ts',
-                           'linear-scaling', 'affine-scaling', 'quadratic-scaling', 'logistic'
-                       ]
-                   ] + [
-                       ('svs', get_calibrator('svs', svs_opt='bfgs')),
-                       # test with bfgs instead of saga since it is deterministic.
-                       ('sms', get_calibrator('sms', sms_opt='bfgs')),
-                       # test with bfgs instead of saga since it is deterministic.
-                       ('temp-scaling-lbfgs', TemperatureScalingCalibrator(opt='lbfgs')),
-                       ('temp-scaling-mixture', get_calibrator('temp-scaling', calibrate_with_mixture=True)),
-                   ]
-
-
-# [!] 'ivap' not running for test_calibrator_missing_class.
+    (name, get_calibrator(name)) for name in [
+        "platt", "isotonic",
+        # "platt-logits",
+        "ivap-ovr", "ivap-ovo", "cir",
+        "temp-scaling", "autogluon-ts", "torchunc-ts", "linear-scaling",
+        "affine-scaling", "quadratic-scaling", "logistic", "svs", "sms"
+    ]] + [
+    ("temp-scaling-lbfgs", TemperatureScalingCalibrator(opt="lbfgs")),
+    ("temp-scaling-mixture", get_calibrator("temp-scaling", calibrate_with_mixture=True)),
+    ("Hist-uniform", BinaryHistogramBinningCalibrator(strategy="uniform")),
+    ("Hist-quantile", BinaryHistogramBinningCalibrator(strategy="quantile")),
+    ("Scaling-Binning", BinaryPlattBinnerCalibrator()),
+    ("BBQ", NetcalBBQCalibrator()),
+    ("CIR", CenteredIsotonicRegressionCalibrator()),
+    ("Venn-Abers", BinaryVennAbersCalibrator()),
+    ("ENIR", NetcalENIRCalibrator()),
+    ("Beta", BetacalCalibrator()),
+    ("Spline", MLISplineCalibrator()),
+    ("CDF-Spline", CDFSplineCalibrator()),
+    ("ETS", ETSCalibrator()),
+    ("VS", VectorScalingCalibrator()),
+    ("MS", MatrixScalingCalibrator()),
+    ("Kernel", KernelCalibrator()),
+    ("XGBoost", XGBoostCalibrator()),
+    ("LightGBM", LightGBMCalibrator()),
+    ("CatBoost", CatBoostCalibrator()),
+]
+
+# [!] "ivap" not running for test_calibrator_missing_class.
 
 def sample_labels(p: np.ndarray, random_state: Optional[int] = None) -> np.ndarray:
     """
@@ -49,10 +77,10 @@ def sample_labels(p: np.ndarray, random_state: Optional[int] = None) -> np.ndarr
     rng = np.random.default_rng(seed=random_state)
     cumulative_probs = np.cumsum(p, axis=-1)  # Cumulative sum along each row
     random_values = rng.random(size=(*p.shape[:-1], 1))  # Uniform random values for each sample
-    # print(f'{p.shape=}, {cumulative_probs.shape=}, {random_values.shape=}')
+    # print(f"{p.shape=}, {cumulative_probs.shape=}, {random_values.shape=}")
     samples = (random_values < cumulative_probs).argmax(
         axis=-1)  # Find the first index where cumulative probability exceeds random value
-    # print(f'{samples.shape=}')
+    # print(f"{samples.shape=}")
     return samples
 
 
@@ -74,11 +102,11 @@ def calib_dataset(request) -> Tuple[np.ndarray, np.ndarray]:
 # test that they work with multiclass?
 # test that torch and numpy interfaces are equivalent?
 
-# @pytest.mark.parametrize('calibrators', [
+# @pytest.mark.parametrize("calibrators", [
 #     (VennAbersCalibrator(use_ovo=False), MulticlassOneVsRestCalibrator(BinaryVennAbersCalibrator())),
 #     (VennAbersCalibrator(use_ovo=True), MulticlassOneVsOneCalibrator(BinaryVennAbersCalibrator())),
-#     (SklearnCalibrator(method='isotonic', cv='prefit'),
-#      MulticlassOneVsRestCalibrator(SklearnCalibrator(method='isotonic', cv='prefit')))
+#     (SklearnCalibrator(method="isotonic", cv="prefit"),
+#      MulticlassOneVsRestCalibrator(SklearnCalibrator(method="isotonic", cv="prefit")))
 # ])
 # def test_calibrators_equal(calibrators: Tuple[Calibrator, Calibrator], calib_dataset: Tuple[np.ndarray, np.ndarray]):
 #     cal1, cal2 = calibrators
@@ -94,16 +122,16 @@ def calib_dataset(request) -> Tuple[np.ndarray, np.ndarray]:
 #     # assert np.allclose(cal1.predict_proba(X_test), cal2.predict_proba(X_test))
 #     np.testing.assert_allclose(cal1.predict_proba(X_test), cal2.predict_proba(X_test), atol=1e-7)
 
-@pytest.mark.parametrize('metric', [BrierLoss(), SmoothCalibrationError(), CalibrationError()])
-@pytest.mark.parametrize(('calibrator_name', 'calibrator'),
+@pytest.mark.parametrize("metric", [BrierLoss(), SmoothCalibrationError(), CalibrationError()])
+@pytest.mark.parametrize(("calibrator_name", "calibrator"),
                          # torchunc-ts sometimes fails, so we exclude it here
-                         [(name, calib) for name, calib in calibrator_specs if name != 'torchunc-ts'])
+                         [(name, calib) for name, calib in calibrator_specs if name != "torchunc-ts"])
 def test_calibrator_performance(metric: ClassificationMetric, calibrator_name: str, calibrator: Calibrator,
                                 calib_dataset: Tuple[np.ndarray, np.ndarray]):
     X, y = calib_dataset
     n_classes = X.shape[-1]
 
-    # don't test with train/test split since it might fail due to overfitting
+    # don"t test with train/test split since it might fail due to overfitting
     # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
     #
     # cal = sklearn.base.clone(calibrator)
@@ -127,10 +155,10 @@ def test_calibrator_performance(metric: ClassificationMetric, calibrator_name: s
     with_cal = metric.compute(y_true=y_true, y_pred=CategoricalProbs(torch.as_tensor(y_pred_probs))).item()
 
     # loss after calibration should be better than before
-    assert with_cal < without_cal
+    assert with_cal <= without_cal
 
 
-@pytest.mark.parametrize(('calibrator_name', 'calibrator'), calibrator_specs)
+@pytest.mark.parametrize(("calibrator_name", "calibrator"), calibrator_specs)
 def test_calibrator_torch_vs_numpy(calibrator_name: str, calibrator: Calibrator,
                                    calib_dataset: Tuple[np.ndarray, np.ndarray]):
     X, y = calib_dataset
@@ -160,7 +188,7 @@ def test_calibrator_torch_vs_numpy(calibrator_name: str, calibrator: Calibrator,
         np.testing.assert_allclose(preds[i], preds[0], atol=1e-7)
 
 
-@pytest.mark.parametrize(('calibrator_name', 'calibrator'), calibrator_specs)
+@pytest.mark.parametrize(("calibrator_name", "calibrator"), calibrator_specs)
 def test_calibrator_missing_class(calibrator_name: str, calibrator: Calibrator):
     n_samples = 1000
     n_classes = 4