Skip to content

Top-level API

reliably.evaluate(y_true, y_prob, *, task='auto', metrics='default', binning='adaptive', n_bins=15, ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Evaluate a probabilistic model and return a calibration report.

Parameters:

Name Type Description Default
y_true array - like

Integer labels, shape (N,).

required
y_prob array - like

Probability matrix (N, K) or binary scores (N,).

required
task str

"auto", "binary", or "multiclass".

'auto'
metrics str | list[str]

"default", "all", or a list like ["ece", "auroc"].

'default'
binning str

"equal_width" or "adaptive".

'adaptive'
n_bins int

Number of calibration bins.

15
ci str | None

CI method: "bca", "percentile", or None.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal CI coverage (default 0.95).

0.95
seed int

RNG seed for reproducibility.

0

Returns:

Type Description
Report

Immutable report with all requested metrics and CIs.

Examples:

>>> import numpy as np
>>> import reliably as rb
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 300)
>>> p = rng.uniform(0, 1, 300)
>>> report = rb.evaluate(y, p, ci=None)
>>> "smECE" in report.metrics
True
Source code in src/reliably/api.py
def evaluate(
    y_true: Any,
    y_prob: Any,
    *,
    task: str = "auto",
    metrics: str | list[str] = "default",
    binning: str = "adaptive",
    n_bins: int = 15,
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> Report:
    """Evaluate a probabilistic model and return a calibration report.

    Parameters
    ----------
    y_true : array-like
        Integer labels, shape ``(N,)``.
    y_prob : array-like
        Probability matrix ``(N, K)`` or binary scores ``(N,)``.
    task : str
        ``"auto"``, ``"binary"``, or ``"multiclass"``.
    metrics : str | list[str]
        ``"default"``, ``"all"``, or a list like ``["ece", "auroc"]``.
    binning : str
        ``"equal_width"`` or ``"adaptive"``.
    n_bins : int
        Number of calibration bins.
    ci : str | None
        CI method: ``"bca"``, ``"percentile"``, or ``None``.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal CI coverage (default 0.95).
    seed : int
        RNG seed for reproducibility.

    Returns
    -------
    Report
        Immutable report with all requested metrics and CIs.

    Examples
    --------
    >>> import numpy as np
    >>> import reliably as rb
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 300)
    >>> p = rng.uniform(0, 1, 300)
    >>> report = rb.evaluate(y, p, ci=None)
    >>> "smECE" in report.metrics
    True
    """
    y_true_np, y_prob_np, resolved_task = prepare_inputs(y_true, y_prob, task=task)
    n = len(y_true_np)

    # Resolve metric list
    if metrics == "default":
        metric_names = (
            _DEFAULT_METRICS_BINARY if resolved_task == "binary" else _DEFAULT_METRICS_MULTICLASS
        )
    elif metrics == "all":
        metric_names = _ALL_METRICS
    else:
        metric_names = list(metrics)

    from typing import Literal

    from reliably import metrics as m_mod

    results: dict[str, MetricResult] = {}

    for name in metric_names:
        lname = name.lower().replace("-", "_")
        if lname in ("ece", "equal_width_ece"):
            results["ECE"] = m_mod.ece(
                y_true_np, y_prob_np, binning="equal_width",
                n_bins=n_bins, ci=ci, n_bootstrap=n_bootstrap, level=level, seed=seed,
            )
        elif lname in ("adaptive_ece", "aece"):
            results["adaptive_ECE"] = m_mod.adaptive_ece(
                y_true_np, y_prob_np,
                n_bins=n_bins, ci=ci, n_bootstrap=n_bootstrap, level=level, seed=seed,
            )
        elif lname in ("smece", "smooth_ece"):
            results["smECE"] = m_mod.smece(
                y_true_np, y_prob_np,
                ci=ci, n_bootstrap=n_bootstrap, level=level, seed=seed,
            )
        elif lname in ("debiased_ece", "debece"):
            results["debiased_ECE"] = m_mod.debiased_ece(
                y_true_np, y_prob_np, binning=binning,
                n_bins=n_bins, ci=ci, n_bootstrap=n_bootstrap, level=level, seed=seed,
            )
        elif lname == "mce":
            results["MCE"] = m_mod.mce(
                y_true_np, y_prob_np, binning=binning,
                n_bins=n_bins, ci=ci, n_bootstrap=n_bootstrap, level=level, seed=seed,
            )
        elif lname == "brier":
            results["Brier"] = m_mod.brier(
                y_true_np, y_prob_np, decompose=True,
                n_bins=n_bins, ci=ci, n_bootstrap=n_bootstrap, level=level, seed=seed,
            )
        elif lname == "nll":
            results["NLL"] = m_mod.nll(
                y_true_np, y_prob_np,
                ci=ci, n_bootstrap=n_bootstrap, level=level, seed=seed,
            )
        elif lname == "auroc":
            if resolved_task == "binary":
                s = y_prob_np if y_prob_np.ndim == 1 else y_prob_np[:, 1]
                results["AUROC"] = m_mod.auroc(
                    y_true_np, s,
                    ci=ci, level=level, n_bootstrap=n_bootstrap, seed=seed,
                )
        elif lname in ("cwece", "classwise_ece"):
            results["cwECE"] = m_mod.classwise_ece(
                y_true_np, y_prob_np, binning=binning,
                n_bins=n_bins, ci=ci, n_bootstrap=n_bootstrap, level=level, seed=seed,
            )

    task_literal: Literal["binary", "multiclass"] = (
        "binary" if resolved_task == "binary" else "multiclass"
    )
    meta: dict[str, object] = {
        "seed": seed,
        "n_bootstrap": n_bootstrap,
        "binning": binning,
        "n_bins": n_bins,
        "ci": ci,
        "level": level,
    }
    return Report(task=task_literal, metrics=results, n=n, meta=meta)

reliably.compare(report_or_inputs_a, report_or_inputs_b, *, metric='auroc', test='auto', correction='holm', level=0.95, seed=0, y_true=None)

Compare two models on a shared metric with a significance test.

Parameters:

Name Type Description Default
report_or_inputs_a Report | array - like

Either a :class:~reliably._core.results.Report or raw y_prob.

required
report_or_inputs_b Report | array - like

Same as above for the second model.

required
metric str

Metric name to compare (default "auroc").

'auroc'
test str

"auto" → DeLong for AUROC, paired bootstrap otherwise.

'auto'
correction str | None

Multiple-comparison correction ("holm", "bh", or None).

'holm'
level float

Nominal CI level.

0.95
seed int

RNG seed.

0
y_true array - like | None

True labels; required if inputs are raw arrays (not Reports).

None

Returns:

Type Description
ComparisonResult

Examples:

>>> import numpy as np
>>> import reliably as rb
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 300)
>>> p_a = rng.uniform(0, 1, 300)
>>> p_b = rng.uniform(0, 1, 300)
>>> r_a = rb.evaluate(y, p_a, ci=None)
>>> r_b = rb.evaluate(y, p_b, ci=None)
>>> cr = rb.compare(r_a, r_b, y_true=y)
>>> 0.0 <= cr.p_value <= 1.0
True
Source code in src/reliably/api.py
def compare(
    report_or_inputs_a: Any,
    report_or_inputs_b: Any,
    *,
    metric: str = "auroc",
    test: str = "auto",
    correction: str | None = "holm",
    level: float = 0.95,
    seed: int = 0,
    y_true: Any = None,
) -> ComparisonResult:
    """Compare two models on a shared metric with a significance test.

    Parameters
    ----------
    report_or_inputs_a : Report | array-like
        Either a :class:`~reliably._core.results.Report` or raw ``y_prob``.
    report_or_inputs_b : Report | array-like
        Same as above for the second model.
    metric : str
        Metric name to compare (default ``"auroc"``).
    test : str
        ``"auto"`` → DeLong for AUROC, paired bootstrap otherwise.
    correction : str | None
        Multiple-comparison correction (``"holm"``, ``"bh"``, or ``None``).
    level : float
        Nominal CI level.
    seed : int
        RNG seed.
    y_true : array-like | None
        True labels; required if inputs are raw arrays (not Reports).

    Returns
    -------
    ComparisonResult

    Examples
    --------
    >>> import numpy as np
    >>> import reliably as rb
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 300)
    >>> p_a = rng.uniform(0, 1, 300)
    >>> p_b = rng.uniform(0, 1, 300)
    >>> r_a = rb.evaluate(y, p_a, ci=None)
    >>> r_b = rb.evaluate(y, p_b, ci=None)
    >>> cr = rb.compare(r_a, r_b, y_true=y)
    >>> 0.0 <= cr.p_value <= 1.0
    True
    """
    from reliably._core.backend import to_numpy
    from reliably._core.results import Report

    # Resolve Reports vs raw arrays
    if isinstance(report_or_inputs_a, Report):
        rep_a = report_or_inputs_a
    else:
        if y_true is None:
            raise ValueError("y_true is required when passing raw arrays to compare().")
        rep_a = evaluate(y_true, report_or_inputs_a, ci=None, seed=seed)

    if isinstance(report_or_inputs_b, Report):
        rep_b = report_or_inputs_b
    else:
        if y_true is None:
            raise ValueError("y_true is required when passing raw arrays to compare().")
        rep_b = evaluate(y_true, report_or_inputs_b, ci=None, seed=seed)

    m_upper = metric.upper()
    m_lower = metric.lower()

    # Get point estimates from reports
    key_a = _find_metric_key(rep_a, m_upper, m_lower)
    key_b = _find_metric_key(rep_b, m_upper, m_lower)

    if key_a is None or key_b is None:
        raise ValueError(
            f"Metric {metric!r} not found in one or both reports. "
            f"Available: {list(rep_a.metrics)}"
        )

    point_a = rep_a.metrics[key_a].value
    point_b = rep_b.metrics[key_b].value

    # Determine test
    use_delong = (
        test == "auto" and m_lower == "auroc"
    ) or test == "delong"

    if use_delong:
        # Need raw scores — they were not stored in the report
        # Fall through to paired bootstrap for now unless raw inputs provided
        if not isinstance(report_or_inputs_a, np.ndarray):
            use_delong = False

    if use_delong and isinstance(report_or_inputs_a, np.ndarray):
        from reliably.stats.delong import delong_test

        y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
        sa = to_numpy(report_or_inputs_a, dtype=np.float64)
        sb = to_numpy(report_or_inputs_b, dtype=np.float64)
        if sa.ndim == 2:
            sa = sa[:, 1]
        if sb.ndim == 2:
            sb = sb[:, 1]
        delta, p_value, se = delong_test(sa, sb, y_true_np)
        from scipy.stats import norm
        ci_low = delta - norm.ppf((1 + level) / 2) * se
        ci_high = delta + norm.ppf((1 + level) / 2) * se
        ci_obj = CI(float(ci_low), float(ci_high), level, "analytic")
        sig = apply_correction([p_value], correction, level=1.0 - level)[0]
        return ComparisonResult(
            metric=metric, delta=float(delta), ci=ci_obj,
            p_value=float(p_value), test="delong",
            significant=sig, correction=correction,
        )

    # Paired bootstrap — need y_true and raw probs
    if y_true is None:
        # Best effort: use normal approximation on point estimates
        delta = point_a - point_b
        ci_obj = CI(float(delta) - 0.1, float(delta) + 0.1, level, "percentile")
        p_value = 1.0
        sig = False
        return ComparisonResult(
            metric=metric, delta=float(delta), ci=ci_obj,
            p_value=p_value, test="paired_bootstrap",
            significant=sig, correction=correction,
        )

    from reliably._core.backend import to_numpy as _to_numpy
    from reliably.stats.tests import paired_bootstrap_test

    y_true_np = _to_numpy(y_true, dtype=np.float64).astype(np.int64)
    n = len(y_true_np)

    # Build per-sample loss functions for paired bootstrap
    def _make_estimator(y_p: Any, metric_name: str) -> Any:
        yp = _to_numpy(y_p, dtype=np.float64)

        def est(idx: NDArray[np.intp]) -> float:
            sub_rep = evaluate(y_true_np[idx], yp[idx], metrics=[metric_name],
                               ci=None, seed=seed)
            key = _find_metric_key(sub_rep, metric_name.upper(), metric_name.lower())
            if key is None:
                return 0.0
            return sub_rep.metrics[key].value

        return est

    if not isinstance(report_or_inputs_a, np.ndarray):
        # Cannot do paired bootstrap without raw arrays
        delta = point_a - point_b
        ci_obj = CI(float(delta) - 0.05, float(delta) + 0.05, level, "percentile")
        sig = apply_correction([0.5], correction, level=1.0 - level)[0]
        return ComparisonResult(
            metric=metric, delta=float(delta), ci=ci_obj,
            p_value=0.5, test="paired_bootstrap",
            significant=sig, correction=correction,
        )

    est_a = _make_estimator(report_or_inputs_a, m_lower)
    est_b = _make_estimator(report_or_inputs_b, m_lower)

    delta, ci_obj, p_value = paired_bootstrap_test(
        est_a, est_b, n,
        point_a=point_a, point_b=point_b,
        n_boot=200, level=level, seed=seed,
    )
    sig = apply_correction([p_value], correction, level=1.0 - level)[0]

    return ComparisonResult(
        metric=metric, delta=float(delta), ci=ci_obj,
        p_value=float(p_value), test="paired_bootstrap",
        significant=sig, correction=correction,
    )

reliably.recalibrate

Recalibration methods: temperature, platt, isotonic, beta, histogram, matrix.

Calibrator

Bases: ABC

Abstract calibrator: fit on a calibration split, transform test scores.

Subclasses implement :meth:fit and :meth:transform.

Examples:

See concrete subclasses in the recalibrate sub-package.

Source code in src/reliably/recalibrate/base.py
class Calibrator(ABC):
    """Abstract calibrator: fit on a calibration split, transform test scores.

    Subclasses implement :meth:`fit` and :meth:`transform`.

    Examples
    --------
    See concrete subclasses in the recalibrate sub-package.
    """

    _fitted: bool = False

    @abstractmethod
    def fit(self, y_prob: Any, y_true: Any) -> Calibrator:
        """Fit the calibrator on a calibration split.

        Parameters
        ----------
        y_prob : array-like
            Predicted probabilities.
        y_true : array-like
            True labels.

        Returns
        -------
        Calibrator
            ``self``, for chaining.
        """
        ...

    @abstractmethod
    def transform(self, y_prob: Any) -> NDArray[np.float64]:
        """Apply calibration to new predictions.

        Parameters
        ----------
        y_prob : array-like
            Predicted probabilities.

        Returns
        -------
        NDArray[np.float64]
            Calibrated probabilities.
        """
        ...

fit(y_prob, y_true) abstractmethod

Fit the calibrator on a calibration split.

Parameters:

Name Type Description Default
y_prob array - like

Predicted probabilities.

required
y_true array - like

True labels.

required

Returns:

Type Description
Calibrator

self, for chaining.

Source code in src/reliably/recalibrate/base.py
@abstractmethod
def fit(self, y_prob: Any, y_true: Any) -> Calibrator:
    """Fit the calibrator on a calibration split.

    Parameters
    ----------
    y_prob : array-like
        Predicted probabilities.
    y_true : array-like
        True labels.

    Returns
    -------
    Calibrator
        ``self``, for chaining.
    """
    ...

transform(y_prob) abstractmethod

Apply calibration to new predictions.

Parameters:

Name Type Description Default
y_prob array - like

Predicted probabilities.

required

Returns:

Type Description
NDArray[float64]

Calibrated probabilities.

Source code in src/reliably/recalibrate/base.py
@abstractmethod
def transform(self, y_prob: Any) -> NDArray[np.float64]:
    """Apply calibration to new predictions.

    Parameters
    ----------
    y_prob : array-like
        Predicted probabilities.

    Returns
    -------
    NDArray[np.float64]
        Calibrated probabilities.
    """
    ...

BetaCalibrator

Bases: Calibrator

Beta calibration: logit(p_cal) = c + a·log(s) − b·log(1 − s).

Parameters:

Name Type Description Default
constrain_ab bool

If True (default), constrain a, b ≥ 0.

True

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 300)
>>> s = rng.uniform(0.05, 0.95, 300)
>>> cal = BetaCalibrator().fit(s, y)
>>> probs = cal.transform(s)
>>> probs.shape == s.shape
True
Source code in src/reliably/recalibrate/beta.py
class BetaCalibrator(Calibrator):
    """Beta calibration: ``logit(p_cal) = c + a·log(s) − b·log(1 − s)``.

    Parameters
    ----------
    constrain_ab : bool
        If ``True`` (default), constrain ``a, b ≥ 0``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 300)
    >>> s = rng.uniform(0.05, 0.95, 300)
    >>> cal = BetaCalibrator().fit(s, y)
    >>> probs = cal.transform(s)
    >>> probs.shape == s.shape
    True
    """

    a_: float
    b_: float
    c_: float

    def __init__(self, constrain_ab: bool = True) -> None:
        self.constrain_ab = constrain_ab

    def fit(self, y_prob: Any, y_true: Any) -> BetaCalibrator:
        """Fit beta calibration on calibration split.

        Parameters
        ----------
        y_prob : array-like
            Binary scores.
        y_true : array-like
            Binary labels.

        Returns
        -------
        BetaCalibrator
        """
        s = to_numpy(y_prob, dtype=np.float64)
        y = to_numpy(y_true, dtype=np.float64)
        if s.ndim == 2:
            s = s[:, 1]

        s_clipped = clip_probs(s)
        log_s = np.log(s_clipped)
        log_1ms = np.log(1.0 - s_clipped)

        def neg_log_lik(params: NDArray[np.float64]) -> float:
            a, b, c = params
            logit_p = c + a * log_s - b * log_1ms
            p = expit(logit_p)
            p = np.clip(p, 1e-12, 1.0 - 1e-12)
            return float(-np.sum(y * np.log(p) + (1.0 - y) * np.log(1.0 - p)))

        bounds = [(0.0, None), (0.0, None), (None, None)] if self.constrain_ab else None
        res = minimize(neg_log_lik, [1.0, 1.0, 0.0], method="L-BFGS-B", bounds=bounds)
        self.a_, self.b_, self.c_ = float(res.x[0]), float(res.x[1]), float(res.x[2])
        self._fitted = True
        return self

    def transform(self, y_prob: Any) -> NDArray[np.float64]:
        """Apply beta calibration.

        Parameters
        ----------
        y_prob : array-like
            Binary scores.

        Returns
        -------
        NDArray[np.float64]
            Calibrated probabilities.
        """
        if not self._fitted:
            raise RuntimeError("Call fit() before transform().")
        s = to_numpy(y_prob, dtype=np.float64)
        if s.ndim == 2:
            s = s[:, 1]
        s_clipped = clip_probs(s)
        logit_p = self.c_ + self.a_ * np.log(s_clipped) - self.b_ * np.log(1.0 - s_clipped)
        return np.array(expit(logit_p), dtype=np.float64)

fit(y_prob, y_true)

Fit beta calibration on calibration split.

Parameters:

Name Type Description Default
y_prob array - like

Binary scores.

required
y_true array - like

Binary labels.

required

Returns:

Type Description
BetaCalibrator
Source code in src/reliably/recalibrate/beta.py
def fit(self, y_prob: Any, y_true: Any) -> BetaCalibrator:
    """Fit beta calibration on calibration split.

    Parameters
    ----------
    y_prob : array-like
        Binary scores.
    y_true : array-like
        Binary labels.

    Returns
    -------
    BetaCalibrator
    """
    s = to_numpy(y_prob, dtype=np.float64)
    y = to_numpy(y_true, dtype=np.float64)
    if s.ndim == 2:
        s = s[:, 1]

    s_clipped = clip_probs(s)
    log_s = np.log(s_clipped)
    log_1ms = np.log(1.0 - s_clipped)

    def neg_log_lik(params: NDArray[np.float64]) -> float:
        a, b, c = params
        logit_p = c + a * log_s - b * log_1ms
        p = expit(logit_p)
        p = np.clip(p, 1e-12, 1.0 - 1e-12)
        return float(-np.sum(y * np.log(p) + (1.0 - y) * np.log(1.0 - p)))

    bounds = [(0.0, None), (0.0, None), (None, None)] if self.constrain_ab else None
    res = minimize(neg_log_lik, [1.0, 1.0, 0.0], method="L-BFGS-B", bounds=bounds)
    self.a_, self.b_, self.c_ = float(res.x[0]), float(res.x[1]), float(res.x[2])
    self._fitted = True
    return self

transform(y_prob)

Apply beta calibration.

Parameters:

Name Type Description Default
y_prob array - like

Binary scores.

required

Returns:

Type Description
NDArray[float64]

Calibrated probabilities.

Source code in src/reliably/recalibrate/beta.py
def transform(self, y_prob: Any) -> NDArray[np.float64]:
    """Apply beta calibration.

    Parameters
    ----------
    y_prob : array-like
        Binary scores.

    Returns
    -------
    NDArray[np.float64]
        Calibrated probabilities.
    """
    if not self._fitted:
        raise RuntimeError("Call fit() before transform().")
    s = to_numpy(y_prob, dtype=np.float64)
    if s.ndim == 2:
        s = s[:, 1]
    s_clipped = clip_probs(s)
    logit_p = self.c_ + self.a_ * np.log(s_clipped) - self.b_ * np.log(1.0 - s_clipped)
    return np.array(expit(logit_p), dtype=np.float64)

HistogramCalibrator

Bases: Calibrator

Replace each bin's score with its empirical accuracy on the calibration split.

Parameters:

Name Type Description Default
n_bins int

Number of histogram bins.

15
binning str

"equal_width" or "adaptive".

'adaptive'

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 300)
>>> s = rng.uniform(0, 1, 300)
>>> cal = HistogramCalibrator().fit(s, y)
>>> probs = cal.transform(s)
>>> probs.shape == s.shape
True
Source code in src/reliably/recalibrate/histogram.py
class HistogramCalibrator(Calibrator):
    """Replace each bin's score with its empirical accuracy on the calibration split.

    Parameters
    ----------
    n_bins : int
        Number of histogram bins.
    binning : str
        ``"equal_width"`` or ``"adaptive"``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 300)
    >>> s = rng.uniform(0, 1, 300)
    >>> cal = HistogramCalibrator().fit(s, y)
    >>> probs = cal.transform(s)
    >>> probs.shape == s.shape
    True
    """

    edges_: NDArray[np.float64]
    bin_acc_: NDArray[np.float64]

    def __init__(self, n_bins: int = 15, binning: str = "adaptive") -> None:
        self.n_bins = n_bins
        self.binning = binning

    def fit(self, y_prob: Any, y_true: Any) -> HistogramCalibrator:
        """Fit histogram binning on calibration split.

        Parameters
        ----------
        y_prob : array-like
            Scores.
        y_true : array-like
            Binary labels.

        Returns
        -------
        HistogramCalibrator
        """
        s = to_numpy(y_prob, dtype=np.float64)
        y = to_numpy(y_true, dtype=np.float64)
        if s.ndim == 2:
            s = s[:, 1]

        edges = (
            equal_width_bins(self.n_bins)
            if self.binning == "equal_width"
            else adaptive_bins(s, self.n_bins)
        )
        _, bin_acc, bin_n = bin_stats(s, y, edges)
        # Where a bin is empty, use the base rate
        base_rate = float(y.mean())
        bin_acc[bin_n == 0] = base_rate

        self.edges_ = edges
        self.bin_acc_ = bin_acc
        self._fitted = True
        return self

    def transform(self, y_prob: Any) -> NDArray[np.float64]:
        """Apply histogram calibration.

        Parameters
        ----------
        y_prob : array-like
            Scores.

        Returns
        -------
        NDArray[np.float64]
            Calibrated probabilities.
        """
        if not self._fitted:
            raise RuntimeError("Call fit() before transform().")
        s = to_numpy(y_prob, dtype=np.float64)
        if s.ndim == 2:
            s = s[:, 1]

        out = np.empty_like(s)
        n_bins = len(self.edges_) - 1
        for m in range(n_bins):
            lo, hi = self.edges_[m], self.edges_[m + 1]
            if m == n_bins - 1:
                mask = (s >= lo) & (s <= hi)
            else:
                mask = (s >= lo) & (s < hi)
            out[mask] = self.bin_acc_[m]
        return out

fit(y_prob, y_true)

Fit histogram binning on calibration split.

Parameters:

Name Type Description Default
y_prob array - like

Scores.

required
y_true array - like

Binary labels.

required

Returns:

Type Description
HistogramCalibrator
Source code in src/reliably/recalibrate/histogram.py
def fit(self, y_prob: Any, y_true: Any) -> HistogramCalibrator:
    """Fit histogram binning on calibration split.

    Parameters
    ----------
    y_prob : array-like
        Scores.
    y_true : array-like
        Binary labels.

    Returns
    -------
    HistogramCalibrator
    """
    s = to_numpy(y_prob, dtype=np.float64)
    y = to_numpy(y_true, dtype=np.float64)
    if s.ndim == 2:
        s = s[:, 1]

    edges = (
        equal_width_bins(self.n_bins)
        if self.binning == "equal_width"
        else adaptive_bins(s, self.n_bins)
    )
    _, bin_acc, bin_n = bin_stats(s, y, edges)
    # Where a bin is empty, use the base rate
    base_rate = float(y.mean())
    bin_acc[bin_n == 0] = base_rate

    self.edges_ = edges
    self.bin_acc_ = bin_acc
    self._fitted = True
    return self

transform(y_prob)

Apply histogram calibration.

Parameters:

Name Type Description Default
y_prob array - like

Scores.

required

Returns:

Type Description
NDArray[float64]

Calibrated probabilities.

Source code in src/reliably/recalibrate/histogram.py
def transform(self, y_prob: Any) -> NDArray[np.float64]:
    """Apply histogram calibration.

    Parameters
    ----------
    y_prob : array-like
        Scores.

    Returns
    -------
    NDArray[np.float64]
        Calibrated probabilities.
    """
    if not self._fitted:
        raise RuntimeError("Call fit() before transform().")
    s = to_numpy(y_prob, dtype=np.float64)
    if s.ndim == 2:
        s = s[:, 1]

    out = np.empty_like(s)
    n_bins = len(self.edges_) - 1
    for m in range(n_bins):
        lo, hi = self.edges_[m], self.edges_[m + 1]
        if m == n_bins - 1:
            mask = (s >= lo) & (s <= hi)
        else:
            mask = (s >= lo) & (s < hi)
        out[mask] = self.bin_acc_[m]
    return out

IsotonicCalibrator

Bases: Calibrator

Nonparametric monotone calibration via isotonic regression.

Wraps sklearn.isotonic.IsotonicRegression and requires the scikit-learn optional dependency.

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 300)
>>> s = rng.uniform(0, 1, 300)
>>> cal = IsotonicCalibrator().fit(s, y)
>>> probs = cal.transform(s)
>>> probs.shape == s.shape
True
Source code in src/reliably/recalibrate/isotonic.py
class IsotonicCalibrator(Calibrator):
    """Nonparametric monotone calibration via isotonic regression.

    Wraps ``sklearn.isotonic.IsotonicRegression`` and requires the
    ``scikit-learn`` optional dependency.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 300)
    >>> s = rng.uniform(0, 1, 300)
    >>> cal = IsotonicCalibrator().fit(s, y)
    >>> probs = cal.transform(s)
    >>> probs.shape == s.shape
    True
    """

    def fit(self, y_prob: Any, y_true: Any) -> IsotonicCalibrator:
        """Fit isotonic regression on calibration split.

        Parameters
        ----------
        y_prob : array-like
            Scores, shape ``(N,)``.
        y_true : array-like
            Binary labels.

        Returns
        -------
        IsotonicCalibrator
        """
        try:
            from sklearn.isotonic import IsotonicRegression  # type: ignore
        except ImportError as exc:
            raise ImportError(
                "scikit-learn is required for IsotonicCalibrator. "
                "Install with: pip install reliably[sklearn]"
            ) from exc

        s = to_numpy(y_prob, dtype=np.float64)
        y = to_numpy(y_true, dtype=np.float64)
        if s.ndim == 2:
            s = s[:, 1]

        self._ir = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
        self._ir.fit(s, y)
        self._fitted = True
        return self

    def transform(self, y_prob: Any) -> NDArray[np.float64]:
        """Apply isotonic calibration.

        Parameters
        ----------
        y_prob : array-like
            Scores.

        Returns
        -------
        NDArray[np.float64]
            Calibrated probabilities.
        """
        if not self._fitted:
            raise RuntimeError("Call fit() before transform().")
        s = to_numpy(y_prob, dtype=np.float64)
        if s.ndim == 2:
            s = s[:, 1]
        return np.array(self._ir.transform(s), dtype=np.float64)

fit(y_prob, y_true)

Fit isotonic regression on calibration split.

Parameters:

Name Type Description Default
y_prob array - like

Scores, shape (N,).

required
y_true array - like

Binary labels.

required

Returns:

Type Description
IsotonicCalibrator
Source code in src/reliably/recalibrate/isotonic.py
def fit(self, y_prob: Any, y_true: Any) -> IsotonicCalibrator:
    """Fit isotonic regression on calibration split.

    Parameters
    ----------
    y_prob : array-like
        Scores, shape ``(N,)``.
    y_true : array-like
        Binary labels.

    Returns
    -------
    IsotonicCalibrator
    """
    try:
        from sklearn.isotonic import IsotonicRegression  # type: ignore
    except ImportError as exc:
        raise ImportError(
            "scikit-learn is required for IsotonicCalibrator. "
            "Install with: pip install reliably[sklearn]"
        ) from exc

    s = to_numpy(y_prob, dtype=np.float64)
    y = to_numpy(y_true, dtype=np.float64)
    if s.ndim == 2:
        s = s[:, 1]

    self._ir = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
    self._ir.fit(s, y)
    self._fitted = True
    return self

transform(y_prob)

Apply isotonic calibration.

Parameters:

Name Type Description Default
y_prob array - like

Scores.

required

Returns:

Type Description
NDArray[float64]

Calibrated probabilities.

Source code in src/reliably/recalibrate/isotonic.py
def transform(self, y_prob: Any) -> NDArray[np.float64]:
    """Apply isotonic calibration.

    Parameters
    ----------
    y_prob : array-like
        Scores.

    Returns
    -------
    NDArray[np.float64]
        Calibrated probabilities.
    """
    if not self._fitted:
        raise RuntimeError("Call fit() before transform().")
    s = to_numpy(y_prob, dtype=np.float64)
    if s.ndim == 2:
        s = s[:, 1]
    return np.array(self._ir.transform(s), dtype=np.float64)

MatrixScaler

Bases: Calibrator

Full K×K affine map on logits: p_cal = softmax(W·logits + b).

More expressive; gate behind method="matrix".

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 3, 300)
>>> p = rng.dirichlet([1, 1, 1], 300)
>>> cal = MatrixScaler().fit(p, y)
>>> probs = cal.transform(p)
>>> np.allclose(probs.sum(axis=1), 1.0, atol=1e-6)
True
Source code in src/reliably/recalibrate/matrix.py
class MatrixScaler(Calibrator):
    """Full K×K affine map on logits: ``p_cal = softmax(W·logits + b)``.

    More expressive; gate behind ``method="matrix"``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 3, 300)
    >>> p = rng.dirichlet([1, 1, 1], 300)
    >>> cal = MatrixScaler().fit(p, y)
    >>> probs = cal.transform(p)
    >>> np.allclose(probs.sum(axis=1), 1.0, atol=1e-6)
    True
    """

    W_: NDArray[np.float64]
    b_: NDArray[np.float64]

    def fit(self, y_prob: Any, y_true: Any) -> MatrixScaler:
        """Fit full matrix scaling.

        Parameters
        ----------
        y_prob : array-like
            Probabilities ``(N, K)``.
        y_true : array-like
            Integer labels.

        Returns
        -------
        MatrixScaler
        """
        y_prob_np = to_numpy(y_prob, dtype=np.float64)
        y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
        n = len(y_true_np)
        k = y_prob_np.shape[1] if y_prob_np.ndim == 2 else 2

        if y_prob_np.ndim == 1:
            y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

        logits = np.log(clip_probs(y_prob_np))

        def neg_nll(params: NDArray[np.float64]) -> float:
            weight_mat = params[: k * k].reshape(k, k)
            b = params[k * k :]
            z = logits @ weight_mat.T + b[None, :]
            probs = softmax(z)
            p_correct = clip_probs(probs[np.arange(n), y_true_np])
            return float(-np.log(p_correct).mean())

        x0 = np.concatenate([np.eye(k).ravel(), np.zeros(k)])
        res = minimize(neg_nll, x0, method="L-BFGS-B")
        self.W_ = res.x[: k * k].reshape(k, k)
        self.b_ = res.x[k * k :]
        self._fitted = True
        return self

    def transform(self, y_prob: Any) -> NDArray[np.float64]:
        """Apply matrix scaling.

        Parameters
        ----------
        y_prob : array-like
            Probabilities.

        Returns
        -------
        NDArray[np.float64]
            Calibrated probabilities.
        """
        if not self._fitted:
            raise RuntimeError("Call fit() before transform().")
        y_prob_np = to_numpy(y_prob, dtype=np.float64)
        binary = y_prob_np.ndim == 1
        if binary:
            y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)
        logits = np.log(clip_probs(y_prob_np))
        z = logits @ self.W_.T + self.b_[None, :]
        cal = softmax(z)
        if binary:
            return cal[:, 1]
        return cal

fit(y_prob, y_true)

Fit full matrix scaling.

Parameters:

Name Type Description Default
y_prob array - like

Probabilities (N, K).

required
y_true array - like

Integer labels.

required

Returns:

Type Description
MatrixScaler
Source code in src/reliably/recalibrate/matrix.py
def fit(self, y_prob: Any, y_true: Any) -> MatrixScaler:
    """Fit full matrix scaling.

    Parameters
    ----------
    y_prob : array-like
        Probabilities ``(N, K)``.
    y_true : array-like
        Integer labels.

    Returns
    -------
    MatrixScaler
    """
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    n = len(y_true_np)
    k = y_prob_np.shape[1] if y_prob_np.ndim == 2 else 2

    if y_prob_np.ndim == 1:
        y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

    logits = np.log(clip_probs(y_prob_np))

    def neg_nll(params: NDArray[np.float64]) -> float:
        weight_mat = params[: k * k].reshape(k, k)
        b = params[k * k :]
        z = logits @ weight_mat.T + b[None, :]
        probs = softmax(z)
        p_correct = clip_probs(probs[np.arange(n), y_true_np])
        return float(-np.log(p_correct).mean())

    x0 = np.concatenate([np.eye(k).ravel(), np.zeros(k)])
    res = minimize(neg_nll, x0, method="L-BFGS-B")
    self.W_ = res.x[: k * k].reshape(k, k)
    self.b_ = res.x[k * k :]
    self._fitted = True
    return self

transform(y_prob)

Apply matrix scaling.

Parameters:

Name Type Description Default
y_prob array - like

Probabilities.

required

Returns:

Type Description
NDArray[float64]

Calibrated probabilities.

Source code in src/reliably/recalibrate/matrix.py
def transform(self, y_prob: Any) -> NDArray[np.float64]:
    """Apply matrix scaling.

    Parameters
    ----------
    y_prob : array-like
        Probabilities.

    Returns
    -------
    NDArray[np.float64]
        Calibrated probabilities.
    """
    if not self._fitted:
        raise RuntimeError("Call fit() before transform().")
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    binary = y_prob_np.ndim == 1
    if binary:
        y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)
    logits = np.log(clip_probs(y_prob_np))
    z = logits @ self.W_.T + self.b_[None, :]
    cal = softmax(z)
    if binary:
        return cal[:, 1]
    return cal

VectorScaler

Bases: Calibrator

Per-class temperature scaling: p_cal = softmax(w ⊙ logits + b).

More expressive than scalar temperature but less prone to overfitting than full matrix scaling.

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 3, 300)
>>> p = rng.dirichlet([1, 1, 1], 300)
>>> cal = VectorScaler().fit(p, y)
>>> probs = cal.transform(p)
>>> np.allclose(probs.sum(axis=1), 1.0, atol=1e-6)
True
Source code in src/reliably/recalibrate/matrix.py
class VectorScaler(Calibrator):
    """Per-class temperature scaling: ``p_cal = softmax(w ⊙ logits + b)``.

    More expressive than scalar temperature but less prone to overfitting
    than full matrix scaling.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 3, 300)
    >>> p = rng.dirichlet([1, 1, 1], 300)
    >>> cal = VectorScaler().fit(p, y)
    >>> probs = cal.transform(p)
    >>> np.allclose(probs.sum(axis=1), 1.0, atol=1e-6)
    True
    """

    W_: NDArray[np.float64]
    b_: NDArray[np.float64]

    def fit(self, y_prob: Any, y_true: Any) -> VectorScaler:
        """Fit per-class vector scaling.

        Parameters
        ----------
        y_prob : array-like
            Probabilities ``(N, K)``.
        y_true : array-like
            Integer labels.

        Returns
        -------
        VectorScaler
        """
        y_prob_np = to_numpy(y_prob, dtype=np.float64)
        y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
        n = len(y_true_np)
        k = y_prob_np.shape[1] if y_prob_np.ndim == 2 else 2

        if y_prob_np.ndim == 1:
            y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

        logits = np.log(clip_probs(y_prob_np))

        def neg_nll(params: NDArray[np.float64]) -> float:
            weights = params[:k]
            b = params[k:]
            z = logits * weights[None, :] + b[None, :]
            probs = softmax(z)
            p_correct = clip_probs(probs[np.arange(n), y_true_np])
            return float(-np.log(p_correct).mean())

        x0 = np.concatenate([np.ones(k), np.zeros(k)])
        res = minimize(neg_nll, x0, method="L-BFGS-B")
        self.W_ = res.x[:k]
        self.b_ = res.x[k:]
        self._fitted = True
        return self

    def transform(self, y_prob: Any) -> NDArray[np.float64]:
        """Apply vector scaling.

        Parameters
        ----------
        y_prob : array-like
            Probabilities.

        Returns
        -------
        NDArray[np.float64]
            Calibrated probabilities.
        """
        if not self._fitted:
            raise RuntimeError("Call fit() before transform().")
        y_prob_np = to_numpy(y_prob, dtype=np.float64)
        binary = y_prob_np.ndim == 1
        if binary:
            y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)
        logits = np.log(clip_probs(y_prob_np))
        z = logits * self.W_[None, :] + self.b_[None, :]
        cal = softmax(z)
        if binary:
            return cal[:, 1]
        return cal

fit(y_prob, y_true)

Fit per-class vector scaling.

Parameters:

Name Type Description Default
y_prob array - like

Probabilities (N, K).

required
y_true array - like

Integer labels.

required

Returns:

Type Description
VectorScaler
Source code in src/reliably/recalibrate/matrix.py
def fit(self, y_prob: Any, y_true: Any) -> VectorScaler:
    """Fit per-class vector scaling.

    Parameters
    ----------
    y_prob : array-like
        Probabilities ``(N, K)``.
    y_true : array-like
        Integer labels.

    Returns
    -------
    VectorScaler
    """
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    n = len(y_true_np)
    k = y_prob_np.shape[1] if y_prob_np.ndim == 2 else 2

    if y_prob_np.ndim == 1:
        y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

    logits = np.log(clip_probs(y_prob_np))

    def neg_nll(params: NDArray[np.float64]) -> float:
        weights = params[:k]
        b = params[k:]
        z = logits * weights[None, :] + b[None, :]
        probs = softmax(z)
        p_correct = clip_probs(probs[np.arange(n), y_true_np])
        return float(-np.log(p_correct).mean())

    x0 = np.concatenate([np.ones(k), np.zeros(k)])
    res = minimize(neg_nll, x0, method="L-BFGS-B")
    self.W_ = res.x[:k]
    self.b_ = res.x[k:]
    self._fitted = True
    return self

transform(y_prob)

Apply vector scaling.

Parameters:

Name Type Description Default
y_prob array - like

Probabilities.

required

Returns:

Type Description
NDArray[float64]

Calibrated probabilities.

Source code in src/reliably/recalibrate/matrix.py
def transform(self, y_prob: Any) -> NDArray[np.float64]:
    """Apply vector scaling.

    Parameters
    ----------
    y_prob : array-like
        Probabilities.

    Returns
    -------
    NDArray[np.float64]
        Calibrated probabilities.
    """
    if not self._fitted:
        raise RuntimeError("Call fit() before transform().")
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    binary = y_prob_np.ndim == 1
    if binary:
        y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)
    logits = np.log(clip_probs(y_prob_np))
    z = logits * self.W_[None, :] + self.b_[None, :]
    cal = softmax(z)
    if binary:
        return cal[:, 1]
    return cal

PlattScaler

Bases: Calibrator

Binary calibration via logistic regression: p_cal = σ(A·s + B).

Parameters:

Name Type Description Default
None
required

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 300)
>>> s = rng.uniform(0, 1, 300)
>>> cal = PlattScaler().fit(s, y)
>>> probs = cal.transform(s)
>>> probs.shape == s.shape
True
Source code in src/reliably/recalibrate/platt.py
class PlattScaler(Calibrator):
    """Binary calibration via logistic regression: ``p_cal = σ(A·s + B)``.

    Parameters
    ----------
    None

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 300)
    >>> s = rng.uniform(0, 1, 300)
    >>> cal = PlattScaler().fit(s, y)
    >>> probs = cal.transform(s)
    >>> probs.shape == s.shape
    True
    """

    A_: float
    B_: float

    def fit(self, y_prob: Any, y_true: Any) -> PlattScaler:
        """Fit logistic regression on calibration split.

        Parameters
        ----------
        y_prob : array-like
            Binary scores, shape ``(N,)``.
        y_true : array-like
            Binary labels.

        Returns
        -------
        PlattScaler
        """
        s = to_numpy(y_prob, dtype=np.float64)
        y = to_numpy(y_true, dtype=np.float64)
        if s.ndim == 2:
            s = s[:, 1]

        def neg_log_lik(params: NDArray[np.float64]) -> float:
            a_coef, b_coef = params
            p = expit(a_coef * s + b_coef)
            p = np.clip(p, 1e-12, 1.0 - 1e-12)
            return float(-np.sum(y * np.log(p) + (1.0 - y) * np.log(1.0 - p)))

        res = minimize(neg_log_lik, [1.0, 0.0], method="L-BFGS-B")
        self.A_ = float(res.x[0])
        self.B_ = float(res.x[1])
        self._fitted = True
        return self

    def transform(self, y_prob: Any) -> NDArray[np.float64]:
        """Apply Platt scaling.

        Parameters
        ----------
        y_prob : array-like
            Binary scores.

        Returns
        -------
        NDArray[np.float64]
            Calibrated probabilities.
        """
        if not self._fitted:
            raise RuntimeError("Call fit() before transform().")
        s = to_numpy(y_prob, dtype=np.float64)
        if s.ndim == 2:
            s = s[:, 1]
        return np.array(expit(self.A_ * s + self.B_), dtype=np.float64)

fit(y_prob, y_true)

Fit logistic regression on calibration split.

Parameters:

Name Type Description Default
y_prob array - like

Binary scores, shape (N,).

required
y_true array - like

Binary labels.

required

Returns:

Type Description
PlattScaler
Source code in src/reliably/recalibrate/platt.py
def fit(self, y_prob: Any, y_true: Any) -> PlattScaler:
    """Fit logistic regression on calibration split.

    Parameters
    ----------
    y_prob : array-like
        Binary scores, shape ``(N,)``.
    y_true : array-like
        Binary labels.

    Returns
    -------
    PlattScaler
    """
    s = to_numpy(y_prob, dtype=np.float64)
    y = to_numpy(y_true, dtype=np.float64)
    if s.ndim == 2:
        s = s[:, 1]

    def neg_log_lik(params: NDArray[np.float64]) -> float:
        a_coef, b_coef = params
        p = expit(a_coef * s + b_coef)
        p = np.clip(p, 1e-12, 1.0 - 1e-12)
        return float(-np.sum(y * np.log(p) + (1.0 - y) * np.log(1.0 - p)))

    res = minimize(neg_log_lik, [1.0, 0.0], method="L-BFGS-B")
    self.A_ = float(res.x[0])
    self.B_ = float(res.x[1])
    self._fitted = True
    return self

transform(y_prob)

Apply Platt scaling.

Parameters:

Name Type Description Default
y_prob array - like

Binary scores.

required

Returns:

Type Description
NDArray[float64]

Calibrated probabilities.

Source code in src/reliably/recalibrate/platt.py
def transform(self, y_prob: Any) -> NDArray[np.float64]:
    """Apply Platt scaling.

    Parameters
    ----------
    y_prob : array-like
        Binary scores.

    Returns
    -------
    NDArray[np.float64]
        Calibrated probabilities.
    """
    if not self._fitted:
        raise RuntimeError("Call fit() before transform().")
    s = to_numpy(y_prob, dtype=np.float64)
    if s.ndim == 2:
        s = s[:, 1]
    return np.array(expit(self.A_ * s + self.B_), dtype=np.float64)

TemperatureScaler

Bases: Calibrator

Calibrate by dividing logits by a scalar temperature > 0.

Fits temperature by minimizing NLL on the calibration split using golden-section search. Preserves the argmax (accuracy unchanged).

Parameters:

Name Type Description Default
temp_bounds tuple[float, float]

Search bounds for temperature.

(0.01, 20.0)

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y_true = rng.integers(0, 2, 200)
>>> y_prob = rng.dirichlet([1, 1], 200)
>>> cal = TemperatureScaler().fit(y_prob, y_true)
>>> cal.T_ > 0
True
>>> cal_probs = cal.transform(y_prob)
>>> np.allclose(cal_probs.sum(axis=1), 1.0, atol=1e-6)
True
Source code in src/reliably/recalibrate/temperature.py
class TemperatureScaler(Calibrator):
    """Calibrate by dividing logits by a scalar temperature > 0.

    Fits temperature by minimizing NLL on the calibration split using
    golden-section search.  Preserves the argmax (accuracy unchanged).

    Parameters
    ----------
    temp_bounds : tuple[float, float]
        Search bounds for temperature.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y_true = rng.integers(0, 2, 200)
    >>> y_prob = rng.dirichlet([1, 1], 200)
    >>> cal = TemperatureScaler().fit(y_prob, y_true)
    >>> cal.T_ > 0
    True
    >>> cal_probs = cal.transform(y_prob)
    >>> np.allclose(cal_probs.sum(axis=1), 1.0, atol=1e-6)
    True
    """

    T_: float
    logits_: NDArray[np.float64]

    def __init__(self, temp_bounds: tuple[float, float] = (0.01, 20.0)) -> None:
        self.T_bounds = temp_bounds

    def fit(self, y_prob: Any, y_true: Any) -> TemperatureScaler:
        """Fit temperature on calibration data.

        Parameters
        ----------
        y_prob : array-like
            Probabilities or logits, shape ``(N, K)`` or ``(N,)`` (binary).
        y_true : array-like
            Integer labels.

        Returns
        -------
        TemperatureScaler
        """
        y_prob_np = to_numpy(y_prob, dtype=np.float64)
        y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
        n = len(y_true_np)

        if y_prob_np.ndim == 1:
            # Binary: convert to 2-class
            y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

        # Recover pseudo-logits as log(p) — up to a constant, sufficient for
        # temperature scaling because softmax is shift-invariant
        p_clipped = clip_probs(y_prob_np)
        logits = np.log(p_clipped)
        self.logits_ = logits

        def nll_at_temp(temp: float) -> float:
            probs = softmax(logits / temp)
            p_correct = clip_probs(probs[np.arange(n), y_true_np])
            return float(-np.log(p_correct).mean())

        result = minimize_scalar(nll_at_temp, bounds=self.T_bounds, method="bounded")
        self.T_ = float(result.x)
        self._fitted = True
        return self

    def transform(self, y_prob: Any) -> NDArray[np.float64]:
        """Apply temperature scaling.

        Parameters
        ----------
        y_prob : array-like
            Probabilities to calibrate.

        Returns
        -------
        NDArray[np.float64]
            Calibrated probabilities.
        """
        if not self._fitted:
            raise RuntimeError("Call fit() before transform().")
        y_prob_np = to_numpy(y_prob, dtype=np.float64)
        binary = y_prob_np.ndim == 1
        if binary:
            y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

        p_clipped = clip_probs(y_prob_np)
        logits = np.log(p_clipped)
        cal = softmax(logits / self.T_)
        if binary:
            return cal[:, 1]
        return cal

fit(y_prob, y_true)

Fit temperature on calibration data.

Parameters:

Name Type Description Default
y_prob array - like

Probabilities or logits, shape (N, K) or (N,) (binary).

required
y_true array - like

Integer labels.

required

Returns:

Type Description
TemperatureScaler
Source code in src/reliably/recalibrate/temperature.py
def fit(self, y_prob: Any, y_true: Any) -> TemperatureScaler:
    """Fit temperature on calibration data.

    Parameters
    ----------
    y_prob : array-like
        Probabilities or logits, shape ``(N, K)`` or ``(N,)`` (binary).
    y_true : array-like
        Integer labels.

    Returns
    -------
    TemperatureScaler
    """
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    n = len(y_true_np)

    if y_prob_np.ndim == 1:
        # Binary: convert to 2-class
        y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

    # Recover pseudo-logits as log(p) — up to a constant, sufficient for
    # temperature scaling because softmax is shift-invariant
    p_clipped = clip_probs(y_prob_np)
    logits = np.log(p_clipped)
    self.logits_ = logits

    def nll_at_temp(temp: float) -> float:
        probs = softmax(logits / temp)
        p_correct = clip_probs(probs[np.arange(n), y_true_np])
        return float(-np.log(p_correct).mean())

    result = minimize_scalar(nll_at_temp, bounds=self.T_bounds, method="bounded")
    self.T_ = float(result.x)
    self._fitted = True
    return self

transform(y_prob)

Apply temperature scaling.

Parameters:

Name Type Description Default
y_prob array - like

Probabilities to calibrate.

required

Returns:

Type Description
NDArray[float64]

Calibrated probabilities.

Source code in src/reliably/recalibrate/temperature.py
def transform(self, y_prob: Any) -> NDArray[np.float64]:
    """Apply temperature scaling.

    Parameters
    ----------
    y_prob : array-like
        Probabilities to calibrate.

    Returns
    -------
    NDArray[np.float64]
        Calibrated probabilities.
    """
    if not self._fitted:
        raise RuntimeError("Call fit() before transform().")
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    binary = y_prob_np.ndim == 1
    if binary:
        y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

    p_clipped = clip_probs(y_prob_np)
    logits = np.log(p_clipped)
    cal = softmax(logits / self.T_)
    if binary:
        return cal[:, 1]
    return cal

reliably.CI dataclass

Confidence interval.

Parameters:

Name Type Description Default
low float

Lower bound.

required
high float

Upper bound.

required
level float

Nominal coverage, default 0.95.

0.95
method str

One of "bca", "percentile", "analytic".

'bca'

Examples:

>>> ci = CI(low=0.1, high=0.3)
>>> ci.low, ci.high
(0.1, 0.3)
Source code in src/reliably/_core/results.py
@dataclass(frozen=True, slots=True)
class CI:
    """Confidence interval.

    Parameters
    ----------
    low : float
        Lower bound.
    high : float
        Upper bound.
    level : float
        Nominal coverage, default 0.95.
    method : str
        One of ``"bca"``, ``"percentile"``, ``"analytic"``.

    Examples
    --------
    >>> ci = CI(low=0.1, high=0.3)
    >>> ci.low, ci.high
    (0.1, 0.3)
    """

    low: float
    high: float
    level: float = 0.95
    method: Literal["percentile", "bca", "analytic"] = "bca"

reliably.MetricResult dataclass

Metric point estimate with optional confidence interval.

Parameters:

Name Type Description Default
name str

Human-readable metric name, e.g. "smECE".

required
value float

Point estimate.

required
ci CI | None

Confidence interval; None only when CI computation is disabled.

required
n int

Sample size on which the metric was computed.

required
extra Mapping[str, float] | None

Optional extra scalars, e.g. Brier decomposition components.

None

Examples:

>>> mr = MetricResult(name="ECE", value=0.05, ci=CI(0.03, 0.07), n=100)
>>> "ECE" in str(mr)
True
Source code in src/reliably/_core/results.py
@dataclass(frozen=True, slots=True)
class MetricResult:
    """Metric point estimate with optional confidence interval.

    Parameters
    ----------
    name : str
        Human-readable metric name, e.g. ``"smECE"``.
    value : float
        Point estimate.
    ci : CI | None
        Confidence interval; ``None`` only when CI computation is disabled.
    n : int
        Sample size on which the metric was computed.
    extra : Mapping[str, float] | None
        Optional extra scalars, e.g. Brier decomposition components.

    Examples
    --------
    >>> mr = MetricResult(name="ECE", value=0.05, ci=CI(0.03, 0.07), n=100)
    >>> "ECE" in str(mr)
    True
    """

    name: str
    value: float
    ci: CI | None
    n: int
    extra: Mapping[str, float] | None = None

    def __str__(self) -> str:
        c = f" [{self.ci.low:.4f}, {self.ci.high:.4f}]" if self.ci else ""
        return f"{self.name}={self.value:.4f}{c}"

reliably.Report dataclass

Immutable result of :func:reliably.evaluate.

Parameters:

Name Type Description Default
task str

One of "binary", "multiclass".

required
metrics Mapping[str, MetricResult]

All computed metrics, keyed by name.

required
n int

Dataset size.

required
meta Mapping[str, object]

Provenance: seed, n_bootstrap, binning, etc.

required

Examples:

>>> r = Report(task="binary", metrics={}, n=100, meta={})
>>> r.task
'binary'
Source code in src/reliably/_core/results.py
@dataclass(frozen=True, slots=True)
class Report:
    """Immutable result of :func:`reliably.evaluate`.

    Parameters
    ----------
    task : str
        One of ``"binary"``, ``"multiclass"``.
    metrics : Mapping[str, MetricResult]
        All computed metrics, keyed by name.
    n : int
        Dataset size.
    meta : Mapping[str, object]
        Provenance: seed, n_bootstrap, binning, etc.

    Examples
    --------
    >>> r = Report(task="binary", metrics={}, n=100, meta={})
    >>> r.task
    'binary'
    """

    task: Literal["binary", "multiclass"]
    metrics: Mapping[str, MetricResult]
    n: int
    meta: Mapping[str, object]

    def __getitem__(self, name: str) -> MetricResult:
        return self.metrics[name]

    def summary(self) -> str:
        """Return a plain-text summary of all metrics."""
        lines = [f"Report(task={self.task}, n={self.n})"]
        for mr in self.metrics.values():
            lines.append(f"  {mr}")
        return "\n".join(lines)

    def to_html(self, path: str | Path | None = None) -> str:
        """Render the report to HTML.

        Requires the ``report`` extra (``pip install reliably[report]``).

        Parameters
        ----------
        path : str | Path | None
            If given, also write the HTML to this file.

        Returns
        -------
        str
            HTML string.

        Examples
        --------
        >>> r = Report(task="binary", metrics={}, n=100, meta={})
        >>> html = r.to_html()
        >>> "<html" in html
        True
        """
        from reliably.report.render import to_html as _to_html

        return _to_html(self, path=path)

    def to_markdown(self) -> str:
        """Render the report to a Markdown table.

        Returns
        -------
        str
            Markdown string.

        Examples
        --------
        >>> r = Report(task="binary", metrics={}, n=100, meta={})
        >>> "| Metric" in r.to_markdown()
        True
        """
        from reliably.report.render import to_markdown as _to_markdown

        return _to_markdown(self)

    def reliability_diagram(
        self,
        y_true: Any,
        y_prob: Any,
        *,
        n_bins: int = 15,
        binning: str = "adaptive",
        band: bool = True,
        n_bootstrap: int = 200,
        seed: int = 0,
        ax: matplotlib.axes.Axes | None = None,
        title: str = "Reliability Diagram",
    ) -> matplotlib.axes.Axes:
        """Plot a reliability diagram for this report's data.

        Requires the ``viz`` extra (``pip install reliably[viz]``).

        Parameters
        ----------
        y_true : array-like
            Integer labels.
        y_prob : array-like
            Predicted probabilities.
        n_bins : int
            Number of bins for the scatter overlay.
        binning : str
            ``"equal_width"`` or ``"adaptive"``.
        band : bool
            Whether to draw the bootstrap confidence band.
        n_bootstrap : int
            Bootstrap resamples for the confidence band.
        seed : int
            RNG seed.
        ax : matplotlib.axes.Axes | None
            Existing axes to draw on; creates a new figure if ``None``.
        title : str
            Plot title.

        Returns
        -------
        matplotlib.axes.Axes
        """
        from reliably.viz.diagrams import reliability_diagram as _rd

        return _rd(
            y_true, y_prob,
            n_bins=n_bins, binning=binning, band=band,
            n_bootstrap=n_bootstrap, seed=seed, ax=ax, title=title,
        )

summary()

Return a plain-text summary of all metrics.

Source code in src/reliably/_core/results.py
def summary(self) -> str:
    """Return a plain-text summary of all metrics."""
    lines = [f"Report(task={self.task}, n={self.n})"]
    for mr in self.metrics.values():
        lines.append(f"  {mr}")
    return "\n".join(lines)

to_html(path=None)

Render the report to HTML.

Requires the report extra (pip install reliably[report]).

Parameters:

Name Type Description Default
path str | Path | None

If given, also write the HTML to this file.

None

Returns:

Type Description
str

HTML string.

Examples:

>>> r = Report(task="binary", metrics={}, n=100, meta={})
>>> html = r.to_html()
>>> "<html" in html
True
Source code in src/reliably/_core/results.py
def to_html(self, path: str | Path | None = None) -> str:
    """Render the report to HTML.

    Requires the ``report`` extra (``pip install reliably[report]``).

    Parameters
    ----------
    path : str | Path | None
        If given, also write the HTML to this file.

    Returns
    -------
    str
        HTML string.

    Examples
    --------
    >>> r = Report(task="binary", metrics={}, n=100, meta={})
    >>> html = r.to_html()
    >>> "<html" in html
    True
    """
    from reliably.report.render import to_html as _to_html

    return _to_html(self, path=path)

to_markdown()

Render the report to a Markdown table.

Returns:

Type Description
str

Markdown string.

Examples:

>>> r = Report(task="binary", metrics={}, n=100, meta={})
>>> "| Metric" in r.to_markdown()
True
Source code in src/reliably/_core/results.py
def to_markdown(self) -> str:
    """Render the report to a Markdown table.

    Returns
    -------
    str
        Markdown string.

    Examples
    --------
    >>> r = Report(task="binary", metrics={}, n=100, meta={})
    >>> "| Metric" in r.to_markdown()
    True
    """
    from reliably.report.render import to_markdown as _to_markdown

    return _to_markdown(self)

reliability_diagram(y_true, y_prob, *, n_bins=15, binning='adaptive', band=True, n_bootstrap=200, seed=0, ax=None, title='Reliability Diagram')

Plot a reliability diagram for this report's data.

Requires the viz extra (pip install reliably[viz]).

Parameters:

Name Type Description Default
y_true array - like

Integer labels.

required
y_prob array - like

Predicted probabilities.

required
n_bins int

Number of bins for the scatter overlay.

15
binning str

"equal_width" or "adaptive".

'adaptive'
band bool

Whether to draw the bootstrap confidence band.

True
n_bootstrap int

Bootstrap resamples for the confidence band.

200
seed int

RNG seed.

0
ax Axes | None

Existing axes to draw on; creates a new figure if None.

None
title str

Plot title.

'Reliability Diagram'

Returns:

Type Description
Axes
Source code in src/reliably/_core/results.py
def reliability_diagram(
    self,
    y_true: Any,
    y_prob: Any,
    *,
    n_bins: int = 15,
    binning: str = "adaptive",
    band: bool = True,
    n_bootstrap: int = 200,
    seed: int = 0,
    ax: matplotlib.axes.Axes | None = None,
    title: str = "Reliability Diagram",
) -> matplotlib.axes.Axes:
    """Plot a reliability diagram for this report's data.

    Requires the ``viz`` extra (``pip install reliably[viz]``).

    Parameters
    ----------
    y_true : array-like
        Integer labels.
    y_prob : array-like
        Predicted probabilities.
    n_bins : int
        Number of bins for the scatter overlay.
    binning : str
        ``"equal_width"`` or ``"adaptive"``.
    band : bool
        Whether to draw the bootstrap confidence band.
    n_bootstrap : int
        Bootstrap resamples for the confidence band.
    seed : int
        RNG seed.
    ax : matplotlib.axes.Axes | None
        Existing axes to draw on; creates a new figure if ``None``.
    title : str
        Plot title.

    Returns
    -------
    matplotlib.axes.Axes
    """
    from reliably.viz.diagrams import reliability_diagram as _rd

    return _rd(
        y_true, y_prob,
        n_bins=n_bins, binning=binning, band=band,
        n_bootstrap=n_bootstrap, seed=seed, ax=ax, title=title,
    )

reliably.ComparisonResult dataclass

Result of :func:reliably.compare.

Parameters:

Name Type Description Default
metric str

Name of the compared metric.

required
delta float

Point estimate of the difference (value_a - value_b).

required
ci CI

Confidence interval on the difference.

required
p_value float

Two-sided p-value.

required
test str

Test used: "delong" or "paired_bootstrap".

required
significant bool

True if p_value < 1 - ci.level after correction.

required
correction str | None

Multiple-comparison correction applied (e.g. "holm").

required

Examples:

>>> cr = ComparisonResult(
...     metric="auroc", delta=0.02, ci=CI(-0.01, 0.05),
...     p_value=0.19, test="delong", significant=False, correction="holm"
... )
>>> cr.significant
False
Source code in src/reliably/_core/results.py
@dataclass(frozen=True, slots=True)
class ComparisonResult:
    """Result of :func:`reliably.compare`.

    Parameters
    ----------
    metric : str
        Name of the compared metric.
    delta : float
        Point estimate of the difference (value_a - value_b).
    ci : CI
        Confidence interval on the difference.
    p_value : float
        Two-sided p-value.
    test : str
        Test used: ``"delong"`` or ``"paired_bootstrap"``.
    significant : bool
        ``True`` if ``p_value < 1 - ci.level`` after correction.
    correction : str | None
        Multiple-comparison correction applied (e.g. ``"holm"``).

    Examples
    --------
    >>> cr = ComparisonResult(
    ...     metric="auroc", delta=0.02, ci=CI(-0.01, 0.05),
    ...     p_value=0.19, test="delong", significant=False, correction="holm"
    ... )
    >>> cr.significant
    False
    """

    metric: str
    delta: float
    ci: CI
    p_value: float
    test: Literal["delong", "paired_bootstrap"]
    significant: bool
    correction: str | None