Skip to content

Metrics

Calibration

reliably.metrics.calibration.ece(y_true, y_prob, *, n_bins=15, binning='equal_width', ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Expected Calibration Error (ECE) with equal-width or adaptive binning.

Parameters:

Name Type Description Default
y_true array - like

Integer labels, shape (N,).

required
y_prob array - like

Probability matrix (N, K) or binary scores (N,).

required
n_bins int

Number of bins (default 15).

15
binning str

"equal_width" or "adaptive".

'equal_width'
ci str | None

CI method: "bca", "percentile", or None.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal coverage.

0.95
seed int

RNG seed.

0

Returns:

Type Description
MetricResult

Named "ECE" or "adaptive_ECE".

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 200)
>>> p = rng.uniform(0, 1, 200)
>>> result = ece(y, p, ci=None)
>>> 0.0 <= result.value <= 1.0
True
Source code in src/reliably/metrics/calibration.py
def ece(
    y_true: Any,
    y_prob: Any,
    *,
    n_bins: int = 15,
    binning: str = "equal_width",
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> MetricResult:
    """Expected Calibration Error (ECE) with equal-width or adaptive binning.

    Parameters
    ----------
    y_true : array-like
        Integer labels, shape ``(N,)``.
    y_prob : array-like
        Probability matrix ``(N, K)`` or binary scores ``(N,)``.
    n_bins : int
        Number of bins (default 15).
    binning : str
        ``"equal_width"`` or ``"adaptive"``.
    ci : str | None
        CI method: ``"bca"``, ``"percentile"``, or ``None``.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal coverage.
    seed : int
        RNG seed.

    Returns
    -------
    MetricResult
        Named ``"ECE"`` or ``"adaptive_ECE"``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 200)
    >>> p = rng.uniform(0, 1, 200)
    >>> result = ece(y, p, ci=None)
    >>> 0.0 <= result.value <= 1.0
    True
    """
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    n = len(y_true_np)

    conf, acc = _top_label_conf_acc(y_true_np, y_prob_np)
    name = "ECE" if binning == "equal_width" else "adaptive_ECE"

    def _est(idx: NDArray[np.intp]) -> float:
        c, a = conf[idx], acc[idx]
        edges = (
            equal_width_bins(n_bins)
            if binning == "equal_width"
            else adaptive_bins(c, n_bins)
        )
        bc, ba, bn = bin_stats(c, a, edges)
        return _ece_from_bins(bc, ba, bn, len(idx))

    edges = (
        equal_width_bins(n_bins) if binning == "equal_width" else adaptive_bins(conf, n_bins)
    )
    bc, ba, bn = bin_stats(conf, acc, edges)
    point = _ece_from_bins(bc, ba, bn, n)

    if ci is None:
        return MetricResult(name=name, value=point, ci=None, n=n)

    # Use the generic estimator for binned metrics
    from reliably.stats.bootstrap import bootstrap_ci

    ci_result = bootstrap_ci(
        _est, n, point=point, n_boot=n_bootstrap, level=level, method=ci, seed=seed
    )
    return MetricResult(name=name, value=point, ci=ci_result, n=n)

reliably.metrics.calibration.adaptive_ece(y_true, y_prob, *, n_bins=15, ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Adaptive ECE (equal-mass / quantile binning).

Parameters:

Name Type Description Default
y_true array - like

Integer labels.

required
y_prob array - like

Probability matrix or binary scores.

required
n_bins int

Number of bins.

15
ci str | None

CI method.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal coverage.

0.95
seed int

RNG seed.

0

Returns:

Type Description
MetricResult

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(1)
>>> y = rng.integers(0, 2, 300)
>>> p = rng.uniform(0, 1, 300)
>>> r = adaptive_ece(y, p, ci=None)
>>> 0.0 <= r.value <= 1.0
True
Source code in src/reliably/metrics/calibration.py
def adaptive_ece(
    y_true: Any,
    y_prob: Any,
    *,
    n_bins: int = 15,
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> MetricResult:
    """Adaptive ECE (equal-mass / quantile binning).

    Parameters
    ----------
    y_true : array-like
        Integer labels.
    y_prob : array-like
        Probability matrix or binary scores.
    n_bins : int
        Number of bins.
    ci : str | None
        CI method.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal coverage.
    seed : int
        RNG seed.

    Returns
    -------
    MetricResult

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(1)
    >>> y = rng.integers(0, 2, 300)
    >>> p = rng.uniform(0, 1, 300)
    >>> r = adaptive_ece(y, p, ci=None)
    >>> 0.0 <= r.value <= 1.0
    True
    """
    return ece(
        y_true,
        y_prob,
        n_bins=n_bins,
        binning="adaptive",
        ci=ci,
        n_bootstrap=n_bootstrap,
        level=level,
        seed=seed,
    )

reliably.metrics.calibration.smece(y_true, y_prob, *, bandwidth=None, ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Smooth (kernel) Expected Calibration Error (smECE).

Uses Gaussian kernel regression of correctness on confidence. Bandwidth is selected by a Silverman-style rule-of-thumb if not provided.

Parameters:

Name Type Description Default
y_true array - like

Integer labels.

required
y_prob array - like

Probability matrix or binary scores.

required
bandwidth float | None

Kernel bandwidth h; None uses h ∝ N^{-1/5}.

None
ci str | None

CI method.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal coverage.

0.95
seed int

RNG seed.

0

Returns:

Type Description
MetricResult

Named "smECE".

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(4)
>>> y = rng.integers(0, 2, 400)
>>> p = rng.uniform(0, 1, 400)
>>> r = smece(y, p, ci=None)
>>> 0.0 <= r.value <= 1.0
True
Source code in src/reliably/metrics/calibration.py
def smece(
    y_true: Any,
    y_prob: Any,
    *,
    bandwidth: float | None = None,
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> MetricResult:
    """Smooth (kernel) Expected Calibration Error (smECE).

    Uses Gaussian kernel regression of correctness on confidence.
    Bandwidth is selected by a Silverman-style rule-of-thumb if not provided.

    Parameters
    ----------
    y_true : array-like
        Integer labels.
    y_prob : array-like
        Probability matrix or binary scores.
    bandwidth : float | None
        Kernel bandwidth ``h``; ``None`` uses ``h ∝ N^{-1/5}``.
    ci : str | None
        CI method.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal coverage.
    seed : int
        RNG seed.

    Returns
    -------
    MetricResult
        Named ``"smECE"``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(4)
    >>> y = rng.integers(0, 2, 400)
    >>> p = rng.uniform(0, 1, 400)
    >>> r = smece(y, p, ci=None)
    >>> 0.0 <= r.value <= 1.0
    True
    """
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    n = len(y_true_np)

    conf, acc = _top_label_conf_acc(y_true_np, y_prob_np)

    def _smece_from_arrays(c: NDArray[np.float64], a: NDArray[np.float64]) -> float:
        nn = len(c)
        h = bandwidth if bandwidth is not None else max(0.1 * nn ** (-0.2), 0.01)
        diff = c[:, None] - c[None, :]
        k_mat = np.exp(-(diff**2) / (2.0 * h**2))
        k_sum = k_mat.sum(axis=1)
        r_hat = k_mat @ a / k_sum
        return float(np.abs(r_hat - c).mean())

    point = _smece_from_arrays(conf, acc)

    if ci is None:
        return MetricResult(name="smECE", value=point, ci=None, n=n)

    from reliably.stats.bootstrap import bootstrap_ci

    def _est(idx: NDArray[np.intp]) -> float:
        return _smece_from_arrays(conf[idx], acc[idx])

    ci_result = bootstrap_ci(
        _est, n, point=point, n_boot=n_bootstrap, level=level, method=ci, seed=seed
    )
    return MetricResult(name="smECE", value=point, ci=ci_result, n=n)

reliably.metrics.calibration.mce(y_true, y_prob, *, n_bins=15, binning='adaptive', ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Maximum Calibration Error (MCE).

Parameters:

Name Type Description Default
y_true array - like

Integer labels.

required
y_prob array - like

Probability matrix or binary scores.

required
n_bins int

Number of bins.

15
binning str

"equal_width" or "adaptive".

'adaptive'
ci str | None

CI method.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal coverage.

0.95
seed int

RNG seed.

0

Returns:

Type Description
MetricResult

Named "MCE".

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(2)
>>> y = rng.integers(0, 2, 200)
>>> p = rng.uniform(0, 1, 200)
>>> r = mce(y, p, ci=None)
>>> 0.0 <= r.value <= 1.0
True
Source code in src/reliably/metrics/calibration.py
def mce(
    y_true: Any,
    y_prob: Any,
    *,
    n_bins: int = 15,
    binning: str = "adaptive",
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> MetricResult:
    """Maximum Calibration Error (MCE).

    Parameters
    ----------
    y_true : array-like
        Integer labels.
    y_prob : array-like
        Probability matrix or binary scores.
    n_bins : int
        Number of bins.
    binning : str
        ``"equal_width"`` or ``"adaptive"``.
    ci : str | None
        CI method.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal coverage.
    seed : int
        RNG seed.

    Returns
    -------
    MetricResult
        Named ``"MCE"``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(2)
    >>> y = rng.integers(0, 2, 200)
    >>> p = rng.uniform(0, 1, 200)
    >>> r = mce(y, p, ci=None)
    >>> 0.0 <= r.value <= 1.0
    True
    """
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    n = len(y_true_np)

    conf, acc = _top_label_conf_acc(y_true_np, y_prob_np)

    def _est(idx: NDArray[np.intp]) -> float:
        c, a = conf[idx], acc[idx]
        edges = (
            equal_width_bins(n_bins) if binning == "equal_width" else adaptive_bins(c, n_bins)
        )
        bc, ba, bn = bin_stats(c, a, edges)
        gaps = np.abs(ba - bc)
        gaps[bn == 0] = 0.0
        return float(gaps.max())

    edges = (
        equal_width_bins(n_bins) if binning == "equal_width" else adaptive_bins(conf, n_bins)
    )
    bc, ba, bn = bin_stats(conf, acc, edges)
    gaps = np.abs(ba - bc)
    gaps[bn == 0] = 0.0
    point = float(gaps.max())

    if ci is None:
        return MetricResult(name="MCE", value=point, ci=None, n=n)

    from reliably.stats.bootstrap import bootstrap_ci

    ci_result = bootstrap_ci(
        _est, n, point=point, n_boot=n_bootstrap, level=level, method=ci, seed=seed
    )
    return MetricResult(name="MCE", value=point, ci=ci_result, n=n)

reliably.metrics.calibration.debiased_ece(y_true, y_prob, *, n_bins=15, binning='adaptive', ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Debiased ECE² (bias-corrected squared calibration error).

Removes the finite-sample positive bias of the plug-in estimator.

Parameters:

Name Type Description Default
y_true array - like

Integer labels.

required
y_prob array - like

Probability matrix or binary scores.

required
n_bins int

Number of bins.

15
binning str

"equal_width" or "adaptive".

'adaptive'
ci str | None

CI method.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal coverage.

0.95
seed int

RNG seed.

0

Returns:

Type Description
MetricResult

Named "debiased_ECE2" (square-root reported as value).

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(3)
>>> y = rng.integers(0, 2, 500)
>>> p = rng.uniform(0, 1, 500)
>>> r = debiased_ece(y, p, ci=None)
>>> r.value >= 0.0
True
Source code in src/reliably/metrics/calibration.py
def debiased_ece(
    y_true: Any,
    y_prob: Any,
    *,
    n_bins: int = 15,
    binning: str = "adaptive",
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> MetricResult:
    """Debiased ECE² (bias-corrected squared calibration error).

    Removes the finite-sample positive bias of the plug-in estimator.

    Parameters
    ----------
    y_true : array-like
        Integer labels.
    y_prob : array-like
        Probability matrix or binary scores.
    n_bins : int
        Number of bins.
    binning : str
        ``"equal_width"`` or ``"adaptive"``.
    ci : str | None
        CI method.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal coverage.
    seed : int
        RNG seed.

    Returns
    -------
    MetricResult
        Named ``"debiased_ECE2"`` (square-root reported as value).

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(3)
    >>> y = rng.integers(0, 2, 500)
    >>> p = rng.uniform(0, 1, 500)
    >>> r = debiased_ece(y, p, ci=None)
    >>> r.value >= 0.0
    True
    """
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    n = len(y_true_np)

    conf, acc = _top_label_conf_acc(y_true_np, y_prob_np)

    def _est_sq(idx: NDArray[np.intp]) -> float:
        c, a = conf[idx], acc[idx]
        nn = len(idx)
        edges = (
            equal_width_bins(n_bins) if binning == "equal_width" else adaptive_bins(c, n_bins)
        )
        bc, ba, bn = bin_stats(c, a, edges)
        sq_gap = (ba - bc) ** 2
        # Bias term: acc*(1-acc)/(|B|-1), zero for empty or singleton bins
        bias = np.where(bn > 1, ba * (1.0 - ba) / (bn - 1), 0.0)
        debiased = np.maximum(sq_gap - bias, 0.0)
        return float(np.sum(bn / nn * debiased))

    edges = (
        equal_width_bins(n_bins) if binning == "equal_width" else adaptive_bins(conf, n_bins)
    )
    bc, ba, bn = bin_stats(conf, acc, edges)
    sq_gap = (ba - bc) ** 2
    bias = np.where(bn > 1, ba * (1.0 - ba) / (bn - 1), 0.0)
    debiased = np.maximum(sq_gap - bias, 0.0)
    point_sq = float(np.sum(bn / n * debiased))
    point = float(np.sqrt(point_sq))

    if ci is None:
        return MetricResult(name="debiased_ECE2", value=point, ci=None, n=n)

    from reliably.stats.bootstrap import bootstrap_ci

    ci_result = bootstrap_ci(
        lambda idx: float(np.sqrt(_est_sq(idx))),
        n,
        point=point,
        n_boot=n_bootstrap,
        level=level,
        method=ci,
        seed=seed,
    )
    return MetricResult(name="debiased_ECE2", value=point, ci=ci_result, n=n)

reliably.metrics.calibration.classwise_ece(y_true, y_prob, *, n_bins=15, binning='adaptive', ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Classwise / marginal ECE for multiclass models.

Parameters:

Name Type Description Default
y_true array - like

Integer labels.

required
y_prob array - like

Probability matrix (N, K).

required
n_bins int

Number of bins per class.

15
binning str

"equal_width" or "adaptive".

'adaptive'
ci str | None

CI method.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal coverage.

0.95
seed int

RNG seed.

0

Returns:

Type Description
MetricResult

Named "cwECE".

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(5)
>>> y = rng.integers(0, 3, 300)
>>> p = rng.dirichlet([1, 1, 1], 300)
>>> r = classwise_ece(y, p, ci=None)
>>> 0.0 <= r.value <= 1.0
True
Source code in src/reliably/metrics/calibration.py
def classwise_ece(
    y_true: Any,
    y_prob: Any,
    *,
    n_bins: int = 15,
    binning: str = "adaptive",
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> MetricResult:
    """Classwise / marginal ECE for multiclass models.

    Parameters
    ----------
    y_true : array-like
        Integer labels.
    y_prob : array-like
        Probability matrix ``(N, K)``.
    n_bins : int
        Number of bins per class.
    binning : str
        ``"equal_width"`` or ``"adaptive"``.
    ci : str | None
        CI method.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal coverage.
    seed : int
        RNG seed.

    Returns
    -------
    MetricResult
        Named ``"cwECE"``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(5)
    >>> y = rng.integers(0, 3, 300)
    >>> p = rng.dirichlet([1, 1, 1], 300)
    >>> r = classwise_ece(y, p, ci=None)
    >>> 0.0 <= r.value <= 1.0
    True
    """
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    n = len(y_true_np)

    if y_prob_np.ndim == 1:
        # Binary: treat as 2-class
        y_prob_np = np.stack([1.0 - y_prob_np, y_prob_np], axis=1)

    k = y_prob_np.shape[1]

    def _cw_ece(idx: NDArray[np.intp]) -> float:
        total = 0.0
        nn = len(idx)
        yt = y_true_np[idx]
        yp = y_prob_np[idx]
        for cls in range(k):
            conf_k = yp[:, cls]
            acc_k = (yt == cls).astype(np.float64)
            edges = (
                equal_width_bins(n_bins)
                if binning == "equal_width"
                else adaptive_bins(conf_k, n_bins)
            )
            bc, ba, bn = bin_stats(conf_k, acc_k, edges)
            total += _ece_from_bins(bc, ba, bn, nn)
        return float(total / k)

    point = _cw_ece(np.arange(n))

    if ci is None:
        return MetricResult(name="cwECE", value=point, ci=None, n=n)

    from reliably.stats.bootstrap import bootstrap_ci

    ci_result = bootstrap_ci(
        _cw_ece, n, point=point, n_boot=n_bootstrap, level=level, method=ci, seed=seed
    )
    return MetricResult(name="cwECE", value=point, ci=ci_result, n=n)

Scoring

reliably.metrics.scoring.brier(y_true, y_prob, *, decompose=False, n_bins=15, ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Brier score with optional Murphy decomposition.

Parameters:

Name Type Description Default
y_true array - like

Integer labels, shape (N,).

required
y_prob array - like

Probability matrix (N, K) or binary scores (N,).

required
decompose bool

If True, include the Murphy decomposition (binary only) in MetricResult.extra.

False
n_bins int

Bins for the Murphy decomposition.

15
ci str | None

CI method.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal coverage.

0.95
seed int

RNG seed.

0

Returns:

Type Description
MetricResult

Named "Brier". If decompose=True, extra contains {"reliability", "resolution", "uncertainty"}.

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 300)
>>> p = rng.uniform(0, 1, 300)
>>> r = brier(y, p, ci=None)
>>> 0.0 <= r.value <= 1.0
True
Source code in src/reliably/metrics/scoring.py
def brier(
    y_true: Any,
    y_prob: Any,
    *,
    decompose: bool = False,
    n_bins: int = 15,
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> MetricResult:
    """Brier score with optional Murphy decomposition.

    Parameters
    ----------
    y_true : array-like
        Integer labels, shape ``(N,)``.
    y_prob : array-like
        Probability matrix ``(N, K)`` or binary scores ``(N,)``.
    decompose : bool
        If ``True``, include the Murphy decomposition (binary only) in
        ``MetricResult.extra``.
    n_bins : int
        Bins for the Murphy decomposition.
    ci : str | None
        CI method.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal coverage.
    seed : int
        RNG seed.

    Returns
    -------
    MetricResult
        Named ``"Brier"``. If ``decompose=True``, ``extra`` contains
        ``{"reliability", "resolution", "uncertainty"}``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 300)
    >>> p = rng.uniform(0, 1, 300)
    >>> r = brier(y, p, ci=None)
    >>> 0.0 <= r.value <= 1.0
    True
    """
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    n = len(y_true_np)
    binary = y_prob_np.ndim == 1

    if binary:
        # Per-sample squared error
        per_sample = (y_prob_np - y_true_np.astype(np.float64)) ** 2
    else:
        k = y_prob_np.shape[1]
        oh = _one_hot(y_true_np, k)
        per_sample = ((y_prob_np - oh) ** 2).sum(axis=1)

    point = float(per_sample.mean())

    extra: dict[str, float] | None = None
    if decompose and binary:
        extra = _murphy_decomposition(y_true_np, y_prob_np, n_bins=n_bins)

    if ci is None:
        return MetricResult(name="Brier", value=point, ci=None, n=n, extra=extra)

    ci_result = vectorized_bootstrap_ci(
        per_sample, point=point, n_boot=n_bootstrap, level=level, method=ci, seed=seed
    )
    return MetricResult(name="Brier", value=point, ci=ci_result, n=n, extra=extra)

reliably.metrics.scoring.nll(y_true, y_prob, *, ci='bca', n_bootstrap=2000, level=0.95, seed=0)

Negative log-likelihood (log loss).

Parameters:

Name Type Description Default
y_true array - like

Integer labels, shape (N,).

required
y_prob array - like

Probability matrix (N, K) or binary scores (N,).

required
ci str | None

CI method.

'bca'
n_bootstrap int

Bootstrap resamples.

2000
level float

Nominal coverage.

0.95
seed int

RNG seed.

0

Returns:

Type Description
MetricResult

Named "NLL".

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 300)
>>> p = rng.uniform(0.1, 0.9, 300)
>>> r = nll(y, p, ci=None)
>>> r.value > 0.0
True
Source code in src/reliably/metrics/scoring.py
def nll(
    y_true: Any,
    y_prob: Any,
    *,
    ci: str | None = "bca",
    n_bootstrap: int = 2000,
    level: float = 0.95,
    seed: int = 0,
) -> MetricResult:
    """Negative log-likelihood (log loss).

    Parameters
    ----------
    y_true : array-like
        Integer labels, shape ``(N,)``.
    y_prob : array-like
        Probability matrix ``(N, K)`` or binary scores ``(N,)``.
    ci : str | None
        CI method.
    n_bootstrap : int
        Bootstrap resamples.
    level : float
        Nominal coverage.
    seed : int
        RNG seed.

    Returns
    -------
    MetricResult
        Named ``"NLL"``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 300)
    >>> p = rng.uniform(0.1, 0.9, 300)
    >>> r = nll(y, p, ci=None)
    >>> r.value > 0.0
    True
    """
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    y_prob_np = to_numpy(y_prob, dtype=np.float64)
    n = len(y_true_np)
    binary = y_prob_np.ndim == 1

    if binary:
        p_clipped = clip_probs(y_prob_np)
        p_correct = np.where(y_true_np == 1, p_clipped, 1.0 - p_clipped)
    else:
        p_clipped = clip_probs(y_prob_np)
        p_correct = p_clipped[np.arange(n), y_true_np]

    per_sample = -np.log(p_correct)
    point = float(per_sample.mean())

    if ci is None:
        return MetricResult(name="NLL", value=point, ci=None, n=n)

    ci_result = vectorized_bootstrap_ci(
        per_sample, point=point, n_boot=n_bootstrap, level=level, method=ci, seed=seed
    )
    return MetricResult(name="NLL", value=point, ci=ci_result, n=n)

Discrimination

reliably.metrics.discrimination.auroc(y_true, y_score, *, ci='bca', level=0.95, n_bootstrap=2000, seed=0)

Area under the ROC curve (AUROC) with DeLong or bootstrap CI.

For binary tasks the DeLong analytic CI is used by default (ci="bca" routes through the analytic DeLong variance). For the bootstrap path, use ci="percentile".

Parameters:

Name Type Description Default
y_true array - like

Binary labels {0, 1}, shape (N,).

required
y_score array - like

Predicted scores (higher = more likely positive), shape (N,).

required
ci str | None

"bca" uses DeLong analytic; "percentile" uses bootstrap; None skips CI.

'bca'
level float

Nominal CI coverage.

0.95
n_bootstrap int

Bootstrap resamples (only used when ci="percentile").

2000
seed int

RNG seed for bootstrap.

0

Returns:

Type Description
MetricResult

Named "AUROC".

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> y = rng.integers(0, 2, 200)
>>> s = rng.uniform(0, 1, 200)
>>> r = auroc(y, s, ci=None)
>>> 0.0 <= r.value <= 1.0
True
Source code in src/reliably/metrics/discrimination.py
def auroc(
    y_true: Any,
    y_score: Any,
    *,
    ci: str | None = "bca",
    level: float = 0.95,
    n_bootstrap: int = 2000,
    seed: int = 0,
) -> MetricResult:
    """Area under the ROC curve (AUROC) with DeLong or bootstrap CI.

    For binary tasks the DeLong analytic CI is used by default
    (``ci="bca"`` routes through the analytic DeLong variance).
    For the bootstrap path, use ``ci="percentile"``.

    Parameters
    ----------
    y_true : array-like
        Binary labels ``{0, 1}``, shape ``(N,)``.
    y_score : array-like
        Predicted scores (higher = more likely positive), shape ``(N,)``.
    ci : str | None
        ``"bca"`` uses DeLong analytic; ``"percentile"`` uses bootstrap;
        ``None`` skips CI.
    level : float
        Nominal CI coverage.
    n_bootstrap : int
        Bootstrap resamples (only used when ``ci="percentile"``).
    seed : int
        RNG seed for bootstrap.

    Returns
    -------
    MetricResult
        Named ``"AUROC"``.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> y = rng.integers(0, 2, 200)
    >>> s = rng.uniform(0, 1, 200)
    >>> r = auroc(y, s, ci=None)
    >>> 0.0 <= r.value <= 1.0
    True
    """
    y_true_np = to_numpy(y_true, dtype=np.float64).astype(np.int64)
    y_score_np = to_numpy(y_score, dtype=np.float64)
    n = len(y_true_np)

    if y_score_np.ndim == 2:
        # Use score of the positive class (index 1)
        y_score_np = y_score_np[:, 1]

    if ci is None:
        auc_val, _, _, _ = auroc_delong(y_score_np, y_true_np, level=level)
        return MetricResult(name="AUROC", value=float(auc_val), ci=None, n=n)

    if ci in ("bca", "analytic"):
        auc_val, lo, hi, _ = auroc_delong(y_score_np, y_true_np, level=level)
        ci_obj = CI(float(lo), float(hi), level, "analytic")
        return MetricResult(name="AUROC", value=float(auc_val), ci=ci_obj, n=n)

    # Bootstrap path
    from reliably.stats.bootstrap import bootstrap_ci
    from reliably.stats.delong import delong_var_components

    auc_val, _, _, _ = auroc_delong(y_score_np, y_true_np, level=level)

    def _est(idx: NDArray[np.intp]) -> float:
        s = y_score_np[idx]
        y = y_true_np[idx]
        if y.sum() == 0 or y.sum() == len(y):
            return 0.5
        a, _, _, _ = delong_var_components(s, y)
        return float(a)

    ci_result = bootstrap_ci(
        _est, n, point=float(auc_val), n_boot=n_bootstrap, level=level, method=ci, seed=seed
    )
    return MetricResult(name="AUROC", value=float(auc_val), ci=ci_result, n=n)