Skip to content

Stats

reliably.stats.bootstrap.bootstrap_ci(estimator, n, *, point, n_boot=2000, level=0.95, method='bca', seed=0)

Compute a bootstrap confidence interval for any scalar estimator.

The estimator is called once per bootstrap replicate with a resample index array; use vectorized implementations where possible.

Parameters:

Name Type Description Default
estimator Callable[[NDArray], float]

Function f(idx) -> float applied to each resample.

required
n int

Dataset size.

required
point float

Point estimate (from the full dataset, not resampled).

required
n_boot int

Number of resamples (default 2000).

2000
level float

Nominal coverage (default 0.95).

0.95
method str

"bca" (default) or "percentile".

'bca'
seed int | Generator

RNG seed.

0

Returns:

Type Description
CI

Confidence interval object.

Examples:

>>> import numpy as np
>>> data = np.random.default_rng(0).normal(0, 1, 200)
>>> ci = bootstrap_ci(lambda idx: data[idx].mean(), len(data),
...                   point=data.mean(), seed=0)
>>> ci.low < ci.high
True
Source code in src/reliably/stats/bootstrap.py
def bootstrap_ci(
    estimator: Callable[[NDArray[Any]], float],
    n: int,
    *,
    point: float,
    n_boot: int = 2000,
    level: float = 0.95,
    method: str = "bca",
    seed: int | np.random.Generator = 0,
) -> CI:
    """Compute a bootstrap confidence interval for any scalar estimator.

    The estimator is called once per bootstrap replicate with a resample
    index array; use vectorized implementations where possible.

    Parameters
    ----------
    estimator : Callable[[NDArray], float]
        Function ``f(idx) -> float`` applied to each resample.
    n : int
        Dataset size.
    point : float
        Point estimate (from the full dataset, not resampled).
    n_boot : int
        Number of resamples (default 2000).
    level : float
        Nominal coverage (default 0.95).
    method : str
        ``"bca"`` (default) or ``"percentile"``.
    seed : int | np.random.Generator
        RNG seed.

    Returns
    -------
    CI
        Confidence interval object.

    Examples
    --------
    >>> import numpy as np
    >>> data = np.random.default_rng(0).normal(0, 1, 200)
    >>> ci = bootstrap_ci(lambda idx: data[idx].mean(), len(data),
    ...                   point=data.mean(), seed=0)
    >>> ci.low < ci.high
    True
    """
    rng = make_rng(seed)
    idx_matrix = bootstrap_replicate_indices(n, n_boot, seed=rng)

    boot = np.empty(n_boot, dtype=np.float64)
    for b in range(n_boot):
        boot[b] = estimator(idx_matrix[b])

    alpha = 1.0 - level
    if method == "percentile":
        lo, hi = np.quantile(boot, [alpha / 2, 1.0 - alpha / 2])
        return CI(float(lo), float(hi), level, "percentile")

    # BCa: jackknife for acceleration
    full = np.arange(n)
    jack = np.empty(n, dtype=np.float64)
    for i in range(n):
        mask = np.concatenate([full[:i], full[i + 1 :]])
        jack[i] = estimator(mask)

    return _bca_ci(boot, point, jack, level)

reliably.stats.delong.delong_test(scores_a, scores_b, labels)

Compare two correlated AUROCs on the same test set (DeLong 1988).

Parameters:

Name Type Description Default
scores_a NDArray[float64]

Scores from model A.

required
scores_b NDArray[float64]

Scores from model B.

required
labels NDArray[int64]

Shared binary labels.

required

Returns:

Name Type Description
delta float

AUC_a - AUC_b.

p_value float

Two-sided p-value.

se float

Standard error of the difference.

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(1)
>>> y = rng.integers(0, 2, 200)
>>> sa = rng.uniform(0, 1, 200)
>>> sb = rng.uniform(0, 1, 200)
>>> delta, p, se = delong_test(sa, sb, y)
>>> 0.0 <= p <= 1.0
True
Source code in src/reliably/stats/delong.py
def delong_test(
    scores_a: NDArray[np.float64],
    scores_b: NDArray[np.float64],
    labels: NDArray[np.int64],
) -> tuple[float, float, float]:
    """Compare two correlated AUROCs on the same test set (DeLong 1988).

    Parameters
    ----------
    scores_a : NDArray[np.float64]
        Scores from model A.
    scores_b : NDArray[np.float64]
        Scores from model B.
    labels : NDArray[np.int64]
        Shared binary labels.

    Returns
    -------
    delta : float
        ``AUC_a - AUC_b``.
    p_value : float
        Two-sided p-value.
    se : float
        Standard error of the difference.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(1)
    >>> y = rng.integers(0, 2, 200)
    >>> sa = rng.uniform(0, 1, 200)
    >>> sb = rng.uniform(0, 1, 200)
    >>> delta, p, se = delong_test(sa, sb, y)
    >>> 0.0 <= p <= 1.0
    True
    """
    auc_a, _var_a, v10a, v01a = delong_var_components(scores_a, labels)
    auc_b, _var_b, v10b, v01b = delong_var_components(scores_b, labels)

    # Correlated covariance (same test set)
    cov10 = float(np.cov(v10a, v10b)[0, 1]) / len(v10a) if len(v10a) > 1 else 0.0
    cov01 = float(np.cov(v01a, v01b)[0, 1]) / len(v01a) if len(v01a) > 1 else 0.0

    var_a = float(np.var(v10a, ddof=1) / len(v10a) + np.var(v01a, ddof=1) / len(v01a))
    var_b = float(np.var(v10b, ddof=1) / len(v10b) + np.var(v01b, ddof=1) / len(v01b))

    var_diff = var_a + var_b - 2.0 * (cov10 + cov01)
    se = float(np.sqrt(max(var_diff, 0.0)))
    delta = auc_a - auc_b
    z = delta / se if se > 0 else 0.0
    p_value = float(2.0 * (1.0 - norm.cdf(abs(z))))
    return float(delta), p_value, se

reliably.stats.tests.paired_bootstrap_test(estimator_a, estimator_b, n, *, point_a, point_b, n_boot=2000, level=0.95, seed=0)

Paired bootstrap test for any pair of scalar estimators.

Both estimators are applied to the same resample indices so the comparison is paired. This works for any metric, unlike DeLong.

Parameters:

Name Type Description Default
estimator_a Callable[[NDArray], float]

f(idx) -> float for model A.

required
estimator_b Callable[[NDArray], float]

f(idx) -> float for model B.

required
n int

Dataset size.

required
point_a float

Full-data point estimate for model A.

required
point_b float

Full-data point estimate for model B.

required
n_boot int

Number of resamples.

2000
level float

Nominal CI coverage.

0.95
seed int | Generator

RNG seed.

0

Returns:

Name Type Description
delta float

point_a - point_b.

ci CI

Bootstrap CI on the difference (percentile by default).

p_value float

Two-sided p-value via bootstrap hypothesis convention.

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> x = rng.normal(0, 1, 200)
>>> delta, ci, p = paired_bootstrap_test(
...     lambda idx: x[idx].mean(), lambda idx: (-x)[idx].mean(),
...     len(x), point_a=x.mean(), point_b=(-x).mean(), seed=0
... )
>>> p < 0.01
True
Source code in src/reliably/stats/tests.py
def paired_bootstrap_test(
    estimator_a: Callable[[NDArray[Any]], float],
    estimator_b: Callable[[NDArray[Any]], float],
    n: int,
    *,
    point_a: float,
    point_b: float,
    n_boot: int = 2000,
    level: float = 0.95,
    seed: int | np.random.Generator = 0,
) -> tuple[float, CI, float]:
    """Paired bootstrap test for any pair of scalar estimators.

    Both estimators are applied to the *same* resample indices so the
    comparison is paired.  This works for any metric, unlike DeLong.

    Parameters
    ----------
    estimator_a : Callable[[NDArray], float]
        ``f(idx) -> float`` for model A.
    estimator_b : Callable[[NDArray], float]
        ``f(idx) -> float`` for model B.
    n : int
        Dataset size.
    point_a : float
        Full-data point estimate for model A.
    point_b : float
        Full-data point estimate for model B.
    n_boot : int
        Number of resamples.
    level : float
        Nominal CI coverage.
    seed : int | np.random.Generator
        RNG seed.

    Returns
    -------
    delta : float
        ``point_a - point_b``.
    ci : CI
        Bootstrap CI on the difference (percentile by default).
    p_value : float
        Two-sided p-value via bootstrap hypothesis convention.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> x = rng.normal(0, 1, 200)
    >>> delta, ci, p = paired_bootstrap_test(
    ...     lambda idx: x[idx].mean(), lambda idx: (-x)[idx].mean(),
    ...     len(x), point_a=x.mean(), point_b=(-x).mean(), seed=0
    ... )
    >>> p < 0.01
    True
    """
    rng = make_rng(seed)
    idx_matrix = bootstrap_replicate_indices(n, n_boot, seed=rng)

    delta_boot = np.empty(n_boot, dtype=np.float64)
    for b in range(n_boot):
        idx = idx_matrix[b]
        delta_boot[b] = estimator_a(idx) - estimator_b(idx)

    delta = point_a - point_b
    alpha = 1.0 - level
    lo, hi = np.quantile(delta_boot, [alpha / 2, 1.0 - alpha / 2])
    ci = CI(float(lo), float(hi), level, "percentile")

    # Bootstrap hypothesis p-value
    n_le = int(np.sum(delta_boot <= 0))
    n_ge = int(np.sum(delta_boot >= 0))
    p_value = float(2.0 * min((n_le + 1) / (n_boot + 1), (n_ge + 1) / (n_boot + 1)))

    return delta, ci, p_value

reliably.stats.tests.apply_correction(p_values, correction, *, level=0.05)

Apply a multiple-comparison correction by name.

Parameters:

Name Type Description Default
p_values list[float]

Raw p-values.

required
correction str | None

"holm", "bh" (Benjamini–Hochberg), or None.

required
level float

Error rate.

0.05

Returns:

Type Description
list[bool]

Significance flags.

Examples:

>>> apply_correction([0.01, 0.2], "holm")
[True, False]
Source code in src/reliably/stats/tests.py
def apply_correction(
    p_values: list[float],
    correction: str | None,
    *,
    level: float = 0.05,
) -> list[bool]:
    """Apply a multiple-comparison correction by name.

    Parameters
    ----------
    p_values : list[float]
        Raw p-values.
    correction : str | None
        ``"holm"``, ``"bh"`` (Benjamini–Hochberg), or ``None``.
    level : float
        Error rate.

    Returns
    -------
    list[bool]
        Significance flags.

    Examples
    --------
    >>> apply_correction([0.01, 0.2], "holm")
    [True, False]
    """
    if correction is None:
        return [p < level for p in p_values]
    if correction == "holm":
        return holm_bonferroni(p_values, level)
    if correction in ("bh", "fdr"):
        return benjamini_hochberg(p_values, level)
    raise ValueError(f"Unknown correction {correction!r}. Use 'holm', 'bh', or None.")