Stats¶

`reliably.stats.bootstrap.bootstrap_ci(estimator, n, *, point, n_boot=2000, level=0.95, method='bca', seed=0)` ¶

Compute a bootstrap confidence interval for any scalar estimator.

The estimator is called once per bootstrap replicate with a resample index array; use vectorized implementations where possible.

Parameters:

Name	Type	Description	Default
`estimator`	`Callable[[NDArray], float]`	Function `f(idx) -> float` applied to each resample.	required
`n`	`int`	Dataset size.	required
`point`	`float`	Point estimate (from the full dataset, not resampled).	required
`n_boot`	`int`	Number of resamples (default 2000).	`2000`
`level`	`float`	Nominal coverage (default 0.95).	`0.95`
`method`	`str`	`"bca"` (default) or `"percentile"`.	`'bca'`
`seed`	`int \| Generator`	RNG seed.	`0`

Returns:

Type	Description
`CI`	Confidence interval object.

Examples:

>>> import numpy as np
>>> data = np.random.default_rng(0).normal(0, 1, 200)
>>> ci = bootstrap_ci(lambda idx: data[idx].mean(), len(data),
...                   point=data.mean(), seed=0)
>>> ci.low < ci.high
True

Source code in src/reliably/stats/bootstrap.py

def bootstrap_ci(
    estimator: Callable[[NDArray[Any]], float],
    n: int,
    *,
    point: float,
    n_boot: int = 2000,
    level: float = 0.95,
    method: str = "bca",
    seed: int | np.random.Generator = 0,
) -> CI:
    """Compute a bootstrap confidence interval for any scalar estimator.

    The estimator is called once per bootstrap replicate with a resample
    index array; use vectorized implementations where possible.

    Parameters
    ----------
    estimator : Callable[[NDArray], float]
        Function ``f(idx) -> float`` applied to each resample.
    n : int
        Dataset size.
    point : float
        Point estimate (from the full dataset, not resampled).
    n_boot : int
        Number of resamples (default 2000).
    level : float
        Nominal coverage (default 0.95).
    method : str
        ``"bca"`` (default) or ``"percentile"``.
    seed : int | np.random.Generator
        RNG seed.

    Returns
    -------
    CI
        Confidence interval object.

    Examples
    --------
    >>> import numpy as np
    >>> data = np.random.default_rng(0).normal(0, 1, 200)
    >>> ci = bootstrap_ci(lambda idx: data[idx].mean(), len(data),
    ...                   point=data.mean(), seed=0)
    >>> ci.low < ci.high
    True
    """
    rng = make_rng(seed)
    idx_matrix = bootstrap_replicate_indices(n, n_boot, seed=rng)

    boot = np.empty(n_boot, dtype=np.float64)
    for b in range(n_boot):
        boot[b] = estimator(idx_matrix[b])

    alpha = 1.0 - level
    if method == "percentile":
        lo, hi = np.quantile(boot, [alpha / 2, 1.0 - alpha / 2])
        return CI(float(lo), float(hi), level, "percentile")

    # BCa: jackknife for acceleration
    full = np.arange(n)
    jack = np.empty(n, dtype=np.float64)
    for i in range(n):
        mask = np.concatenate([full[:i], full[i + 1 :]])
        jack[i] = estimator(mask)

    return _bca_ci(boot, point, jack, level)

`reliably.stats.delong.delong_test(scores_a, scores_b, labels)` ¶

Compare two correlated AUROCs on the same test set (DeLong 1988).

Parameters:

Name	Type	Description	Default
`scores_a`	`NDArray[float64]`	Scores from model A.	required
`scores_b`	`NDArray[float64]`	Scores from model B.	required
`labels`	`NDArray[int64]`	Shared binary labels.	required

Returns:

Name	Type	Description
`delta`	`float`	`AUC_a - AUC_b`.
`p_value`	`float`	Two-sided p-value.
`se`	`float`	Standard error of the difference.

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(1)
>>> y = rng.integers(0, 2, 200)
>>> sa = rng.uniform(0, 1, 200)
>>> sb = rng.uniform(0, 1, 200)
>>> delta, p, se = delong_test(sa, sb, y)
>>> 0.0 <= p <= 1.0
True

Source code in src/reliably/stats/delong.py

def delong_test(
    scores_a: NDArray[np.float64],
    scores_b: NDArray[np.float64],
    labels: NDArray[np.int64],
) -> tuple[float, float, float]:
    """Compare two correlated AUROCs on the same test set (DeLong 1988).

    Parameters
    ----------
    scores_a : NDArray[np.float64]
        Scores from model A.
    scores_b : NDArray[np.float64]
        Scores from model B.
    labels : NDArray[np.int64]
        Shared binary labels.

    Returns
    -------
    delta : float
        ``AUC_a - AUC_b``.
    p_value : float
        Two-sided p-value.
    se : float
        Standard error of the difference.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(1)
    >>> y = rng.integers(0, 2, 200)
    >>> sa = rng.uniform(0, 1, 200)
    >>> sb = rng.uniform(0, 1, 200)
    >>> delta, p, se = delong_test(sa, sb, y)
    >>> 0.0 <= p <= 1.0
    True
    """
    auc_a, _var_a, v10a, v01a = delong_var_components(scores_a, labels)
    auc_b, _var_b, v10b, v01b = delong_var_components(scores_b, labels)

    # Correlated covariance (same test set)
    cov10 = float(np.cov(v10a, v10b)[0, 1]) / len(v10a) if len(v10a) > 1 else 0.0
    cov01 = float(np.cov(v01a, v01b)[0, 1]) / len(v01a) if len(v01a) > 1 else 0.0

    var_a = float(np.var(v10a, ddof=1) / len(v10a) + np.var(v01a, ddof=1) / len(v01a))
    var_b = float(np.var(v10b, ddof=1) / len(v10b) + np.var(v01b, ddof=1) / len(v01b))

    var_diff = var_a + var_b - 2.0 * (cov10 + cov01)
    se = float(np.sqrt(max(var_diff, 0.0)))
    delta = auc_a - auc_b
    z = delta / se if se > 0 else 0.0
    p_value = float(2.0 * (1.0 - norm.cdf(abs(z))))
    return float(delta), p_value, se

`reliably.stats.tests.paired_bootstrap_test(estimator_a, estimator_b, n, *, point_a, point_b, n_boot=2000, level=0.95, seed=0)` ¶

Paired bootstrap test for any pair of scalar estimators.

Both estimators are applied to the same resample indices so the comparison is paired. This works for any metric, unlike DeLong.

Parameters:

Name	Type	Description	Default
`estimator_a`	`Callable[[NDArray], float]`	`f(idx) -> float` for model A.	required
`estimator_b`	`Callable[[NDArray], float]`	`f(idx) -> float` for model B.	required
`n`	`int`	Dataset size.	required
`point_a`	`float`	Full-data point estimate for model A.	required
`point_b`	`float`	Full-data point estimate for model B.	required
`n_boot`	`int`	Number of resamples.	`2000`
`level`	`float`	Nominal CI coverage.	`0.95`
`seed`	`int \| Generator`	RNG seed.	`0`

Returns:

Name	Type	Description
`delta`	`float`	`point_a - point_b`.
`ci`	`CI`	Bootstrap CI on the difference (percentile by default).
`p_value`	`float`	Two-sided p-value via bootstrap hypothesis convention.

Examples:

>>> import numpy as np
>>> rng = np.random.default_rng(0)
>>> x = rng.normal(0, 1, 200)
>>> delta, ci, p = paired_bootstrap_test(
...     lambda idx: x[idx].mean(), lambda idx: (-x)[idx].mean(),
...     len(x), point_a=x.mean(), point_b=(-x).mean(), seed=0
... )
>>> p < 0.01
True

Source code in src/reliably/stats/tests.py

def paired_bootstrap_test(
    estimator_a: Callable[[NDArray[Any]], float],
    estimator_b: Callable[[NDArray[Any]], float],
    n: int,
    *,
    point_a: float,
    point_b: float,
    n_boot: int = 2000,
    level: float = 0.95,
    seed: int | np.random.Generator = 0,
) -> tuple[float, CI, float]:
    """Paired bootstrap test for any pair of scalar estimators.

    Both estimators are applied to the *same* resample indices so the
    comparison is paired.  This works for any metric, unlike DeLong.

    Parameters
    ----------
    estimator_a : Callable[[NDArray], float]
        ``f(idx) -> float`` for model A.
    estimator_b : Callable[[NDArray], float]
        ``f(idx) -> float`` for model B.
    n : int
        Dataset size.
    point_a : float
        Full-data point estimate for model A.
    point_b : float
        Full-data point estimate for model B.
    n_boot : int
        Number of resamples.
    level : float
        Nominal CI coverage.
    seed : int | np.random.Generator
        RNG seed.

    Returns
    -------
    delta : float
        ``point_a - point_b``.
    ci : CI
        Bootstrap CI on the difference (percentile by default).
    p_value : float
        Two-sided p-value via bootstrap hypothesis convention.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.default_rng(0)
    >>> x = rng.normal(0, 1, 200)
    >>> delta, ci, p = paired_bootstrap_test(
    ...     lambda idx: x[idx].mean(), lambda idx: (-x)[idx].mean(),
    ...     len(x), point_a=x.mean(), point_b=(-x).mean(), seed=0
    ... )
    >>> p < 0.01
    True
    """
    rng = make_rng(seed)
    idx_matrix = bootstrap_replicate_indices(n, n_boot, seed=rng)

    delta_boot = np.empty(n_boot, dtype=np.float64)
    for b in range(n_boot):
        idx = idx_matrix[b]
        delta_boot[b] = estimator_a(idx) - estimator_b(idx)

    delta = point_a - point_b
    alpha = 1.0 - level
    lo, hi = np.quantile(delta_boot, [alpha / 2, 1.0 - alpha / 2])
    ci = CI(float(lo), float(hi), level, "percentile")

    # Bootstrap hypothesis p-value
    n_le = int(np.sum(delta_boot <= 0))
    n_ge = int(np.sum(delta_boot >= 0))
    p_value = float(2.0 * min((n_le + 1) / (n_boot + 1), (n_ge + 1) / (n_boot + 1)))

    return delta, ci, p_value

`reliably.stats.tests.apply_correction(p_values, correction, *, level=0.05)` ¶

Apply a multiple-comparison correction by name.

Parameters:

Name	Type	Description	Default
`p_values`	`list[float]`	Raw p-values.	required
`correction`	`str \| None`	`"holm"`, `"bh"` (Benjamini–Hochberg), or `None`.	required
`level`	`float`	Error rate.	`0.05`

Returns:

Type	Description
`list[bool]`	Significance flags.

Examples:

>>> apply_correction([0.01, 0.2], "holm")
[True, False]

Source code in src/reliably/stats/tests.py

def apply_correction(
    p_values: list[float],
    correction: str | None,
    *,
    level: float = 0.05,
) -> list[bool]:
    """Apply a multiple-comparison correction by name.

    Parameters
    ----------
    p_values : list[float]
        Raw p-values.
    correction : str | None
        ``"holm"``, ``"bh"`` (Benjamini–Hochberg), or ``None``.
    level : float
        Error rate.

    Returns
    -------
    list[bool]
        Significance flags.

    Examples
    --------
    >>> apply_correction([0.01, 0.2], "holm")
    [True, False]
    """
    if correction is None:
        return [p < level for p in p_values]
    if correction == "holm":
        return holm_bonferroni(p_values, level)
    if correction in ("bh", "fdr"):
        return benjamini_hochberg(p_values, level)
    raise ValueError(f"Unknown correction {correction!r}. Use 'holm', 'bh', or None.")

Stats¶

reliably.stats.bootstrap.bootstrap_ci(estimator, n, *, point, n_boot=2000, level=0.95, method='bca', seed=0) ¶

reliably.stats.delong.delong_test(scores_a, scores_b, labels) ¶

reliably.stats.tests.paired_bootstrap_test(estimator_a, estimator_b, n, *, point_a, point_b, n_boot=2000, level=0.95, seed=0) ¶

reliably.stats.tests.apply_correction(p_values, correction, *, level=0.05) ¶

`reliably.stats.bootstrap.bootstrap_ci(estimator, n, *, point, n_boot=2000, level=0.95, method='bca', seed=0)` ¶

`reliably.stats.delong.delong_test(scores_a, scores_b, labels)` ¶

`reliably.stats.tests.paired_bootstrap_test(estimator_a, estimator_b, n, *, point_a, point_b, n_boot=2000, level=0.95, seed=0)` ¶

`reliably.stats.tests.apply_correction(p_values, correction, *, level=0.05)` ¶