| |
| |
|
|
| """Non-parametric bootstrap for discrete-choice model inference.""" |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
| from typing import Any, Callable |
|
|
| import numpy as np |
| import pandas as pd |
|
|
| from .config import ModelSpec |
| from .pipeline import estimate_dataframe |
|
|
|
|
| @dataclass |
| class BootstrapResult: |
| """Stores the results of a bootstrap procedure.""" |
|
|
| n_replications: int |
| n_successful: int |
| param_names: list[str] |
| estimates_matrix: np.ndarray |
| bootstrap_se: dict[str, float] |
| percentile_ci: dict[str, tuple[float, float]] |
| original_estimates: dict[str, float] |
|
|
| def summary_dataframe(self) -> pd.DataFrame: |
| rows = [] |
| for name in self.param_names: |
| rows.append( |
| { |
| "parameter": name, |
| "original": self.original_estimates.get(name, float("nan")), |
| "bootstrap_se": self.bootstrap_se.get(name, float("nan")), |
| "ci_lower": self.percentile_ci[name][0] if name in self.percentile_ci else float("nan"), |
| "ci_upper": self.percentile_ci[name][1] if name in self.percentile_ci else float("nan"), |
| } |
| ) |
| return pd.DataFrame(rows) |
|
|
|
|
| def _resample_individuals(df: pd.DataFrame, id_col: str, rng: np.random.Generator) -> pd.DataFrame: |
| """Resample individuals with replacement, keeping all tasks per individual.""" |
| unique_ids = df[id_col].unique() |
| sampled_ids = rng.choice(unique_ids, size=len(unique_ids), replace=True) |
|
|
| parts = [] |
| for new_idx, orig_id in enumerate(sampled_ids): |
| chunk = df[df[id_col] == orig_id].copy() |
| chunk[id_col] = new_idx |
| parts.append(chunk) |
|
|
| return pd.concat(parts, ignore_index=True) |
|
|
|
|
| def run_bootstrap( |
| df: pd.DataFrame, |
| spec: ModelSpec, |
| model_type: str = "mixed", |
| n_replications: int = 100, |
| maxiter: int = 200, |
| seed: int = 42, |
| progress_callback: Callable[[int, int], None] | None = None, |
| *, |
| correlated: bool = False, |
| correlation_groups: list[list[int]] | None = None, |
| n_classes: int | None = None, |
| n_starts: int = 10, |
| membership_cols: list[str] | None = None, |
| bws_worst_col: str | None = None, |
| estimate_lambda_w: bool = True, |
| ) -> BootstrapResult: |
| """ |
| Run non-parametric bootstrap by resampling individuals with replacement. |
| |
| Parameters |
| ---------- |
| df : pd.DataFrame |
| Long-format choice data. |
| spec : ModelSpec |
| Model specification. |
| model_type : str |
| "mixed", "conditional", "gmnl", or "latent_class". |
| n_replications : int |
| Number of bootstrap replications. |
| maxiter : int |
| Max optimizer iterations per replication. |
| seed : int |
| Base seed for reproducibility. |
| progress_callback : callable, optional |
| Called with (current_replication, n_replications) after each replication. |
| correlated : bool |
| Enable full correlation (Cholesky) for random parameters. |
| correlation_groups : list[list[int]], optional |
| Selective correlation groups (block-diagonal Cholesky). |
| n_classes : int, optional |
| Number of latent classes (for latent_class model type). |
| n_starts : int |
| Number of random starts (for latent_class). |
| membership_cols : list[str], optional |
| Membership covariates (for latent_class). |
| bws_worst_col : str, optional |
| Column name for BWS worst choices. |
| estimate_lambda_w : bool |
| Whether to estimate lambda_w for BWS. |
| |
| Returns |
| ------- |
| BootstrapResult |
| """ |
| rng = np.random.default_rng(seed) |
|
|
| |
| extra_kwargs: dict[str, Any] = {} |
| if correlated: |
| extra_kwargs["correlated"] = True |
| if correlation_groups is not None: |
| extra_kwargs["correlation_groups"] = correlation_groups |
| if n_classes is not None: |
| extra_kwargs["n_classes"] = n_classes |
| if n_starts != 10: |
| extra_kwargs["n_starts"] = n_starts |
| if membership_cols: |
| extra_kwargs["membership_cols"] = membership_cols |
| if bws_worst_col: |
| extra_kwargs["bws_worst_col"] = bws_worst_col |
| extra_kwargs["estimate_lambda_w"] = estimate_lambda_w |
|
|
| |
| original = estimate_dataframe( |
| df, spec, model_type=model_type, maxiter=maxiter, seed=seed, **extra_kwargs, |
| ) |
| original_est = original.estimation |
| param_names = original_est.estimates["parameter"].tolist() |
| original_values = dict( |
| zip(original_est.estimates["parameter"], original_est.estimates["estimate"]) |
| ) |
|
|
| all_estimates: list[np.ndarray] = [] |
| n_successful = 0 |
|
|
| for b in range(n_replications): |
| rep_seed = int(rng.integers(0, 2**31)) |
| resampled = _resample_individuals(df, spec.id_col, np.random.default_rng(rep_seed)) |
|
|
| try: |
| result = estimate_dataframe( |
| resampled, spec, model_type=model_type, maxiter=maxiter, |
| seed=rep_seed, **extra_kwargs, |
| ) |
| est_values = result.estimation.estimates["estimate"].to_numpy() |
| all_estimates.append(est_values) |
| n_successful += 1 |
| except Exception: |
| pass |
|
|
| if progress_callback is not None: |
| progress_callback(b + 1, n_replications) |
|
|
| if n_successful < 2: |
| raise RuntimeError( |
| f"Only {n_successful} of {n_replications} bootstrap replications succeeded. " |
| "Cannot compute bootstrap statistics." |
| ) |
|
|
| estimates_matrix = np.array(all_estimates) |
|
|
| bootstrap_se = {} |
| percentile_ci = {} |
| for i, name in enumerate(param_names): |
| col = estimates_matrix[:, i] |
| bootstrap_se[name] = float(np.std(col, ddof=1)) |
| percentile_ci[name] = (float(np.percentile(col, 2.5)), float(np.percentile(col, 97.5))) |
|
|
| return BootstrapResult( |
| n_replications=n_replications, |
| n_successful=n_successful, |
| param_names=param_names, |
| estimates_matrix=estimates_matrix, |
| bootstrap_se=bootstrap_se, |
| percentile_ci=percentile_ci, |
| original_estimates=original_values, |
| ) |
|
|