Spaces:

Wil2200
/

prefero

Sleeping

App Files Files Community

prefero / src /dce_analyzer /bootstrap.py

Wil2200

Add dual license (AGPL-3.0 + Commercial) and copyright notices

247642a about 2 months ago

raw

history blame contribute delete

6.44 kB

	# Copyright (C) 2026 Hengzhe Zhao. All rights reserved.
	# Licensed under dual license: AGPL-3.0 (open-source) or commercial. See LICENSE.

	"""Non-parametric bootstrap for discrete-choice model inference."""

	from __future__ import annotations

	from dataclasses import dataclass
	from typing import Any, Callable

	import numpy as np
	import pandas as pd

	from .config import ModelSpec
	from .pipeline import estimate_dataframe


	@dataclass
	class BootstrapResult:
	"""Stores the results of a bootstrap procedure."""

	n_replications: int
	n_successful: int
	param_names: list[str]
	estimates_matrix: np.ndarray # (n_successful, n_params) — each row is one replication
	bootstrap_se: dict[str, float]
	percentile_ci: dict[str, tuple[float, float]] # 95% CI per parameter
	original_estimates: dict[str, float]

	def summary_dataframe(self) -> pd.DataFrame:
	rows = []
	for name in self.param_names:
	rows.append(
	{
	"parameter": name,
	"original": self.original_estimates.get(name, float("nan")),
	"bootstrap_se": self.bootstrap_se.get(name, float("nan")),
	"ci_lower": self.percentile_ci[name][0] if name in self.percentile_ci else float("nan"),
	"ci_upper": self.percentile_ci[name][1] if name in self.percentile_ci else float("nan"),
	}
	)
	return pd.DataFrame(rows)


	def _resample_individuals(df: pd.DataFrame, id_col: str, rng: np.random.Generator) -> pd.DataFrame:
	"""Resample individuals with replacement, keeping all tasks per individual."""
	unique_ids = df[id_col].unique()
	sampled_ids = rng.choice(unique_ids, size=len(unique_ids), replace=True)

	parts = []
	for new_idx, orig_id in enumerate(sampled_ids):
	chunk = df[df[id_col] == orig_id].copy()
	chunk[id_col] = new_idx
	parts.append(chunk)

	return pd.concat(parts, ignore_index=True)


	def run_bootstrap(
	df: pd.DataFrame,
	spec: ModelSpec,
	model_type: str = "mixed",
	n_replications: int = 100,
	maxiter: int = 200,
	seed: int = 42,
	progress_callback: Callable[[int, int], None] \| None = None,
	*,
	correlated: bool = False,
	correlation_groups: list[list[int]] \| None = None,
	n_classes: int \| None = None,
	n_starts: int = 10,
	membership_cols: list[str] \| None = None,
	bws_worst_col: str \| None = None,
	estimate_lambda_w: bool = True,
	) -> BootstrapResult:
	"""
	Run non-parametric bootstrap by resampling individuals with replacement.

	Parameters
	----------
	df : pd.DataFrame
	Long-format choice data.
	spec : ModelSpec
	Model specification.
	model_type : str
	"mixed", "conditional", "gmnl", or "latent_class".
	n_replications : int
	Number of bootstrap replications.
	maxiter : int
	Max optimizer iterations per replication.
	seed : int
	Base seed for reproducibility.
	progress_callback : callable, optional
	Called with (current_replication, n_replications) after each replication.
	correlated : bool
	Enable full correlation (Cholesky) for random parameters.
	correlation_groups : list[list[int]], optional
	Selective correlation groups (block-diagonal Cholesky).
	n_classes : int, optional
	Number of latent classes (for latent_class model type).
	n_starts : int
	Number of random starts (for latent_class).
	membership_cols : list[str], optional
	Membership covariates (for latent_class).
	bws_worst_col : str, optional
	Column name for BWS worst choices.
	estimate_lambda_w : bool
	Whether to estimate lambda_w for BWS.

	Returns
	-------
	BootstrapResult
	"""
	rng = np.random.default_rng(seed)

	# Build extra kwargs for estimate_dataframe
	extra_kwargs: dict[str, Any] = {}
	if correlated:
	extra_kwargs["correlated"] = True
	if correlation_groups is not None:
	extra_kwargs["correlation_groups"] = correlation_groups
	if n_classes is not None:
	extra_kwargs["n_classes"] = n_classes
	if n_starts != 10:
	extra_kwargs["n_starts"] = n_starts
	if membership_cols:
	extra_kwargs["membership_cols"] = membership_cols
	if bws_worst_col:
	extra_kwargs["bws_worst_col"] = bws_worst_col
	extra_kwargs["estimate_lambda_w"] = estimate_lambda_w

	# run original estimation for reference
	original = estimate_dataframe(
	df, spec, model_type=model_type, maxiter=maxiter, seed=seed, **extra_kwargs,
	)
	original_est = original.estimation
	param_names = original_est.estimates["parameter"].tolist()
	original_values = dict(
	zip(original_est.estimates["parameter"], original_est.estimates["estimate"])
	)

	all_estimates: list[np.ndarray] = []
	n_successful = 0

	for b in range(n_replications):
	rep_seed = int(rng.integers(0, 2**31))
	resampled = _resample_individuals(df, spec.id_col, np.random.default_rng(rep_seed))

	try:
	result = estimate_dataframe(
	resampled, spec, model_type=model_type, maxiter=maxiter,
	seed=rep_seed, **extra_kwargs,
	)
	est_values = result.estimation.estimates["estimate"].to_numpy()
	all_estimates.append(est_values)
	n_successful += 1
	except Exception:
	pass # skip failed replications

	if progress_callback is not None:
	progress_callback(b + 1, n_replications)

	if n_successful < 2:
	raise RuntimeError(
	f"Only {n_successful} of {n_replications} bootstrap replications succeeded. "
	"Cannot compute bootstrap statistics."
	)

	estimates_matrix = np.array(all_estimates) # (n_successful, n_params)

	bootstrap_se = {}
	percentile_ci = {}
	for i, name in enumerate(param_names):
	col = estimates_matrix[:, i]
	bootstrap_se[name] = float(np.std(col, ddof=1))
	percentile_ci[name] = (float(np.percentile(col, 2.5)), float(np.percentile(col, 97.5)))

	return BootstrapResult(
	n_replications=n_replications,
	n_successful=n_successful,
	param_names=param_names,
	estimates_matrix=estimates_matrix,
	bootstrap_se=bootstrap_se,
	percentile_ci=percentile_ci,
	original_estimates=original_values,
	)