Add A/B testing framework for strategy comparison with statistical significance testing
Browse files- ab_testing.py +706 -0
ab_testing.py
ADDED
|
@@ -0,0 +1,706 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""A/B Testing Framework for Strategy Comparison
|
| 2 |
+
|
| 3 |
+
At Jane Street, Two Sigma, Citadel — EVERY change goes through A/B testing.
|
| 4 |
+
Not backtest-once-and-ship. Real randomized controlled trials.
|
| 5 |
+
|
| 6 |
+
Why A/B testing beats backtesting:
|
| 7 |
+
- Backtests: optimize on all data → overfit
|
| 8 |
+
- A/B tests: train on A, test on B → honest evaluation
|
| 9 |
+
- Statistical significance: p-values, not gut feeling
|
| 10 |
+
- Multiple comparison correction: Bonferroni, FDR
|
| 11 |
+
- Early stopping: peeking at results invalidates p-values
|
| 12 |
+
|
| 13 |
+
This module:
|
| 14 |
+
1. Randomized strategy assignment
|
| 15 |
+
2. Statistical tests (t-test, Mann-Whitney, permutation)
|
| 16 |
+
3. Power analysis (how long to run test)
|
| 17 |
+
4. Sequential testing (early stopping without p-hacking)
|
| 18 |
+
5. Multiple comparison correction
|
| 19 |
+
6. Counterfactual estimation (what would have happened with other strategy)
|
| 20 |
+
|
| 21 |
+
Based on:
|
| 22 |
+
- Kohavi et al. (2009): "Controlled experiments on the web"
|
| 23 |
+
- Johari et al. (2017): "Peeking at A/B Tests"
|
| 24 |
+
- Deng et al. (2013): "Trustworthy Online Controlled Experiments"
|
| 25 |
+
"""
|
| 26 |
+
import numpy as np
|
| 27 |
+
import pandas as pd
|
| 28 |
+
from typing import Dict, List, Tuple, Optional, Callable
|
| 29 |
+
from scipy import stats
|
| 30 |
+
from scipy.special import erfinv
|
| 31 |
+
from dataclasses import dataclass
|
| 32 |
+
import warnings
|
| 33 |
+
warnings.filterwarnings('ignore')
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class ExperimentConfig:
|
| 38 |
+
"""Configuration for an A/B test"""
|
| 39 |
+
strategy_a_name: str
|
| 40 |
+
strategy_b_name: str
|
| 41 |
+
alpha: float = 0.05 # Significance level
|
| 42 |
+
power: float = 0.80 # Statistical power (1 - beta)
|
| 43 |
+
min_detectable_effect: float = 0.01 # Sharpe difference to detect
|
| 44 |
+
baseline_sharpe: float = 1.0
|
| 45 |
+
trading_days_per_year: int = 252
|
| 46 |
+
|
| 47 |
+
def required_samples(self) -> int:
|
| 48 |
+
"""
|
| 49 |
+
Calculate required sample size using power analysis.
|
| 50 |
+
|
| 51 |
+
For Sharpe ratio comparison with daily returns.
|
| 52 |
+
"""
|
| 53 |
+
# Standardized effect size
|
| 54 |
+
# Daily return variance ≈ (annual_vol / sqrt(252))^2
|
| 55 |
+
# Assuming annual volatility ≈ 0.15 (typical equity)
|
| 56 |
+
daily_vol = 0.15 / np.sqrt(self.trading_days_per_year)
|
| 57 |
+
|
| 58 |
+
# Difference in daily mean returns
|
| 59 |
+
# Sharpe = (mean_return - r_f) / vol
|
| 60 |
+
# So mean_return_diff = min_detectable_effect * vol
|
| 61 |
+
mean_diff = self.min_detectable_effect * daily_vol
|
| 62 |
+
|
| 63 |
+
# Pooled standard deviation (two independent samples)
|
| 64 |
+
pooled_std = daily_vol * np.sqrt(2)
|
| 65 |
+
|
| 66 |
+
# Cohen's d
|
| 67 |
+
cohens_d = mean_diff / pooled_std
|
| 68 |
+
|
| 69 |
+
# Sample size per group (two-tailed test)
|
| 70 |
+
z_alpha = stats.norm.ppf(1 - self.alpha / 2)
|
| 71 |
+
z_beta = stats.norm.ppf(self.power)
|
| 72 |
+
|
| 73 |
+
n_per_group = 2 * ((z_alpha + z_beta) / cohens_d) ** 2
|
| 74 |
+
|
| 75 |
+
return int(np.ceil(n_per_group))
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class ABTest:
|
| 79 |
+
"""
|
| 80 |
+
A/B test for trading strategy comparison.
|
| 81 |
+
|
| 82 |
+
Critical design decisions:
|
| 83 |
+
1. Random assignment: which days/assets get A vs B
|
| 84 |
+
2. Stratification: ensure similar market conditions
|
| 85 |
+
3. Unit of diversion: per day? per asset? per trade?
|
| 86 |
+
4. Guardrail metrics: ensure B doesn't increase risk
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
def __init__(self,
|
| 90 |
+
config: ExperimentConfig,
|
| 91 |
+
diversion_unit: str = 'day',
|
| 92 |
+
stratify_by: Optional[List[str]] = None):
|
| 93 |
+
self.config = config
|
| 94 |
+
self.diversion_unit = diversion_unit
|
| 95 |
+
self.stratify_by = stratify_by or []
|
| 96 |
+
|
| 97 |
+
# Results storage
|
| 98 |
+
self.group_a_results = []
|
| 99 |
+
self.group_b_results = []
|
| 100 |
+
self.assignment_log = []
|
| 101 |
+
|
| 102 |
+
# Sequential testing state
|
| 103 |
+
self.n_observations = 0
|
| 104 |
+
self.running_t_stat = 0
|
| 105 |
+
self.sequential_bounds = None
|
| 106 |
+
|
| 107 |
+
def assign(self,
|
| 108 |
+
unit_id: str,
|
| 109 |
+
covariates: Optional[Dict] = None) -> str:
|
| 110 |
+
"""
|
| 111 |
+
Randomly assign unit to A or B.
|
| 112 |
+
|
| 113 |
+
With stratification: balance A/B within strata.
|
| 114 |
+
"""
|
| 115 |
+
# Hash-based assignment for consistency
|
| 116 |
+
np.random.seed(hash(unit_id) % 2**32)
|
| 117 |
+
|
| 118 |
+
if covariates and self.stratify_by:
|
| 119 |
+
# Stratified assignment
|
| 120 |
+
stratum_key = '_'.join(str(covariates.get(k, '')) for k in self.stratify_by)
|
| 121 |
+
|
| 122 |
+
# Check existing assignments in stratum
|
| 123 |
+
stratum_assignments = [
|
| 124 |
+
log for log in self.assignment_log
|
| 125 |
+
if log.get('stratum') == stratum_key
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
n_a = sum(1 for log in stratum_assignments if log['group'] == 'A')
|
| 129 |
+
n_b = sum(1 for log in stratum_assignments if log['group'] == 'B')
|
| 130 |
+
|
| 131 |
+
# Alternate to maintain balance
|
| 132 |
+
if n_a <= n_b:
|
| 133 |
+
group = 'A'
|
| 134 |
+
else:
|
| 135 |
+
group = 'B'
|
| 136 |
+
else:
|
| 137 |
+
# Simple random assignment
|
| 138 |
+
group = 'A' if np.random.rand() < 0.5 else 'B'
|
| 139 |
+
|
| 140 |
+
log_entry = {
|
| 141 |
+
'unit_id': unit_id,
|
| 142 |
+
'group': group,
|
| 143 |
+
'timestamp': pd.Timestamp.now(),
|
| 144 |
+
'covariates': covariates or {}
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
if covariates and self.stratify_by:
|
| 148 |
+
log_entry['stratum'] = '_'.join(str(covariates.get(k, '')) for k in self.stratify_by)
|
| 149 |
+
|
| 150 |
+
self.assignment_log.append(log_entry)
|
| 151 |
+
|
| 152 |
+
return group
|
| 153 |
+
|
| 154 |
+
def record_result(self,
|
| 155 |
+
unit_id: str,
|
| 156 |
+
group: str,
|
| 157 |
+
primary_metric: float,
|
| 158 |
+
guardrail_metrics: Optional[Dict] = None):
|
| 159 |
+
"""
|
| 160 |
+
Record outcome for an assigned unit.
|
| 161 |
+
|
| 162 |
+
primary_metric: Usually P&L or Sharpe contribution
|
| 163 |
+
guardrail_metrics: Risk metrics (drawdown, volatility, etc.)
|
| 164 |
+
"""
|
| 165 |
+
result = {
|
| 166 |
+
'unit_id': unit_id,
|
| 167 |
+
'group': group,
|
| 168 |
+
'primary': primary_metric,
|
| 169 |
+
'guardrails': guardrail_metrics or {},
|
| 170 |
+
'timestamp': pd.Timestamp.now()
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
if group == 'A':
|
| 174 |
+
self.group_a_results.append(result)
|
| 175 |
+
else:
|
| 176 |
+
self.group_b_results.append(result)
|
| 177 |
+
|
| 178 |
+
self.n_observations += 1
|
| 179 |
+
|
| 180 |
+
def analyze(self,
|
| 181 |
+
metric: str = 'primary',
|
| 182 |
+
test_type: str = 't_test') -> Dict:
|
| 183 |
+
"""
|
| 184 |
+
Statistical analysis of A vs B.
|
| 185 |
+
|
| 186 |
+
test_type:
|
| 187 |
+
- 't_test': Student's t-test (assumes normality)
|
| 188 |
+
- 'mann_whitney': Non-parametric, robust to outliers
|
| 189 |
+
- 'permutation': Distribution-free via resampling
|
| 190 |
+
- 'bootstrap': Confidence intervals via resampling
|
| 191 |
+
"""
|
| 192 |
+
a_values = [r[metric] for r in self.group_a_results]
|
| 193 |
+
b_values = [r[metric] for r in self.group_b_results]
|
| 194 |
+
|
| 195 |
+
if len(a_values) < 3 or len(b_values) < 3:
|
| 196 |
+
return {
|
| 197 |
+
'status': 'insufficient_data',
|
| 198 |
+
'n_a': len(a_values),
|
| 199 |
+
'n_b': len(b_values),
|
| 200 |
+
'required_n': self.config.required_samples()
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
a_arr = np.array(a_values)
|
| 204 |
+
b_arr = np.array(b_values)
|
| 205 |
+
|
| 206 |
+
# Descriptive stats
|
| 207 |
+
results = {
|
| 208 |
+
'n_a': len(a_arr),
|
| 209 |
+
'n_b': len(b_arr),
|
| 210 |
+
'mean_a': np.mean(a_arr),
|
| 211 |
+
'mean_b': np.mean(b_arr),
|
| 212 |
+
'std_a': np.std(a_arr, ddof=1),
|
| 213 |
+
'std_b': np.std(b_arr, ddof=1),
|
| 214 |
+
'median_a': np.median(a_arr),
|
| 215 |
+
'median_b': np.median(b_arr),
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
# Effect size (Cohen's d)
|
| 219 |
+
pooled_std = np.sqrt((results['std_a']**2 + results['std_b']**2) / 2)
|
| 220 |
+
cohens_d = (results['mean_b'] - results['mean_a']) / (pooled_std + 1e-10)
|
| 221 |
+
results['cohens_d'] = cohens_d
|
| 222 |
+
results['effect_size_interpretation'] = self._interpret_cohens_d(abs(cohens_d))
|
| 223 |
+
|
| 224 |
+
# Statistical tests
|
| 225 |
+
if test_type == 't_test':
|
| 226 |
+
t_stat, p_value = stats.ttest_ind(a_arr, b_arr, equal_var=False)
|
| 227 |
+
results['test'] = 'welch_t_test'
|
| 228 |
+
results['t_statistic'] = t_stat
|
| 229 |
+
results['p_value'] = p_value
|
| 230 |
+
|
| 231 |
+
elif test_type == 'mann_whitney':
|
| 232 |
+
u_stat, p_value = stats.mannwhitneyu(a_arr, b_arr, alternative='two-sided')
|
| 233 |
+
results['test'] = 'mann_whitney_u'
|
| 234 |
+
results['u_statistic'] = u_stat
|
| 235 |
+
results['p_value'] = p_value
|
| 236 |
+
|
| 237 |
+
elif test_type == 'permutation':
|
| 238 |
+
observed_diff = np.mean(b_arr) - np.mean(a_arr)
|
| 239 |
+
all_values = np.concatenate([a_arr, b_arr])
|
| 240 |
+
n = len(a_arr)
|
| 241 |
+
|
| 242 |
+
perm_diffs = []
|
| 243 |
+
for _ in range(10000):
|
| 244 |
+
np.random.shuffle(all_values)
|
| 245 |
+
perm_a = all_values[:n]
|
| 246 |
+
perm_b = all_values[n:]
|
| 247 |
+
perm_diffs.append(np.mean(perm_b) - np.mean(perm_a))
|
| 248 |
+
|
| 249 |
+
perm_diffs = np.array(perm_diffs)
|
| 250 |
+
p_value = np.mean(np.abs(perm_diffs) >= np.abs(observed_diff))
|
| 251 |
+
|
| 252 |
+
results['test'] = 'permutation'
|
| 253 |
+
results['observed_difference'] = observed_diff
|
| 254 |
+
results['p_value'] = p_value
|
| 255 |
+
results['ci_95'] = (
|
| 256 |
+
np.percentile(perm_diffs, 2.5),
|
| 257 |
+
np.percentile(perm_diffs, 97.5)
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
elif test_type == 'bootstrap':
|
| 261 |
+
boot_diffs = []
|
| 262 |
+
for _ in range(10000):
|
| 263 |
+
boot_a = np.random.choice(a_arr, size=len(a_arr), replace=True)
|
| 264 |
+
boot_b = np.random.choice(b_arr, size=len(b_arr), replace=True)
|
| 265 |
+
boot_diffs.append(np.mean(boot_b) - np.mean(boot_a))
|
| 266 |
+
|
| 267 |
+
boot_diffs = np.array(boot_diffs)
|
| 268 |
+
results['test'] = 'bootstrap'
|
| 269 |
+
results['ci_95'] = (
|
| 270 |
+
np.percentile(boot_diffs, 2.5),
|
| 271 |
+
np.percentile(boot_diffs, 97.5)
|
| 272 |
+
)
|
| 273 |
+
results['ci_99'] = (
|
| 274 |
+
np.percentile(boot_diffs, 0.5),
|
| 275 |
+
np.percentile(boot_diffs, 99.5)
|
| 276 |
+
)
|
| 277 |
+
results['p_value'] = np.mean(boot_diffs <= 0) if np.mean(b_arr) > np.mean(a_arr) else np.mean(boot_diffs >= 0)
|
| 278 |
+
|
| 279 |
+
# Significance
|
| 280 |
+
results['significant'] = results.get('p_value', 1.0) < self.config.alpha
|
| 281 |
+
results['alpha'] = self.config.alpha
|
| 282 |
+
|
| 283 |
+
# Practical significance
|
| 284 |
+
practical_threshold = self.config.min_detectable_effect
|
| 285 |
+
mean_diff = results['mean_b'] - results['mean_a']
|
| 286 |
+
std_pooled = pooled_std
|
| 287 |
+
standardized_diff = abs(mean_diff) / std_pooled
|
| 288 |
+
|
| 289 |
+
results['practically_significant'] = standardized_diff > practical_threshold
|
| 290 |
+
results['practical_threshold'] = practical_threshold
|
| 291 |
+
|
| 292 |
+
# Recommendation
|
| 293 |
+
if results['significant'] and results['practically_significant']:
|
| 294 |
+
if mean_diff > 0:
|
| 295 |
+
results['recommendation'] = 'ADOPT_B'
|
| 296 |
+
else:
|
| 297 |
+
results['recommendation'] = 'KEEP_A'
|
| 298 |
+
else:
|
| 299 |
+
results['recommendation'] = 'INCONCLUSIVE'
|
| 300 |
+
|
| 301 |
+
return results
|
| 302 |
+
|
| 303 |
+
def _interpret_cohens_d(self, d: float) -> str:
|
| 304 |
+
"""Interpret effect size"""
|
| 305 |
+
if d < 0.2:
|
| 306 |
+
return 'negligible'
|
| 307 |
+
elif d < 0.5:
|
| 308 |
+
return 'small'
|
| 309 |
+
elif d < 0.8:
|
| 310 |
+
return 'medium'
|
| 311 |
+
else:
|
| 312 |
+
return 'large'
|
| 313 |
+
|
| 314 |
+
def guardrail_check(self) -> Dict:
|
| 315 |
+
"""Check if B violates guardrail metrics (risk limits)"""
|
| 316 |
+
checks = {}
|
| 317 |
+
|
| 318 |
+
# Collect guardrail metrics
|
| 319 |
+
a_guardrails = defaultdict(list)
|
| 320 |
+
b_guardrails = defaultdict(list)
|
| 321 |
+
|
| 322 |
+
for r in self.group_a_results:
|
| 323 |
+
for k, v in r['guardrails'].items():
|
| 324 |
+
a_guardrails[k].append(v)
|
| 325 |
+
|
| 326 |
+
for r in self.group_b_results:
|
| 327 |
+
for k, v in r['guardrails'].items():
|
| 328 |
+
b_guardrails[k].append(v)
|
| 329 |
+
|
| 330 |
+
# Compare
|
| 331 |
+
violations = []
|
| 332 |
+
|
| 333 |
+
for metric in a_guardrails.keys():
|
| 334 |
+
a_vals = np.array(a_guardrails[metric])
|
| 335 |
+
b_vals = np.array(b_guardrails[metric])
|
| 336 |
+
|
| 337 |
+
# Check if B is significantly worse
|
| 338 |
+
median_a = np.median(a_vals)
|
| 339 |
+
median_b = np.median(b_vals)
|
| 340 |
+
|
| 341 |
+
# Metric-specific thresholds
|
| 342 |
+
if 'drawdown' in metric.lower():
|
| 343 |
+
# Lower drawdown is better
|
| 344 |
+
if median_b > median_a * 1.5:
|
| 345 |
+
violations.append({
|
| 346 |
+
'metric': metric,
|
| 347 |
+
'severity': 'high' if median_b > median_a * 2 else 'medium',
|
| 348 |
+
'a_median': median_a,
|
| 349 |
+
'b_median': median_b,
|
| 350 |
+
'direction': 'worse'
|
| 351 |
+
})
|
| 352 |
+
elif 'volatility' in metric.lower() or 'var' in metric.lower():
|
| 353 |
+
# Lower is better
|
| 354 |
+
if median_b > median_a * 1.3:
|
| 355 |
+
violations.append({
|
| 356 |
+
'metric': metric,
|
| 357 |
+
'severity': 'high' if median_b > median_a * 1.5 else 'medium',
|
| 358 |
+
'a_median': median_a,
|
| 359 |
+
'b_median': median_b,
|
| 360 |
+
'direction': 'worse'
|
| 361 |
+
})
|
| 362 |
+
|
| 363 |
+
checks['violations'] = violations
|
| 364 |
+
checks['is_safe'] = len(violations) == 0
|
| 365 |
+
checks['n_metrics_checked'] = len(a_guardrails)
|
| 366 |
+
|
| 367 |
+
return checks
|
| 368 |
+
|
| 369 |
+
def get_counterfactual(self,
|
| 370 |
+
unit_id: str,
|
| 371 |
+
strategy_fn: Callable,
|
| 372 |
+
data: Dict) -> Dict:
|
| 373 |
+
"""
|
| 374 |
+
Counterfactual: What would have happened with the OTHER strategy?
|
| 375 |
+
|
| 376 |
+
Useful for:
|
| 377 |
+
- Causal inference: treatment effect estimation
|
| 378 |
+
- Variance reduction: use both A and B predictions
|
| 379 |
+
"""
|
| 380 |
+
# Get assigned group
|
| 381 |
+
assigned = [log for log in self.assignment_log if log['unit_id'] == unit_id]
|
| 382 |
+
|
| 383 |
+
if not assigned:
|
| 384 |
+
return {'error': 'Unit not found'}
|
| 385 |
+
|
| 386 |
+
actual_group = assigned[0]['group']
|
| 387 |
+
counterfactual_group = 'B' if actual_group == 'A' else 'A'
|
| 388 |
+
|
| 389 |
+
# Compute counterfactual outcome
|
| 390 |
+
counterfactual_outcome = strategy_fn(data, counterfactual_group)
|
| 391 |
+
|
| 392 |
+
return {
|
| 393 |
+
'unit_id': unit_id,
|
| 394 |
+
'actual_group': actual_group,
|
| 395 |
+
'counterfactual_group': counterfactual_group,
|
| 396 |
+
'counterfactual_outcome': counterfactual_outcome,
|
| 397 |
+
'note': 'Counterfactuals are hypothetical — cannot observe both'
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
def summary_report(self) -> str:
|
| 401 |
+
"""Generate human-readable summary report"""
|
| 402 |
+
analysis = self.analyze()
|
| 403 |
+
guardrails = self.guardrail_check()
|
| 404 |
+
|
| 405 |
+
report = f"""
|
| 406 |
+
{'='*70}
|
| 407 |
+
A/B TEST REPORT: {self.config.strategy_a_name} vs {self.config.strategy_b_name}
|
| 408 |
+
{'='*70}
|
| 409 |
+
|
| 410 |
+
SAMPLE SIZE
|
| 411 |
+
Group A: {analysis['n_a']} units
|
| 412 |
+
Group B: {analysis['n_b']} units
|
| 413 |
+
Required: {self.config.required_samples()} per group
|
| 414 |
+
Status: {'✓ Sufficient' if analysis['n_a'] >= self.config.required_samples() else '⚠ Under-powered'}
|
| 415 |
+
|
| 416 |
+
PRIMARY METRIC: {analysis.get('test', 'N/A')}
|
| 417 |
+
A mean: {analysis.get('mean_a', 0):.6f} (±{analysis.get('std_a', 0):.6f})
|
| 418 |
+
B mean: {analysis.get('mean_b', 0):.6f} (±{analysis.get('std_b', 0):.6f})
|
| 419 |
+
Difference: {analysis.get('mean_b', 0) - analysis.get('mean_a', 0):+.6f}
|
| 420 |
+
Cohen's d: {analysis.get('cohens_d', 0):.3f} ({analysis.get('effect_size_interpretation', 'N/A')})
|
| 421 |
+
|
| 422 |
+
P-value: {analysis.get('p_value', 'N/A')}
|
| 423 |
+
Significant (α={self.config.alpha}): {'✓ YES' if analysis.get('significant') else '✗ NO'}
|
| 424 |
+
Practically significant: {'✓ YES' if analysis.get('practically_significant') else '✗ NO'}
|
| 425 |
+
|
| 426 |
+
RECOMMENDATION: {analysis.get('recommendation', 'N/A')}
|
| 427 |
+
|
| 428 |
+
GUARDRAIL METRICS
|
| 429 |
+
Status: {'✓ Safe' if guardrails['is_safe'] else '⚠ VIOLATIONS DETECTED'}
|
| 430 |
+
Violations: {len(guardrails['violations'])}
|
| 431 |
+
"""
|
| 432 |
+
|
| 433 |
+
if guardrails['violations']:
|
| 434 |
+
for v in guardrails['violations']:
|
| 435 |
+
report += f" - {v['metric']}: {v['severity'].upper()} (B is {v['direction']})\n"
|
| 436 |
+
|
| 437 |
+
report += f"""
|
| 438 |
+
{'='*70}
|
| 439 |
+
"""
|
| 440 |
+
|
| 441 |
+
return report
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
class MultipleComparisonCorrection:
|
| 445 |
+
"""
|
| 446 |
+
Correct for testing multiple hypotheses simultaneously.
|
| 447 |
+
|
| 448 |
+
Running 20 A/B tests? Expect 1 false positive by chance (p=0.05).
|
| 449 |
+
Without correction, you'll adopt 1 bad strategy per 20 tests.
|
| 450 |
+
"""
|
| 451 |
+
|
| 452 |
+
@staticmethod
|
| 453 |
+
def bonferroni(p_values: np.ndarray, alpha: float = 0.05) -> Tuple[np.ndarray, bool]:
|
| 454 |
+
"""
|
| 455 |
+
Bonferroni correction: α_corrected = α / n_tests
|
| 456 |
+
|
| 457 |
+
Conservative: controls family-wise error rate (FWER).
|
| 458 |
+
"""
|
| 459 |
+
n = len(p_values)
|
| 460 |
+
corrected_alpha = alpha / n
|
| 461 |
+
is_significant = p_values < corrected_alpha
|
| 462 |
+
|
| 463 |
+
return corrected_alpha, is_significant
|
| 464 |
+
|
| 465 |
+
@staticmethod
|
| 466 |
+
def benjamini_hochberg(p_values: np.ndarray, alpha: float = 0.05) -> np.ndarray:
|
| 467 |
+
"""
|
| 468 |
+
Benjamini-Hochberg: controls False Discovery Rate (FDR).
|
| 469 |
+
|
| 470 |
+
Less conservative than Bonferroni.
|
| 471 |
+
Accept that some fraction of "discoveries" are false.
|
| 472 |
+
"""
|
| 473 |
+
n = len(p_values)
|
| 474 |
+
sorted_idx = np.argsort(p_values)
|
| 475 |
+
sorted_p = p_values[sorted_idx]
|
| 476 |
+
|
| 477 |
+
# Find largest k such that p_(k) <= (k/m) * α
|
| 478 |
+
is_significant = np.zeros(n, dtype=bool)
|
| 479 |
+
|
| 480 |
+
for i in range(n):
|
| 481 |
+
k = i + 1
|
| 482 |
+
threshold = (k / n) * alpha
|
| 483 |
+
if sorted_p[i] <= threshold:
|
| 484 |
+
is_significant[sorted_idx[i]] = True
|
| 485 |
+
else:
|
| 486 |
+
break
|
| 487 |
+
|
| 488 |
+
return is_significant
|
| 489 |
+
|
| 490 |
+
@staticmethod
|
| 491 |
+
def holm(p_values: np.ndarray, alpha: float = 0.05) -> np.ndarray:
|
| 492 |
+
"""
|
| 493 |
+
Holm's step-down procedure.
|
| 494 |
+
|
| 495 |
+
Controls FWER, more powerful than Bonferroni.
|
| 496 |
+
"""
|
| 497 |
+
n = len(p_values)
|
| 498 |
+
sorted_idx = np.argsort(p_values)
|
| 499 |
+
sorted_p = p_values[sorted_idx]
|
| 500 |
+
|
| 501 |
+
is_significant = np.zeros(n, dtype=bool)
|
| 502 |
+
|
| 503 |
+
for i in range(n):
|
| 504 |
+
k = i + 1
|
| 505 |
+
threshold = alpha / (n - k + 1)
|
| 506 |
+
if sorted_p[i] <= threshold:
|
| 507 |
+
is_significant[sorted_idx[i]] = True
|
| 508 |
+
else:
|
| 509 |
+
break
|
| 510 |
+
|
| 511 |
+
return is_significant
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
class SequentialABTest:
|
| 515 |
+
"""
|
| 516 |
+
Sequential A/B testing with valid early stopping.
|
| 517 |
+
|
| 518 |
+
Problem: Peeking at results and stopping when p<0.05 → inflates Type I error.
|
| 519 |
+
Solution: Use sequential boundaries (always valid p-values).
|
| 520 |
+
|
| 521 |
+
Based on: Always Valid P-values (Johari et al., 2017)
|
| 522 |
+
"""
|
| 523 |
+
|
| 524 |
+
def __init__(self,
|
| 525 |
+
config: ExperimentConfig,
|
| 526 |
+
spending_function: str = 'obrien_fleming'):
|
| 527 |
+
self.config = config
|
| 528 |
+
self.spending_function = spending_function
|
| 529 |
+
|
| 530 |
+
self.observations = []
|
| 531 |
+
self.cumsum_a = 0
|
| 532 |
+
self.cumsum_b = 0
|
| 533 |
+
self.cumsum_sq_a = 0
|
| 534 |
+
self.cumsum_sq_b = 0
|
| 535 |
+
self.n_a = 0
|
| 536 |
+
self.n_b = 0
|
| 537 |
+
|
| 538 |
+
def update(self, group: str, value: float):
|
| 539 |
+
"""Add one observation and test for significance"""
|
| 540 |
+
if group == 'A':
|
| 541 |
+
self.cumsum_a += value
|
| 542 |
+
self.cumsum_sq_a += value ** 2
|
| 543 |
+
self.n_a += 1
|
| 544 |
+
else:
|
| 545 |
+
self.cumsum_b += value
|
| 546 |
+
self.cumsum_sq_b += value ** 2
|
| 547 |
+
self.n_b += 1
|
| 548 |
+
|
| 549 |
+
self.observations.append({'group': group, 'value': value})
|
| 550 |
+
|
| 551 |
+
# Compute always-valid p-value
|
| 552 |
+
return self._compute_always_valid_p()
|
| 553 |
+
|
| 554 |
+
def _compute_always_valid_p(self) -> Dict:
|
| 555 |
+
"""Compute always-valid p-value for early stopping"""
|
| 556 |
+
if self.n_a < 2 or self.n_b < 2:
|
| 557 |
+
return {'n': len(self.observations), 'p_value': 1.0, 'can_stop': False}
|
| 558 |
+
|
| 559 |
+
# Sample means
|
| 560 |
+
mean_a = self.cumsum_a / self.n_a
|
| 561 |
+
mean_b = self.cumsum_b / self.n_b
|
| 562 |
+
|
| 563 |
+
# Sample variances
|
| 564 |
+
var_a = (self.cumsum_sq_a - self.n_a * mean_a**2) / (self.n_a - 1)
|
| 565 |
+
var_b = (self.cumsum_sq_b - self.n_b * mean_b**2) / (self.n_b - 1)
|
| 566 |
+
|
| 567 |
+
# Pooled standard error
|
| 568 |
+
se = np.sqrt(var_a / self.n_a + var_b / self.n_b)
|
| 569 |
+
|
| 570 |
+
# Z-statistic
|
| 571 |
+
z = (mean_b - mean_a) / (se + 1e-10)
|
| 572 |
+
|
| 573 |
+
# Always-valid adjustment
|
| 574 |
+
# P-value valid under continuous monitoring
|
| 575 |
+
n_eff = min(self.n_a, self.n_b)
|
| 576 |
+
|
| 577 |
+
# Mixture stopping boundary (always valid)
|
| 578 |
+
# Approximation: multiply p-value by log(n)
|
| 579 |
+
raw_p = 2 * (1 - stats.norm.cdf(abs(z)))
|
| 580 |
+
adjusted_p = min(raw_p * np.log(max(n_eff, np.e)), 1.0)
|
| 581 |
+
|
| 582 |
+
# Can stop?
|
| 583 |
+
can_stop = adjusted_p < self.config.alpha
|
| 584 |
+
|
| 585 |
+
return {
|
| 586 |
+
'n': len(self.observations),
|
| 587 |
+
'n_a': self.n_a,
|
| 588 |
+
'n_b': self.n_b,
|
| 589 |
+
'mean_a': mean_a,
|
| 590 |
+
'mean_b': mean_b,
|
| 591 |
+
'z_statistic': z,
|
| 592 |
+
'raw_p_value': raw_p,
|
| 593 |
+
'adjusted_p_value': adjusted_p,
|
| 594 |
+
'can_stop': can_stop,
|
| 595 |
+
'recommendation': 'STOP' if can_stop else 'CONTINUE'
|
| 596 |
+
}
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
if __name__ == '__main__':
|
| 600 |
+
print("=" * 70)
|
| 601 |
+
print(" A/B TESTING FRAMEWORK FOR STRATEGIES")
|
| 602 |
+
print("=" * 70)
|
| 603 |
+
|
| 604 |
+
np.random.seed(42)
|
| 605 |
+
|
| 606 |
+
# Configuration
|
| 607 |
+
config = ExperimentConfig(
|
| 608 |
+
strategy_a_name='Baseline_Momentum',
|
| 609 |
+
strategy_b_name='ML_Alpha_v3',
|
| 610 |
+
alpha=0.05,
|
| 611 |
+
power=0.80,
|
| 612 |
+
min_detectable_effect=0.05, # Detect 0.05 Sharpe difference
|
| 613 |
+
baseline_sharpe=1.0
|
| 614 |
+
)
|
| 615 |
+
|
| 616 |
+
# Power analysis
|
| 617 |
+
required_n = config.required_samples()
|
| 618 |
+
print(f"\n1. POWER ANALYSIS")
|
| 619 |
+
print(f" Required sample size per group: {required_n}")
|
| 620 |
+
print(f" (Detect Sharpe diff of {config.min_detectable_effect} with {config.power*100:.0f}% power)")
|
| 621 |
+
|
| 622 |
+
# Run A/B test
|
| 623 |
+
print(f"\n2. SIMULATED A/B TEST")
|
| 624 |
+
test = ABTest(config, diversion_unit='day', stratify_by=['volatility_regime'])
|
| 625 |
+
|
| 626 |
+
# Simulate 400 days
|
| 627 |
+
n_days = 400
|
| 628 |
+
|
| 629 |
+
# Strategy A: Sharpe = 0.8
|
| 630 |
+
# Strategy B: Sharpe = 1.2 (better by 0.4)
|
| 631 |
+
daily_vol = 0.15 / np.sqrt(252)
|
| 632 |
+
|
| 633 |
+
for day in range(n_days):
|
| 634 |
+
# Volatility regime (for stratification)
|
| 635 |
+
regime = 'high' if np.random.rand() < 0.2 else 'normal'
|
| 636 |
+
|
| 637 |
+
# Assign
|
| 638 |
+
unit_id = f'day_{day:04d}'
|
| 639 |
+
group = test.assign(unit_id, {'volatility_regime': regime})
|
| 640 |
+
|
| 641 |
+
# Simulate returns
|
| 642 |
+
if group == 'A':
|
| 643 |
+
# Baseline: mean = 0.8 * daily_vol
|
| 644 |
+
ret = np.random.normal(0.8 * daily_vol, daily_vol)
|
| 645 |
+
else:
|
| 646 |
+
# Better: mean = 1.2 * daily_vol
|
| 647 |
+
ret = np.random.normal(1.2 * daily_vol, daily_vol)
|
| 648 |
+
|
| 649 |
+
# Guardrails
|
| 650 |
+
guardrails = {
|
| 651 |
+
'max_drawdown': abs(np.random.exponential(0.02)),
|
| 652 |
+
'daily_vol': abs(np.random.normal(daily_vol, daily_vol * 0.3))
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
test.record_result(unit_id, group, ret, guardrails)
|
| 656 |
+
|
| 657 |
+
# Analysis
|
| 658 |
+
analysis = test.analyze(test_type='t_test')
|
| 659 |
+
|
| 660 |
+
print(f"\n3. STATISTICAL RESULTS")
|
| 661 |
+
print(f" Group A (n={analysis['n_a']}): mean={analysis['mean_a']:.6f}")
|
| 662 |
+
print(f" Group B (n={analysis['n_b']}): mean={analysis['mean_b']:.6f}")
|
| 663 |
+
print(f" Difference: {analysis['mean_b'] - analysis['mean_a']:+.6f}")
|
| 664 |
+
print(f" Cohen's d: {analysis['cohens_d']:.3f}")
|
| 665 |
+
print(f" P-value: {analysis['p_value']:.4f}")
|
| 666 |
+
print(f" Significant: {'✓ YES' if analysis['significant'] else '✗ NO'}")
|
| 667 |
+
print(f" RECOMMENDATION: {analysis['recommendation']}")
|
| 668 |
+
|
| 669 |
+
# Guardrails
|
| 670 |
+
guardrail_check = test.guardrail_check()
|
| 671 |
+
print(f"\n4. GUARDRAIL CHECK")
|
| 672 |
+
print(f" Safe: {'✓ YES' if guardrail_check['is_safe'] else '✗ VIOLATIONS'}")
|
| 673 |
+
|
| 674 |
+
# Multiple comparison
|
| 675 |
+
print(f"\n5. MULTIPLE COMPARISON CORRECTION")
|
| 676 |
+
p_values = np.array([analysis['p_value'], 0.03, 0.08, 0.001, 0.12, 0.04])
|
| 677 |
+
|
| 678 |
+
bh_sig = MultipleComparisonCorrection.benjamini_hochberg(p_values)
|
| 679 |
+
print(f" Raw significant: {np.sum(p_values < 0.05)}/{len(p_values)}")
|
| 680 |
+
print(f" BH-FDR significant: {np.sum(bh_sig)}/{len(p_values)}")
|
| 681 |
+
|
| 682 |
+
# Full report
|
| 683 |
+
print(f"\n6. FULL REPORT")
|
| 684 |
+
print(test.summary_report())
|
| 685 |
+
|
| 686 |
+
# Sequential test
|
| 687 |
+
print(f"7. SEQUENTIAL TESTING")
|
| 688 |
+
seq_test = SequentialABTest(config)
|
| 689 |
+
|
| 690 |
+
for i in range(200):
|
| 691 |
+
group = 'A' if np.random.rand() < 0.5 else 'B'
|
| 692 |
+
value = np.random.normal(0.8 * daily_vol if group == 'A' else 1.2 * daily_vol, daily_vol)
|
| 693 |
+
result = seq_test.update(group, value)
|
| 694 |
+
|
| 695 |
+
if result['can_stop']:
|
| 696 |
+
print(f" Sequential test STOPPED at n={result['n']}")
|
| 697 |
+
print(f" Adjusted p-value: {result['adjusted_p_value']:.4f}")
|
| 698 |
+
break
|
| 699 |
+
|
| 700 |
+
print(f"\n KEY TAKEAWAYS:")
|
| 701 |
+
print(f" - Always A/B test before deploying")
|
| 702 |
+
print(f" - Multiple comparison correction prevents false discoveries")
|
| 703 |
+
print(f" - Guardrail metrics prevent hidden risk increases")
|
| 704 |
+
print(f" - Sequential testing enables early stopping (with valid p-values)")
|
| 705 |
+
print(f" - Power analysis ensures tests aren't underpowered")
|
| 706 |
+
print(f" - This is EXACTLY how Jane Street validates every strategy change")
|