Premchan369 commited on
Commit
a512ac0
·
verified ·
1 Parent(s): 7a0ba11

Add A/B testing framework for strategy comparison with statistical significance testing

Browse files
Files changed (1) hide show
  1. ab_testing.py +706 -0
ab_testing.py ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A/B Testing Framework for Strategy Comparison
2
+
3
+ At Jane Street, Two Sigma, Citadel — EVERY change goes through A/B testing.
4
+ Not backtest-once-and-ship. Real randomized controlled trials.
5
+
6
+ Why A/B testing beats backtesting:
7
+ - Backtests: optimize on all data → overfit
8
+ - A/B tests: train on A, test on B → honest evaluation
9
+ - Statistical significance: p-values, not gut feeling
10
+ - Multiple comparison correction: Bonferroni, FDR
11
+ - Early stopping: peeking at results invalidates p-values
12
+
13
+ This module:
14
+ 1. Randomized strategy assignment
15
+ 2. Statistical tests (t-test, Mann-Whitney, permutation)
16
+ 3. Power analysis (how long to run test)
17
+ 4. Sequential testing (early stopping without p-hacking)
18
+ 5. Multiple comparison correction
19
+ 6. Counterfactual estimation (what would have happened with other strategy)
20
+
21
+ Based on:
22
+ - Kohavi et al. (2009): "Controlled experiments on the web"
23
+ - Johari et al. (2017): "Peeking at A/B Tests"
24
+ - Deng et al. (2013): "Trustworthy Online Controlled Experiments"
25
+ """
26
+ import numpy as np
27
+ import pandas as pd
28
+ from typing import Dict, List, Tuple, Optional, Callable
29
+ from scipy import stats
30
+ from scipy.special import erfinv
31
+ from dataclasses import dataclass
32
+ import warnings
33
+ warnings.filterwarnings('ignore')
34
+
35
+
36
+ @dataclass
37
+ class ExperimentConfig:
38
+ """Configuration for an A/B test"""
39
+ strategy_a_name: str
40
+ strategy_b_name: str
41
+ alpha: float = 0.05 # Significance level
42
+ power: float = 0.80 # Statistical power (1 - beta)
43
+ min_detectable_effect: float = 0.01 # Sharpe difference to detect
44
+ baseline_sharpe: float = 1.0
45
+ trading_days_per_year: int = 252
46
+
47
+ def required_samples(self) -> int:
48
+ """
49
+ Calculate required sample size using power analysis.
50
+
51
+ For Sharpe ratio comparison with daily returns.
52
+ """
53
+ # Standardized effect size
54
+ # Daily return variance ≈ (annual_vol / sqrt(252))^2
55
+ # Assuming annual volatility ≈ 0.15 (typical equity)
56
+ daily_vol = 0.15 / np.sqrt(self.trading_days_per_year)
57
+
58
+ # Difference in daily mean returns
59
+ # Sharpe = (mean_return - r_f) / vol
60
+ # So mean_return_diff = min_detectable_effect * vol
61
+ mean_diff = self.min_detectable_effect * daily_vol
62
+
63
+ # Pooled standard deviation (two independent samples)
64
+ pooled_std = daily_vol * np.sqrt(2)
65
+
66
+ # Cohen's d
67
+ cohens_d = mean_diff / pooled_std
68
+
69
+ # Sample size per group (two-tailed test)
70
+ z_alpha = stats.norm.ppf(1 - self.alpha / 2)
71
+ z_beta = stats.norm.ppf(self.power)
72
+
73
+ n_per_group = 2 * ((z_alpha + z_beta) / cohens_d) ** 2
74
+
75
+ return int(np.ceil(n_per_group))
76
+
77
+
78
+ class ABTest:
79
+ """
80
+ A/B test for trading strategy comparison.
81
+
82
+ Critical design decisions:
83
+ 1. Random assignment: which days/assets get A vs B
84
+ 2. Stratification: ensure similar market conditions
85
+ 3. Unit of diversion: per day? per asset? per trade?
86
+ 4. Guardrail metrics: ensure B doesn't increase risk
87
+ """
88
+
89
+ def __init__(self,
90
+ config: ExperimentConfig,
91
+ diversion_unit: str = 'day',
92
+ stratify_by: Optional[List[str]] = None):
93
+ self.config = config
94
+ self.diversion_unit = diversion_unit
95
+ self.stratify_by = stratify_by or []
96
+
97
+ # Results storage
98
+ self.group_a_results = []
99
+ self.group_b_results = []
100
+ self.assignment_log = []
101
+
102
+ # Sequential testing state
103
+ self.n_observations = 0
104
+ self.running_t_stat = 0
105
+ self.sequential_bounds = None
106
+
107
+ def assign(self,
108
+ unit_id: str,
109
+ covariates: Optional[Dict] = None) -> str:
110
+ """
111
+ Randomly assign unit to A or B.
112
+
113
+ With stratification: balance A/B within strata.
114
+ """
115
+ # Hash-based assignment for consistency
116
+ np.random.seed(hash(unit_id) % 2**32)
117
+
118
+ if covariates and self.stratify_by:
119
+ # Stratified assignment
120
+ stratum_key = '_'.join(str(covariates.get(k, '')) for k in self.stratify_by)
121
+
122
+ # Check existing assignments in stratum
123
+ stratum_assignments = [
124
+ log for log in self.assignment_log
125
+ if log.get('stratum') == stratum_key
126
+ ]
127
+
128
+ n_a = sum(1 for log in stratum_assignments if log['group'] == 'A')
129
+ n_b = sum(1 for log in stratum_assignments if log['group'] == 'B')
130
+
131
+ # Alternate to maintain balance
132
+ if n_a <= n_b:
133
+ group = 'A'
134
+ else:
135
+ group = 'B'
136
+ else:
137
+ # Simple random assignment
138
+ group = 'A' if np.random.rand() < 0.5 else 'B'
139
+
140
+ log_entry = {
141
+ 'unit_id': unit_id,
142
+ 'group': group,
143
+ 'timestamp': pd.Timestamp.now(),
144
+ 'covariates': covariates or {}
145
+ }
146
+
147
+ if covariates and self.stratify_by:
148
+ log_entry['stratum'] = '_'.join(str(covariates.get(k, '')) for k in self.stratify_by)
149
+
150
+ self.assignment_log.append(log_entry)
151
+
152
+ return group
153
+
154
+ def record_result(self,
155
+ unit_id: str,
156
+ group: str,
157
+ primary_metric: float,
158
+ guardrail_metrics: Optional[Dict] = None):
159
+ """
160
+ Record outcome for an assigned unit.
161
+
162
+ primary_metric: Usually P&L or Sharpe contribution
163
+ guardrail_metrics: Risk metrics (drawdown, volatility, etc.)
164
+ """
165
+ result = {
166
+ 'unit_id': unit_id,
167
+ 'group': group,
168
+ 'primary': primary_metric,
169
+ 'guardrails': guardrail_metrics or {},
170
+ 'timestamp': pd.Timestamp.now()
171
+ }
172
+
173
+ if group == 'A':
174
+ self.group_a_results.append(result)
175
+ else:
176
+ self.group_b_results.append(result)
177
+
178
+ self.n_observations += 1
179
+
180
+ def analyze(self,
181
+ metric: str = 'primary',
182
+ test_type: str = 't_test') -> Dict:
183
+ """
184
+ Statistical analysis of A vs B.
185
+
186
+ test_type:
187
+ - 't_test': Student's t-test (assumes normality)
188
+ - 'mann_whitney': Non-parametric, robust to outliers
189
+ - 'permutation': Distribution-free via resampling
190
+ - 'bootstrap': Confidence intervals via resampling
191
+ """
192
+ a_values = [r[metric] for r in self.group_a_results]
193
+ b_values = [r[metric] for r in self.group_b_results]
194
+
195
+ if len(a_values) < 3 or len(b_values) < 3:
196
+ return {
197
+ 'status': 'insufficient_data',
198
+ 'n_a': len(a_values),
199
+ 'n_b': len(b_values),
200
+ 'required_n': self.config.required_samples()
201
+ }
202
+
203
+ a_arr = np.array(a_values)
204
+ b_arr = np.array(b_values)
205
+
206
+ # Descriptive stats
207
+ results = {
208
+ 'n_a': len(a_arr),
209
+ 'n_b': len(b_arr),
210
+ 'mean_a': np.mean(a_arr),
211
+ 'mean_b': np.mean(b_arr),
212
+ 'std_a': np.std(a_arr, ddof=1),
213
+ 'std_b': np.std(b_arr, ddof=1),
214
+ 'median_a': np.median(a_arr),
215
+ 'median_b': np.median(b_arr),
216
+ }
217
+
218
+ # Effect size (Cohen's d)
219
+ pooled_std = np.sqrt((results['std_a']**2 + results['std_b']**2) / 2)
220
+ cohens_d = (results['mean_b'] - results['mean_a']) / (pooled_std + 1e-10)
221
+ results['cohens_d'] = cohens_d
222
+ results['effect_size_interpretation'] = self._interpret_cohens_d(abs(cohens_d))
223
+
224
+ # Statistical tests
225
+ if test_type == 't_test':
226
+ t_stat, p_value = stats.ttest_ind(a_arr, b_arr, equal_var=False)
227
+ results['test'] = 'welch_t_test'
228
+ results['t_statistic'] = t_stat
229
+ results['p_value'] = p_value
230
+
231
+ elif test_type == 'mann_whitney':
232
+ u_stat, p_value = stats.mannwhitneyu(a_arr, b_arr, alternative='two-sided')
233
+ results['test'] = 'mann_whitney_u'
234
+ results['u_statistic'] = u_stat
235
+ results['p_value'] = p_value
236
+
237
+ elif test_type == 'permutation':
238
+ observed_diff = np.mean(b_arr) - np.mean(a_arr)
239
+ all_values = np.concatenate([a_arr, b_arr])
240
+ n = len(a_arr)
241
+
242
+ perm_diffs = []
243
+ for _ in range(10000):
244
+ np.random.shuffle(all_values)
245
+ perm_a = all_values[:n]
246
+ perm_b = all_values[n:]
247
+ perm_diffs.append(np.mean(perm_b) - np.mean(perm_a))
248
+
249
+ perm_diffs = np.array(perm_diffs)
250
+ p_value = np.mean(np.abs(perm_diffs) >= np.abs(observed_diff))
251
+
252
+ results['test'] = 'permutation'
253
+ results['observed_difference'] = observed_diff
254
+ results['p_value'] = p_value
255
+ results['ci_95'] = (
256
+ np.percentile(perm_diffs, 2.5),
257
+ np.percentile(perm_diffs, 97.5)
258
+ )
259
+
260
+ elif test_type == 'bootstrap':
261
+ boot_diffs = []
262
+ for _ in range(10000):
263
+ boot_a = np.random.choice(a_arr, size=len(a_arr), replace=True)
264
+ boot_b = np.random.choice(b_arr, size=len(b_arr), replace=True)
265
+ boot_diffs.append(np.mean(boot_b) - np.mean(boot_a))
266
+
267
+ boot_diffs = np.array(boot_diffs)
268
+ results['test'] = 'bootstrap'
269
+ results['ci_95'] = (
270
+ np.percentile(boot_diffs, 2.5),
271
+ np.percentile(boot_diffs, 97.5)
272
+ )
273
+ results['ci_99'] = (
274
+ np.percentile(boot_diffs, 0.5),
275
+ np.percentile(boot_diffs, 99.5)
276
+ )
277
+ results['p_value'] = np.mean(boot_diffs <= 0) if np.mean(b_arr) > np.mean(a_arr) else np.mean(boot_diffs >= 0)
278
+
279
+ # Significance
280
+ results['significant'] = results.get('p_value', 1.0) < self.config.alpha
281
+ results['alpha'] = self.config.alpha
282
+
283
+ # Practical significance
284
+ practical_threshold = self.config.min_detectable_effect
285
+ mean_diff = results['mean_b'] - results['mean_a']
286
+ std_pooled = pooled_std
287
+ standardized_diff = abs(mean_diff) / std_pooled
288
+
289
+ results['practically_significant'] = standardized_diff > practical_threshold
290
+ results['practical_threshold'] = practical_threshold
291
+
292
+ # Recommendation
293
+ if results['significant'] and results['practically_significant']:
294
+ if mean_diff > 0:
295
+ results['recommendation'] = 'ADOPT_B'
296
+ else:
297
+ results['recommendation'] = 'KEEP_A'
298
+ else:
299
+ results['recommendation'] = 'INCONCLUSIVE'
300
+
301
+ return results
302
+
303
+ def _interpret_cohens_d(self, d: float) -> str:
304
+ """Interpret effect size"""
305
+ if d < 0.2:
306
+ return 'negligible'
307
+ elif d < 0.5:
308
+ return 'small'
309
+ elif d < 0.8:
310
+ return 'medium'
311
+ else:
312
+ return 'large'
313
+
314
+ def guardrail_check(self) -> Dict:
315
+ """Check if B violates guardrail metrics (risk limits)"""
316
+ checks = {}
317
+
318
+ # Collect guardrail metrics
319
+ a_guardrails = defaultdict(list)
320
+ b_guardrails = defaultdict(list)
321
+
322
+ for r in self.group_a_results:
323
+ for k, v in r['guardrails'].items():
324
+ a_guardrails[k].append(v)
325
+
326
+ for r in self.group_b_results:
327
+ for k, v in r['guardrails'].items():
328
+ b_guardrails[k].append(v)
329
+
330
+ # Compare
331
+ violations = []
332
+
333
+ for metric in a_guardrails.keys():
334
+ a_vals = np.array(a_guardrails[metric])
335
+ b_vals = np.array(b_guardrails[metric])
336
+
337
+ # Check if B is significantly worse
338
+ median_a = np.median(a_vals)
339
+ median_b = np.median(b_vals)
340
+
341
+ # Metric-specific thresholds
342
+ if 'drawdown' in metric.lower():
343
+ # Lower drawdown is better
344
+ if median_b > median_a * 1.5:
345
+ violations.append({
346
+ 'metric': metric,
347
+ 'severity': 'high' if median_b > median_a * 2 else 'medium',
348
+ 'a_median': median_a,
349
+ 'b_median': median_b,
350
+ 'direction': 'worse'
351
+ })
352
+ elif 'volatility' in metric.lower() or 'var' in metric.lower():
353
+ # Lower is better
354
+ if median_b > median_a * 1.3:
355
+ violations.append({
356
+ 'metric': metric,
357
+ 'severity': 'high' if median_b > median_a * 1.5 else 'medium',
358
+ 'a_median': median_a,
359
+ 'b_median': median_b,
360
+ 'direction': 'worse'
361
+ })
362
+
363
+ checks['violations'] = violations
364
+ checks['is_safe'] = len(violations) == 0
365
+ checks['n_metrics_checked'] = len(a_guardrails)
366
+
367
+ return checks
368
+
369
+ def get_counterfactual(self,
370
+ unit_id: str,
371
+ strategy_fn: Callable,
372
+ data: Dict) -> Dict:
373
+ """
374
+ Counterfactual: What would have happened with the OTHER strategy?
375
+
376
+ Useful for:
377
+ - Causal inference: treatment effect estimation
378
+ - Variance reduction: use both A and B predictions
379
+ """
380
+ # Get assigned group
381
+ assigned = [log for log in self.assignment_log if log['unit_id'] == unit_id]
382
+
383
+ if not assigned:
384
+ return {'error': 'Unit not found'}
385
+
386
+ actual_group = assigned[0]['group']
387
+ counterfactual_group = 'B' if actual_group == 'A' else 'A'
388
+
389
+ # Compute counterfactual outcome
390
+ counterfactual_outcome = strategy_fn(data, counterfactual_group)
391
+
392
+ return {
393
+ 'unit_id': unit_id,
394
+ 'actual_group': actual_group,
395
+ 'counterfactual_group': counterfactual_group,
396
+ 'counterfactual_outcome': counterfactual_outcome,
397
+ 'note': 'Counterfactuals are hypothetical — cannot observe both'
398
+ }
399
+
400
+ def summary_report(self) -> str:
401
+ """Generate human-readable summary report"""
402
+ analysis = self.analyze()
403
+ guardrails = self.guardrail_check()
404
+
405
+ report = f"""
406
+ {'='*70}
407
+ A/B TEST REPORT: {self.config.strategy_a_name} vs {self.config.strategy_b_name}
408
+ {'='*70}
409
+
410
+ SAMPLE SIZE
411
+ Group A: {analysis['n_a']} units
412
+ Group B: {analysis['n_b']} units
413
+ Required: {self.config.required_samples()} per group
414
+ Status: {'✓ Sufficient' if analysis['n_a'] >= self.config.required_samples() else '⚠ Under-powered'}
415
+
416
+ PRIMARY METRIC: {analysis.get('test', 'N/A')}
417
+ A mean: {analysis.get('mean_a', 0):.6f} (±{analysis.get('std_a', 0):.6f})
418
+ B mean: {analysis.get('mean_b', 0):.6f} (±{analysis.get('std_b', 0):.6f})
419
+ Difference: {analysis.get('mean_b', 0) - analysis.get('mean_a', 0):+.6f}
420
+ Cohen's d: {analysis.get('cohens_d', 0):.3f} ({analysis.get('effect_size_interpretation', 'N/A')})
421
+
422
+ P-value: {analysis.get('p_value', 'N/A')}
423
+ Significant (α={self.config.alpha}): {'✓ YES' if analysis.get('significant') else '✗ NO'}
424
+ Practically significant: {'✓ YES' if analysis.get('practically_significant') else '✗ NO'}
425
+
426
+ RECOMMENDATION: {analysis.get('recommendation', 'N/A')}
427
+
428
+ GUARDRAIL METRICS
429
+ Status: {'✓ Safe' if guardrails['is_safe'] else '⚠ VIOLATIONS DETECTED'}
430
+ Violations: {len(guardrails['violations'])}
431
+ """
432
+
433
+ if guardrails['violations']:
434
+ for v in guardrails['violations']:
435
+ report += f" - {v['metric']}: {v['severity'].upper()} (B is {v['direction']})\n"
436
+
437
+ report += f"""
438
+ {'='*70}
439
+ """
440
+
441
+ return report
442
+
443
+
444
+ class MultipleComparisonCorrection:
445
+ """
446
+ Correct for testing multiple hypotheses simultaneously.
447
+
448
+ Running 20 A/B tests? Expect 1 false positive by chance (p=0.05).
449
+ Without correction, you'll adopt 1 bad strategy per 20 tests.
450
+ """
451
+
452
+ @staticmethod
453
+ def bonferroni(p_values: np.ndarray, alpha: float = 0.05) -> Tuple[np.ndarray, bool]:
454
+ """
455
+ Bonferroni correction: α_corrected = α / n_tests
456
+
457
+ Conservative: controls family-wise error rate (FWER).
458
+ """
459
+ n = len(p_values)
460
+ corrected_alpha = alpha / n
461
+ is_significant = p_values < corrected_alpha
462
+
463
+ return corrected_alpha, is_significant
464
+
465
+ @staticmethod
466
+ def benjamini_hochberg(p_values: np.ndarray, alpha: float = 0.05) -> np.ndarray:
467
+ """
468
+ Benjamini-Hochberg: controls False Discovery Rate (FDR).
469
+
470
+ Less conservative than Bonferroni.
471
+ Accept that some fraction of "discoveries" are false.
472
+ """
473
+ n = len(p_values)
474
+ sorted_idx = np.argsort(p_values)
475
+ sorted_p = p_values[sorted_idx]
476
+
477
+ # Find largest k such that p_(k) <= (k/m) * α
478
+ is_significant = np.zeros(n, dtype=bool)
479
+
480
+ for i in range(n):
481
+ k = i + 1
482
+ threshold = (k / n) * alpha
483
+ if sorted_p[i] <= threshold:
484
+ is_significant[sorted_idx[i]] = True
485
+ else:
486
+ break
487
+
488
+ return is_significant
489
+
490
+ @staticmethod
491
+ def holm(p_values: np.ndarray, alpha: float = 0.05) -> np.ndarray:
492
+ """
493
+ Holm's step-down procedure.
494
+
495
+ Controls FWER, more powerful than Bonferroni.
496
+ """
497
+ n = len(p_values)
498
+ sorted_idx = np.argsort(p_values)
499
+ sorted_p = p_values[sorted_idx]
500
+
501
+ is_significant = np.zeros(n, dtype=bool)
502
+
503
+ for i in range(n):
504
+ k = i + 1
505
+ threshold = alpha / (n - k + 1)
506
+ if sorted_p[i] <= threshold:
507
+ is_significant[sorted_idx[i]] = True
508
+ else:
509
+ break
510
+
511
+ return is_significant
512
+
513
+
514
+ class SequentialABTest:
515
+ """
516
+ Sequential A/B testing with valid early stopping.
517
+
518
+ Problem: Peeking at results and stopping when p<0.05 → inflates Type I error.
519
+ Solution: Use sequential boundaries (always valid p-values).
520
+
521
+ Based on: Always Valid P-values (Johari et al., 2017)
522
+ """
523
+
524
+ def __init__(self,
525
+ config: ExperimentConfig,
526
+ spending_function: str = 'obrien_fleming'):
527
+ self.config = config
528
+ self.spending_function = spending_function
529
+
530
+ self.observations = []
531
+ self.cumsum_a = 0
532
+ self.cumsum_b = 0
533
+ self.cumsum_sq_a = 0
534
+ self.cumsum_sq_b = 0
535
+ self.n_a = 0
536
+ self.n_b = 0
537
+
538
+ def update(self, group: str, value: float):
539
+ """Add one observation and test for significance"""
540
+ if group == 'A':
541
+ self.cumsum_a += value
542
+ self.cumsum_sq_a += value ** 2
543
+ self.n_a += 1
544
+ else:
545
+ self.cumsum_b += value
546
+ self.cumsum_sq_b += value ** 2
547
+ self.n_b += 1
548
+
549
+ self.observations.append({'group': group, 'value': value})
550
+
551
+ # Compute always-valid p-value
552
+ return self._compute_always_valid_p()
553
+
554
+ def _compute_always_valid_p(self) -> Dict:
555
+ """Compute always-valid p-value for early stopping"""
556
+ if self.n_a < 2 or self.n_b < 2:
557
+ return {'n': len(self.observations), 'p_value': 1.0, 'can_stop': False}
558
+
559
+ # Sample means
560
+ mean_a = self.cumsum_a / self.n_a
561
+ mean_b = self.cumsum_b / self.n_b
562
+
563
+ # Sample variances
564
+ var_a = (self.cumsum_sq_a - self.n_a * mean_a**2) / (self.n_a - 1)
565
+ var_b = (self.cumsum_sq_b - self.n_b * mean_b**2) / (self.n_b - 1)
566
+
567
+ # Pooled standard error
568
+ se = np.sqrt(var_a / self.n_a + var_b / self.n_b)
569
+
570
+ # Z-statistic
571
+ z = (mean_b - mean_a) / (se + 1e-10)
572
+
573
+ # Always-valid adjustment
574
+ # P-value valid under continuous monitoring
575
+ n_eff = min(self.n_a, self.n_b)
576
+
577
+ # Mixture stopping boundary (always valid)
578
+ # Approximation: multiply p-value by log(n)
579
+ raw_p = 2 * (1 - stats.norm.cdf(abs(z)))
580
+ adjusted_p = min(raw_p * np.log(max(n_eff, np.e)), 1.0)
581
+
582
+ # Can stop?
583
+ can_stop = adjusted_p < self.config.alpha
584
+
585
+ return {
586
+ 'n': len(self.observations),
587
+ 'n_a': self.n_a,
588
+ 'n_b': self.n_b,
589
+ 'mean_a': mean_a,
590
+ 'mean_b': mean_b,
591
+ 'z_statistic': z,
592
+ 'raw_p_value': raw_p,
593
+ 'adjusted_p_value': adjusted_p,
594
+ 'can_stop': can_stop,
595
+ 'recommendation': 'STOP' if can_stop else 'CONTINUE'
596
+ }
597
+
598
+
599
+ if __name__ == '__main__':
600
+ print("=" * 70)
601
+ print(" A/B TESTING FRAMEWORK FOR STRATEGIES")
602
+ print("=" * 70)
603
+
604
+ np.random.seed(42)
605
+
606
+ # Configuration
607
+ config = ExperimentConfig(
608
+ strategy_a_name='Baseline_Momentum',
609
+ strategy_b_name='ML_Alpha_v3',
610
+ alpha=0.05,
611
+ power=0.80,
612
+ min_detectable_effect=0.05, # Detect 0.05 Sharpe difference
613
+ baseline_sharpe=1.0
614
+ )
615
+
616
+ # Power analysis
617
+ required_n = config.required_samples()
618
+ print(f"\n1. POWER ANALYSIS")
619
+ print(f" Required sample size per group: {required_n}")
620
+ print(f" (Detect Sharpe diff of {config.min_detectable_effect} with {config.power*100:.0f}% power)")
621
+
622
+ # Run A/B test
623
+ print(f"\n2. SIMULATED A/B TEST")
624
+ test = ABTest(config, diversion_unit='day', stratify_by=['volatility_regime'])
625
+
626
+ # Simulate 400 days
627
+ n_days = 400
628
+
629
+ # Strategy A: Sharpe = 0.8
630
+ # Strategy B: Sharpe = 1.2 (better by 0.4)
631
+ daily_vol = 0.15 / np.sqrt(252)
632
+
633
+ for day in range(n_days):
634
+ # Volatility regime (for stratification)
635
+ regime = 'high' if np.random.rand() < 0.2 else 'normal'
636
+
637
+ # Assign
638
+ unit_id = f'day_{day:04d}'
639
+ group = test.assign(unit_id, {'volatility_regime': regime})
640
+
641
+ # Simulate returns
642
+ if group == 'A':
643
+ # Baseline: mean = 0.8 * daily_vol
644
+ ret = np.random.normal(0.8 * daily_vol, daily_vol)
645
+ else:
646
+ # Better: mean = 1.2 * daily_vol
647
+ ret = np.random.normal(1.2 * daily_vol, daily_vol)
648
+
649
+ # Guardrails
650
+ guardrails = {
651
+ 'max_drawdown': abs(np.random.exponential(0.02)),
652
+ 'daily_vol': abs(np.random.normal(daily_vol, daily_vol * 0.3))
653
+ }
654
+
655
+ test.record_result(unit_id, group, ret, guardrails)
656
+
657
+ # Analysis
658
+ analysis = test.analyze(test_type='t_test')
659
+
660
+ print(f"\n3. STATISTICAL RESULTS")
661
+ print(f" Group A (n={analysis['n_a']}): mean={analysis['mean_a']:.6f}")
662
+ print(f" Group B (n={analysis['n_b']}): mean={analysis['mean_b']:.6f}")
663
+ print(f" Difference: {analysis['mean_b'] - analysis['mean_a']:+.6f}")
664
+ print(f" Cohen's d: {analysis['cohens_d']:.3f}")
665
+ print(f" P-value: {analysis['p_value']:.4f}")
666
+ print(f" Significant: {'✓ YES' if analysis['significant'] else '✗ NO'}")
667
+ print(f" RECOMMENDATION: {analysis['recommendation']}")
668
+
669
+ # Guardrails
670
+ guardrail_check = test.guardrail_check()
671
+ print(f"\n4. GUARDRAIL CHECK")
672
+ print(f" Safe: {'✓ YES' if guardrail_check['is_safe'] else '✗ VIOLATIONS'}")
673
+
674
+ # Multiple comparison
675
+ print(f"\n5. MULTIPLE COMPARISON CORRECTION")
676
+ p_values = np.array([analysis['p_value'], 0.03, 0.08, 0.001, 0.12, 0.04])
677
+
678
+ bh_sig = MultipleComparisonCorrection.benjamini_hochberg(p_values)
679
+ print(f" Raw significant: {np.sum(p_values < 0.05)}/{len(p_values)}")
680
+ print(f" BH-FDR significant: {np.sum(bh_sig)}/{len(p_values)}")
681
+
682
+ # Full report
683
+ print(f"\n6. FULL REPORT")
684
+ print(test.summary_report())
685
+
686
+ # Sequential test
687
+ print(f"7. SEQUENTIAL TESTING")
688
+ seq_test = SequentialABTest(config)
689
+
690
+ for i in range(200):
691
+ group = 'A' if np.random.rand() < 0.5 else 'B'
692
+ value = np.random.normal(0.8 * daily_vol if group == 'A' else 1.2 * daily_vol, daily_vol)
693
+ result = seq_test.update(group, value)
694
+
695
+ if result['can_stop']:
696
+ print(f" Sequential test STOPPED at n={result['n']}")
697
+ print(f" Adjusted p-value: {result['adjusted_p_value']:.4f}")
698
+ break
699
+
700
+ print(f"\n KEY TAKEAWAYS:")
701
+ print(f" - Always A/B test before deploying")
702
+ print(f" - Multiple comparison correction prevents false discoveries")
703
+ print(f" - Guardrail metrics prevent hidden risk increases")
704
+ print(f" - Sequential testing enables early stopping (with valid p-values)")
705
+ print(f" - Power analysis ensures tests aren't underpowered")
706
+ print(f" - This is EXACTLY how Jane Street validates every strategy change")