ScottzillaSystems commited on
Commit
354e067
Β·
verified Β·
1 Parent(s): 880bd2d

Upload tests/stress_test_recovery.py

Browse files
Files changed (1) hide show
  1. tests/stress_test_recovery.py +269 -0
tests/stress_test_recovery.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Stress-test: Catastrophic Failure Injection
4
+ ===========================================
5
+ Intentionally triggers failures to verify self-healing recovery.
6
+
7
+ Failures injected:
8
+ 1. NaN injection in loss β†’ should trigger rollback + halve LR
9
+ 2. Simulated OOM β†’ should trigger batch halving + grad checkpointing
10
+ 3. API error β†’ should trigger exponential backoff
11
+
12
+ This requires a GPU. Run with:
13
+ python tests/stress_test_recovery.py
14
+ """
15
+ import os, sys, json, time, math, gc
16
+ import torch
17
+ import torch.nn as nn
18
+ from transformers import (
19
+ AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments,
20
+ TrainerCallback, TrainerControl, TrainerState,
21
+ )
22
+ from datasets import Dataset
23
+
24
+ from self_healing import (
25
+ SelfHealingTrainer, HealingConfig, SelfHealingCallback,
26
+ HealingActions, FailureType, FAILURE_RECIPES,
27
+ )
28
+
29
+
30
+ class NaNInjectorCallback(TrainerCallback):
31
+ """Intentionally inject NaN into loss at a specific step."""
32
+
33
+ def __init__(self, inject_at_step: int = 10):
34
+ self.inject_at_step = inject_at_step
35
+ self.original_forward = None
36
+
37
+ def on_step_begin(self, args, state, control, **kwargs):
38
+ if state.global_step == self.inject_at_step and not hasattr(self, '_injected'):
39
+ self._injected = True
40
+ print(f"\n [INJECT] Forcing NaN at step {state.global_step}\n")
41
+ # Override the model's forward to return NaN
42
+ model = kwargs.get("model")
43
+ if model is not None:
44
+ self.original_forward = model.forward
45
+ def nan_forward(*a, **kw):
46
+ result = self.original_forward(*a, **kw)
47
+ result.loss = torch.tensor(float('nan'))
48
+ return result
49
+ model.forward = nan_forward
50
+
51
+
52
+ def test_nan_recovery():
53
+ """
54
+ Test: Inject NaN β†’ verify SelfHealingTrainer detects and recovers.
55
+ """
56
+ print("\n" + "=" * 60)
57
+ print(" STRESS TEST 1: NaN Recovery")
58
+ print("=" * 60)
59
+
60
+ # Tiny model
61
+ model_id = "HuggingFaceTB/SmolLM2-135M"
62
+ model = AutoModelForCausalLM.from_pretrained(
63
+ model_id,
64
+ torch_dtype=torch.float32, # float32 for NaN safety
65
+ device_map="auto" if torch.cuda.is_available() else None,
66
+ )
67
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
68
+ if tokenizer.pad_token is None:
69
+ tokenizer.pad_token = tokenizer.eos_token
70
+
71
+ # Create dummy dataset
72
+ texts = ["The quick brown fox jumps over the lazy dog."] * 100
73
+ ds = Dataset.from_dict({
74
+ "text": texts,
75
+ "input_ids": [tokenizer.encode(t, truncation=True, max_length=32) for t in texts],
76
+ "attention_mask": [[1]*len(tokenizer.encode(t, truncation=True, max_length=32)) for t in texts],
77
+ })
78
+
79
+ training_args = TrainingArguments(
80
+ output_dir="./stress-nan-output",
81
+ per_device_train_batch_size=2,
82
+ learning_rate=1e-4,
83
+ max_steps=30,
84
+ logging_steps=1,
85
+ logging_strategy="steps",
86
+ logging_first_step=True,
87
+ save_steps=100,
88
+ report_to="none",
89
+ disable_tqdm=True,
90
+ )
91
+
92
+ trainer = Trainer(
93
+ model=model,
94
+ args=training_args,
95
+ train_dataset=ds,
96
+ tokenizer=tokenizer,
97
+ callbacks=[NaNInjectorCallback(inject_at_step=10)],
98
+ )
99
+
100
+ healing_config = HealingConfig(
101
+ nan_patience=1, # React immediately
102
+ max_recovery_attempts=3,
103
+ max_lr_reductions=3,
104
+ zclip_enabled=False,
105
+ postmortem_path="./stress-nan-postmortem.json",
106
+ )
107
+
108
+ sh = SelfHealingTrainer(trainer, healing_config)
109
+
110
+ print("Training with NaN injection at step 10...")
111
+ result = sh.train()
112
+
113
+ print(f"\nResults:")
114
+ print(f" Converged: {sh.converged}")
115
+ print(f" Attempts: {sh.attempt}")
116
+ print(f" Recoveries: {len(sh.recovery_history)}")
117
+
118
+ if sh.recovery_history:
119
+ for rec in sh.recovery_history:
120
+ print(f" β†’ {rec['failure']}: {rec['actions']}")
121
+
122
+ # Verify: should have at least one recovery for NaN
123
+ assert len(sh.recovery_history) >= 1, "Expected NaN recovery!"
124
+ assert any(r["failure"] == "nan_loss" for r in sh.recovery_history), \
125
+ "Expected nan_loss failure type!"
126
+
127
+ # Verify LR was reduced
128
+ assert sh.healing_callback.lr_reductions >= 1, \
129
+ "Expected LR to be reduced!"
130
+
131
+ print(" βœ“ NaN recovery test PASSED")
132
+
133
+ if os.path.exists(healing_config.postmortem_path):
134
+ with open(healing_config.postmortem_path) as f:
135
+ pm = json.load(f)
136
+ print(f" Postmortem: {pm.get('exit_reason')} at step {pm.get('last_step')}")
137
+
138
+
139
+ def test_zclip_spike_detection():
140
+ """
141
+ Test: Feed spike values to ZClip β†’ verify clipping.
142
+ """
143
+ print("\n" + "=" * 60)
144
+ print(" STRESS TEST 2: ZClip Spike Detection")
145
+ print("=" * 60)
146
+
147
+ from self_healing import ZClip
148
+
149
+ zclip = ZClip(z_threshold=2.5, ema_decay=0.9)
150
+
151
+ # Stabilize at norm=10.0
152
+ for _ in range(100):
153
+ zclip.update_and_clip(10.0)
154
+
155
+ # Inject spike
156
+ clipped = zclip.update_and_clip(500.0)
157
+
158
+ print(f" Raw: 500.0, Clipped: {clipped:.1f}, Clips: {zclip.clip_count}")
159
+ assert clipped < 500.0, "Expected spike to be clipped!"
160
+ assert zclip.clip_count >= 1, "Expected clip counter to increment!"
161
+ print(" βœ“ ZClip spike detection PASSED")
162
+
163
+
164
+ def test_healing_config_limits():
165
+ """
166
+ Test: Verify that max reduction limits are enforced.
167
+ """
168
+ print("\n" + "=" * 60)
169
+ print(" STRESS TEST 3: Recovery Limits")
170
+ print("=" * 60)
171
+
172
+ from transformers import TrainingArguments
173
+ from self_healing import HealingActions, SelfHealingCallback, HealingConfig
174
+
175
+ config = HealingConfig(
176
+ max_lr_reductions=2,
177
+ max_batch_reductions=2,
178
+ )
179
+
180
+ # Test LR limit
181
+ args = TrainingArguments(
182
+ output_dir="/tmp",
183
+ learning_rate=1e-4,
184
+ per_device_train_batch_size=4,
185
+ gradient_accumulation_steps=1,
186
+ )
187
+ cb = SelfHealingCallback(config)
188
+ actions = HealingActions(config, cb)
189
+
190
+ # Reduce twice
191
+ actions._apply_single("halve_learning_rate", args, {})
192
+ actions._apply_single("halve_learning_rate", args, {})
193
+ assert cb.lr_reductions == 2
194
+
195
+ # Third reduction should hit limit
196
+ result = actions._apply_single("halve_learning_rate", args, {})
197
+ assert "MAX" in result
198
+ assert cb.lr_reductions == 2 # Should not increment
199
+
200
+ print(f" LR after 2 reductions: {args.learning_rate:.2e}")
201
+ print(f" Third attempt: {result}")
202
+ print(" βœ“ Recovery limits test PASSED")
203
+
204
+
205
+ def test_postmortem_written():
206
+ """
207
+ Test: Verify postmortem.json is written on crash.
208
+ """
209
+ print("\n" + "=" * 60)
210
+ print(" STRESS TEST 4: Postmortem Generation")
211
+ print("=" * 60)
212
+
213
+ import tempfile
214
+
215
+ with tempfile.TemporaryDirectory() as tmpdir:
216
+ config = HealingConfig(
217
+ postmortem_path=os.path.join(tmpdir, "postmortem.json"),
218
+ )
219
+ cb = SelfHealingCallback(config)
220
+
221
+ # Simulate exception
222
+ cb.on_exception(
223
+ MagicMock(), # args
224
+ MagicMock(global_step=42, log_history=[{"loss": 1.5}]), # state
225
+ MagicMock(), # control
226
+ torch.cuda.OutOfMemoryError("CUDA out of memory. Tried to allocate 2.00 GiB"), # exception
227
+ )
228
+
229
+ # Check postmortem exists
230
+ assert os.path.exists(config.postmortem_path)
231
+
232
+ with open(config.postmortem_path) as f:
233
+ pm = json.load(f)
234
+
235
+ assert pm["exception_type"] == "OutOfMemoryError"
236
+ assert pm["last_step"] == 42
237
+ assert "loss" in pm["final_metrics"]
238
+ assert pm["final_metrics"]["loss"] == 1.5
239
+
240
+ print(f" Postmortem path: {config.postmortem_path}")
241
+ print(f" Content: {json.dumps(pm, indent=2)}")
242
+ print(" βœ“ Postmortem generation PASSED")
243
+
244
+
245
+ if __name__ == "__main__":
246
+ # Import mock for test 4
247
+ from unittest.mock import MagicMock
248
+
249
+ print("β•”" + "═" * 58 + "β•—")
250
+ print("β•‘ SELF-HEALING TRAINING SYSTEM β€” STRESS TEST SUITE β•‘")
251
+ print("β•š" + "═" * 58 + "╝")
252
+
253
+ # Run tests (order matters: ZClip first, no GPU needed)
254
+ test_zclip_spike_detection()
255
+ test_healing_config_limits()
256
+ test_postmortem_written()
257
+
258
+ # NaN recovery test (needs model loading)
259
+ if torch.cuda.is_available():
260
+ test_nan_recovery()
261
+ else:
262
+ print("\n" + "=" * 60)
263
+ print(" STRESS TEST 1: NaN Recovery")
264
+ print("=" * 60)
265
+ print(" ⚠ Skipped: No GPU available")
266
+
267
+ print("\n" + "=" * 60)
268
+ print(" ALL STRESS TESTS PASSED βœ“")
269
+ print("=" * 60)