ScottzillaSystems commited on
Commit
5c15f3b
Β·
verified Β·
1 Parent(s): d8afe20

Upload healer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. healer.py +479 -0
healer.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Autonomous Self-Healing System for Hugging Face Spaces
4
+ Monitors, diagnoses, fixes errors, and minimizes costs automatically.
5
+
6
+ Usage:
7
+ python healer.py --daemon # Run continuously
8
+ python healer.py --once # Single check cycle
9
+ python healer.py --space <id> # Check specific space
10
+ python healer.py --report # Generate cost report
11
+ """
12
+
13
+ import os
14
+ import sys
15
+ import time
16
+ import json
17
+ import argparse
18
+ import traceback
19
+ from datetime import datetime, timedelta
20
+ from dataclasses import dataclass, asdict
21
+ from typing import List, Dict, Optional, Tuple
22
+ from collections import defaultdict
23
+
24
+ from huggingface_hub import HfApi, SpaceHardware, SpaceRuntime
25
+
26
+
27
+ # ─── Configuration ───────────────────────────────────────────────────────────
28
+
29
+ HEALER_CONFIG = {
30
+ "poll_interval_seconds": 60,
31
+ "max_restarts_per_hour": 5,
32
+ "oom_downgrade_hw": "cpu-basic",
33
+ "default_sleep_seconds": 300,
34
+ "cost_per_hour": {
35
+ "cpu-basic": 0.0,
36
+ "cpu-upgrade": 0.03,
37
+ "t4-small": 0.40,
38
+ "t4-medium": 0.60,
39
+ "a10g-small": 1.00,
40
+ "a10g-large": 1.50,
41
+ "a10g-largex2": 3.00,
42
+ "a100-large": 2.50,
43
+ "l4x1": 0.80,
44
+ "l40sx1": 1.80,
45
+ },
46
+ "auto_pause_on_error": True,
47
+ "auto_sleep_idle": True,
48
+ "idle_sleep_threshold_seconds": 300,
49
+ "cost_cutting": {
50
+ "pause_broken_spaces": True,
51
+ "downgrade_oom_to_cpu": True,
52
+ "set_auto_sleep_on_paid_hw": True,
53
+ "pause_during_night_hours": False,
54
+ "night_hours": {"start": 2, "end": 8},
55
+ },
56
+ }
57
+
58
+ # Spaces to monitor (auto-discovered if empty)
59
+ MONITORED_SPACES: List[str] = []
60
+
61
+ # Fix playbook: error pattern β†’ action
62
+ FIX_PLAYBOOK = {
63
+ "BUILD_ERROR": {
64
+ "action": "restart",
65
+ "description": "Build failed β€” restart to retry",
66
+ "cost_action": "none",
67
+ },
68
+ "RUNTIME_ERROR": {
69
+ "action": "restart_then_pause_if_repeated",
70
+ "description": "App crashed β€” restart, pause if keeps failing",
71
+ "cost_action": "pause_after_3_failures",
72
+ },
73
+ "OOM": {
74
+ "action": "downgrade_and_restart",
75
+ "description": "Out of memory β€” downgrade to CPU, restart",
76
+ "cost_action": "downgrade_to_cpu",
77
+ },
78
+ "PAUSED": {
79
+ "action": "restart_if_should_be_active",
80
+ "description": "Space paused β€” restart if in active hours",
81
+ "cost_action": "none",
82
+ },
83
+ "SLEEPING": {
84
+ "action": "restart_on_demand_only",
85
+ "description": "Space sleeping β€” let visitors wake it",
86
+ "cost_action": "none",
87
+ },
88
+ "NO_APP_FILE": {
89
+ "action": "alert_human",
90
+ "description": "Missing app file β€” requires code fix",
91
+ "cost_action": "pause",
92
+ },
93
+ "HARDWARE_PENDING": {
94
+ "action": "wait",
95
+ "description": "Hardware change pending β€” wait for provisioning",
96
+ "cost_action": "none",
97
+ },
98
+ }
99
+
100
+
101
+ # ─── Data Structures ─────────────────────────────────────────────────────────
102
+
103
+ @dataclass
104
+ class SpaceState:
105
+ repo_id: str
106
+ stage: str
107
+ hardware: Optional[str]
108
+ requested_hardware: Optional[str]
109
+ sleep_time: Optional[int]
110
+ last_checked: str
111
+ error_message: Optional[str] = None
112
+ restart_count_1h: int = 0
113
+ total_uptime_minutes: float = 0.0
114
+ estimated_cost_today: float = 0.0
115
+
116
+ @dataclass
117
+ class HealAction:
118
+ action: str
119
+ description: str
120
+ executed: bool
121
+ result: str
122
+ timestamp: str
123
+
124
+
125
+ # ─── Core Healer Class ───────────────────────────────────────────────────────
126
+
127
+ class SpaceHealer:
128
+ def __init__(self, token: Optional[str] = None):
129
+ self.api = HfApi(token=token or os.getenv("HF_TOKEN"))
130
+ self.history: Dict[str, List[Dict]] = defaultdict(list)
131
+ self.state_cache: Dict[str, SpaceState] = {}
132
+ self.fix_log: List[Dict] = []
133
+
134
+ # ── Discovery ───────────────────────────────────────────────────────────
135
+
136
+ def discover_spaces(self, author: str = "ScottzillaSystems") -> List[str]:
137
+ """Auto-discover all Spaces under a namespace."""
138
+ try:
139
+ import requests
140
+ resp = requests.get(
141
+ f"https://huggingface.co/api/spaces?author={author}",
142
+ headers={"Authorization": f"Bearer {self.api.token}"}
143
+ )
144
+ spaces = resp.json()
145
+ return [s["id"] for s in spaces]
146
+ except Exception as e:
147
+ print(f"[Healer] Discovery failed: {e}")
148
+ return MONITORED_SPACES
149
+
150
+ # ── Monitoring ──────────────────────────────────────────────────────────
151
+
152
+ def check_space(self, repo_id: str) -> Tuple[SpaceState, Optional[HealAction]]:
153
+ """Check a single space and return state + action taken."""
154
+ now = datetime.utcnow().isoformat()
155
+
156
+ try:
157
+ runtime = self.api.get_space_runtime(repo_id)
158
+ except Exception as e:
159
+ return SpaceState(
160
+ repo_id=repo_id, stage="UNKNOWN", hardware=None,
161
+ requested_hardware=None, sleep_time=None,
162
+ last_checked=now, error_message=str(e)
163
+ ), None
164
+
165
+ # Build state
166
+ state = SpaceState(
167
+ repo_id=repo_id,
168
+ stage=runtime.stage,
169
+ hardware=runtime.hardware,
170
+ requested_hardware=runtime.requested_hardware,
171
+ sleep_time=runtime.sleep_time,
172
+ last_checked=now,
173
+ error_message=getattr(runtime, "errorMessage", None),
174
+ )
175
+
176
+ # Estimate cost
177
+ hw = (runtime.hardware or "cpu-basic").lower()
178
+ cost_rate = HEALER_CONFIG["cost_per_hour"].get(hw, 0.0)
179
+ state.estimated_cost_today = cost_rate * 24 # rough daily estimate
180
+
181
+ # Update restart history
182
+ prev = self.state_cache.get(repo_id)
183
+ if prev and prev.stage != "RUNNING" and runtime.stage == "RUNNING":
184
+ self.history[repo_id].append({"event": "restart", "time": now})
185
+
186
+ # Count restarts in last hour
187
+ cutoff = (datetime.utcnow() - timedelta(hours=1)).isoformat()
188
+ state.restart_count_1h = sum(
189
+ 1 for h in self.history[repo_id]
190
+ if h["event"] == "restart" and h["time"] > cutoff
191
+ )
192
+
193
+ self.state_cache[repo_id] = state
194
+
195
+ # Diagnose and heal
196
+ diagnosis = self._diagnose(state)
197
+ if diagnosis:
198
+ action = self._heal(repo_id, state, diagnosis)
199
+ return state, action
200
+
201
+ return state, None
202
+
203
+ def _diagnose(self, state: SpaceState) -> Optional[str]:
204
+ """Classify the problem."""
205
+ stage = state.stage
206
+
207
+ if stage == "BUILD_ERROR":
208
+ return "BUILD_ERROR"
209
+ if stage == "RUNTIME_ERROR":
210
+ # Check if OOM
211
+ if state.error_message and any(k in (state.error_message or "").lower()
212
+ for k in ["killed", "oom", "out of memory", "cuda out of memory"]):
213
+ return "OOM"
214
+ return "RUNTIME_ERROR"
215
+ if stage == "PAUSED":
216
+ return "PAUSED"
217
+ if stage == "SLEEPING":
218
+ return "SLEEPING"
219
+ if stage == "NO_APP_FILE":
220
+ return "NO_APP_FILE"
221
+ if state.requested_hardware and state.requested_hardware != state.hardware:
222
+ return "HARDWARE_PENDING"
223
+ return None
224
+
225
+ def _heal(self, repo_id: str, state: SpaceState, diagnosis: str) -> HealAction:
226
+ """Execute fix from playbook."""
227
+ now = datetime.utcnow().isoformat()
228
+ playbook = FIX_PLAYBOOK.get(diagnosis, {"action": "alert_human", "description": "Unknown issue", "cost_action": "none"})
229
+ action_name = playbook["action"]
230
+ result = "skipped"
231
+ executed = False
232
+
233
+ try:
234
+ if action_name == "restart":
235
+ self.api.restart_space(repo_id)
236
+ result = "restarted"
237
+ executed = True
238
+
239
+ elif action_name == "restart_then_pause_if_repeated":
240
+ if state.restart_count_1h >= HEALER_CONFIG["max_restarts_per_hour"]:
241
+ try:
242
+ self.api.pause_space(repo_id)
243
+ result = f"paused_after_{state.restart_count_1h}_restarts"
244
+ except Exception as e:
245
+ result = f"pause_failed: {str(e)[:80]}"
246
+ else:
247
+ try:
248
+ self.api.restart_space(repo_id)
249
+ result = "restarted"
250
+ except Exception as e:
251
+ result = f"restart_failed: {str(e)[:80]}"
252
+ executed = True
253
+
254
+ elif action_name == "downgrade_and_restart":
255
+ try:
256
+ self.api.request_space_hardware(repo_id, hardware=SpaceHardware.CPU_BASIC)
257
+ time.sleep(3)
258
+ self.api.restart_space(repo_id)
259
+ result = "downgraded_to_cpu_and_restarted"
260
+ except Exception as e:
261
+ result = f"downgrade_failed: {str(e)[:80]}"
262
+ executed = True
263
+
264
+ elif action_name == "restart_if_should_be_active":
265
+ try:
266
+ self.api.restart_space(repo_id)
267
+ result = "restarted"
268
+ except Exception as e:
269
+ result = f"restart_failed: {str(e)[:80]}"
270
+ executed = True
271
+
272
+ elif action_name == "restart_on_demand_only":
273
+ result = "left_sleeping"
274
+ executed = False
275
+
276
+ elif action_name == "alert_human":
277
+ if HEALER_CONFIG["auto_pause_on_error"]:
278
+ try:
279
+ self.api.pause_space(repo_id)
280
+ result = "paused_for_human_review"
281
+ except Exception as e:
282
+ result = f"pause_failed: {str(e)[:80]}"
283
+ else:
284
+ result = "alerted_human"
285
+ executed = True
286
+
287
+ elif action_name == "wait":
288
+ result = "waiting_for_provisioning"
289
+ executed = False
290
+
291
+ except Exception as e:
292
+ result = f"error: {str(e)[:100]}"
293
+ executed = False
294
+
295
+ action = HealAction(
296
+ action=action_name,
297
+ description=playbook["description"],
298
+ executed=executed,
299
+ result=result,
300
+ timestamp=now,
301
+ )
302
+
303
+ self.fix_log.append({
304
+ "repo_id": repo_id,
305
+ "diagnosis": diagnosis,
306
+ **asdict(action),
307
+ })
308
+
309
+ return action
310
+
311
+ # ── Cost Optimization ───────────────────────────────────────────────────
312
+
313
+ def optimize_costs(self, repo_id: str):
314
+ """Apply aggressive cost-saving measures."""
315
+ state = self.state_cache.get(repo_id)
316
+ if not state:
317
+ return
318
+
319
+ hw = (state.hardware or "cpu-basic").lower()
320
+ cost_rate = HEALER_CONFIG["cost_per_hour"].get(hw, 0.0)
321
+ cc = HEALER_CONFIG["cost_cutting"]
322
+ saved = []
323
+
324
+ # 1. Pause broken spaces on ANY paid hardware
325
+ if cc["pause_broken_spaces"] and cost_rate > 0 and state.stage in ("RUNTIME_ERROR", "BUILD_ERROR", "NO_APP_FILE"):
326
+ try:
327
+ self.api.pause_space(repo_id)
328
+ saved.append(f"paused broken space (${cost_rate}/hr)")
329
+ except Exception:
330
+ pass
331
+
332
+ # 2. Downgrade OOM spaces to CPU
333
+ if cc["downgrade_oom_to_cpu"] and state.stage == "RUNTIME_ERROR" and state.error_message:
334
+ if any(k in state.error_message.lower() for k in ["killed", "oom", "out of memory"]):
335
+ try:
336
+ self.api.request_space_hardware(repo_id, hardware=SpaceHardware.CPU_BASIC)
337
+ saved.append("downgraded OOM to CPU")
338
+ except Exception:
339
+ pass
340
+
341
+ # 3. Set auto-sleep on all paid hardware without it
342
+ if cc["set_auto_sleep_on_paid_hw"] and cost_rate > 0 and state.sleep_time is None:
343
+ try:
344
+ self.api.set_space_sleep_time(repo_id, sleep_time=HEALER_CONFIG["default_sleep_seconds"])
345
+ saved.append(f"auto-sleep {HEALER_CONFIG['default_sleep_seconds']}s")
346
+ except Exception:
347
+ pass
348
+
349
+ # 4. Night-hour pause for non-critical spaces
350
+ if cc["pause_during_night_hours"]:
351
+ hour = datetime.utcnow().hour
352
+ night_start = cc["night_hours"]["start"]
353
+ night_end = cc["night_hours"]["end"]
354
+ is_night = (hour >= night_start or hour < night_end)
355
+ if is_night and cost_rate > 0 and state.stage == "RUNNING":
356
+ # Only pause if not in protected list
357
+ protected = ["Cydonia-24B-Chat", "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled"]
358
+ if not any(p in repo_id for p in protected):
359
+ try:
360
+ self.api.pause_space(repo_id)
361
+ saved.append("night-pause")
362
+ except Exception:
363
+ pass
364
+
365
+ if saved:
366
+ print(f"[Healer] πŸ’° {repo_id}: {', '.join(saved)}")
367
+
368
+ # ── Reporting ───────────────────────────────────────────────────────────
369
+
370
+ def generate_report(self) -> Dict:
371
+ """Generate health and cost report."""
372
+ report = {
373
+ "generated_at": datetime.utcnow().isoformat(),
374
+ "spaces": [],
375
+ "total_estimated_daily_cost": 0.0,
376
+ "actions_today": len(self.fix_log),
377
+ "fix_log": self.fix_log[-50:], # last 50 actions
378
+ }
379
+
380
+ for repo_id, state in self.state_cache.items():
381
+ report["spaces"].append(asdict(state))
382
+ report["total_estimated_daily_cost"] += state.estimated_cost_today
383
+
384
+ return report
385
+
386
+ def print_report(self):
387
+ """Print formatted report to console."""
388
+ report = self.generate_report()
389
+ print("\n" + "=" * 70)
390
+ print(f"🩺 SPACE HEALER REPORT β€” {report['generated_at']}")
391
+ print("=" * 70)
392
+ print(f"\nπŸ’° Total estimated daily cost: ${report['total_estimated_daily_cost']:.2f}")
393
+ print(f"πŸ”§ Auto-heal actions today: {report['actions_today']}\n")
394
+
395
+ for s in report["spaces"]:
396
+ status_emoji = "🟒" if s['stage'] == "RUNNING" else "πŸ”΄" if s['stage'] in ("RUNTIME_ERROR", "BUILD_ERROR") else "🟑"
397
+ print(f"{status_emoji} {s['repo_id']}")
398
+ print(f" Stage: {s['stage']} | HW: {s['hardware']} | Restarts/hr: {s['restart_count_1h']}")
399
+ print(f" Est. daily cost: ${s['estimated_cost_today']:.2f}")
400
+ if s['error_message']:
401
+ print(f" Error: {s['error_message'][:120]}...")
402
+ print()
403
+
404
+ if report["fix_log"]:
405
+ print("Recent heal actions:")
406
+ for a in report["fix_log"][-5:]:
407
+ emoji = "βœ…" if a["executed"] else "⏸️"
408
+ print(f" {emoji} [{a['timestamp'][:19]}] {a['repo_id']}: {a['action']} β†’ {a['result']}")
409
+ print("=" * 70 + "\n")
410
+
411
+ # ── Main Loop ───────────────────────────────────────────────────────────
412
+
413
+ def run_cycle(self, spaces: Optional[List[str]] = None):
414
+ """Run one monitoring/healing cycle."""
415
+ if spaces is None:
416
+ spaces = self.discover_spaces()
417
+
418
+ print(f"[Healer] πŸ” Checking {len(spaces)} spaces at {datetime.utcnow().isoformat()}")
419
+
420
+ for repo_id in spaces:
421
+ state, action = self.check_space(repo_id)
422
+
423
+ if action and action.executed:
424
+ print(f"[Healer] πŸ”§ {repo_id}: {action.action} β†’ {action.result}")
425
+ elif action:
426
+ print(f"[Healer] ⏸️ {repo_id}: {action.action} β€” {action.result}")
427
+ else:
428
+ print(f"[Healer] 🟒 {repo_id}: {state.stage}")
429
+
430
+ # Cost optimization pass
431
+ self.optimize_costs(repo_id)
432
+
433
+ def run_daemon(self, spaces: Optional[List[str]] = None):
434
+ """Run continuous monitoring loop."""
435
+ print("[Healer] πŸ€– Autonomous self-healing daemon started")
436
+ print(f"[Healer] Poll interval: {HEALER_CONFIG['poll_interval_seconds']}s")
437
+
438
+ while True:
439
+ try:
440
+ self.run_cycle(spaces)
441
+ self.print_report()
442
+ except Exception as e:
443
+ print(f"[Healer] ❌ Cycle error: {e}")
444
+ traceback.print_exc()
445
+
446
+ time.sleep(HEALER_CONFIG["poll_interval_seconds"])
447
+
448
+
449
+ # ─── CLI ─────────────────────────────────────────────────────────────────────
450
+
451
+ def main():
452
+ parser = argparse.ArgumentParser(description="Autonomous Space Self-Healer")
453
+ parser.add_argument("--daemon", action="store_true", help="Run continuous monitoring")
454
+ parser.add_argument("--once", action="store_true", help="Single check cycle")
455
+ parser.add_argument("--space", help="Check specific space only")
456
+ parser.add_argument("--report", action="store_true", help="Generate report")
457
+ parser.add_argument("--discover", default="ScottzillaSystems", help="Namespace to discover")
458
+ args = parser.parse_args()
459
+
460
+ healer = SpaceHealer()
461
+
462
+ if args.space:
463
+ state, action = healer.check_space(args.space)
464
+ healer.print_report()
465
+ elif args.report:
466
+ healer.print_report()
467
+ elif args.once:
468
+ spaces = healer.discover_spaces(args.discover)
469
+ healer.run_cycle(spaces)
470
+ healer.print_report()
471
+ elif args.daemon:
472
+ spaces = healer.discover_spaces(args.discover)
473
+ healer.run_daemon(spaces)
474
+ else:
475
+ parser.print_help()
476
+
477
+
478
+ if __name__ == "__main__":
479
+ main()