meccatronis commited on
Commit
faedde1
·
verified ·
1 Parent(s): 653a0bd

Upload alert_system.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. alert_system.py +785 -0
alert_system.py ADDED
@@ -0,0 +1,785 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Alert and Notification System
4
+
5
+ Provides comprehensive alerting for GPU monitoring with multiple notification
6
+ channels, threshold management, and alert history tracking.
7
+ """
8
+
9
+ import time
10
+ import json
11
+ import logging
12
+ import smtplib
13
+ import subprocess
14
+ import threading
15
+ from typing import Dict, List, Optional, Callable, Any
16
+ from dataclasses import dataclass, asdict
17
+ from datetime import datetime, timedelta
18
+ from email.mime.text import MIMEText
19
+ from email.mime.multipart import MIMEMultipart
20
+ from pathlib import Path
21
+
22
+ from gpu_monitoring import GPUStatus, GPUDataManager
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ @dataclass
28
+ class AlertThreshold:
29
+ """Alert threshold configuration."""
30
+ metric: str
31
+ threshold: float
32
+ operator: str # '>', '<', '>=', '<=', '=='
33
+ duration: int # seconds to maintain threshold before alerting
34
+ enabled: bool = True
35
+ cooldown: int = 300 # seconds between same alerts
36
+
37
+
38
+ @dataclass
39
+ class Alert:
40
+ """Alert instance."""
41
+ id: str
42
+ timestamp: float
43
+ gpu_name: str
44
+ metric: str
45
+ value: float
46
+ threshold: float
47
+ message: str
48
+ severity: str # 'info', 'warning', 'critical', 'emergency'
49
+ acknowledged: bool = False
50
+ resolved: bool = False
51
+ resolved_at: Optional[float] = None
52
+
53
+
54
+ class NotificationChannel:
55
+ """Base class for notification channels."""
56
+
57
+ def __init__(self, name: str, enabled: bool = True):
58
+ self.name = name
59
+ self.enabled = enabled
60
+
61
+ def send(self, alert: Alert) -> bool:
62
+ """Send notification for alert."""
63
+ raise NotImplementedError
64
+
65
+
66
+ class LogNotification(NotificationChannel):
67
+ """Log-based notification channel."""
68
+
69
+ def send(self, alert: Alert) -> bool:
70
+ """Send alert to log."""
71
+ severity_colors = {
72
+ 'info': '\033[94m', # Blue
73
+ 'warning': '\033[93m', # Yellow
74
+ 'critical': '\033[91m', # Red
75
+ 'emergency': '\033[41m' # Red background
76
+ }
77
+ reset_color = '\033[0m'
78
+
79
+ color = severity_colors.get(alert.severity, '')
80
+
81
+ message = (
82
+ f"{color}[{alert.severity.upper()}] GPU Alert: {alert.gpu_name} - "
83
+ f"{alert.metric}: {alert.value} (threshold: {alert.threshold}){reset_color}"
84
+ )
85
+
86
+ if alert.severity in ['critical', 'emergency']:
87
+ logger.critical(message)
88
+ elif alert.severity == 'warning':
89
+ logger.warning(message)
90
+ else:
91
+ logger.info(message)
92
+
93
+ return True
94
+
95
+
96
+ class DesktopNotification(NotificationChannel):
97
+ """Desktop notification channel using notify-send."""
98
+
99
+ def send(self, alert: Alert) -> bool:
100
+ """Send desktop notification."""
101
+ try:
102
+ urgency = 'normal'
103
+ if alert.severity == 'critical':
104
+ urgency = 'critical'
105
+ elif alert.severity == 'warning':
106
+ urgency = 'normal'
107
+ else:
108
+ urgency = 'low'
109
+
110
+ # Create notification command
111
+ cmd = [
112
+ 'notify-send',
113
+ '--app-name=GPU Monitor',
114
+ f'--urgency={urgency}',
115
+ f'--icon=video-display',
116
+ f'GPU Alert - {alert.gpu_name}',
117
+ f'{alert.message}'
118
+ ]
119
+
120
+ subprocess.run(cmd, check=True)
121
+ return True
122
+
123
+ except subprocess.CalledProcessError as e:
124
+ logger.error(f"Failed to send desktop notification: {e}")
125
+ return False
126
+ except FileNotFoundError:
127
+ logger.warning("notify-send not found, desktop notifications disabled")
128
+ return False
129
+
130
+
131
+ class EmailNotification(NotificationChannel):
132
+ """Email notification channel."""
133
+
134
+ def __init__(self, name: str, smtp_config: Dict[str, Any], recipients: List[str]):
135
+ super().__init__(name)
136
+ self.smtp_config = smtp_config
137
+ self.recipients = recipients
138
+
139
+ def send(self, alert: Alert) -> bool:
140
+ """Send email notification."""
141
+ try:
142
+ # Create message
143
+ msg = MIMEMultipart()
144
+ msg['From'] = self.smtp_config['sender']
145
+ msg['To'] = ', '.join(self.recipients)
146
+ msg['Subject'] = f'GPU Alert: {alert.gpu_name} - {alert.severity.upper()}'
147
+
148
+ # Create email body
149
+ body = f"""
150
+ GPU Monitoring Alert
151
+
152
+ GPU: {alert.gpu_name}
153
+ Metric: {alert.metric}
154
+ Current Value: {alert.value}
155
+ Threshold: {alert.threshold}
156
+ Severity: {alert.severity.upper()}
157
+ Time: {datetime.fromtimestamp(alert.timestamp).strftime('%Y-%m-%d %H:%M:%S')}
158
+
159
+ Message: {alert.message}
160
+
161
+ This is an automated message from the GPU Monitoring System.
162
+ """
163
+
164
+ msg.attach(MIMEText(body, 'plain'))
165
+
166
+ # Send email
167
+ with smtplib.SMTP(self.smtp_config['server'], self.smtp_config['port']) as server:
168
+ if self.smtp_config.get('use_tls', True):
169
+ server.starttls()
170
+
171
+ if 'username' in self.smtp_config and 'password' in self.smtp_config:
172
+ server.login(self.smtp_config['username'], self.smtp_config['password'])
173
+
174
+ server.send_message(msg)
175
+
176
+ return True
177
+
178
+ except Exception as e:
179
+ logger.error(f"Failed to send email notification: {e}")
180
+ return False
181
+
182
+
183
+ class WebhookNotification(NotificationChannel):
184
+ """Webhook notification channel."""
185
+
186
+ def __init__(self, name: str, webhook_url: str, headers: Optional[Dict[str, str]] = None):
187
+ super().__init__(name)
188
+ self.webhook_url = webhook_url
189
+ self.headers = headers or {}
190
+
191
+ def send(self, alert: Alert) -> bool:
192
+ """Send webhook notification."""
193
+ try:
194
+ import requests
195
+
196
+ payload = {
197
+ 'alert_id': alert.id,
198
+ 'timestamp': alert.timestamp,
199
+ 'gpu_name': alert.gpu_name,
200
+ 'metric': alert.metric,
201
+ 'value': alert.value,
202
+ 'threshold': alert.threshold,
203
+ 'message': alert.message,
204
+ 'severity': alert.severity,
205
+ 'acknowledged': alert.acknowledged
206
+ }
207
+
208
+ response = requests.post(
209
+ self.webhook_url,
210
+ json=payload,
211
+ headers=self.headers,
212
+ timeout=10
213
+ )
214
+
215
+ return response.status_code == 200
216
+
217
+ except Exception as e:
218
+ logger.error(f"Failed to send webhook notification: {e}")
219
+ return False
220
+
221
+
222
+ class AlertManager:
223
+ """Main alert management system."""
224
+
225
+ def __init__(self, config_file: str = "config/alerts.json", db_path: str = "data/gpu_monitoring.db"):
226
+ self.config_file = config_file
227
+ self.db_manager = GPUDataManager(db_path)
228
+
229
+ # Alert state tracking
230
+ self.active_alerts = {} # alert_id -> Alert
231
+ self.threshold_states = {} # (gpu, metric) -> {'value': float, 'start_time': float}
232
+ self.last_alert_times = {} # (gpu, metric) -> timestamp
233
+
234
+ # Notification channels
235
+ self.channels = []
236
+
237
+ # Configuration
238
+ self.thresholds = []
239
+ self.alert_settings = {}
240
+
241
+ # Threading
242
+ self.running = False
243
+ self.thread = None
244
+
245
+ # Load configuration
246
+ self.load_config()
247
+ self.setup_channels()
248
+
249
+ def load_config(self):
250
+ """Load alert configuration from file."""
251
+ try:
252
+ if Path(self.config_file).exists():
253
+ with open(self.config_file, 'r') as f:
254
+ config = json.load(f)
255
+
256
+ # Load thresholds
257
+ self.thresholds = []
258
+ for threshold_data in config.get('thresholds', []):
259
+ threshold = AlertThreshold(**threshold_data)
260
+ self.thresholds.append(threshold)
261
+
262
+ # Load settings
263
+ self.alert_settings = config.get('settings', {
264
+ 'check_interval': 5.0,
265
+ 'cleanup_interval': 3600.0,
266
+ 'max_alert_age': 86400.0 # 24 hours
267
+ })
268
+
269
+ logger.info(f"Loaded {len(self.thresholds)} alert thresholds")
270
+ else:
271
+ self.create_default_config()
272
+
273
+ except Exception as e:
274
+ logger.error(f"Error loading alert config: {e}")
275
+ self.create_default_config()
276
+
277
+ def create_default_config(self):
278
+ """Create default alert configuration."""
279
+ default_config = {
280
+ 'thresholds': [
281
+ {
282
+ 'metric': 'temperature',
283
+ 'threshold': 75.0,
284
+ 'operator': '>=',
285
+ 'duration': 10,
286
+ 'enabled': True,
287
+ 'cooldown': 300
288
+ },
289
+ {
290
+ 'metric': 'temperature',
291
+ 'threshold': 85.0,
292
+ 'operator': '>=',
293
+ 'duration': 5,
294
+ 'enabled': True,
295
+ 'cooldown': 600
296
+ },
297
+ {
298
+ 'metric': 'load',
299
+ 'threshold': 90.0,
300
+ 'operator': '>=',
301
+ 'duration': 30,
302
+ 'enabled': True,
303
+ 'cooldown': 600
304
+ },
305
+ {
306
+ 'metric': 'power_draw',
307
+ 'threshold': 200.0,
308
+ 'operator': '>=',
309
+ 'duration': 10,
310
+ 'enabled': True,
311
+ 'cooldown': 300
312
+ },
313
+ {
314
+ 'metric': 'fan_speed',
315
+ 'threshold': 95.0,
316
+ 'operator': '>=',
317
+ 'duration': 60,
318
+ 'enabled': True,
319
+ 'cooldown': 1800
320
+ }
321
+ ],
322
+ 'settings': {
323
+ 'check_interval': 5.0,
324
+ 'cleanup_interval': 3600.0,
325
+ 'max_alert_age': 86400.0
326
+ },
327
+ 'notifications': {
328
+ 'log': {'enabled': True},
329
+ 'desktop': {'enabled': True},
330
+ 'email': {
331
+ 'enabled': False,
332
+ 'smtp': {
333
+ 'server': 'smtp.gmail.com',
334
+ 'port': 587,
335
+ 'use_tls': True,
336
+ 'sender': 'your-email@gmail.com',
337
+ 'username': 'your-username',
338
+ 'password': 'your-app-password'
339
+ },
340
+ 'recipients': ['admin@example.com']
341
+ },
342
+ 'webhook': {
343
+ 'enabled': False,
344
+ 'url': 'https://your-webhook-url.com/alerts',
345
+ 'headers': {'Authorization': 'Bearer your-token'}
346
+ }
347
+ }
348
+ }
349
+
350
+ # Save default config
351
+ Path(self.config_file).parent.mkdir(parents=True, exist_ok=True)
352
+ with open(self.config_file, 'w') as f:
353
+ json.dump(default_config, f, indent=2)
354
+
355
+ logger.info("Created default alert configuration")
356
+
357
+ def setup_channels(self):
358
+ """Setup notification channels."""
359
+ try:
360
+ if Path(self.config_file).exists():
361
+ with open(self.config_file, 'r') as f:
362
+ config = json.load(f)
363
+
364
+ notifications = config.get('notifications', {})
365
+
366
+ # Log channel
367
+ if notifications.get('log', {}).get('enabled', True):
368
+ self.channels.append(LogNotification('log'))
369
+
370
+ # Desktop channel
371
+ if notifications.get('desktop', {}).get('enabled', True):
372
+ self.channels.append(DesktopNotification('desktop'))
373
+
374
+ # Email channel
375
+ email_config = notifications.get('email', {})
376
+ if email_config.get('enabled', False):
377
+ smtp_config = email_config.get('smtp', {})
378
+ recipients = email_config.get('recipients', [])
379
+ if smtp_config and recipients:
380
+ self.channels.append(EmailNotification(
381
+ 'email', smtp_config, recipients
382
+ ))
383
+
384
+ # Webhook channel
385
+ webhook_config = notifications.get('webhook', {})
386
+ if webhook_config.get('enabled', False):
387
+ url = webhook_config.get('url')
388
+ headers = webhook_config.get('headers', {})
389
+ if url:
390
+ self.channels.append(WebhookNotification('webhook', url, headers))
391
+
392
+ logger.info(f"Setup {len(self.channels)} notification channels")
393
+
394
+ except Exception as e:
395
+ logger.error(f"Error setting up notification channels: {e}")
396
+
397
+ def add_threshold(self, threshold: AlertThreshold):
398
+ """Add a new alert threshold."""
399
+ self.thresholds.append(threshold)
400
+ self.save_config()
401
+
402
+ def remove_threshold(self, metric: str, threshold_value: float):
403
+ """Remove an alert threshold."""
404
+ self.thresholds = [
405
+ t for t in self.thresholds
406
+ if not (t.metric == metric and t.threshold == threshold_value)
407
+ ]
408
+ self.save_config()
409
+
410
+ def save_config(self):
411
+ """Save current configuration to file."""
412
+ try:
413
+ config = {
414
+ 'thresholds': [asdict(t) for t in self.thresholds],
415
+ 'settings': self.alert_settings,
416
+ 'notifications': {
417
+ 'log': {'enabled': any(isinstance(c, LogNotification) for c in self.channels)},
418
+ 'desktop': {'enabled': any(isinstance(c, DesktopNotification) for c in self.channels)},
419
+ }
420
+ }
421
+
422
+ # Add email and webhook configs if channels exist
423
+ email_channel = next((c for c in self.channels if isinstance(c, EmailNotification)), None)
424
+ if email_channel:
425
+ config['notifications']['email'] = {
426
+ 'enabled': True,
427
+ 'smtp': email_channel.smtp_config,
428
+ 'recipients': email_channel.recipients
429
+ }
430
+
431
+ webhook_channel = next((c for c in self.channels if isinstance(c, WebhookNotification)), None)
432
+ if webhook_channel:
433
+ config['notifications']['webhook'] = {
434
+ 'enabled': True,
435
+ 'url': webhook_channel.webhook_url,
436
+ 'headers': webhook_channel.headers
437
+ }
438
+
439
+ with open(self.config_file, 'w') as f:
440
+ json.dump(config, f, indent=2)
441
+
442
+ except Exception as e:
443
+ logger.error(f"Error saving alert config: {e}")
444
+
445
+ def check_thresholds(self, gpu_name: str, status: GPUStatus):
446
+ """Check all thresholds against current status."""
447
+ current_time = time.time()
448
+
449
+ # Check each threshold
450
+ for threshold in self.thresholds:
451
+ if not threshold.enabled:
452
+ continue
453
+
454
+ # Get metric value
455
+ metric_value = self.get_metric_value(status, threshold.metric)
456
+ if metric_value is None:
457
+ continue
458
+
459
+ # Check if threshold is exceeded
460
+ if self.check_operator(metric_value, threshold.operator, threshold.threshold):
461
+ # Start or continue threshold state
462
+ state_key = (gpu_name, threshold.metric, threshold.threshold)
463
+
464
+ if state_key not in self.threshold_states:
465
+ self.threshold_states[state_key] = {
466
+ 'value': metric_value,
467
+ 'start_time': current_time
468
+ }
469
+ else:
470
+ self.threshold_states[state_key]['value'] = metric_value
471
+
472
+ # Check if duration threshold is met
473
+ state = self.threshold_states[state_key]
474
+ duration = current_time - state['start_time']
475
+
476
+ if duration >= threshold.duration:
477
+ # Check cooldown
478
+ last_alert_key = state_key
479
+ if last_alert_key in self.last_alert_times:
480
+ time_since_last = current_time - self.last_alert_times[last_alert_key]
481
+ if time_since_last < threshold.cooldown:
482
+ continue
483
+
484
+ # Create alert
485
+ alert = self.create_alert(gpu_name, threshold, metric_value)
486
+ self.trigger_alert(alert)
487
+
488
+ # Update cooldown
489
+ self.last_alert_times[last_alert_key] = current_time
490
+
491
+ else:
492
+ # Reset threshold state if condition is no longer met
493
+ state_key = (gpu_name, threshold.metric, threshold.threshold)
494
+ if state_key in self.threshold_states:
495
+ del self.threshold_states[state_key]
496
+
497
+ def get_metric_value(self, status: GPUStatus, metric: str) -> Optional[float]:
498
+ """Get metric value from GPU status."""
499
+ if metric == 'temperature':
500
+ return status.temperature
501
+ elif metric == 'load':
502
+ return status.load
503
+ elif metric == 'power_draw':
504
+ return status.power_draw
505
+ elif metric == 'fan_speed':
506
+ return (status.fan_pwm / 255) * 100 # Convert PWM to percentage
507
+ elif metric == 'memory_usage':
508
+ if status.memory_total > 0:
509
+ return (status.memory_used / status.memory_total) * 100
510
+ return 0
511
+ elif metric == 'efficiency':
512
+ return status.efficiency
513
+ else:
514
+ return None
515
+
516
+ def check_operator(self, value: float, operator: str, threshold: float) -> bool:
517
+ """Check if value meets threshold condition."""
518
+ if operator == '>':
519
+ return value > threshold
520
+ elif operator == '<':
521
+ return value < threshold
522
+ elif operator == '>=':
523
+ return value >= threshold
524
+ elif operator == '<=':
525
+ return value <= threshold
526
+ elif operator == '==':
527
+ return value == threshold
528
+ else:
529
+ return False
530
+
531
+ def create_alert(self, gpu_name: str, threshold: AlertThreshold, value: float) -> Alert:
532
+ """Create an alert instance."""
533
+ alert_id = f"{gpu_name}_{threshold.metric}_{threshold.threshold}_{int(time.time())}"
534
+
535
+ # Determine severity
536
+ severity = 'info'
537
+ if threshold.metric == 'temperature':
538
+ if threshold.threshold >= 85:
539
+ severity = 'emergency'
540
+ elif threshold.threshold >= 75:
541
+ severity = 'critical'
542
+ else:
543
+ severity = 'warning'
544
+ elif threshold.metric == 'load':
545
+ severity = 'warning'
546
+ elif threshold.metric == 'power_draw':
547
+ severity = 'critical'
548
+ elif threshold.metric == 'fan_speed':
549
+ severity = 'warning'
550
+
551
+ message = f"{threshold.metric} ({value}) exceeded threshold ({threshold.threshold}) for {threshold.duration}s"
552
+
553
+ alert = Alert(
554
+ id=alert_id,
555
+ timestamp=time.time(),
556
+ gpu_name=gpu_name,
557
+ metric=threshold.metric,
558
+ value=value,
559
+ threshold=threshold.threshold,
560
+ message=message,
561
+ severity=severity
562
+ )
563
+
564
+ return alert
565
+
566
+ def trigger_alert(self, alert: Alert):
567
+ """Trigger an alert and send notifications."""
568
+ # Store alert in database
569
+ self.db_manager.save_alert(alert)
570
+
571
+ # Add to active alerts
572
+ self.active_alerts[alert.id] = alert
573
+
574
+ # Send notifications
575
+ for channel in self.channels:
576
+ if channel.enabled:
577
+ try:
578
+ success = channel.send(alert)
579
+ if not success:
580
+ logger.warning(f"Failed to send alert via {channel.name}")
581
+ except Exception as e:
582
+ logger.error(f"Error sending alert via {channel.name}: {e}")
583
+
584
+ logger.info(f"Alert triggered: {alert.message}")
585
+
586
+ def resolve_alert(self, alert_id: str):
587
+ """Resolve an active alert."""
588
+ if alert_id in self.active_alerts:
589
+ alert = self.active_alerts[alert_id]
590
+ alert.resolved = True
591
+ alert.resolved_at = time.time()
592
+
593
+ # Remove from active alerts
594
+ del self.active_alerts[alert_id]
595
+
596
+ logger.info(f"Alert resolved: {alert.message}")
597
+
598
+ def get_active_alerts(self) -> List[Alert]:
599
+ """Get list of active alerts."""
600
+ return list(self.active_alerts.values())
601
+
602
+ def get_alert_history(self, hours: int = 24) -> List[Alert]:
603
+ """Get alert history from database."""
604
+ try:
605
+ cutoff_time = time.time() - (hours * 3600)
606
+
607
+ with sqlite3.connect(self.db_manager.db_path) as conn:
608
+ cursor = conn.cursor()
609
+
610
+ cursor.execute('''
611
+ SELECT id, timestamp, gpu_name, metric, value, threshold, message, severity,
612
+ acknowledged, resolved, resolved_at
613
+ FROM alerts
614
+ WHERE timestamp >= ?
615
+ ORDER BY timestamp DESC
616
+ ''', (cutoff_time,))
617
+
618
+ rows = cursor.fetchall()
619
+
620
+ if not rows:
621
+ return []
622
+
623
+ alerts = []
624
+ for row in rows:
625
+ alert = Alert(
626
+ id=row[0],
627
+ timestamp=row[1],
628
+ gpu_name=row[2],
629
+ metric=row[3],
630
+ value=row[4],
631
+ threshold=row[5],
632
+ message=row[6],
633
+ severity=row[7],
634
+ acknowledged=bool(row[8]),
635
+ resolved=bool(row[9]),
636
+ resolved_at=row[10]
637
+ )
638
+ alerts.append(alert)
639
+
640
+ return alerts
641
+
642
+ except Exception as e:
643
+ logger.error(f"Error getting alert history: {e}")
644
+ return []
645
+
646
+ def cleanup_old_alerts(self):
647
+ """Clean up old alerts and threshold states."""
648
+ current_time = time.time()
649
+ max_age = self.alert_settings.get('max_alert_age', 86400.0)
650
+
651
+ # Clean up old active alerts
652
+ old_alerts = []
653
+ for alert_id, alert in self.active_alerts.items():
654
+ if current_time - alert.timestamp > max_age:
655
+ old_alerts.append(alert_id)
656
+
657
+ for alert_id in old_alerts:
658
+ del self.active_alerts[alert_id]
659
+
660
+ # Clean up old threshold states
661
+ old_states = []
662
+ for state_key, state in self.threshold_states.items():
663
+ if current_time - state['start_time'] > max_age:
664
+ old_states.append(state_key)
665
+
666
+ for state_key in old_states:
667
+ del self.threshold_states[state_key]
668
+
669
+ # Clean up old last alert times
670
+ old_cooldowns = []
671
+ for key, timestamp in self.last_alert_times.items():
672
+ if current_time - timestamp > max_age:
673
+ old_cooldowns.append(key)
674
+
675
+ for key in old_cooldowns:
676
+ del self.last_alert_times[key]
677
+
678
+ def start(self):
679
+ """Start the alert manager."""
680
+ if self.running:
681
+ return
682
+
683
+ self.running = True
684
+ self.thread = threading.Thread(target=self.run, daemon=True)
685
+ self.thread.start()
686
+ logger.info("Alert manager started")
687
+
688
+ def stop(self):
689
+ """Stop the alert manager."""
690
+ self.running = False
691
+ if self.thread:
692
+ self.thread.join()
693
+ logger.info("Alert manager stopped")
694
+
695
+ def run(self):
696
+ """Main alert manager loop."""
697
+ check_interval = self.alert_settings.get('check_interval', 5.0)
698
+ cleanup_interval = self.alert_settings.get('cleanup_interval', 3600.0)
699
+ last_cleanup = time.time()
700
+
701
+ while self.running:
702
+ try:
703
+ # Get current GPU status
704
+ gpu_manager = GPUManager()
705
+ if gpu_manager.initialize():
706
+ status_dict = gpu_manager.get_status()
707
+
708
+ # Check thresholds for each GPU
709
+ for gpu_name, status in status_dict.items():
710
+ if status:
711
+ self.check_thresholds(gpu_name, status)
712
+
713
+ # Cleanup old data periodically
714
+ current_time = time.time()
715
+ if current_time - last_cleanup >= cleanup_interval:
716
+ self.cleanup_old_alerts()
717
+ last_cleanup = current_time
718
+
719
+ time.sleep(check_interval)
720
+
721
+ except Exception as e:
722
+ logger.error(f"Error in alert manager loop: {e}")
723
+ time.sleep(5) # Wait before retrying
724
+
725
+
726
+ class AlertAPI:
727
+ """API interface for alert management."""
728
+
729
+ def __init__(self, alert_manager: AlertManager):
730
+ self.alert_manager = alert_manager
731
+
732
+ def get_active_alerts(self) -> List[Dict[str, Any]]:
733
+ """Get active alerts."""
734
+ alerts = self.alert_manager.get_active_alerts()
735
+ return [asdict(alert) for alert in alerts]
736
+
737
+ def get_alert_history(self, hours: int = 24) -> List[Dict[str, Any]]:
738
+ """Get alert history."""
739
+ alerts = self.alert_manager.get_alert_history(hours)
740
+ return [asdict(alert) for alert in alerts]
741
+
742
+ def acknowledge_alert(self, alert_id: str) -> bool:
743
+ """Acknowledge an alert."""
744
+ if alert_id in self.alert_manager.active_alerts:
745
+ self.alert_manager.active_alerts[alert_id].acknowledged = True
746
+ return True
747
+ return False
748
+
749
+ def resolve_alert(self, alert_id: str) -> bool:
750
+ """Resolve an alert."""
751
+ self.alert_manager.resolve_alert(alert_id)
752
+ return True
753
+
754
+ def add_threshold(self, threshold_data: Dict[str, Any]) -> bool:
755
+ """Add a new threshold."""
756
+ try:
757
+ threshold = AlertThreshold(**threshold_data)
758
+ self.alert_manager.add_threshold(threshold)
759
+ return True
760
+ except Exception as e:
761
+ logger.error(f"Error adding threshold: {e}")
762
+ return False
763
+
764
+ def remove_threshold(self, metric: str, threshold_value: float) -> bool:
765
+ """Remove a threshold."""
766
+ try:
767
+ self.alert_manager.remove_threshold(metric, threshold_value)
768
+ return True
769
+ except Exception as e:
770
+ logger.error(f"Error removing threshold: {e}")
771
+ return False
772
+
773
+
774
+ if __name__ == "__main__":
775
+ # Test alert system
776
+ logging.basicConfig(level=logging.INFO)
777
+
778
+ alert_manager = AlertManager()
779
+ alert_manager.start()
780
+
781
+ try:
782
+ while True:
783
+ time.sleep(1)
784
+ except KeyboardInterrupt:
785
+ alert_manager.stop()