meccatronis commited on
Commit
3838a08
·
verified ·
1 Parent(s): 85b135b

Upload gpu_monitoring.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. gpu_monitoring.py +600 -0
gpu_monitoring.py ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GPU Detection and Monitoring Module
4
+
5
+ Handles GPU detection, hardware monitoring, and data collection for AMD GPUs.
6
+ Supports multiple GPU models and provides a unified interface for monitoring.
7
+ """
8
+
9
+ import os
10
+ import glob
11
+ import time
12
+ import logging
13
+ import json
14
+ import sqlite3
15
+ from typing import Dict, List, Optional, Tuple, Any
16
+ from dataclasses import dataclass, asdict
17
+ from pathlib import Path
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class GPUInfo:
24
+ """GPU information data structure."""
25
+ name: str
26
+ vendor: str
27
+ device_id: str
28
+ bus_id: str
29
+ hwmon_path: str
30
+ temp_sensor: str
31
+ fan_control: str
32
+ power_sensor: str
33
+ memory_total: int = 0
34
+ memory_used: int = 0
35
+
36
+
37
+ @dataclass
38
+ class GPUStatus:
39
+ """Current GPU status data structure."""
40
+ timestamp: float
41
+ temperature: float
42
+ load: float
43
+ fan_speed: int
44
+ fan_pwm: int
45
+ power_draw: float
46
+ memory_used: int
47
+ memory_total: int
48
+ core_clock: int
49
+ memory_clock: int
50
+ voltage: float = 0.0
51
+ efficiency: float = 0.0
52
+
53
+
54
+ class GPUHardwareDetector:
55
+ """Detects and identifies GPU hardware."""
56
+
57
+ def __init__(self):
58
+ self.gpus = []
59
+ self.detected_gpus = []
60
+
61
+ def detect_amd_gpus(self) -> List[GPUInfo]:
62
+ """Detect AMD GPUs in the system."""
63
+ logger.info("Detecting AMD GPUs...")
64
+
65
+ gpus = []
66
+
67
+ # Look for AMD GPUs in sysfs
68
+ for card_path in glob.glob("/sys/class/drm/card*"):
69
+ try:
70
+ # Check if this is an AMD GPU
71
+ device_path = os.path.join(card_path, "device")
72
+ vendor_file = os.path.join(device_path, "vendor")
73
+
74
+ if not os.path.exists(vendor_file):
75
+ continue
76
+
77
+ with open(vendor_file, 'r') as f:
78
+ vendor_id = f.read().strip()
79
+
80
+ if vendor_id != "0x1002": # AMD vendor ID
81
+ continue
82
+
83
+ # Get device information
84
+ device_id_file = os.path.join(device_path, "device")
85
+ with open(device_id_file, 'r') as f:
86
+ device_id = f.read().strip()
87
+
88
+ # Get GPU name
89
+ gpu_name = self._get_gpu_name(device_id)
90
+
91
+ # Find hwmon path
92
+ hwmon_path = self._find_hwmon_path(device_path)
93
+
94
+ if hwmon_path:
95
+ gpu_info = GPUInfo(
96
+ name=gpu_name,
97
+ vendor="AMD",
98
+ device_id=device_id,
99
+ bus_id=os.path.basename(card_path),
100
+ hwmon_path=hwmon_path,
101
+ temp_sensor=os.path.join(hwmon_path, "temp1_input"),
102
+ fan_control=os.path.join(hwmon_path, "pwm1"),
103
+ power_sensor=os.path.join(hwmon_path, "power1_input")
104
+ )
105
+ gpus.append(gpu_info)
106
+ logger.info(f"Found AMD GPU: {gpu_name} ({device_id})")
107
+
108
+ except Exception as e:
109
+ logger.warning(f"Error detecting GPU at {card_path}: {e}")
110
+ continue
111
+
112
+ self.detected_gpus = gpus
113
+ return gpus
114
+
115
+ def _get_gpu_name(self, device_id: str) -> str:
116
+ """Get human-readable GPU name from device ID."""
117
+ gpu_names = {
118
+ "0x73bf": "Radeon Pro VII",
119
+ "0x73ff": "Radeon Pro VII",
120
+ "0x7310": "Radeon Pro VII",
121
+ "0x7340": "Radeon Pro VII",
122
+ "0x73a0": "Radeon Pro VII",
123
+ "0x73b0": "Radeon Pro VII",
124
+ "0x73c0": "Radeon Pro VII",
125
+ "0x73d0": "Radeon Pro VII",
126
+ "0x73e0": "Radeon Pro VII",
127
+ "0x73f0": "Radeon Pro VII",
128
+ # Add more GPU mappings as needed
129
+ }
130
+
131
+ return gpu_names.get(device_id, f"AMD GPU {device_id}")
132
+
133
+ def _find_hwmon_path(self, device_path: str) -> Optional[str]:
134
+ """Find the hwmon path for a GPU device."""
135
+ hwmon_base = os.path.join(device_path, "hwmon")
136
+
137
+ if not os.path.exists(hwmon_base):
138
+ return None
139
+
140
+ try:
141
+ hwmons = os.listdir(hwmon_base)
142
+ if hwmons:
143
+ return os.path.join(hwmon_base, hwmons[0])
144
+ except Exception as e:
145
+ logger.warning(f"Error finding hwmon for {device_path}: {e}")
146
+
147
+ return None
148
+
149
+
150
+ class GPUDataCollector:
151
+ """Collects GPU monitoring data."""
152
+
153
+ def __init__(self, gpu_info: GPUInfo):
154
+ self.gpu_info = gpu_info
155
+ self.last_status = None
156
+
157
+ def read_temperature(self) -> Optional[float]:
158
+ """Read GPU temperature in Celsius."""
159
+ try:
160
+ if os.path.exists(self.gpu_info.temp_sensor):
161
+ with open(self.gpu_info.temp_sensor, 'r') as f:
162
+ temp_millic = int(f.read().strip())
163
+ return temp_millic / 1000.0
164
+ except Exception as e:
165
+ logger.debug(f"Error reading temperature: {e}")
166
+ return None
167
+
168
+ def read_fan_speed(self) -> Optional[int]:
169
+ """Read fan speed in RPM."""
170
+ fan_speed_file = self.gpu_info.fan_control.replace("pwm1", "fan1_input")
171
+ try:
172
+ if os.path.exists(fan_speed_file):
173
+ with open(fan_speed_file, 'r') as f:
174
+ return int(f.read().strip())
175
+ except Exception as e:
176
+ logger.debug(f"Error reading fan speed: {e}")
177
+ return None
178
+
179
+ def read_fan_pwm(self) -> Optional[int]:
180
+ """Read fan PWM value (0-255)."""
181
+ try:
182
+ if os.path.exists(self.gpu_info.fan_control):
183
+ with open(self.gpu_info.fan_control, 'r') as f:
184
+ return int(f.read().strip())
185
+ except Exception as e:
186
+ logger.debug(f"Error reading fan PWM: {e}")
187
+ return None
188
+
189
+ def read_power_draw(self) -> Optional[float]:
190
+ """Read power draw in watts."""
191
+ try:
192
+ if os.path.exists(self.gpu_info.power_sensor):
193
+ with open(self.gpu_info.power_sensor, 'r') as f:
194
+ power_microw = int(f.read().strip())
195
+ return power_microw / 1000000.0
196
+ except Exception as e:
197
+ logger.debug(f"Error reading power: {e}")
198
+ return None
199
+
200
+ def read_memory_info(self) -> Tuple[Optional[int], Optional[int]]:
201
+ """Read VRAM usage in MB."""
202
+ try:
203
+ device_path = os.path.dirname(os.path.dirname(self.gpu_info.hwmon_path))
204
+ vram_used_file = os.path.join(device_path, "mem_info_vram_used")
205
+ vram_total_file = os.path.join(device_path, "mem_info_vram_total")
206
+
207
+ if os.path.exists(vram_used_file) and os.path.exists(vram_total_file):
208
+ with open(vram_used_file, 'r') as f:
209
+ used = int(f.read().strip()) // (1024 * 1024)
210
+ with open(vram_total_file, 'r') as f:
211
+ total = int(f.read().strip()) // (1024 * 1024)
212
+ return used, total
213
+ except Exception as e:
214
+ logger.debug(f"Error reading memory info: {e}")
215
+ return None, None
216
+
217
+ def read_gpu_load(self) -> Optional[float]:
218
+ """Read GPU load percentage."""
219
+ try:
220
+ device_path = os.path.dirname(os.path.dirname(self.gpu_info.hwmon_path))
221
+ load_file = os.path.join(device_path, "gpu_busy_percent")
222
+
223
+ if os.path.exists(load_file):
224
+ with open(load_file, 'r') as f:
225
+ return float(f.read().strip())
226
+ except Exception as e:
227
+ logger.debug(f"Error reading GPU load: {e}")
228
+ return None
229
+
230
+ def read_clocks(self) -> Tuple[Optional[int], Optional[int]]:
231
+ """Read core and memory clocks in MHz."""
232
+ try:
233
+ device_path = os.path.dirname(os.path.dirname(self.gpu_info.hwmon_path))
234
+ sclk_file = os.path.join(device_path, "pp_dpm_sclk")
235
+ mclk_file = os.path.join(device_path, "pp_dpm_mclk")
236
+
237
+ core_clock = self._parse_clock_file(sclk_file)
238
+ memory_clock = self._parse_clock_file(mclk_file)
239
+
240
+ return core_clock, memory_clock
241
+ except Exception as e:
242
+ logger.debug(f"Error reading clocks: {e}")
243
+ return None, None
244
+
245
+ def _parse_clock_file(self, clock_file: str) -> Optional[int]:
246
+ """Parse clock file to extract current clock speed."""
247
+ try:
248
+ if os.path.exists(clock_file):
249
+ with open(clock_file, 'r') as f:
250
+ for line in f:
251
+ if '*' in line: # Active clock
252
+ parts = line.strip().split(':')
253
+ if len(parts) >= 2:
254
+ clock_str = parts[1].strip().split()[0]
255
+ return int(clock_str.replace('Mhz', ''))
256
+ except Exception as e:
257
+ logger.debug(f"Error parsing clock file {clock_file}: {e}")
258
+ return None
259
+
260
+ def collect_status(self) -> Optional[GPUStatus]:
261
+ """Collect all GPU status information."""
262
+ try:
263
+ # Read all sensors
264
+ temp = self.read_temperature()
265
+ fan_speed = self.read_fan_speed()
266
+ fan_pwm = self.read_fan_pwm()
267
+ power = self.read_power_draw()
268
+ mem_used, mem_total = self.read_memory_info()
269
+ load = self.read_gpu_load()
270
+ core_clock, mem_clock = self.read_clocks()
271
+
272
+ # Calculate efficiency if we have power and load data
273
+ efficiency = 0.0
274
+ if power and load and power > 0:
275
+ efficiency = load / power
276
+
277
+ status = GPUStatus(
278
+ timestamp=time.time(),
279
+ temperature=temp or 0.0,
280
+ load=load or 0.0,
281
+ fan_speed=fan_speed or 0,
282
+ fan_pwm=fan_pwm or 0,
283
+ power_draw=power or 0.0,
284
+ memory_used=mem_used or 0,
285
+ memory_total=mem_total or 0,
286
+ core_clock=core_clock or 0,
287
+ memory_clock=mem_clock or 0,
288
+ voltage=0.0, # Not implemented yet
289
+ efficiency=efficiency
290
+ )
291
+
292
+ self.last_status = status
293
+ return status
294
+
295
+ except Exception as e:
296
+ logger.error(f"Error collecting GPU status: {e}")
297
+ return None
298
+
299
+
300
+ class GPUDataManager:
301
+ """Manages GPU data storage and retrieval."""
302
+
303
+ def __init__(self, db_path: str = "gpu_monitoring.db"):
304
+ self.db_path = db_path
305
+ self.init_database()
306
+
307
+ def init_database(self):
308
+ """Initialize the SQLite database."""
309
+ try:
310
+ with sqlite3.connect(self.db_path) as conn:
311
+ cursor = conn.cursor()
312
+
313
+ # Create tables
314
+ cursor.execute('''
315
+ CREATE TABLE IF NOT EXISTS gpu_status (
316
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
317
+ timestamp REAL,
318
+ gpu_name TEXT,
319
+ temperature REAL,
320
+ load REAL,
321
+ fan_speed INTEGER,
322
+ fan_pwm INTEGER,
323
+ power_draw REAL,
324
+ memory_used INTEGER,
325
+ memory_total INTEGER,
326
+ core_clock INTEGER,
327
+ memory_clock INTEGER,
328
+ voltage REAL,
329
+ efficiency REAL
330
+ )
331
+ ''')
332
+
333
+ cursor.execute('''
334
+ CREATE TABLE IF NOT EXISTS gpu_info (
335
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
336
+ gpu_name TEXT UNIQUE,
337
+ vendor TEXT,
338
+ device_id TEXT,
339
+ bus_id TEXT,
340
+ hwmon_path TEXT,
341
+ detected_at REAL
342
+ )
343
+ ''')
344
+
345
+ cursor.execute('''
346
+ CREATE TABLE IF NOT EXISTS alerts (
347
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
348
+ timestamp REAL,
349
+ gpu_name TEXT,
350
+ alert_type TEXT,
351
+ message TEXT,
352
+ value REAL,
353
+ threshold REAL
354
+ )
355
+ ''')
356
+
357
+ conn.commit()
358
+ logger.info("Database initialized successfully")
359
+
360
+ except Exception as e:
361
+ logger.error(f"Error initializing database: {e}")
362
+
363
+ def save_gpu_info(self, gpu_info: GPUInfo):
364
+ """Save GPU information to database."""
365
+ try:
366
+ with sqlite3.connect(self.db_path) as conn:
367
+ cursor = conn.cursor()
368
+
369
+ cursor.execute('''
370
+ INSERT OR REPLACE INTO gpu_info
371
+ (gpu_name, vendor, device_id, bus_id, hwmon_path, detected_at)
372
+ VALUES (?, ?, ?, ?, ?, ?)
373
+ ''', (gpu_info.name, gpu_info.vendor, gpu_info.device_id,
374
+ gpu_info.bus_id, gpu_info.hwmon_path, time.time()))
375
+
376
+ conn.commit()
377
+ logger.debug(f"Saved GPU info: {gpu_info.name}")
378
+
379
+ except Exception as e:
380
+ logger.error(f"Error saving GPU info: {e}")
381
+
382
+ def save_status(self, gpu_name: str, status: GPUStatus):
383
+ """Save GPU status to database."""
384
+ try:
385
+ with sqlite3.connect(self.db_path) as conn:
386
+ cursor = conn.cursor()
387
+
388
+ cursor.execute('''
389
+ INSERT INTO gpu_status
390
+ (timestamp, gpu_name, temperature, load, fan_speed, fan_pwm,
391
+ power_draw, memory_used, memory_total, core_clock, memory_clock,
392
+ voltage, efficiency)
393
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
394
+ ''', (status.timestamp, gpu_name, status.temperature, status.load,
395
+ status.fan_speed, status.fan_pwm, status.power_draw,
396
+ status.memory_used, status.memory_total, status.core_clock,
397
+ status.memory_clock, status.voltage, status.efficiency))
398
+
399
+ conn.commit()
400
+
401
+ except Exception as e:
402
+ logger.error(f"Error saving status: {e}")
403
+
404
+ def get_recent_status(self, gpu_name: str, limit: int = 100) -> List[Dict[str, Any]]:
405
+ """Get recent status entries for a GPU."""
406
+ try:
407
+ with sqlite3.connect(self.db_path) as conn:
408
+ cursor = conn.cursor()
409
+
410
+ cursor.execute('''
411
+ SELECT timestamp, temperature, load, fan_speed, fan_pwm,
412
+ power_draw, memory_used, memory_total, core_clock,
413
+ memory_clock, voltage, efficiency
414
+ FROM gpu_status
415
+ WHERE gpu_name = ?
416
+ ORDER BY timestamp DESC
417
+ LIMIT ?
418
+ ''', (gpu_name, limit))
419
+
420
+ rows = cursor.fetchall()
421
+
422
+ # Convert to list of dictionaries
423
+ columns = ['timestamp', 'temperature', 'load', 'fan_speed', 'fan_pwm',
424
+ 'power_draw', 'memory_used', 'memory_total', 'core_clock',
425
+ 'memory_clock', 'voltage', 'efficiency']
426
+
427
+ return [dict(zip(columns, row)) for row in rows]
428
+
429
+ except Exception as e:
430
+ logger.error(f"Error getting recent status: {e}")
431
+ return []
432
+
433
+ def get_gpu_info(self, gpu_name: Optional[str] = None) -> List[Dict[str, Any]]:
434
+ """Get GPU information from database."""
435
+ try:
436
+ with sqlite3.connect(self.db_path) as conn:
437
+ cursor = conn.cursor()
438
+
439
+ if gpu_name:
440
+ cursor.execute('SELECT * FROM gpu_info WHERE gpu_name = ?', (gpu_name,))
441
+ else:
442
+ cursor.execute('SELECT * FROM gpu_info')
443
+
444
+ rows = cursor.fetchall()
445
+
446
+ if not rows:
447
+ return []
448
+
449
+ columns = [desc[0] for desc in cursor.description]
450
+ return [dict(zip(columns, row)) for row in rows]
451
+
452
+ except Exception as e:
453
+ logger.error(f"Error getting GPU info: {e}")
454
+ return []
455
+
456
+ def cleanup_old_data(self, days: int = 7):
457
+ """Remove data older than specified days."""
458
+ try:
459
+ cutoff_time = time.time() - (days * 24 * 3600)
460
+
461
+ with sqlite3.connect(self.db_path) as conn:
462
+ cursor = conn.cursor()
463
+
464
+ cursor.execute('DELETE FROM gpu_status WHERE timestamp < ?', (cutoff_time,))
465
+ cursor.execute('DELETE FROM alerts WHERE timestamp < ?', (cutoff_time,))
466
+
467
+ conn.commit()
468
+ logger.info(f"Cleaned up data older than {days} days")
469
+
470
+ except Exception as e:
471
+ logger.error(f"Error cleaning up old data: {e}")
472
+
473
+
474
+ class GPUManager:
475
+ """Main GPU management class."""
476
+
477
+ def __init__(self):
478
+ self.detector = GPUHardwareDetector()
479
+ self.data_manager = GPUDataManager()
480
+ self.collectors = {}
481
+
482
+ def initialize(self):
483
+ """Initialize GPU detection and data collection."""
484
+ logger.info("Initializing GPU manager...")
485
+
486
+ # Detect GPUs
487
+ gpus = self.detector.detect_amd_gpus()
488
+
489
+ if not gpus:
490
+ logger.warning("No AMD GPUs detected")
491
+ return False
492
+
493
+ # Initialize collectors for each GPU
494
+ for gpu in gpus:
495
+ self.collectors[gpu.name] = GPUDataCollector(gpu)
496
+ self.data_manager.save_gpu_info(gpu)
497
+ logger.info(f"Initialized collector for {gpu.name}")
498
+
499
+ logger.info(f"Successfully initialized {len(gpus)} GPU(s)")
500
+ return True
501
+
502
+ def get_status(self, gpu_name: Optional[str] = None) -> Dict[str, Optional[GPUStatus]]:
503
+ """Get current status for specified GPU or all GPUs."""
504
+ results = {}
505
+
506
+ if gpu_name:
507
+ if gpu_name in self.collectors:
508
+ status = self.collectors[gpu_name].collect_status()
509
+ results[gpu_name] = status
510
+ else:
511
+ results[gpu_name] = None
512
+ else:
513
+ for name, collector in self.collectors.items():
514
+ status = collector.collect_status()
515
+ results[name] = status
516
+
517
+ # Save to database
518
+ for name, status in results.items():
519
+ if status:
520
+ self.data_manager.save_status(name, status)
521
+
522
+ return results
523
+
524
+ def get_gpu_list(self) -> List[str]:
525
+ """Get list of detected GPU names."""
526
+ return list(self.collectors.keys())
527
+
528
+ def get_gpu_info(self, gpu_name: Optional[str] = None) -> List[Dict[str, Any]]:
529
+ """Get GPU information."""
530
+ return self.data_manager.get_gpu_info(gpu_name)
531
+
532
+ def get_historical_data(self, gpu_name: str, hours: int = 24) -> List[Dict[str, Any]]:
533
+ """Get historical data for a GPU."""
534
+ cutoff_time = time.time() - (hours * 3600)
535
+
536
+ try:
537
+ with sqlite3.connect(self.data_manager.db_path) as conn:
538
+ cursor = conn.cursor()
539
+
540
+ cursor.execute('''
541
+ SELECT timestamp, temperature, load, fan_speed, fan_pwm,
542
+ power_draw, memory_used, memory_total, core_clock,
543
+ memory_clock, voltage, efficiency
544
+ FROM gpu_status
545
+ WHERE gpu_name = ? AND timestamp >= ?
546
+ ORDER BY timestamp ASC
547
+ ''', (gpu_name, cutoff_time))
548
+
549
+ rows = cursor.fetchall()
550
+
551
+ if not rows:
552
+ return []
553
+
554
+ columns = ['timestamp', 'temperature', 'load', 'fan_speed', 'fan_pwm',
555
+ 'power_draw', 'memory_used', 'memory_total', 'core_clock',
556
+ 'memory_clock', 'voltage', 'efficiency']
557
+
558
+ return [dict(zip(columns, row)) for row in rows]
559
+
560
+ except Exception as e:
561
+ logger.error(f"Error getting historical data: {e}")
562
+ return []
563
+
564
+
565
+ # Example usage and testing
566
+ if __name__ == "__main__":
567
+ import sys
568
+
569
+ # Setup logging
570
+ logging.basicConfig(
571
+ level=logging.INFO,
572
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
573
+ )
574
+
575
+ # Test GPU detection and monitoring
576
+ manager = GPUManager()
577
+
578
+ if manager.initialize():
579
+ print("GPU detection successful!")
580
+
581
+ # Get GPU list
582
+ gpus = manager.get_gpu_list()
583
+ print(f"Detected GPUs: {gpus}")
584
+
585
+ # Get current status
586
+ status = manager.get_status()
587
+ for gpu_name, gpu_status in status.items():
588
+ if gpu_status:
589
+ print(f"\n{gpu_name} Status:")
590
+ print(f" Temperature: {gpu_status.temperature}°C")
591
+ print(f" Load: {gpu_status.load}%")
592
+ print(f" Fan Speed: {gpu_status.fan_speed} RPM")
593
+ print(f" Fan PWM: {gpu_status.fan_pwm}")
594
+ print(f" Power: {gpu_status.power_draw}W")
595
+ print(f" Memory: {gpu_status.memory_used}/{gpu_status.memory_total} MB")
596
+ print(f" Core Clock: {gpu_status.core_clock} MHz")
597
+ print(f" Memory Clock: {gpu_status.memory_clock} MHz")
598
+ else:
599
+ print("GPU detection failed!")
600
+ sys.exit(1)