Vedisasi's picture
Upload folder using huggingface_hub
54c5666 verified
"""System resource monitoring"""
import psutil
import time
import logging
from typing import Dict, Optional
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class SystemMetrics:
"""System resource metrics"""
cpu_percent: float
memory_percent: float
memory_available_gb: float
memory_used_gb: float
disk_usage_percent: float
timestamp: float
class SystemMonitor:
"""Monitor system resources during training"""
def __init__(self, log_interval: int = 60):
"""
Args:
log_interval: Seconds between system metric logs
"""
self.log_interval = log_interval
self.last_log_time = 0
self.metrics_history = []
def get_current_metrics(self) -> SystemMetrics:
"""Get current system metrics"""
memory = psutil.virtual_memory()
disk = psutil.disk_usage('/')
return SystemMetrics(
cpu_percent=psutil.cpu_percent(interval=0.1),
memory_percent=memory.percent,
memory_available_gb=memory.available / 1e9,
memory_used_gb=memory.used / 1e9,
disk_usage_percent=disk.percent,
timestamp=time.time()
)
def check_and_log(self, force: bool = False) -> Optional[SystemMetrics]:
"""Check system metrics and log if interval elapsed"""
current_time = time.time()
if force or (current_time - self.last_log_time) >= self.log_interval:
metrics = self.get_current_metrics()
self.metrics_history.append(metrics)
self.last_log_time = current_time
logger.info(
f"System Metrics | "
f"CPU: {metrics.cpu_percent:.1f}% | "
f"RAM: {metrics.memory_used_gb:.1f}/{metrics.memory_used_gb + metrics.memory_available_gb:.1f}GB "
f"({metrics.memory_percent:.1f}%) | "
f"Disk: {metrics.disk_usage_percent:.1f}%"
)
# Warn if resources are high
if metrics.memory_percent > 90:
logger.warning(f"High memory usage: {metrics.memory_percent:.1f}%")
if metrics.cpu_percent > 95:
logger.warning(f"High CPU usage: {metrics.cpu_percent:.1f}%")
if metrics.disk_usage_percent > 90:
logger.warning(f"High disk usage: {metrics.disk_usage_percent:.1f}%")
return metrics
return None
def get_summary(self) -> Dict[str, float]:
"""Get summary of system metrics"""
if not self.metrics_history:
return {}
cpu_values = [m.cpu_percent for m in self.metrics_history]
mem_values = [m.memory_percent for m in self.metrics_history]
return {
'avg_cpu_percent': sum(cpu_values) / len(cpu_values),
'max_cpu_percent': max(cpu_values),
'avg_memory_percent': sum(mem_values) / len(mem_values),
'max_memory_percent': max(mem_values),
'max_memory_used_gb': max(m.memory_used_gb for m in self.metrics_history)
}