|
""" |
|
GPU monitoring tab for Video Model Studio UI. |
|
Displays detailed GPU metrics and visualizations. |
|
""" |
|
|
|
import gradio as gr |
|
import time |
|
import logging |
|
from pathlib import Path |
|
import os |
|
from typing import Dict, Any, List, Optional, Tuple |
|
from datetime import datetime, timedelta |
|
|
|
from vms.utils.base_tab import BaseTab |
|
from vms.ui.monitoring.utils import human_readable_size |
|
|
|
logger = logging.getLogger(__name__) |
|
logger.setLevel(logging.INFO) |
|
|
|
class GPUTab(BaseTab): |
|
"""Tab for GPU-specific monitoring and statistics""" |
|
|
|
def __init__(self, app_state): |
|
super().__init__(app_state) |
|
self.id = "GPU_tab" |
|
self.title = "GPU Stats" |
|
self.refresh_interval = 5 |
|
self.selected_gpu = 0 |
|
|
|
def create(self, parent=None) -> gr.TabItem: |
|
"""Create the GPU tab UI components""" |
|
with gr.TabItem(self.title, id=self.id) as tab: |
|
with gr.Row(): |
|
gr.Markdown("## 🖥️ GPU Monitoring") |
|
|
|
|
|
with gr.Row(visible=not self.app.monitoring.gpu.has_nvidia_gpus): |
|
with gr.Column(): |
|
gr.Markdown("### No NVIDIA GPUs detected") |
|
gr.Markdown("GPU monitoring is only available for NVIDIA GPUs. If you have NVIDIA GPUs installed, ensure the drivers are properly configured.") |
|
|
|
|
|
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus): |
|
|
|
if self.app.monitoring.gpu.gpu_count > 1: |
|
with gr.Column(scale=1): |
|
gpu_options = [f"GPU {i}" for i in range(self.app.monitoring.gpu.gpu_count)] |
|
self.components["gpu_selector"] = gr.Dropdown( |
|
choices=gpu_options, |
|
value=gpu_options[0] if gpu_options else None, |
|
label="Select GPU", |
|
interactive=True |
|
) |
|
|
|
|
|
with gr.Column(scale=3): |
|
self.components["current_metrics"] = gr.Markdown("Loading GPU metrics...") |
|
|
|
|
|
with gr.Tabs(visible=self.app.monitoring.gpu.has_nvidia_gpus) as metrics_tabs: |
|
with gr.Tab(label="Utilization") as util_tab: |
|
self.components["utilization_plot"] = gr.LinePlot( |
|
show_label=False, |
|
height=400, |
|
x="time", |
|
y=["GPU Utilization (%)", "Temperature (°C)"], |
|
y_title="Value", |
|
title="GPU Utilization and Temperature" |
|
) |
|
|
|
with gr.Tab(label="Memory") as memory_tab: |
|
self.components["memory_plot"] = gr.LinePlot( |
|
show_label=False, |
|
height=400, |
|
x="time", |
|
y=["Memory Usage (%)", "Memory Used (GB)"], |
|
y_title="Value", |
|
title="GPU Memory Usage" |
|
) |
|
|
|
with gr.Tab(label="Power") as power_tab: |
|
self.components["power_plot"] = gr.LinePlot( |
|
show_label=False, |
|
height=400, |
|
x="time", |
|
y=["Power Usage (W)", "Power Limit (W)"], |
|
y_title="Watts", |
|
title="GPU Power Usage" |
|
) |
|
|
|
|
|
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus): |
|
with gr.Column(): |
|
gr.Markdown("### Active Processes") |
|
self.components["process_info"] = gr.Markdown("Loading process information...") |
|
|
|
|
|
with gr.Row(visible=self.app.monitoring.gpu.has_nvidia_gpus): |
|
with gr.Column(): |
|
gr.Markdown("### GPU Information") |
|
self.components["gpu_info"] = gr.Markdown("Loading GPU information...") |
|
|
|
|
|
with gr.Row(): |
|
self.components["auto_refresh"] = gr.Checkbox( |
|
label=f"Auto refresh (every {self.refresh_interval} seconds)", |
|
value=True, |
|
info="Automatically refresh GPU metrics" |
|
) |
|
self.components["refresh_btn"] = gr.Button("Refresh Now") |
|
|
|
|
|
self.components["refresh_timer"] = gr.Timer( |
|
value=self.refresh_interval |
|
) |
|
|
|
return tab |
|
|
|
def connect_events(self) -> None: |
|
"""Connect event handlers to UI components""" |
|
|
|
if self.app.monitoring.gpu.gpu_count > 1 and "gpu_selector" in self.components: |
|
self.components["gpu_selector"].change( |
|
fn=self.update_selected_gpu, |
|
inputs=[self.components["gpu_selector"]], |
|
outputs=[ |
|
self.components["current_metrics"], |
|
self.components["utilization_plot"], |
|
self.components["memory_plot"], |
|
self.components["power_plot"], |
|
self.components["process_info"], |
|
self.components["gpu_info"] |
|
] |
|
) |
|
|
|
|
|
self.components["refresh_btn"].click( |
|
fn=self.refresh_all, |
|
outputs=[ |
|
self.components["current_metrics"], |
|
self.components["utilization_plot"], |
|
self.components["memory_plot"], |
|
self.components["power_plot"], |
|
self.components["process_info"], |
|
self.components["gpu_info"] |
|
] |
|
) |
|
|
|
|
|
self.components["refresh_timer"].tick( |
|
fn=self.conditional_refresh, |
|
inputs=[self.components["auto_refresh"]], |
|
outputs=[ |
|
self.components["current_metrics"], |
|
self.components["utilization_plot"], |
|
self.components["memory_plot"], |
|
self.components["power_plot"], |
|
self.components["process_info"], |
|
self.components["gpu_info"] |
|
] |
|
) |
|
|
|
def on_enter(self): |
|
"""Called when the tab is selected""" |
|
|
|
return self.refresh_all() |
|
|
|
def update_selected_gpu(self, gpu_selector: str) -> Tuple: |
|
"""Update the selected GPU and refresh data |
|
|
|
Args: |
|
gpu_selector: Selected GPU string ("GPU X") |
|
|
|
Returns: |
|
Updated components |
|
""" |
|
|
|
try: |
|
self.selected_gpu = int(gpu_selector.replace("GPU ", "")) |
|
except (ValueError, AttributeError): |
|
self.selected_gpu = 0 |
|
|
|
|
|
return self.refresh_all() |
|
|
|
def conditional_refresh(self, auto_refresh: bool) -> Tuple: |
|
"""Only refresh if auto-refresh is enabled |
|
|
|
Args: |
|
auto_refresh: Whether auto-refresh is enabled |
|
|
|
Returns: |
|
Updated components or unchanged components |
|
""" |
|
if auto_refresh: |
|
return self.refresh_all() |
|
|
|
|
|
return ( |
|
self.components["current_metrics"].value, |
|
self.components["utilization_plot"].value, |
|
self.components["memory_plot"].value, |
|
self.components["power_plot"].value, |
|
self.components["process_info"].value, |
|
self.components["gpu_info"].value |
|
) |
|
|
|
def refresh_all(self) -> Tuple: |
|
"""Refresh all GPU monitoring components |
|
|
|
Returns: |
|
Updated values for all components |
|
""" |
|
|
|
try: |
|
if not self.app.monitoring.gpu.has_nvidia_gpus: |
|
return ( |
|
"No NVIDIA GPUs detected", |
|
None, |
|
None, |
|
None, |
|
"No process information available", |
|
"No GPU information available" |
|
) |
|
|
|
|
|
all_metrics = self.app.monitoring.gpu.get_current_metrics() |
|
if not all_metrics or self.selected_gpu >= len(all_metrics): |
|
return ( |
|
"GPU metrics not available", |
|
None, |
|
None, |
|
None, |
|
"No process information available", |
|
"No GPU information available" |
|
) |
|
|
|
|
|
gpu_metrics = all_metrics[self.selected_gpu] |
|
|
|
|
|
metrics_html = self.format_current_metrics(gpu_metrics) |
|
|
|
|
|
process_info_html = self.format_process_info(gpu_metrics) |
|
|
|
|
|
gpu_info = self.app.monitoring.gpu.get_gpu_info() |
|
gpu_info_html = self.format_gpu_info(gpu_info[self.selected_gpu] if self.selected_gpu < len(gpu_info) else {}) |
|
|
|
|
|
utilization_plot_df = self.app.monitoring.gpu.get_utilization_data(self.selected_gpu) |
|
memory_plot_df = self.app.monitoring.gpu.get_memory_data(self.selected_gpu) |
|
power_plot_df = self.app.monitoring.gpu.get_power_data(self.selected_gpu) |
|
|
|
|
|
return ( |
|
metrics_html, |
|
utilization_plot_df, |
|
memory_plot_df, |
|
power_plot_df, |
|
process_info_html, |
|
gpu_info_html |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"Error refreshing GPU data: {str(e)}", exc_info=True) |
|
error_msg = f"Error retrieving GPU data: {str(e)}" |
|
return ( |
|
error_msg, |
|
None, |
|
None, |
|
None, |
|
error_msg, |
|
error_msg |
|
) |
|
|
|
def format_current_metrics(self, metrics: Dict[str, Any]) -> str: |
|
"""Format current GPU metrics as HTML/Markdown |
|
|
|
Args: |
|
metrics: Current metrics dictionary |
|
|
|
Returns: |
|
Formatted HTML/Markdown string |
|
""" |
|
if 'error' in metrics: |
|
return f"Error retrieving GPU metrics: {metrics['error']}" |
|
|
|
|
|
if isinstance(metrics.get('timestamp'), datetime): |
|
timestamp_str = metrics['timestamp'].strftime('%Y-%m-%d %H:%M:%S') |
|
else: |
|
timestamp_str = "Unknown" |
|
|
|
|
|
util_style = "color: green;" |
|
if metrics.get('utilization_gpu', 0) > 90: |
|
util_style = "color: red; font-weight: bold;" |
|
elif metrics.get('utilization_gpu', 0) > 70: |
|
util_style = "color: orange;" |
|
|
|
|
|
mem_style = "color: green;" |
|
if metrics.get('memory_percent', 0) > 90: |
|
mem_style = "color: red; font-weight: bold;" |
|
elif metrics.get('memory_percent', 0) > 70: |
|
mem_style = "color: orange;" |
|
|
|
|
|
temp_style = "color: green;" |
|
temp = metrics.get('temperature', 0) |
|
if temp > 85: |
|
temp_style = "color: red; font-weight: bold;" |
|
elif temp > 75: |
|
temp_style = "color: orange;" |
|
|
|
|
|
memory_used_gb = metrics.get('memory_used', 0) / (1024**3) |
|
memory_total_gb = metrics.get('memory_total', 0) / (1024**3) |
|
|
|
|
|
power_html = "" |
|
if metrics.get('power_usage') is not None: |
|
power_html = f"**Power Usage:** {metrics['power_usage']:.1f}W\n" |
|
|
|
html = f""" |
|
### Current Status as of {timestamp_str} |
|
|
|
**GPU Utilization:** <span style="{util_style}">{metrics.get('utilization_gpu', 0):.1f}%</span> |
|
**Memory Usage:** <span style="{mem_style}">{metrics.get('memory_percent', 0):.1f}% ({memory_used_gb:.2f}/{memory_total_gb:.2f} GB)</span> |
|
**Temperature:** <span style="{temp_style}">{metrics.get('temperature', 0)}°C</span> |
|
{power_html} |
|
""" |
|
return html |
|
def format_process_info(self, metrics: Dict[str, Any]) -> str: |
|
"""Format GPU process information as HTML/Markdown |
|
|
|
Args: |
|
metrics: Current metrics dictionary with process information |
|
|
|
Returns: |
|
Formatted HTML/Markdown string |
|
""" |
|
if 'error' in metrics: |
|
return "Process information not available" |
|
|
|
processes = metrics.get('processes', []) |
|
if not processes: |
|
return "No active processes using this GPU" |
|
|
|
|
|
sorted_processes = sorted(processes, key=lambda p: p.get('memory_used', 0), reverse=True) |
|
|
|
html = "| PID | Process Name | Memory Usage |\n" |
|
html += "|-----|-------------|-------------|\n" |
|
|
|
for proc in sorted_processes: |
|
pid = proc.get('pid', 'Unknown') |
|
name = proc.get('name', 'Unknown') |
|
mem_mb = proc.get('memory_used', 0) / (1024**2) |
|
|
|
html += f"| {pid} | {name} | {mem_mb:.1f} MB |\n" |
|
|
|
return html |
|
|
|
def format_gpu_info(self, info: Dict[str, Any]) -> str: |
|
"""Format GPU information as HTML/Markdown |
|
|
|
Args: |
|
info: GPU information dictionary |
|
|
|
Returns: |
|
Formatted HTML/Markdown string |
|
""" |
|
if 'error' in info: |
|
return f"GPU information not available: {info.get('error', 'Unknown error')}" |
|
|
|
|
|
memory_total_gb = info.get('memory_total', 0) / (1024**3) |
|
|
|
html = f""" |
|
**Name:** {info.get('name', 'Unknown')} |
|
**Memory:** {memory_total_gb:.2f} GB |
|
**UUID:** {info.get('uuid', 'N/A')} |
|
**Compute Capability:** {info.get('compute_capability', 'N/A')} |
|
""" |
|
|
|
|
|
if info.get('power_limit') is not None: |
|
html += f"**Power Limit:** {info['power_limit']:.1f}W\n" |
|
|
|
return html |