SPARKNET / src /tools /gpu_tools.py
MHamdan's picture
Initial commit: SPARKNET framework
a9dc537
"""
GPU Tools for SPARKNET
Tools for GPU monitoring and management
"""
from typing import Optional
from loguru import logger
from .base_tool import BaseTool, ToolResult
from ..utils.gpu_manager import get_gpu_manager
class GPUMonitorTool(BaseTool):
"""Tool for monitoring GPU status."""
def __init__(self):
super().__init__(
name="gpu_monitor",
description="Monitor GPU status, memory usage, and utilization",
)
self.add_parameter("gpu_id", "int", "Specific GPU ID to monitor (optional)", required=False, default=None)
self.gpu_manager = get_gpu_manager()
async def execute(self, gpu_id: Optional[int] = None, **kwargs) -> ToolResult:
"""
Monitor GPU status.
Args:
gpu_id: Specific GPU ID or None for all GPUs
Returns:
ToolResult with GPU information
"""
try:
if gpu_id is not None:
# Get info for specific GPU
info = self.gpu_manager.get_gpu_info(gpu_id)
if "error" in info:
return ToolResult(
success=False,
output=None,
error=info["error"],
)
output = self._format_gpu_info(info)
return ToolResult(
success=True,
output=output,
metadata=info,
)
else:
# Get info for all GPUs
all_info = self.gpu_manager.get_all_gpu_info()
output_lines = []
for info in all_info:
if "error" not in info:
output_lines.append(self._format_gpu_info(info))
output = "\n\n".join(output_lines)
return ToolResult(
success=True,
output=output,
metadata={"gpus": all_info},
)
except Exception as e:
logger.error(f"GPU monitoring error: {e}")
return ToolResult(
success=False,
output=None,
error=f"Monitoring error: {str(e)}",
)
def _format_gpu_info(self, info: dict) -> str:
"""Format GPU info for display."""
return (
f"GPU {info['gpu_id']}: {info['name']}\n"
f" Memory: {info['memory_used'] / 1024**3:.2f} GB / {info['memory_total'] / 1024**3:.2f} GB "
f"({info['memory_percent']:.1f}% used)\n"
f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB\n"
f" GPU Utilization: {info['gpu_utilization']}%\n"
f" Temperature: {info['temperature']}°C"
)
class GPUSelectTool(BaseTool):
"""Tool for selecting best available GPU."""
def __init__(self):
super().__init__(
name="gpu_select",
description="Select the best available GPU based on free memory",
)
self.add_parameter("min_memory_gb", "float", "Minimum required memory in GB", required=False, default=8.0)
self.gpu_manager = get_gpu_manager()
async def execute(self, min_memory_gb: float = 8.0, **kwargs) -> ToolResult:
"""
Select best GPU.
Args:
min_memory_gb: Minimum required memory
Returns:
ToolResult with selected GPU ID
"""
try:
gpu_id = self.gpu_manager.select_best_gpu(min_memory_gb)
if gpu_id is None:
return ToolResult(
success=False,
output=None,
error=f"No GPU found with {min_memory_gb} GB free memory",
)
info = self.gpu_manager.get_gpu_info(gpu_id)
output = (
f"Selected GPU {gpu_id}: {info['name']}\n"
f"Free Memory: {info['memory_free'] / 1024**3:.2f} GB"
)
return ToolResult(
success=True,
output=output,
metadata={
"gpu_id": gpu_id,
"gpu_info": info,
},
)
except Exception as e:
logger.error(f"GPU selection error: {e}")
return ToolResult(
success=False,
output=None,
error=f"Selection error: {str(e)}",
)