FlexBoard / cost_calculator.py
Daniel Altunay
first commit
bfbd179 unverified
"""
Cost calculation module for MLPerf configurations.
"""
import logging
import pandas as pd
logger = logging.getLogger(__name__)
DEFAULT_HOURLY_COST = 1.0
DEFAULT_DEVICE_COSTS = {
"NVIDIA H100": 3.00,
"NVIDIA H200": 4.00,
"NVIDIA GH200": 5.00,
"NVIDIA B200/GB200": 7.00,
"AMD MI300X": 3.50,
"AMD MI325X": 4.50,
"NVIDIA RTX 4090": 1.20,
"NVIDIA L40S": 1.80,
"NVIDIA Jetson AGX": 0.30,
}
device_costs = {}
def normalize_gpu_name(name: str) -> str:
"""Normalize GPU names by identifying common patterns for the same device families."""
if not name:
return name
name_upper = name.upper()
gpu_families = {
"H100": "NVIDIA H100",
"H200": "NVIDIA H200",
"GH200": "NVIDIA GH200",
"GRACE HOPPER": "NVIDIA GH200",
"B200": "NVIDIA B200/GB200",
"GB200": "NVIDIA B200/GB200",
"MI300X": "AMD MI300X",
"MI325X": "AMD MI325X",
"RTX 4090": "NVIDIA RTX 4090",
"L40S": "NVIDIA L40S",
}
if "JETSON" in name_upper and ("ORIN" in name_upper or "THOR" in name_upper):
return "NVIDIA Jetson AGX"
for keyword, normalized_name in gpu_families.items():
if keyword in name_upper:
return normalized_name
return name
def initialize_device_costs(df: pd.DataFrame) -> None:
"""Initialize device costs from dataset with default values."""
global device_costs
accelerators = set()
if df is not None and not df.empty and "system.accelerator.name" in df.columns:
for acc in df["system.accelerator.name"].dropna().unique():
normalized_name = normalize_gpu_name(acc)
accelerators.add(normalized_name)
device_costs = {}
for device in accelerators:
if device in DEFAULT_DEVICE_COSTS:
device_costs[device] = DEFAULT_DEVICE_COSTS[device]
else:
device_costs[device] = DEFAULT_HOURLY_COST
logger.info(f"Initialized costs for {len(device_costs)} unique device families")
def get_device_costs() -> dict[str, float]:
"""Return a copy of the current device costs."""
return device_costs.copy()
def update_device_costs(new_costs: dict[str, float]) -> None:
"""Update device costs with new values."""
global device_costs
device_costs.update(new_costs)
logger.info(f"Updated costs for {len(new_costs)} devices")
def calculate_costs(df: pd.DataFrame) -> pd.DataFrame:
"""Add cost metrics to the DataFrame."""
if df is None or df.empty:
return df
result_df = df.copy()
result_df["hourly_cost"] = None
result_df["cost_per_million_tokens"] = None
for idx, row in result_df.iterrows():
hourly_cost = estimate_hourly_cost(row)
result_df.at[idx, "hourly_cost"] = hourly_cost
if hourly_cost and "metrics.result" in row and row["metrics.result"]:
tokens_per_hour = row["metrics.result"] * 3600
if tokens_per_hour > 0:
cost_per_million = (hourly_cost / tokens_per_hour) * 1000000
result_df.at[idx, "cost_per_million_tokens"] = cost_per_million
return result_df
def estimate_hourly_cost(row: pd.Series) -> float:
"""Estimate hourly cost for a single configuration."""
try:
acc_name = row.get("system.accelerator.name")
acc_vendor = row.get("system.accelerator.vendor")
acc_count = row.get("system.accelerator.total_count")
if not acc_count:
return None
base_cost = DEFAULT_HOURLY_COST
if acc_name:
normalized_name = normalize_gpu_name(acc_name)
if normalized_name in device_costs:
base_cost = device_costs[normalized_name]
elif acc_vendor and acc_vendor in device_costs:
base_cost = device_costs[acc_vendor]
return base_cost * acc_count
except Exception as e:
logger.warning(f"Error calculating cost: {e}")
return None