| """ | |
| Cost calculation module for MLPerf configurations. | |
| """ | |
| import logging | |
| import pandas as pd | |
| logger = logging.getLogger(__name__) | |
| DEFAULT_HOURLY_COST = 1.0 | |
| DEFAULT_DEVICE_COSTS = { | |
| "NVIDIA H100": 3.00, | |
| "NVIDIA H200": 4.00, | |
| "NVIDIA GH200": 5.00, | |
| "NVIDIA B200/GB200": 7.00, | |
| "AMD MI300X": 3.50, | |
| "AMD MI325X": 4.50, | |
| "NVIDIA RTX 4090": 1.20, | |
| "NVIDIA L40S": 1.80, | |
| "NVIDIA Jetson AGX": 0.30, | |
| } | |
| device_costs = {} | |
| def normalize_gpu_name(name: str) -> str: | |
| """Normalize GPU names by identifying common patterns for the same device families.""" | |
| if not name: | |
| return name | |
| name_upper = name.upper() | |
| gpu_families = { | |
| "H100": "NVIDIA H100", | |
| "H200": "NVIDIA H200", | |
| "GH200": "NVIDIA GH200", | |
| "GRACE HOPPER": "NVIDIA GH200", | |
| "B200": "NVIDIA B200/GB200", | |
| "GB200": "NVIDIA B200/GB200", | |
| "MI300X": "AMD MI300X", | |
| "MI325X": "AMD MI325X", | |
| "RTX 4090": "NVIDIA RTX 4090", | |
| "L40S": "NVIDIA L40S", | |
| } | |
| if "JETSON" in name_upper and ("ORIN" in name_upper or "THOR" in name_upper): | |
| return "NVIDIA Jetson AGX" | |
| for keyword, normalized_name in gpu_families.items(): | |
| if keyword in name_upper: | |
| return normalized_name | |
| return name | |
| def initialize_device_costs(df: pd.DataFrame) -> None: | |
| """Initialize device costs from dataset with default values.""" | |
| global device_costs | |
| accelerators = set() | |
| if df is not None and not df.empty and "system.accelerator.name" in df.columns: | |
| for acc in df["system.accelerator.name"].dropna().unique(): | |
| normalized_name = normalize_gpu_name(acc) | |
| accelerators.add(normalized_name) | |
| device_costs = {} | |
| for device in accelerators: | |
| if device in DEFAULT_DEVICE_COSTS: | |
| device_costs[device] = DEFAULT_DEVICE_COSTS[device] | |
| else: | |
| device_costs[device] = DEFAULT_HOURLY_COST | |
| logger.info(f"Initialized costs for {len(device_costs)} unique device families") | |
| def get_device_costs() -> dict[str, float]: | |
| """Return a copy of the current device costs.""" | |
| return device_costs.copy() | |
| def update_device_costs(new_costs: dict[str, float]) -> None: | |
| """Update device costs with new values.""" | |
| global device_costs | |
| device_costs.update(new_costs) | |
| logger.info(f"Updated costs for {len(new_costs)} devices") | |
| def calculate_costs(df: pd.DataFrame) -> pd.DataFrame: | |
| """Add cost metrics to the DataFrame.""" | |
| if df is None or df.empty: | |
| return df | |
| result_df = df.copy() | |
| result_df["hourly_cost"] = None | |
| result_df["cost_per_million_tokens"] = None | |
| for idx, row in result_df.iterrows(): | |
| hourly_cost = estimate_hourly_cost(row) | |
| result_df.at[idx, "hourly_cost"] = hourly_cost | |
| if hourly_cost and "metrics.result" in row and row["metrics.result"]: | |
| tokens_per_hour = row["metrics.result"] * 3600 | |
| if tokens_per_hour > 0: | |
| cost_per_million = (hourly_cost / tokens_per_hour) * 1000000 | |
| result_df.at[idx, "cost_per_million_tokens"] = cost_per_million | |
| return result_df | |
| def estimate_hourly_cost(row: pd.Series) -> float: | |
| """Estimate hourly cost for a single configuration.""" | |
| try: | |
| acc_name = row.get("system.accelerator.name") | |
| acc_vendor = row.get("system.accelerator.vendor") | |
| acc_count = row.get("system.accelerator.total_count") | |
| if not acc_count: | |
| return None | |
| base_cost = DEFAULT_HOURLY_COST | |
| if acc_name: | |
| normalized_name = normalize_gpu_name(acc_name) | |
| if normalized_name in device_costs: | |
| base_cost = device_costs[normalized_name] | |
| elif acc_vendor and acc_vendor in device_costs: | |
| base_cost = device_costs[acc_vendor] | |
| return base_cost * acc_count | |
| except Exception as e: | |
| logger.warning(f"Error calculating cost: {e}") | |
| return None | |