Spaces:
Running
Running
# /// script | |
# dependencies = [ | |
# "torch", | |
# "numpy", | |
# ] | |
# /// | |
"""Simple utilities for running the models.""" | |
import torch | |
def to_dtype(dtype_str: str): | |
"""Convert string to torch dtype.""" | |
if dtype_str == "float16": | |
return torch.float16 | |
if dtype_str == "bfloat16": | |
return torch.bfloat16 | |
return torch.float32 | |
def tensor_stats(t: torch.Tensor) -> str: | |
"""Generate stats string for a tensor.""" | |
return (f"shape={tuple(t.shape)}, " | |
f"dtype={t.dtype}, " | |
f"device={t.device}, " | |
f"mean={t.mean().item():.6f}, " | |
f"std={t.std().item():.6f}") | |
def set_seed(seed: int): | |
"""Set seeds for reproducibility.""" | |
torch.manual_seed(seed) | |
if torch.cuda.is_available(): | |
torch.cuda.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = False | |
"""Reusable benchmarking utilities for performance testing.""" | |
import time | |
import numpy as np | |
from contextlib import contextmanager | |
from typing import Callable, Dict, Tuple, Any, Optional | |
import torch | |
import json | |
def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20, | |
input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]: | |
"""High precision timing function with warmup and optional input generation per iteration.""" | |
# Warmup | |
for i in range(warmup): | |
if input_generator: | |
inputs = input_generator(i) | |
func(inputs) | |
else: | |
func() | |
if torch.cuda.is_available(): | |
torch.cuda.synchronize() | |
start = time.perf_counter() | |
result = None | |
for i in range(iters): | |
if input_generator: | |
inputs = input_generator(i + warmup) # Continue seed sequence after warmup | |
result = func(inputs) | |
else: | |
result = func() | |
if torch.cuda.is_available(): | |
torch.cuda.synchronize() | |
end = time.perf_counter() | |
avg_time = (end - start) / iters | |
return result, avg_time | |
def memory_usage() -> Dict[str, float]: | |
"""Get current memory usage in GB.""" | |
if not torch.cuda.is_available(): | |
return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0} | |
return { | |
"allocated": torch.cuda.memory_allocated() / 1024**3, | |
"cached": torch.cuda.memory_reserved() / 1024**3, | |
"max_allocated": torch.cuda.max_memory_allocated() / 1024**3 | |
} | |
def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None, | |
tokens: int = None, save_json: Optional[str] = None, | |
input_shape: Optional[Tuple] = None, input_seed_base: int = 42): | |
"""Context manager for benchmarking with comprehensive metrics and optional input generation.""" | |
def run_benchmark(model_func, *args, **kwargs): | |
torch.cuda.empty_cache() if torch.cuda.is_available() else None | |
mem_before = memory_usage() | |
# Create input generator if input_shape is provided | |
input_generator = None | |
if input_shape is not None: | |
def create_input(iteration: int): | |
# Use deterministic but different seed for each iteration | |
iteration_seed = input_seed_base + iteration * 123 # Spread out seeds | |
torch.manual_seed(iteration_seed) | |
if torch.cuda.is_available(): | |
torch.cuda.manual_seed(iteration_seed) | |
return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1 | |
input_generator = create_input | |
if input_generator: | |
result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator) | |
else: | |
result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters) | |
mem_after = memory_usage() | |
# Calculate metrics | |
metrics = { | |
"avg_time_ms": avg_time * 1000, | |
"throughput_tokens_per_sec": tokens / avg_time if tokens else None, | |
"memory_allocated_gb": mem_after["allocated"], | |
"memory_cached_gb": mem_after["cached"], | |
"memory_increase_gb": mem_after["allocated"] - mem_before["allocated"], | |
"device": str(device) if device else "cpu", | |
"dtype": str(dtype) if dtype else "float32", | |
"tokens": tokens, | |
"warmup_iters": warmup, | |
"timing_iters": iters | |
} | |
# Print results | |
print(f"Average time: {metrics['avg_time_ms']:.3f} ms") | |
if tokens: | |
print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec") | |
print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB") | |
print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB") | |
# Save to JSON if requested | |
if save_json: | |
with open(save_json, 'w') as f: | |
json.dump(metrics, f, indent=2) | |
return result | |
yield run_benchmark |