drbh's picture
drbh HF Staff
Upload folder using huggingface_hub
bad4ddc verified
# /// script
# dependencies = [
# "torch",
# "numpy",
# ]
# ///
"""Simple utilities for running the models."""
import torch
def to_dtype(dtype_str: str):
"""Convert string to torch dtype."""
if dtype_str == "float16":
return torch.float16
if dtype_str == "bfloat16":
return torch.bfloat16
return torch.float32
def tensor_stats(t: torch.Tensor) -> str:
"""Generate stats string for a tensor."""
return (f"shape={tuple(t.shape)}, "
f"dtype={t.dtype}, "
f"device={t.device}, "
f"mean={t.mean().item():.6f}, "
f"std={t.std().item():.6f}")
def set_seed(seed: int):
"""Set seeds for reproducibility."""
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
"""Reusable benchmarking utilities for performance testing."""
import time
import numpy as np
from contextlib import contextmanager
from typing import Callable, Dict, Tuple, Any, Optional
import torch
import json
def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20,
input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]:
"""High precision timing function with warmup and optional input generation per iteration."""
# Warmup
for i in range(warmup):
if input_generator:
inputs = input_generator(i)
func(inputs)
else:
func()
if torch.cuda.is_available():
torch.cuda.synchronize()
start = time.perf_counter()
result = None
for i in range(iters):
if input_generator:
inputs = input_generator(i + warmup) # Continue seed sequence after warmup
result = func(inputs)
else:
result = func()
if torch.cuda.is_available():
torch.cuda.synchronize()
end = time.perf_counter()
avg_time = (end - start) / iters
return result, avg_time
def memory_usage() -> Dict[str, float]:
"""Get current memory usage in GB."""
if not torch.cuda.is_available():
return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0}
return {
"allocated": torch.cuda.memory_allocated() / 1024**3,
"cached": torch.cuda.memory_reserved() / 1024**3,
"max_allocated": torch.cuda.max_memory_allocated() / 1024**3
}
@contextmanager
def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None,
tokens: int = None, save_json: Optional[str] = None,
input_shape: Optional[Tuple] = None, input_seed_base: int = 42):
"""Context manager for benchmarking with comprehensive metrics and optional input generation."""
def run_benchmark(model_func, *args, **kwargs):
torch.cuda.empty_cache() if torch.cuda.is_available() else None
mem_before = memory_usage()
# Create input generator if input_shape is provided
input_generator = None
if input_shape is not None:
def create_input(iteration: int):
# Use deterministic but different seed for each iteration
iteration_seed = input_seed_base + iteration * 123 # Spread out seeds
torch.manual_seed(iteration_seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(iteration_seed)
return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1
input_generator = create_input
if input_generator:
result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator)
else:
result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters)
mem_after = memory_usage()
# Calculate metrics
metrics = {
"avg_time_ms": avg_time * 1000,
"throughput_tokens_per_sec": tokens / avg_time if tokens else None,
"memory_allocated_gb": mem_after["allocated"],
"memory_cached_gb": mem_after["cached"],
"memory_increase_gb": mem_after["allocated"] - mem_before["allocated"],
"device": str(device) if device else "cpu",
"dtype": str(dtype) if dtype else "float32",
"tokens": tokens,
"warmup_iters": warmup,
"timing_iters": iters
}
# Print results
print(f"Average time: {metrics['avg_time_ms']:.3f} ms")
if tokens:
print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec")
print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB")
print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB")
# Save to JSON if requested
if save_json:
with open(save_json, 'w') as f:
json.dump(metrics, f, indent=2)
return result
yield run_benchmark