Spaces:

drbh
/

compare-moe-uvnote

Running

App Files Files

xet

Community

compare-moe-uvnote / cells /utils.py

drbh HF Staff

Upload folder using huggingface_hub

bad4ddc verified about 1 month ago

raw

history blame contribute delete

5.13 kB

	# /// script
	# dependencies = [
	# "torch",
	# "numpy",
	# ]
	# ///

	"""Simple utilities for running the models."""
	import torch

	def to_dtype(dtype_str: str):
	"""Convert string to torch dtype."""
	if dtype_str == "float16":
	return torch.float16
	if dtype_str == "bfloat16":
	return torch.bfloat16
	return torch.float32

	def tensor_stats(t: torch.Tensor) -> str:
	"""Generate stats string for a tensor."""
	return (f"shape={tuple(t.shape)}, "
	f"dtype={t.dtype}, "
	f"device={t.device}, "
	f"mean={t.mean().item():.6f}, "
	f"std={t.std().item():.6f}")

	def set_seed(seed: int):
	"""Set seeds for reproducibility."""
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False

	"""Reusable benchmarking utilities for performance testing."""
	import time
	import numpy as np
	from contextlib import contextmanager
	from typing import Callable, Dict, Tuple, Any, Optional
	import torch
	import json

	def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20,
	input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]:
	"""High precision timing function with warmup and optional input generation per iteration."""
	# Warmup
	for i in range(warmup):
	if input_generator:
	inputs = input_generator(i)
	func(inputs)
	else:
	func()

	if torch.cuda.is_available():
	torch.cuda.synchronize()

	start = time.perf_counter()
	result = None
	for i in range(iters):
	if input_generator:
	inputs = input_generator(i + warmup) # Continue seed sequence after warmup
	result = func(inputs)
	else:
	result = func()

	if torch.cuda.is_available():
	torch.cuda.synchronize()

	end = time.perf_counter()
	avg_time = (end - start) / iters
	return result, avg_time

	def memory_usage() -> Dict[str, float]:
	"""Get current memory usage in GB."""
	if not torch.cuda.is_available():
	return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0}

	return {
	"allocated": torch.cuda.memory_allocated() / 1024**3,
	"cached": torch.cuda.memory_reserved() / 1024**3,
	"max_allocated": torch.cuda.max_memory_allocated() / 1024**3
	}

	@contextmanager
	def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None,
	tokens: int = None, save_json: Optional[str] = None,
	input_shape: Optional[Tuple] = None, input_seed_base: int = 42):
	"""Context manager for benchmarking with comprehensive metrics and optional input generation."""

	def run_benchmark(model_func, args, *kwargs):
	torch.cuda.empty_cache() if torch.cuda.is_available() else None

	mem_before = memory_usage()

	# Create input generator if input_shape is provided
	input_generator = None
	if input_shape is not None:
	def create_input(iteration: int):
	# Use deterministic but different seed for each iteration
	iteration_seed = input_seed_base + iteration * 123 # Spread out seeds
	torch.manual_seed(iteration_seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(iteration_seed)
	return torch.randn(input_shape, device=device, dtype=dtype) 0.1
	input_generator = create_input

	if input_generator:
	result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator)
	else:
	result, avg_time = precise_timing(lambda: model_func(args, *kwargs), warmup, iters)

	mem_after = memory_usage()

	# Calculate metrics
	metrics = {
	"avg_time_ms": avg_time * 1000,
	"throughput_tokens_per_sec": tokens / avg_time if tokens else None,
	"memory_allocated_gb": mem_after["allocated"],
	"memory_cached_gb": mem_after["cached"],
	"memory_increase_gb": mem_after["allocated"] - mem_before["allocated"],
	"device": str(device) if device else "cpu",
	"dtype": str(dtype) if dtype else "float32",
	"tokens": tokens,
	"warmup_iters": warmup,
	"timing_iters": iters
	}

	# Print results
	print(f"Average time: {metrics['avg_time_ms']:.3f} ms")
	if tokens:
	print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec")
	print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB")
	print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB")

	# Save to JSON if requested
	if save_json:
	with open(save_json, 'w') as f:
	json.dump(metrics, f, indent=2)

	return result

	yield run_benchmark