|
|
| """
|
| CUDA-optimized benchmark script for Ursa Minor Smashed model
|
| """
|
|
|
| import torch
|
| import time
|
| import argparse
|
| from inference_cuda import generate_direct, load_model_direct
|
|
|
| def benchmark_generation(model, num_runs=5, prompt="The quick brown fox", max_tokens=100):
|
| """Benchmark text generation performance on CUDA"""
|
|
|
| print(f"๐ Running {num_runs} generation benchmarks on CUDA...")
|
| print(f"๐ Prompt: '{prompt}'")
|
| print(f"๐ฏ Max tokens: {max_tokens}")
|
| print("-" * 50)
|
|
|
| times = []
|
| token_counts = []
|
|
|
| for i in range(num_runs):
|
| print(f"Run {i+1}/{num_runs}...", end=" ")
|
|
|
| start_time = time.time()
|
|
|
| result = generate_direct(
|
| model,
|
| prompt,
|
| max_new_tokens=max_tokens,
|
| temperature=0.8,
|
| top_k=50,
|
| top_p=0.9
|
| )
|
|
|
| end_time = time.time()
|
|
|
| generation_time = end_time - start_time
|
|
|
|
|
| import tiktoken
|
| enc = tiktoken.get_encoding("gpt2")
|
| total_tokens = len(enc.encode(result))
|
| prompt_tokens = len(enc.encode(prompt))
|
| generated_tokens = total_tokens - prompt_tokens
|
|
|
| times.append(generation_time)
|
| token_counts.append(generated_tokens)
|
|
|
| tokens_per_second = generated_tokens / generation_time
|
| print(f"โก {tokens_per_second:.1f} tokens/sec ({generation_time:.2f}s, {generated_tokens} tokens)")
|
|
|
|
|
| avg_time = sum(times) / len(times)
|
| avg_tokens = sum(token_counts) / len(token_counts)
|
| avg_tokens_per_sec = avg_tokens / avg_time
|
|
|
| print("\n๐ CUDA Benchmark Results:")
|
| print("-" * 30)
|
| print(f"Average generation time: {avg_time:.2f} seconds")
|
| print(f"Average tokens generated: {avg_tokens:.1f}")
|
| print(f"Average tokens/second: {avg_tokens_per_sec:.1f}")
|
| print(f"Best tokens/second: {max(token_counts[i]/times[i] for i in range(len(times))):.1f}")
|
| print(f"GPU Memory Usage: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
|
| print(f"GPU Memory Cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
|
|
|
| def benchmark_memory_usage(model):
|
| """Benchmark memory usage on CUDA"""
|
|
|
| print("\n๐ง CUDA Memory Usage Analysis:")
|
| print("-" * 30)
|
|
|
|
|
| torch.cuda.empty_cache()
|
| baseline_memory = torch.cuda.memory_allocated()
|
|
|
| print(f"Baseline GPU memory: {baseline_memory / 1024**3:.3f} GB")
|
|
|
|
|
| test_lengths = [50, 100, 200, 500]
|
|
|
| for length in test_lengths:
|
| torch.cuda.empty_cache()
|
|
|
|
|
| prompt = "Test prompt for memory benchmark " * 5
|
|
|
| start_memory = torch.cuda.memory_allocated()
|
|
|
| result = generate_direct(
|
| model,
|
| prompt,
|
| max_new_tokens=length,
|
| temperature=0.8
|
| )
|
|
|
| peak_memory = torch.cuda.memory_allocated()
|
| memory_increase = peak_memory - start_memory
|
|
|
| print(f"Tokens {length:3d}: +{memory_increase / 1024**2:.1f} MB (Peak: {peak_memory / 1024**3:.3f} GB)")
|
|
|
| def benchmark_different_parameters(model):
|
| """Benchmark different generation parameters on CUDA"""
|
|
|
| print("\nโ๏ธ CUDA Parameter Performance Comparison:")
|
| print("-" * 40)
|
|
|
| prompt = "Artificial intelligence is revolutionizing"
|
| base_params = {"max_new_tokens": 100}
|
|
|
| test_configs = [
|
| {"name": "Conservative", "temperature": 0.3, "top_k": 20, "top_p": 0.8},
|
| {"name": "Balanced", "temperature": 0.7, "top_k": 50, "top_p": 0.9},
|
| {"name": "Creative", "temperature": 1.0, "top_k": 100, "top_p": 0.95},
|
| {"name": "High Top-K", "temperature": 0.8, "top_k": 200, "top_p": 0.9},
|
| ]
|
|
|
| for config in test_configs:
|
| params = {**base_params, **{k: v for k, v in config.items() if k != "name"}}
|
|
|
| print(f"\n{config['name']} settings:", end=" ")
|
|
|
| start_time = time.time()
|
| result = generate_direct(model, prompt, **params)
|
| end_time = time.time()
|
|
|
|
|
| import tiktoken
|
| enc = tiktoken.get_encoding("gpt2")
|
| generated_tokens = len(enc.encode(result)) - len(enc.encode(prompt))
|
|
|
| tokens_per_sec = generated_tokens / (end_time - start_time)
|
| print(f"โก {tokens_per_sec:.1f} tokens/sec")
|
|
|
| def main():
|
| parser = argparse.ArgumentParser(description="Benchmark Ursa Minor Smashed model on CUDA")
|
| parser.add_argument("--model", type=str, default="model_optimized.pt",
|
| help="Path to model checkpoint")
|
| parser.add_argument("--runs", type=int, default=5,
|
| help="Number of benchmark runs")
|
| parser.add_argument("--max-tokens", type=int, default=100,
|
| help="Maximum tokens to generate")
|
| parser.add_argument("--prompt", type=str, default="The future of artificial intelligence",
|
| help="Prompt for benchmarking")
|
| parser.add_argument("--memory-test", action="store_true",
|
| help="Run memory usage tests")
|
| parser.add_argument("--param-test", action="store_true",
|
| help="Test different parameters")
|
|
|
| args = parser.parse_args()
|
|
|
| if not torch.cuda.is_available():
|
| print("โ ERROR: CUDA is not available. Use benchmark_cpu.py for CPU benchmarking.")
|
| return
|
|
|
| print("๐ฅ CUDA Benchmark for Ursa Minor Smashed")
|
| print("=" * 50)
|
| print(f"GPU: {torch.cuda.get_device_name()}")
|
| print(f"CUDA Version: {torch.version.cuda}")
|
| print(f"PyTorch Version: {torch.__version__}")
|
| print()
|
|
|
|
|
| print("Loading model on CUDA...")
|
| model = load_model_direct(args.model)
|
| print("โ
Model loaded!")
|
|
|
|
|
| benchmark_generation(model, args.runs, args.prompt, args.max_tokens)
|
|
|
|
|
| if args.memory_test:
|
| benchmark_memory_usage(model)
|
|
|
|
|
| if args.param_test:
|
| benchmark_different_parameters(model)
|
|
|
| print("\n๐ CUDA Benchmarking complete!")
|
|
|
| if __name__ == "__main__":
|
| main() |