facedet / evaluation /speed_benchmark.py
cledouxluma's picture
Upload evaluation/speed_benchmark.py with huggingface_hub
f7417f1 verified
"""
Speed Benchmark — Measure inference speed across hardware/configs.
Reports:
- Latency (ms per frame) at p50/p95/p99
- Throughput (FPS)
- GPU memory usage
- Comparison across input resolutions and model variants
"""
import time
import numpy as np
from typing import Dict, Optional, List
from dataclasses import dataclass
import torch
@dataclass
class BenchmarkResult:
"""Single benchmark measurement."""
model_name: str
input_size: int
device: str
batch_size: int
latency_p50_ms: float
latency_p95_ms: float
latency_p99_ms: float
fps: float
gpu_mem_mb: float
num_params_m: float
gflops: float
class SpeedBenchmark:
"""
Inference speed benchmark for face detection models.
Usage:
bench = SpeedBenchmark(device='cuda')
result = bench.benchmark_model(model, 'scrfd_34g', input_size=640)
bench.print_results()
"""
def __init__(self, device: str = 'cuda', warmup_iters: int = 50,
benchmark_iters: int = 200):
self.device = device
self.warmup_iters = warmup_iters
self.benchmark_iters = benchmark_iters
self.results: List[BenchmarkResult] = []
@torch.no_grad()
def benchmark_model(self, model: torch.nn.Module, model_name: str,
input_size: int = 640, batch_size: int = 1) -> BenchmarkResult:
"""
Benchmark a model's inference speed.
Args:
model: PyTorch model in eval mode
model_name: Name for reporting
input_size: Input image resolution
batch_size: Batch size for benchmarking
Returns:
BenchmarkResult with timing statistics
"""
model = model.to(self.device).eval()
dummy_input = torch.randn(batch_size, 3, input_size, input_size,
device=self.device)
# Count parameters
num_params = sum(p.numel() for p in model.parameters()) / 1e6
# Estimate GFLOPs (using torch profiler if available)
gflops = self._estimate_flops(model, dummy_input)
# GPU memory before
if self.device == 'cuda':
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()
# Warmup
for _ in range(self.warmup_iters):
_ = model(dummy_input)
if self.device == 'cuda':
torch.cuda.synchronize()
# Benchmark
latencies = []
for _ in range(self.benchmark_iters):
if self.device == 'cuda':
torch.cuda.synchronize()
t0 = time.perf_counter()
_ = model(dummy_input)
if self.device == 'cuda':
torch.cuda.synchronize()
latencies.append((time.perf_counter() - t0) * 1000) # ms
latencies = np.array(latencies)
gpu_mem = 0
if self.device == 'cuda':
gpu_mem = torch.cuda.max_memory_allocated() / 1e6
result = BenchmarkResult(
model_name=model_name,
input_size=input_size,
device=self.device,
batch_size=batch_size,
latency_p50_ms=np.percentile(latencies, 50),
latency_p95_ms=np.percentile(latencies, 95),
latency_p99_ms=np.percentile(latencies, 99),
fps=1000 / np.mean(latencies) * batch_size,
gpu_mem_mb=gpu_mem,
num_params_m=num_params,
gflops=gflops,
)
self.results.append(result)
return result
def _estimate_flops(self, model: torch.nn.Module,
dummy_input: torch.Tensor) -> float:
"""Estimate GFLOPs (approximate)."""
try:
from torch.utils.flop_counter import FlopCounterMode
flop_counter = FlopCounterMode(display=False)
with flop_counter:
model(dummy_input)
return flop_counter.get_total_flops() / 1e9
except (ImportError, Exception):
return 0.0
def print_results(self):
"""Print formatted benchmark results table."""
if not self.results:
print("No benchmark results yet.")
return
header = (f"{'Model':<15} {'Size':<6} {'Device':<8} {'BS':<4} "
f"{'P50(ms)':<9} {'P95(ms)':<9} {'P99(ms)':<9} "
f"{'FPS':<8} {'Mem(MB)':<10} {'Params(M)':<10} {'GFLOPs':<8}")
print("=" * len(header))
print("Speed Benchmark Results")
print("=" * len(header))
print(header)
print("-" * len(header))
for r in self.results:
print(f"{r.model_name:<15} {r.input_size:<6} {r.device:<8} {r.batch_size:<4} "
f"{r.latency_p50_ms:<9.2f} {r.latency_p95_ms:<9.2f} {r.latency_p99_ms:<9.2f} "
f"{r.fps:<8.1f} {r.gpu_mem_mb:<10.1f} {r.num_params_m:<10.2f} {r.gflops:<8.2f}")
print("=" * len(header))
def to_markdown(self) -> str:
"""Generate Markdown benchmark table."""
lines = [
"| Model | Input | Device | BS | P50 (ms) | P95 (ms) | FPS | GPU Mem (MB) | Params (M) | GFLOPs |",
"|-------|-------|--------|----|---------:|---------:|----:|-------------:|-----------:|-------:|",
]
for r in self.results:
lines.append(
f"| {r.model_name} | {r.input_size} | {r.device} | {r.batch_size} | "
f"{r.latency_p50_ms:.2f} | {r.latency_p95_ms:.2f} | {r.fps:.1f} | "
f"{r.gpu_mem_mb:.1f} | {r.num_params_m:.2f} | {r.gflops:.2f} |"
)
return '\n'.join(lines)