File size: 5,194 Bytes
80538c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
from argparse import ArgumentParser
import time
import os
import torch
import torchvision.transforms as transforms
from contextlib import nullcontext
import json
from models import get_model
parser = ArgumentParser(description="Train an EBC model.")
parser.add_argument("--model_info_path", type=str, required=True, help="Path to the model information file.")
parser.add_argument("--batch_size", type=int, default=1, help="Batch size for the model.")
parser.add_argument("--height", type=int, default=768, help="Height of the input image.")
parser.add_argument("--width", type=int, default=1024, help="Width of the input image.")
parser.add_argument("--num_iterations", type=int, default=200, help="Number of iterations to run the model.")
parser.add_argument("--num_warmup", type=int, default=20, help="Dispose of the first N iterations.")
parser.add_argument("--device", type=str, choices=["cpu", "cuda", "mps"], help="Device to run the model on. Options are 'cpu', 'cuda', or 'mps'.")
parser.add_argument("--amp", action="store_true", help="Enable autocast mixed precision (fp16/bf16).")
parser.add_argument("--half", action="store_true", help="Use half precision for the model.")
parser.add_argument("--channels_last", action="store_true", help="Use NHWC memory format (recommended for CUDA).")
parser.add_argument("--compile", action="store_true", help="Enable torch.compile if available.")
parser.add_argument("--threads", type=int, default=None, help="torch.set_num_threads(threads) for CPU")
parser.add_argument("--sleep_time", type=float, default=0.0, help="Seconds to sleep after *each* iteration (cool-down).")
_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
def _dummy_input(bs, h, w, device, half, channels_last):
x = torch.rand(bs, 3, h, w, device=device)
x = _normalize(x)
if half:
x = x.half()
if channels_last:
x = x.to(memory_format=torch.channels_last)
return x
def _maybe_sync(dev):
if dev.type == "cuda":
torch.cuda.synchronize()
@torch.inference_mode()
def benchmark(
model: torch.nn.Module,
inp: torch.Tensor,
warmup: int,
steps: int,
amp: bool,
sleep_time: float = 0.0
):
cm = torch.autocast(device_type=inp.device.type) if amp else nullcontext()
# --- warm-up ---
for _ in range(warmup):
with cm:
_ = model(inp)
_maybe_sync(inp.device)
# --- timed loop ---
total_time = 0.0
for _ in range(steps):
tic = time.perf_counter()
with cm:
_ = model(inp)
toc = time.perf_counter()
total_time += toc - tic
if sleep_time > 0:
time.sleep(sleep_time)
_maybe_sync(inp.device)
fps = steps / total_time
return fps, total_time / steps
def main(args):
assert os.path.isfile(args.model_info_path), \
f"{args.model_info_path} not found"
model = get_model(model_info_path=args.model_info_path)
model.eval()
if args.channels_last:
model = model.to(memory_format=torch.channels_last)
if args.half:
model = model.half()
device = torch.device(args.device)
model = model.to(device)
if args.compile and hasattr(torch, "compile"):
model = torch.compile(model, mode="reduce-overhead")
if args.threads:
torch.set_num_threads(args.threads)
torch.set_num_interop_threads(1)
inp = _dummy_input(
args.batch_size,
args.height,
args.width,
device,
args.half,
args.channels_last
)
fps, t_avg = benchmark(
model,
inp,
warmup=args.num_warmup,
steps=args.num_iterations,
amp=args.amp,
sleep_time=args.sleep_time
)
cfg = vars(args)
cfg.pop("model_info_path")
print(json.dumps(cfg, indent=2))
print(f"\nAverage latency: {t_avg*1000:6.2f} ms | FPS: {fps:,.2f}")
if __name__ == "__main__":
main(parser.parse_args())
# CUDA @FP16 + channels_last + torch.compile
# python efficiency.py \
# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
# --device cuda --half --amp --channels_last --compile
# CUDA @AMP + channels_last + torch.compile
# python efficiency.py \
# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
# --device cuda --amp --channels_last --compile
# CUDA @FP32 + channels_last + torch.compile
# python efficiency.py \
# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
# --device cuda --channels_last --compile
# AMD 5900X (12 Core) + channels_last + torch.compile
# export OMP_NUM_THREADS=12; export MKL_NUM_THREADS=12
# python efficiency.py \
# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
# --device cpu --threads 12 --channels_last --compile
# Apple M1 Pro (6 Performance Cores). Compiling makes it slower.
# export OMP_NUM_THREADS=6; export VECLIB_MAXIMUM_THREADS=6
# python efficiency.py \
# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
# --device cpu --threads 6
# Apple M1 Pro MPS @FP32 + torch.compile
# python efficiency.py \
# --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
# --device mps --channels_last --compile |