File size: 5,194 Bytes
80538c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from argparse import ArgumentParser
import time
import os
import torch
import torchvision.transforms as transforms
from contextlib import nullcontext
import json
from models import get_model


parser = ArgumentParser(description="Train an EBC model.")
parser.add_argument("--model_info_path", type=str, required=True, help="Path to the model information file.")

parser.add_argument("--batch_size", type=int, default=1, help="Batch size for the model.")
parser.add_argument("--height", type=int, default=768, help="Height of the input image.")
parser.add_argument("--width", type=int, default=1024, help="Width of the input image.")

parser.add_argument("--num_iterations", type=int, default=200, help="Number of iterations to run the model.")
parser.add_argument("--num_warmup", type=int, default=20, help="Dispose of the first N iterations.")

parser.add_argument("--device", type=str, choices=["cpu", "cuda", "mps"], help="Device to run the model on. Options are 'cpu', 'cuda', or 'mps'.")
parser.add_argument("--amp", action="store_true", help="Enable autocast mixed precision (fp16/bf16).")
parser.add_argument("--half", action="store_true", help="Use half precision for the model.")
parser.add_argument("--channels_last", action="store_true", help="Use NHWC memory format (recommended for CUDA).")
parser.add_argument("--compile", action="store_true", help="Enable torch.compile if available.")
parser.add_argument("--threads", type=int, default=None, help="torch.set_num_threads(threads) for CPU")
parser.add_argument("--sleep_time", type=float, default=0.0, help="Seconds to sleep after *each* iteration (cool-down).")

_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])


def _dummy_input(bs, h, w, device, half, channels_last):
    x = torch.rand(bs, 3, h, w, device=device)
    x = _normalize(x)
    if half:
        x = x.half()
    if channels_last:
        x = x.to(memory_format=torch.channels_last)
    return x


def _maybe_sync(dev):
    if dev.type == "cuda":
        torch.cuda.synchronize()


@torch.inference_mode()
def benchmark(
    model: torch.nn.Module,
    inp: torch.Tensor,
    warmup: int,
    steps: int,
    amp: bool,
    sleep_time: float = 0.0
):
    cm = torch.autocast(device_type=inp.device.type) if amp else nullcontext()

    # --- warm-up ---
    for _ in range(warmup):
        with cm:
            _ = model(inp)
    _maybe_sync(inp.device)

    # --- timed loop ---
    total_time = 0.0
    for _ in range(steps):
        tic = time.perf_counter()
        with cm:
            _ = model(inp)
        
        toc = time.perf_counter()
        total_time += toc - tic

        if sleep_time > 0:
            time.sleep(sleep_time)
    
    _maybe_sync(inp.device)

    fps = steps / total_time
    return fps, total_time / steps


def main(args):
    assert os.path.isfile(args.model_info_path), \
        f"{args.model_info_path} not found"

    model = get_model(model_info_path=args.model_info_path)
    model.eval()

    if args.channels_last:
        model = model.to(memory_format=torch.channels_last)
    if args.half:
        model = model.half()

    device = torch.device(args.device)
    model = model.to(device)

    if args.compile and hasattr(torch, "compile"):
        model = torch.compile(model, mode="reduce-overhead")

    if args.threads:
        torch.set_num_threads(args.threads)
        torch.set_num_interop_threads(1)

    inp = _dummy_input(
        args.batch_size,
        args.height, 
        args.width,
        device,
        args.half,
        args.channels_last
    )

    fps, t_avg = benchmark(
        model,
        inp,
        warmup=args.num_warmup,
        steps=args.num_iterations,
        amp=args.amp,
        sleep_time=args.sleep_time
    )

    cfg = vars(args)
    cfg.pop("model_info_path")
    print(json.dumps(cfg, indent=2))
    print(f"\nAverage latency: {t_avg*1000:6.2f} ms  |  FPS: {fps:,.2f}")


if __name__ == "__main__":
    main(parser.parse_args())


# CUDA @FP16 + channels_last + torch.compile
# python efficiency.py \
#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
#   --device cuda --half --amp --channels_last --compile

# CUDA @AMP + channels_last + torch.compile
# python efficiency.py \
#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
#   --device cuda --amp --channels_last --compile

# CUDA @FP32 + channels_last + torch.compile
# python efficiency.py \
#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
#   --device cuda --channels_last --compile

# AMD 5900X (12 Core) + channels_last + torch.compile
# export OMP_NUM_THREADS=12; export MKL_NUM_THREADS=12
# python efficiency.py \
#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
#   --device cpu --threads 12 --channels_last --compile

# Apple M1 Pro (6 Performance Cores). Compiling makes it slower.
# export OMP_NUM_THREADS=6; export VECLIB_MAXIMUM_THREADS=6
# python efficiency.py \
#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
#   --device cpu --threads 6

# Apple M1 Pro MPS @FP32 + torch.compile
# python efficiency.py \
#   --model_info_path checkpoints/shb/ebc_p/best_mae.pth \
#   --device mps --channels_last --compile