|
|
|
|
|
import argparse |
|
import os |
|
import time |
|
|
|
import numpy as np |
|
import nvidia_smi |
|
import psutil |
|
import torch |
|
|
|
from iopaint.model_manager import ModelManager |
|
from iopaint.schema import InpaintRequest, HDStrategy, SDSampler |
|
|
|
try: |
|
torch._C._jit_override_can_fuse_on_cpu(False) |
|
torch._C._jit_override_can_fuse_on_gpu(False) |
|
torch._C._jit_set_texpr_fuser_enabled(False) |
|
torch._C._jit_set_nvfuser_enabled(False) |
|
except: |
|
pass |
|
|
|
NUM_THREADS = str(4) |
|
|
|
os.environ["OMP_NUM_THREADS"] = NUM_THREADS |
|
os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS |
|
os.environ["MKL_NUM_THREADS"] = NUM_THREADS |
|
os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS |
|
os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS |
|
if os.environ.get("CACHE_DIR"): |
|
os.environ["TORCH_HOME"] = os.environ["CACHE_DIR"] |
|
|
|
|
|
def run_model(model, size): |
|
|
|
image = np.random.randint(0, 256, (size[0], size[1], 3)).astype(np.uint8) |
|
mask = np.random.randint(0, 255, size).astype(np.uint8) |
|
|
|
config = InpaintRequest( |
|
ldm_steps=2, |
|
hd_strategy=HDStrategy.ORIGINAL, |
|
hd_strategy_crop_margin=128, |
|
hd_strategy_crop_trigger_size=128, |
|
hd_strategy_resize_limit=128, |
|
prompt="a fox is sitting on a bench", |
|
sd_steps=5, |
|
sd_sampler=SDSampler.ddim, |
|
) |
|
model(image, mask, config) |
|
|
|
|
|
def benchmark(model, times: int, empty_cache: bool): |
|
sizes = [(512, 512)] |
|
|
|
nvidia_smi.nvmlInit() |
|
device_id = 0 |
|
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id) |
|
|
|
def format(metrics): |
|
return f"{np.mean(metrics):.2f} ± {np.std(metrics):.2f}" |
|
|
|
process = psutil.Process(os.getpid()) |
|
|
|
for size in sizes: |
|
torch.cuda.empty_cache() |
|
time_metrics = [] |
|
cpu_metrics = [] |
|
memory_metrics = [] |
|
gpu_memory_metrics = [] |
|
for _ in range(times): |
|
start = time.time() |
|
run_model(model, size) |
|
torch.cuda.synchronize() |
|
|
|
|
|
time_metrics.append((time.time() - start) * 1000) |
|
memory_metrics.append(process.memory_info().rss / 1024 / 1024) |
|
gpu_memory_metrics.append( |
|
nvidia_smi.nvmlDeviceGetMemoryInfo(handle).used / 1024 / 1024 |
|
) |
|
|
|
print(f"size: {size}".center(80, "-")) |
|
|
|
print(f"latency: {format(time_metrics)}ms") |
|
print(f"memory: {format(memory_metrics)} MB") |
|
print(f"gpu memory: {format(gpu_memory_metrics)} MB") |
|
|
|
nvidia_smi.nvmlShutdown() |
|
|
|
|
|
def get_args_parser(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--name") |
|
parser.add_argument("--device", default="cuda", type=str) |
|
parser.add_argument("--times", default=10, type=int) |
|
parser.add_argument("--empty-cache", action="store_true") |
|
return parser.parse_args() |
|
|
|
|
|
if __name__ == "__main__": |
|
args = get_args_parser() |
|
device = torch.device(args.device) |
|
model = ModelManager( |
|
name=args.name, |
|
device=device, |
|
disable_nsfw=True, |
|
sd_cpu_textencoder=True, |
|
) |
|
benchmark(model, args.times, args.empty_cache) |
|
|