# #!/usr/bin/env python3 # # import argparse # import os # import time # import numpy as np # import nvidia_smi # import psutil # import torch # # from model_manager import ModelManager # from schema import Config, HDStrategy, SDSampler # # try: # torch._C._jit_override_can_fuse_on_cpu(False) # torch._C._jit_override_can_fuse_on_gpu(False) # torch._C._jit_set_texpr_fuser_enabled(False) # torch._C._jit_set_nvfuser_enabled(False) # except: # pass # # NUM_THREADS = str(4) # # os.environ["OMP_NUM_THREADS"] = NUM_THREADS # os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS # os.environ["MKL_NUM_THREADS"] = NUM_THREADS # os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS # os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS # if os.environ.get("CACHE_DIR"): # os.environ["TORCH_HOME"] = os.environ["CACHE_DIR"] # # # def run_model(model, size): # # RGB # image = np.random.randint(0, 256, (size[0], size[1], 3)).astype(np.uint8) # mask = np.random.randint(0, 255, size).astype(np.uint8) # # config = Config( # ldm_steps=2, # hd_strategy=HDStrategy.ORIGINAL, # hd_strategy_crop_margin=128, # hd_strategy_crop_trigger_size=128, # hd_strategy_resize_limit=128, # prompt="a fox is sitting on a bench", # sd_steps=5, # sd_sampler=SDSampler.ddim # ) # model(image, mask, config) # # # def benchmark(model, times: int, empty_cache: bool): # sizes = [(512, 512)] # # nvidia_smi.nvmlInit() # device_id = 0 # handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id) # # def format(metrics): # return f"{np.mean(metrics):.2f} ± {np.std(metrics):.2f}" # # process = psutil.Process(os.getpid()) # # 每个 size 给出显存和内存占用的指标 # for size in sizes: # torch.cuda.empty_cache() # time_metrics = [] # cpu_metrics = [] # memory_metrics = [] # gpu_memory_metrics = [] # for _ in range(times): # start = time.time() # run_model(model, size) # torch.cuda.synchronize() # # # cpu_metrics.append(process.cpu_percent()) # time_metrics.append((time.time() - start) * 1000) # memory_metrics.append(process.memory_info().rss / 1024 / 1024) # gpu_memory_metrics.append(nvidia_smi.nvmlDeviceGetMemoryInfo(handle).used / 1024 / 1024) # # print(f"size: {size}".center(80, "-")) # # print(f"cpu: {format(cpu_metrics)}") # print(f"latency: {format(time_metrics)}ms") # print(f"memory: {format(memory_metrics)} MB") # print(f"gpu memory: {format(gpu_memory_metrics)} MB") # # nvidia_smi.nvmlShutdown() # # # def get_args_parser(): # parser = argparse.ArgumentParser() # parser.add_argument("--name") # parser.add_argument("--device", default="cuda", type=str) # parser.add_argument("--times", default=10, type=int) # parser.add_argument("--empty-cache", action="store_true") # return parser.parse_args() # # # if __name__ == "__main__": # args = get_args_parser() # device = torch.device(args.device) # model = ModelManager( # name=args.name, # device=device, # sd_run_local=True, # disable_nsfw=True, # sd_cpu_textencoder=True, # hf_access_token="123" # ) # benchmark(model, args.times, args.empty_cache)