|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
export ONNX model and TensorRT engine for deployment
|
|
|
"""
|
|
|
import os
|
|
|
import ast
|
|
|
import random
|
|
|
import argparse
|
|
|
import subprocess
|
|
|
import torch.nn as nn
|
|
|
from pathlib import Path
|
|
|
import time
|
|
|
from collections import defaultdict
|
|
|
|
|
|
import onnx
|
|
|
import torch
|
|
|
import onnxsim
|
|
|
import numpy as np
|
|
|
from PIL import Image
|
|
|
|
|
|
import rfdetr.util.misc as utils
|
|
|
import rfdetr.datasets.transforms as T
|
|
|
from rfdetr.models import build_model
|
|
|
from rfdetr.deploy._onnx import OnnxOptimizer
|
|
|
import re
|
|
|
import sys
|
|
|
|
|
|
|
|
|
def run_command_shell(command, dry_run:bool = False) -> int:
|
|
|
if dry_run:
|
|
|
print("")
|
|
|
print(f"CUDA_VISIBLE_DEVICES={os.environ['CUDA_VISIBLE_DEVICES']} {command}")
|
|
|
print("")
|
|
|
try:
|
|
|
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
|
|
return result
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
print(f"Command failed with exit code {e.returncode}")
|
|
|
print(f"Error output:\n{e.stderr.decode('utf-8')}")
|
|
|
raise
|
|
|
|
|
|
|
|
|
def make_infer_image(infer_dir, shape, batch_size, device="cuda"):
|
|
|
if infer_dir is None:
|
|
|
dummy = np.random.randint(0, 256, (shape[0], shape[1], 3), dtype=np.uint8)
|
|
|
image = Image.fromarray(dummy, mode="RGB")
|
|
|
else:
|
|
|
image = Image.open(infer_dir).convert("RGB")
|
|
|
|
|
|
transforms = T.Compose([
|
|
|
T.SquareResize([shape[0]]),
|
|
|
T.ToTensor(),
|
|
|
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
|
|
])
|
|
|
|
|
|
inps, _ = transforms(image, None)
|
|
|
inps = inps.to(device)
|
|
|
|
|
|
inps = torch.stack([inps for _ in range(batch_size)])
|
|
|
return inps
|
|
|
|
|
|
def export_onnx(output_dir, model, input_names, input_tensors, output_names, dynamic_axes, backbone_only=False, verbose=True, opset_version=17):
|
|
|
export_name = "backbone_model" if backbone_only else "inference_model"
|
|
|
output_file = os.path.join(output_dir, f"{export_name}.onnx")
|
|
|
|
|
|
|
|
|
if hasattr(model, "export"):
|
|
|
model.export()
|
|
|
|
|
|
torch.onnx.export(
|
|
|
model,
|
|
|
input_tensors,
|
|
|
output_file,
|
|
|
input_names=input_names,
|
|
|
output_names=output_names,
|
|
|
export_params=True,
|
|
|
keep_initializers_as_inputs=False,
|
|
|
do_constant_folding=True,
|
|
|
verbose=verbose,
|
|
|
opset_version=opset_version,
|
|
|
dynamic_axes=dynamic_axes)
|
|
|
|
|
|
print(f'\nSuccessfully exported ONNX model: {output_file}')
|
|
|
return output_file
|
|
|
|
|
|
|
|
|
def onnx_simplify(onnx_dir:str, input_names, input_tensors, force=False):
|
|
|
sim_onnx_dir = onnx_dir.replace(".onnx", ".sim.onnx")
|
|
|
if os.path.isfile(sim_onnx_dir) and not force:
|
|
|
return sim_onnx_dir
|
|
|
|
|
|
if isinstance(input_tensors, torch.Tensor):
|
|
|
input_tensors = [input_tensors]
|
|
|
|
|
|
print(f'start simplify ONNX model: {onnx_dir}')
|
|
|
opt = OnnxOptimizer(onnx_dir)
|
|
|
opt.info('Model: original')
|
|
|
opt.common_opt()
|
|
|
opt.info('Model: optimized')
|
|
|
opt.save_onnx(sim_onnx_dir)
|
|
|
input_dict = {name: tensor.detach().cpu().numpy() for name, tensor in zip(input_names, input_tensors)}
|
|
|
model_opt, check_ok = onnxsim.simplify(
|
|
|
onnx_dir,
|
|
|
check_n = 3,
|
|
|
input_data=input_dict,
|
|
|
dynamic_input_shape=False)
|
|
|
if check_ok:
|
|
|
onnx.save(model_opt, sim_onnx_dir)
|
|
|
else:
|
|
|
raise RuntimeError("Failed to simplify ONNX model.")
|
|
|
print(f'Successfully simplified ONNX model: {sim_onnx_dir}')
|
|
|
return sim_onnx_dir
|
|
|
|
|
|
|
|
|
def trtexec(onnx_dir:str, args) -> None:
|
|
|
engine_dir = onnx_dir.replace(".onnx", f".engine")
|
|
|
|
|
|
|
|
|
trt_command = " ".join([
|
|
|
"trtexec",
|
|
|
f"--onnx={onnx_dir}",
|
|
|
f"--saveEngine={engine_dir}",
|
|
|
f"--memPoolSize=workspace:4096 --fp16",
|
|
|
f"--useCudaGraph --useSpinWait --warmUp=500 --avgRuns=1000 --duration=10",
|
|
|
f"{'--verbose' if args.verbose else ''}"])
|
|
|
|
|
|
if args.profile:
|
|
|
profile_dir = onnx_dir.replace(".onnx", f".nsys-rep")
|
|
|
|
|
|
command = " ".join([
|
|
|
"nsys profile",
|
|
|
f"--output={profile_dir}",
|
|
|
"--trace=cuda,nvtx",
|
|
|
"--force-overwrite true",
|
|
|
trt_command
|
|
|
])
|
|
|
print(f'Profile data will be saved to: {profile_dir}')
|
|
|
else:
|
|
|
command = trt_command
|
|
|
|
|
|
output = run_command_shell(command, args.dry_run)
|
|
|
stats = parse_trtexec_output(output.stdout)
|
|
|
|
|
|
def parse_trtexec_output(output_text):
|
|
|
print(output_text)
|
|
|
|
|
|
gpu_compute_pattern = r"GPU Compute Time: min = (\d+\.\d+) ms, max = (\d+\.\d+) ms, mean = (\d+\.\d+) ms, median = (\d+\.\d+) ms"
|
|
|
h2d_pattern = r"Host to Device Transfer Time: min = (\d+\.\d+) ms, max = (\d+\.\d+) ms, mean = (\d+\.\d+) ms"
|
|
|
d2h_pattern = r"Device to Host Transfer Time: min = (\d+\.\d+) ms, max = (\d+\.\d+) ms, mean = (\d+\.\d+) ms"
|
|
|
latency_pattern = r"Latency: min = (\d+\.\d+) ms, max = (\d+\.\d+) ms, mean = (\d+\.\d+) ms"
|
|
|
throughput_pattern = r"Throughput: (\d+\.\d+) qps"
|
|
|
|
|
|
stats = {}
|
|
|
|
|
|
|
|
|
if match := re.search(gpu_compute_pattern, output_text):
|
|
|
stats.update({
|
|
|
'compute_min_ms': float(match.group(1)),
|
|
|
'compute_max_ms': float(match.group(2)),
|
|
|
'compute_mean_ms': float(match.group(3)),
|
|
|
'compute_median_ms': float(match.group(4))
|
|
|
})
|
|
|
|
|
|
|
|
|
if match := re.search(h2d_pattern, output_text):
|
|
|
stats.update({
|
|
|
'h2d_min_ms': float(match.group(1)),
|
|
|
'h2d_max_ms': float(match.group(2)),
|
|
|
'h2d_mean_ms': float(match.group(3))
|
|
|
})
|
|
|
|
|
|
|
|
|
if match := re.search(d2h_pattern, output_text):
|
|
|
stats.update({
|
|
|
'd2h_min_ms': float(match.group(1)),
|
|
|
'd2h_max_ms': float(match.group(2)),
|
|
|
'd2h_mean_ms': float(match.group(3))
|
|
|
})
|
|
|
|
|
|
if match := re.search(latency_pattern, output_text):
|
|
|
stats.update({
|
|
|
'latency_min_ms': float(match.group(1)),
|
|
|
'latency_max_ms': float(match.group(2)),
|
|
|
'latency_mean_ms': float(match.group(3))
|
|
|
})
|
|
|
|
|
|
|
|
|
if match := re.search(throughput_pattern, output_text):
|
|
|
stats['throughput_qps'] = float(match.group(1))
|
|
|
|
|
|
return stats
|
|
|
|
|
|
def no_batch_norm(model):
|
|
|
for module in model.modules():
|
|
|
if isinstance(module, nn.BatchNorm2d):
|
|
|
raise ValueError("BatchNorm2d found in the model. Please remove it.")
|
|
|
|
|
|
def main(args):
|
|
|
print("git:\n {}\n".format(utils.get_sha()))
|
|
|
print(args)
|
|
|
|
|
|
if args.device == 'cuda':
|
|
|
device_id = "0"
|
|
|
elif args.device == 'cpu':
|
|
|
device_id = ""
|
|
|
else:
|
|
|
device_id = str(int(args.device))
|
|
|
args.device = f"cuda:{device_id}"
|
|
|
|
|
|
|
|
|
|
|
|
device = torch.device("cpu")
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = device_id
|
|
|
|
|
|
|
|
|
seed = args.seed + utils.get_rank()
|
|
|
torch.manual_seed(seed)
|
|
|
np.random.seed(seed)
|
|
|
random.seed(seed)
|
|
|
|
|
|
model, criterion, postprocessors = build_model(args)
|
|
|
n_parameters = sum(p.numel() for p in model.parameters())
|
|
|
print(f"number of parameters: {n_parameters}")
|
|
|
n_backbone_parameters = sum(p.numel() for p in model.backbone.parameters())
|
|
|
print(f"number of backbone parameters: {n_backbone_parameters}")
|
|
|
n_projector_parameters = sum(p.numel() for p in model.backbone[0].projector.parameters())
|
|
|
print(f"number of projector parameters: {n_projector_parameters}")
|
|
|
n_backbone_encoder_parameters = sum(p.numel() for p in model.backbone[0].encoder.parameters())
|
|
|
print(f"number of backbone encoder parameters: {n_backbone_encoder_parameters}")
|
|
|
n_transformer_parameters = sum(p.numel() for p in model.transformer.parameters())
|
|
|
print(f"number of transformer parameters: {n_transformer_parameters}")
|
|
|
if args.resume:
|
|
|
checkpoint = torch.load(args.resume, map_location='cpu')
|
|
|
model.load_state_dict(checkpoint['model'], strict=True)
|
|
|
print(f"load checkpoints {args.resume}")
|
|
|
|
|
|
if args.layer_norm:
|
|
|
no_batch_norm(model)
|
|
|
|
|
|
model.to(device)
|
|
|
|
|
|
input_tensors = make_infer_image(args, device)
|
|
|
input_names = ['input']
|
|
|
output_names = ['features'] if args.backbone_only else ['dets', 'labels']
|
|
|
dynamic_axes = None
|
|
|
|
|
|
model.eval().to("cuda")
|
|
|
input_tensors = input_tensors.to("cuda")
|
|
|
with torch.no_grad():
|
|
|
if args.backbone_only:
|
|
|
features = model(input_tensors)
|
|
|
print(f"PyTorch inference output shape: {features.shape}")
|
|
|
else:
|
|
|
outputs = model(input_tensors)
|
|
|
dets = outputs['pred_boxes']
|
|
|
labels = outputs['pred_logits']
|
|
|
print(f"PyTorch inference output shapes - Boxes: {dets.shape}, Labels: {labels.shape}")
|
|
|
model.cpu()
|
|
|
input_tensors = input_tensors.cpu()
|
|
|
|
|
|
|
|
|
output_file = export_onnx(model, args, input_names, input_tensors, output_names, dynamic_axes)
|
|
|
|
|
|
if args.simplify:
|
|
|
output_file = onnx_simplify(output_file, input_names, input_tensors, args)
|
|
|
|
|
|
if args.tensorrt:
|
|
|
output_file = trtexec(output_file, args)
|
|
|
|