import os import sys import os.path as osp import argparse import numpy as np import torchvision.transforms as transforms import torch.backends.cudnn as cudnn import torch CUR_DIR = osp.dirname(os.path.abspath(__file__)) sys.path.insert(0, osp.join(CUR_DIR, '..', 'main')) sys.path.insert(0, osp.join(CUR_DIR , '..', 'common')) from config import cfg import cv2 from mmdet.apis import init_detector, inference_detector from utils.inference_utils import process_mmdet_results, non_max_suppression from postometro_utils.smpl import SMPL import data.config as smpl_cfg from postometro import get_model class Inferer: def __init__(self, pretrained_model, num_gpus, output_folder): self.output_folder = output_folder self.device = torch.device('cuda') if (num_gpus > 0) else torch.device('cpu') print("Infer using device: ", self.device) # # load model config config_path = osp.join(CUR_DIR, './config', f'config_{pretrained_model}.py') # ckpt_path = osp.join(CUR_DIR, '../pretrained_models', f'{pretrained_model}.pth.tar') ckpt_path = None # for config cfg.get_config_fromfile(config_path) # uodate config cfg.update_config(num_gpus, ckpt_path, output_folder, self.device) self.cfg = cfg cudnn.benchmark = True # load SMPL self.smpl = SMPL().to(self.device) self.faces = self.smpl.faces.cpu().numpy() # load model hmr_model_checkpoint_file = osp.join(CUR_DIR, '../pretrained_models/postometro/resnet_state_dict.bin') self.hmr_model = get_model(backbone_str='resnet50',device=self.device, checkpoint_file = hmr_model_checkpoint_file) # load faster-rcnn as human detector checkpoint_file = osp.join(CUR_DIR, '../pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth') config_file= osp.join(CUR_DIR, '../pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py') model = init_detector(config_file, checkpoint_file, device=self.device) # or device='cuda:0' self.model = model def infer(self, original_img, iou_thr, multi_person=False, mesh_as_vertices=False): from utils.preprocessing import process_bbox, generate_patch_image from utils.vis import render_mesh # from utils.human_models import smpl_x # prepare input image transform = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) vis_img = original_img.copy() original_img_height, original_img_width = original_img.shape[:2] # load renderer # self.renderer = PyRender_Renderer(resolution=(original_img_width, original_img_height), faces=self.faces) ## mmdet inference mmdet_results = inference_detector(self.model, original_img) mmdet_box = process_mmdet_results(mmdet_results, cat_id=0, multi_person=True) # early return # save original image if no bbox if len(mmdet_box[0])<1: return original_img, [], [] if not multi_person: # only select the largest bbox num_bbox = 1 mmdet_box = mmdet_box[0] else: # keep bbox by NMS with iou_thr mmdet_box = non_max_suppression(mmdet_box[0], iou_thr) num_bbox = len(mmdet_box) ## loop all detected bboxes ok_bboxes = [] for bbox_id in range(num_bbox): mmdet_box_xywh = np.zeros((4)) # xyxy -> xywh mmdet_box_xywh[0] = mmdet_box[bbox_id][0] mmdet_box_xywh[1] = mmdet_box[bbox_id][1] mmdet_box_xywh[2] = abs(mmdet_box[bbox_id][2]-mmdet_box[bbox_id][0]) mmdet_box_xywh[3] = abs(mmdet_box[bbox_id][3]-mmdet_box[bbox_id][1]) # skip small bboxes by bbox_thr in pixel if mmdet_box_xywh[2] < 50 or mmdet_box_xywh[3] < 150: continue # align these pre-processing steps bbox = process_bbox(mmdet_box_xywh, original_img_width, original_img_height) ok_bboxes.append(bbox) # [DEBUG] test mmdet pipeline if bbox is not None: top_left = (int(bbox[0]), int(bbox[1])) bottom_right = (int(bbox[0] + bbox[2]), int(bbox[1] + bbox[3])) cv2.rectangle(vis_img, top_left, bottom_right, (0, 0, 255), 2) # human model inference img, img2bb_trans, bb2img_trans = generate_patch_image(original_img, bbox, 1.0, 0.0, False, self.cfg.input_img_shape) vis_patched_images = img.copy() # here we pre-process images img = img.transpose((2,0,1)) # h,w,c -> c,h,w img = torch.from_numpy(img).float() / 255.0 # Store image before normalization to use it in visualization img = transform(img) img = img.to(cfg.device)[None,:,:,:] # mesh recovery with torch.no_grad(): out = self.hmr_model(img) pred_cam, pred_3d_vertices_fine = out['pred_cam'], out['pred_3d_vertices_fine'] pred_3d_joints_from_smpl = self.smpl.get_h36m_joints(pred_3d_vertices_fine) # batch_size X 17 X 3 pred_3d_joints_from_smpl_pelvis = pred_3d_joints_from_smpl[:,smpl_cfg.H36M_J17_NAME.index('Pelvis'),:] pred_3d_joints_from_smpl = pred_3d_joints_from_smpl[:,smpl_cfg.H36M_J17_TO_J14,:] # batch_size X 14 X 3 # normalize predicted vertices pred_3d_vertices_fine = pred_3d_vertices_fine - pred_3d_joints_from_smpl_pelvis[:, None, :] # batch_size X 6890 X 3 pred_3d_vertices_fine = pred_3d_vertices_fine.detach().cpu().numpy()[0] # 6890 X 3 pred_cam = pred_cam.detach().cpu().numpy()[0] bbox_cx, bbox_cy = bbox[0] + bbox[2] / 2, bbox[1] + bbox[3] / 2 img_cx, img_cy = original_img_width / 2, original_img_height / 2 cx_delta, cy_delta = bbox_cx / img_cx - 1, bbox_cy / img_cy - 1 # render single person mesh vis_img = render_mesh(vis_img, pred_3d_vertices_fine, self.faces, [pred_cam[0] / (original_img_width / bbox[2]), pred_cam[0] / (original_img_height / bbox[3]), pred_cam[1] + cx_delta / (pred_cam[0] / (original_img_width / bbox[2])), pred_cam[2] + cy_delta / (pred_cam[0] / (original_img_height / bbox[3]))], mesh_as_vertices=mesh_as_vertices) vis_img = vis_img.astype('uint8') return vis_img, len(ok_bboxes), ok_bboxes if __name__ == '__main__': from PIL import Image inferer = Inferer('postometro', 1, './out_folder') # gpu image_path = f'../assets/07.jpg' image = Image.open(image_path) # Convert the PIL image to a NumPy array image_np = np.array(image) vis_img, _ , _ = inferer.infer(image_np, 0.2, multi_person=True, mesh_as_vertices=True) save_path = f'./saved_vis_07.jpg' # Ensure the image is in the correct format (PIL expects uint8) if vis_img.dtype != np.uint8: vis_img = vis_img.astype('uint8') # Convert the Numpy array (if RGB) to a PIL image and save image = Image.fromarray(vis_img) image.save(save_path)