import argparse import os from pathlib import Path import tempfile import sys import cv2 import gradio as gr import numpy as np import torch from PIL import Image # print file path print(os.path.abspath(__file__)) os.environ["PYOPENGL_PLATFORM"] = "egl" os.environ["MESA_GL_VERSION_OVERRIDE"] = "4.1" os.system('pip install /home/user/app/pyrender') sys.path.append('/home/user/app/pyrender') from hamer.configs import get_config from hamer.datasets.vitdet_dataset import (DEFAULT_MEAN, DEFAULT_STD, ViTDetDataset) from hamer.models import HAMER from hamer.utils import recursive_to from hamer.utils.renderer import Renderer, cam_crop_to_full try: import detectron2 except: import os os.system('pip install --upgrade pip') os.system('pip install git+') #try: # from vitpose_model import ViTPoseModel #except: # os.system('pip install -v -e /home/user/app/vendor/ViTPose') # from vitpose_model import ViTPoseModel from vitpose_model import ViTPoseModel OUT_FOLDER = 'demo_out' os.makedirs(OUT_FOLDER, exist_ok=True) # Setup HaMeR model LIGHT_BLUE=(0.65098039, 0.74117647, 0.85882353) DEFAULT_CHECKPOINT='_DATA/hamer_ckpts/checkpoints/hamer.ckpt' device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model_cfg = str(Path(DEFAULT_CHECKPOINT).parent.parent / 'model_config.yaml') model_cfg = get_config(model_cfg) # Override some config values, to crop bbox correctly if (model_cfg.MODEL.BACKBONE.TYPE == 'vit') and ('BBOX_SHAPE' not in model_cfg.MODEL): model_cfg.defrost() assert model_cfg.MODEL.IMAGE_SIZE == 256, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone" model_cfg.MODEL.BBOX_SHAPE = [192,256] model_cfg.freeze() model = HAMER.load_from_checkpoint(DEFAULT_CHECKPOINT, strict=False, cfg=model_cfg).to(device) model.eval() # Load detector from detectron2.config import LazyConfig from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy detectron2_cfg = LazyConfig.load(f"vendor/detectron2/projects/ViTDet/configs/COCO/") detectron2_cfg.train.init_checkpoint = "" for i in range(3): detectron2_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25 detector = DefaultPredictor_Lazy(detectron2_cfg) # Setup the renderer renderer = Renderer(model_cfg, faces=model.mano.faces) # keypoint detector cpm = ViTPoseModel(device) import numpy as np def infer(in_pil_img, in_threshold=0.8, out_pil_img=None): open_cv_image = np.array(in_pil_img) # Convert RGB to BGR open_cv_image = open_cv_image[:, :, ::-1].copy() print("EEEEE", open_cv_image.shape) det_out = detector(open_cv_image) det_instances = det_out['instances'] valid_idx = (det_instances.pred_classes==0) & (det_instances.scores > in_threshold) pred_bboxes=det_instances.pred_boxes.tensor[valid_idx].cpu().numpy() pred_scores=det_instances.scores[valid_idx].cpu().numpy() # Detect human keypoints for each person vitposes_out = cpm.predict_pose( open_cv_image, [np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)], ) bboxes = [] is_right = [] # Use hands based on hand keypoint detections for vitposes in vitposes_out: left_hand_keyp = vitposes['keypoints'][-42:-21] right_hand_keyp = vitposes['keypoints'][-21:] # Rejecting not confident detections (this could be improved) keyp = left_hand_keyp valid = keyp[:,2] > 0.5 if sum(valid) > 3: bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] bboxes.append(bbox) is_right.append(0) keyp = right_hand_keyp valid = keyp[:,2] > 0.5 if sum(valid) > 3: bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] bboxes.append(bbox) is_right.append(1) if len(bboxes) == 0: return None, [] boxes = np.stack(bboxes) right = np.stack(is_right) # Run HaMeR on all detected humans dataset = ViTDetDataset(model_cfg, open_cv_image, boxes, right) dataloader =, batch_size=8, shuffle=False, num_workers=0) all_verts = [] all_cam_t = [] all_right = [] all_mesh_paths = [] temp_name = next(tempfile._get_candidate_names()) for batch in dataloader: batch = recursive_to(batch, device) with torch.no_grad(): out = model(batch) multiplier = (2*batch['right']-1) pred_cam = out['pred_cam'] pred_cam[:,1] = multiplier*pred_cam[:,1] box_center = batch["box_center"].float() box_size = batch["box_size"].float() img_size = batch["img_size"].float() multiplier = (2*batch['right']-1) render_size = img_size scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max() pred_cam_t = cam_crop_to_full(pred_cam, box_center, box_size, render_size, scaled_focal_length).detach().cpu().numpy() # Render the result batch_size = batch['img'].shape[0] for n in range(batch_size): # Get filename from path img_path # img_fn, _ = os.path.splitext(os.path.basename(img_path)) person_id = int(batch['personid'][n]) white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255) input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255) input_patch = input_patch.permute(1,2,0).numpy() verts = out['pred_vertices'][n].detach().cpu().numpy() is_right = batch['right'][n].cpu().numpy() verts[:,0] = (2*is_right-1)*verts[:,0] cam_t = pred_cam_t[n] all_verts.append(verts) all_cam_t.append(cam_t) all_right.append(is_right) # Save all meshes to disk # if args.save_mesh: if True: camera_translation = cam_t.copy() tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right) temp_path = os.path.join(f'{OUT_FOLDER}/{temp_name}_{person_id}.obj') tmesh.export(temp_path) all_mesh_paths.append(temp_path) # Render front view if len(all_verts) > 0: misc_args = dict( mesh_base_color=LIGHT_BLUE, scene_bg_color=(1, 1, 1), focal_length=scaled_focal_length, ) cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=render_size[n], is_right=all_right, **misc_args) # Overlay image input_img = open_cv_image.astype(np.float32)[:,:,::-1]/255.0 input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:] # convert to PIL image out_pil_img = Image.fromarray((input_img_overlay*255).astype(np.uint8)) return out_pil_img, all_mesh_paths else: return None, [] with gr.Blocks(title="HaMeR", css=".gradio-container") as demo: gr.HTML("""