| | import sys |
| | import os |
| | from pathlib import Path |
| |
|
| | import numpy as np |
| | from tqdm import tqdm |
| | import torch |
| | from ultralytics import YOLO |
| |
|
| | |
| | current_file_dir = os.path.dirname(os.path.abspath(__file__)) |
| | hawor_path = os.path.abspath(os.path.join(current_file_dir, '..', '..', 'thirdparty', 'HaWoR')) |
| | if hawor_path not in sys.path: |
| | sys.path.insert(0, hawor_path) |
| |
|
| | from thirdparty.HaWoR.lib.models.hawor import HAWOR |
| | from thirdparty.HaWoR.lib.pipeline.tools import parse_chunks |
| | from thirdparty.HaWoR.lib.eval_utils.custom_utils import interpolate_bboxes |
| | from thirdparty.HaWoR.hawor.utils.rotation import angle_axis_to_rotation_matrix, rotation_matrix_to_angle_axis |
| | from thirdparty.HaWoR.hawor.configs import get_config |
| |
|
| |
|
| | def load_hawor(checkpoint_path: str): |
| | """ |
| | Loads the HAWOR model and its configuration from a checkpoint. |
| | |
| | Args: |
| | checkpoint_path (str): Path to the model checkpoint file or HuggingFace repo ID (e.g., 'username/model-name'). |
| | |
| | Returns: |
| | tuple: (HAWOR model instance, model configuration object) |
| | """ |
| | from huggingface_hub import hf_hub_download |
| | |
| | |
| | if '/' in checkpoint_path and not os.path.exists(checkpoint_path): |
| | |
| | print(f"Downloading model from HuggingFace: {checkpoint_path}") |
| | checkpoint_file = hf_hub_download(repo_id=checkpoint_path, filename="checkpoints/hawor.ckpt") |
| | config_file = hf_hub_download(repo_id=checkpoint_path, filename="config.yaml") |
| | print(f"Downloaded checkpoint to: {checkpoint_file}") |
| | print(f"Downloaded config to: {config_file}") |
| | print(f"Checkpoint exists: {os.path.exists(checkpoint_file)}") |
| | model_cfg_path = Path(config_file) |
| | else: |
| | |
| | checkpoint_file = checkpoint_path |
| | model_cfg_path = Path(checkpoint_path).parent.parent / 'config.yaml' |
| | print(f"Using local checkpoint: {checkpoint_file}") |
| | print(f"Using local config: {model_cfg_path}") |
| | |
| | print(f"Loading config from: {model_cfg_path}") |
| | model_cfg = get_config(str(model_cfg_path), update_cachedir=True) |
| |
|
| | |
| | if (model_cfg.MODEL.BACKBONE.TYPE == 'vit') and ('BBOX_SHAPE' not in model_cfg.MODEL): |
| | model_cfg.defrost() |
| | assert model_cfg.MODEL.IMAGE_SIZE == 256, \ |
| | f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone" |
| | model_cfg.MODEL.BBOX_SHAPE = [192, 256] |
| | model_cfg.freeze() |
| |
|
| | |
| | print(f"Loading HAWOR model from checkpoint: {checkpoint_file}") |
| | |
| | model = HAWOR.load_from_checkpoint( |
| | checkpoint_file, |
| | strict=False, |
| | cfg=model_cfg, |
| | map_location='cpu' |
| | ) |
| |
|
| | return model, model_cfg |
| |
|
| | class HaworPipeline: |
| | """ |
| | Pipeline for hand detection, tracking, and HAWOR motion estimation. |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | model_path: str = '', |
| | detector_path: str = '', |
| | device: torch.device = torch.device("cuda") |
| | ): |
| | """ |
| | Initializes the HAWOR model and detector path. |
| | |
| | Args: |
| | model_path (str): Path to the HAWOR checkpoint. |
| | detector_path (str): Path to the hand detector (YOLO) weights. |
| | device (torch.device): Device to load models onto. |
| | """ |
| | self.device = device |
| | self.detector_path = detector_path |
| | self._checkpoint_path = model_path |
| | self._original_device = device |
| | |
| | model, model_cfg = load_hawor(model_path) |
| | model = model.to(device) |
| | model.eval() |
| | self.model = model |
| | self.model_cfg = model_cfg |
| |
|
| | def recon( |
| | self, |
| | images: list, |
| | img_focal: float, |
| | thresh: float = 0.2, |
| | single_image: bool = False |
| | ) -> dict: |
| | |
| | """ |
| | Performs hand detection, tracking, and HAWOR-based 3D reconstruction. |
| | |
| | Args: |
| | images (list): List of consecutive input image frames (cv2/numpy format). |
| | img_focal (float): Focal length of the camera in pixels. |
| | thresh (float): Confidence threshold for hand detection. |
| | single_image (bool): Flag for single-image processing mode. |
| | |
| | Returns: |
| | dict: Dictionary of reconstruction results for 'left' and 'right' hands. |
| | """ |
| | |
| | hand_det_model = YOLO(self.detector_path) |
| | _, tracks = detect_track(images, hand_det_model, thresh=thresh) |
| | |
| | recon_results = hawor_motion_estimation( |
| | images, tracks, self.model, img_focal, single_image=single_image |
| | ) |
| | |
| | del hand_det_model |
| | |
| | return recon_results |
| |
|
| | |
| | def detect_track(imgfiles: list, hand_det_model: YOLO, thresh: float = 0.5) -> tuple: |
| | """ |
| | Detects and tracks hands across a sequence of images using YOLO. |
| | |
| | Args: |
| | imgfiles (list): List of image frames. |
| | hand_det_model (YOLO): The initialized YOLO hand detection model. |
| | thresh (float): Confidence threshold for detection. |
| | |
| | Returns: |
| | tuple: (list of boxes (unused in original logic), dict of tracks) |
| | """ |
| | boxes_ = [] |
| | tracks = {} |
| |
|
| | for t, img_cv2 in enumerate(tqdm(imgfiles)): |
| |
|
| | |
| | with torch.no_grad(): |
| | with torch.amp.autocast('cuda'): |
| | results = hand_det_model.track(img_cv2, conf=thresh, persist=True, verbose=False) |
| | |
| | boxes = results[0].boxes.xyxy.cpu().numpy() |
| | confs = results[0].boxes.conf.cpu().numpy() |
| | handedness = results[0].boxes.cls.cpu().numpy() |
| | if not results[0].boxes.id is None: |
| | track_id = results[0].boxes.id.cpu().numpy() |
| | else: |
| | track_id = [-1] * len(boxes) |
| |
|
| | boxes = np.hstack([boxes, confs[:, None]]) |
| |
|
| | find_right = False |
| | find_left = False |
| |
|
| | for idx, box in enumerate(boxes): |
| | if track_id[idx] == -1: |
| | if handedness[[idx]] > 0: |
| | id = int(10000) |
| | else: |
| | id = int(5000) |
| | else: |
| | id = track_id[idx] |
| | subj = dict() |
| | subj['frame'] = t |
| | subj['det'] = True |
| | subj['det_box'] = boxes[[idx]] |
| | subj['det_handedness'] = handedness[[idx]] |
| | |
| | if (not find_right and handedness[[idx]] > 0) or (not find_left and handedness[[idx]]==0): |
| | if id in tracks: |
| | tracks[id].append(subj) |
| | else: |
| | tracks[id] = [subj] |
| |
|
| | if handedness[[idx]] > 0: |
| | find_right = True |
| | elif handedness[[idx]] == 0: |
| | find_left = True |
| |
|
| | return boxes_, tracks |
| |
|
| | |
| | def hawor_motion_estimation( |
| | imgfiles: list, |
| | tracks: dict, |
| | model: HAWOR, |
| | img_focal: float, |
| | single_image: bool = False |
| | ) -> dict: |
| | """ |
| | Performs HAWOR 3D hand reconstruction on detected and tracked hand regions. |
| | |
| | Args: |
| | imgfiles (list): List of image frames. |
| | tracks (dict): Dictionary mapping track ID to a list of detection objects. |
| | model (HAWOR): The initialized HAWOR model. |
| | img_focal (float): Camera focal length. |
| | single_image (bool): Flag for single-image processing mode. |
| | |
| | Returns: |
| | dict: Reconstructed parameters ('left' and 'right' hand results). |
| | """ |
| |
|
| | left_results = {} |
| | right_results = {} |
| | |
| | tid = np.array([tr for tr in tracks]) |
| |
|
| | left_trk = [] |
| | right_trk = [] |
| | for k, idx in enumerate(tid): |
| | trk = tracks[idx] |
| |
|
| | valid = np.array([t['det'] for t in trk]) |
| | is_right = np.concatenate([t['det_handedness'] for t in trk])[valid] |
| | |
| | if is_right.sum() / len(is_right) < 0.5: |
| | left_trk.extend(trk) |
| | else: |
| | right_trk.extend(trk) |
| | left_trk = sorted(left_trk, key=lambda x: x['frame']) |
| | right_trk = sorted(right_trk, key=lambda x: x['frame']) |
| | final_tracks = { |
| | 0: left_trk, |
| | 1: right_trk |
| | } |
| | tid = [0, 1] |
| |
|
| | img = imgfiles[0] |
| | img_center = [img.shape[1] / 2, img.shape[0] / 2] |
| | H, W = img.shape[:2] |
| |
|
| | for idx in tid: |
| | print(f"tracklet {idx}:") |
| | trk = final_tracks[idx] |
| |
|
| | |
| | valid = np.array([t['det'] for t in trk]) |
| | if not single_image: |
| | if valid.sum() < 2: |
| | continue |
| | else: |
| | if valid.sum() < 1: |
| | continue |
| | boxes = np.concatenate([t['det_box'] for t in trk]) |
| | non_zero_indices = np.where(np.any(boxes != 0, axis=1))[0] |
| | first_non_zero = non_zero_indices[0] |
| | last_non_zero = non_zero_indices[-1] |
| | boxes[first_non_zero:last_non_zero+1] = interpolate_bboxes(boxes[first_non_zero:last_non_zero+1]) |
| | valid[first_non_zero:last_non_zero+1] = True |
| |
|
| |
|
| | boxes = boxes[first_non_zero:last_non_zero+1] |
| | is_right = np.concatenate([t['det_handedness'] for t in trk])[valid] |
| | frame = np.array([t['frame'] for t in trk])[valid] |
| | |
| | if is_right.sum() / len(is_right) < 0.5: |
| | is_right = np.zeros((len(boxes), 1)) |
| | else: |
| | is_right = np.ones((len(boxes), 1)) |
| |
|
| | frame_chunks, boxes_chunks = parse_chunks(frame, boxes, min_len=1) |
| |
|
| | if len(frame_chunks) == 0: |
| | continue |
| |
|
| | for frame_ck, boxes_ck in zip(frame_chunks, boxes_chunks): |
| | print(f"inference from frame {frame_ck[0]} to {frame_ck[-1]}") |
| | img_ck = [imgfiles[i] for i in frame_ck] |
| | if is_right[0] > 0: |
| | do_flip = False |
| | else: |
| | do_flip = True |
| |
|
| | results = model.inference(img_ck, boxes_ck, img_focal=img_focal, img_center=img_center, do_flip=do_flip) |
| | |
| | data_out = { |
| | "init_root_orient": results["pred_rotmat"][None, :, 0], |
| | "init_hand_pose": results["pred_rotmat"][None, :, 1:], |
| | "init_trans": results["pred_trans"][None, :, 0], |
| | "init_betas": results["pred_shape"][None, :] |
| | } |
| |
|
| | |
| | init_root = rotation_matrix_to_angle_axis(data_out["init_root_orient"]) |
| | init_hand_pose = rotation_matrix_to_angle_axis(data_out["init_hand_pose"]) |
| | if do_flip: |
| | init_root[..., 1] *= -1 |
| | init_root[..., 2] *= -1 |
| | data_out["init_root_orient"] = angle_axis_to_rotation_matrix(init_root) |
| | data_out["init_hand_pose"] = angle_axis_to_rotation_matrix(init_hand_pose) |
| |
|
| | s_frame = frame_ck[0] |
| | e_frame = frame_ck[-1] |
| |
|
| | for frame_id in range(s_frame, e_frame+1): |
| | result = {} |
| | result['beta'] = data_out['init_betas'][0, frame_id-s_frame].cpu().numpy() |
| | result['hand_pose'] = data_out['init_hand_pose'][0, frame_id-s_frame].cpu().numpy() |
| | result['global_orient'] = data_out['init_root_orient'][0, frame_id-s_frame].cpu().numpy() |
| | result['transl'] = data_out['init_trans'][0, frame_id-s_frame].cpu().numpy() |
| | |
| | if idx == 0: |
| | left_results[frame_id] = result |
| | else: |
| | right_results[frame_id] = result |
| | |
| | reformat_results = {'left': left_results, 'right': right_results} |
| |
|
| | return reformat_results |
| |
|
| |
|