Spaces:

alexnasa
/

Wan2.2-Animate-ZEROGPU

Running on Zero

App Files Files Community

alex commited on 18 days ago

Commit

fd0980c

1 Parent(s): c433e00

bounding box fix

Browse files

Files changed (2) hide show

app.py +5 -4
wan/modules/animate/preprocess/process_pipepline.py +178 -1

app.py CHANGED Viewed

@@ -549,10 +549,11 @@ with gr.Blocks(css=css, title="Wan 2.2 Animate --replace", theme=gr.themes.Ocean
                 action_button = gr.Button("Wan Animate 🦆", variant='primary', elem_classes="button-gradient")
                 with gr.Accordion("Preprocessed Data", open=False, visible=True):
-                    pose_video = gr.Video(label="Pose Video")
-                    bg_video = gr.Video(label="Background Video")
-                    face_video = gr.Video(label="Face Video")
-                    mask_video = gr.Video(label="Mask Video")
         with gr.Row():
             with gr.Column(elem_id="col-showcase"):

                 action_button = gr.Button("Wan Animate 🦆", variant='primary', elem_classes="button-gradient")
                 with gr.Accordion("Preprocessed Data", open=False, visible=True):
+                    with gr.Row():
+                        pose_video = gr.Video(label="Pose Video")
+                        bg_video = gr.Video(label="Background Video")
+                        face_video = gr.Video(label="Face Video")
+                        mask_video = gr.Video(label="Mask Video")
         with gr.Row():
             with gr.Column(elem_id="col-showcase"):

wan/modules/animate/preprocess/process_pipepline.py CHANGED Viewed

@@ -94,7 +94,7 @@ class ProcessPipeline():
                 canvas = np.zeros_like(refer_img)
                 conditioning_image = draw_aapose_by_meta_new(canvas, meta)
                 cond_images.append(conditioning_image)
-            masks = self.get_mask(frames, 400, tpl_pose_metas)
             bg_images = []
             aug_masks = []
@@ -352,3 +352,180 @@ class ProcessPipeline():
             metas_list.append(meta)
         return metas_list

                 canvas = np.zeros_like(refer_img)
                 conditioning_image = draw_aapose_by_meta_new(canvas, meta)
                 cond_images.append(conditioning_image)
+            masks = self.get_mask_from_face_bbox(frames, 400, tpl_pose_metas)
             bg_images = []
             aug_masks = []
             metas_list.append(meta)
         return metas_list
+    def get_mask_from_face_bbox(self, frames, th_step, kp2ds_all):
+        """
+        Build masks using a face bounding box per key frame (derived from keypoints_face),
+        then propagate with SAM2 across each chunk of frames.
+        """
+        H, W = frames[0].shape[:2]
+        def _clip_box(x1, y1, x2, y2, W, H):
+            x1 = max(0, min(int(x1), W - 1))
+            x2 = max(0, min(int(x2), W - 1))
+            y1 = max(0, min(int(y1), H - 1))
+            y2 = max(0, min(int(y2), H - 1))
+            if x2 <= x1: x2 = min(W - 1, x1 + 1)
+            if y2 <= y1: y2 = min(H - 1, y1 + 1)
+            return x1, y1, x2, y2
+        frame_num = len(frames)
+        if frame_num < th_step:
+            num_step = 1
+        else:
+            num_step = (frame_num + th_step) // th_step
+        all_mask = []
+        for step_idx in range(num_step):
+            each_frames = frames[step_idx * th_step:(step_idx + 1) * th_step]
+            kp2ds = kp2ds_all[step_idx * th_step:(step_idx + 1) * th_step]
+            if len(each_frames) == 0:
+                continue
+            # pick a few key frames in this chunk
+            key_frame_num = 4 if len(each_frames) > 4 else 1
+            key_frame_step = max(1, len(kp2ds) // key_frame_num)
+            key_frame_index_list = list(range(0, len(kp2ds), key_frame_step))[:key_frame_num]
+            # compute face boxes on the selected key frames
+            key_frame_boxes = []
+            for kfi in key_frame_index_list:
+                meta = kp2ds[kfi]
+                # get_face_bboxes returns (x1, x2, y1, y2) in your code
+                x1, x2, y1, y2 = get_face_bboxes(
+                    meta['keypoints_face'][:, :2],
+                    scale=1.3,
+                    image_shape=(H, W)
+                )
+                x1, y1, x2, y2 = _clip_box(x1, y1, x2, y2, W, H)
+                key_frame_boxes.append(np.array([x1, y1, x2, y2], dtype=np.float32))
+            # init SAM2 for this chunk
+            inference_state = self.predictor.init_state_v2(frames=each_frames)
+            self.predictor.reset_state(inference_state)
+            ann_obj_id = 1
+            # seed with box prompts (preferred), else fall back to points
+            for ann_frame_idx, box_xyxy in zip(key_frame_index_list, key_frame_boxes):
+                used_box = False
+                try:
+                    # If your predictor exposes a box API, this is ideal.
+                    _ = self.predictor.add_new_box(
+                        inference_state=inference_state,
+                        frame_idx=ann_frame_idx,
+                        obj_id=ann_obj_id,
+                        box=box_xyxy[None, :]  # shape (1, 4)
+                    )
+                    used_box = True
+                except Exception:
+                    used_box = False
+                if not used_box:
+                    # Fallback: sample a few positive points inside the box
+                    x1, y1, x2, y2 = box_xyxy.astype(int)
+                    cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
+                    pts = np.array([
+                        [cx, cy],
+                        [x1 + (x2 - x1) // 4, cy],
+                        [x2 - (x2 - x1) // 4, cy],
+                        [cx, y1 + (y2 - y1) // 4],
+                        [cx, y2 - (y2 - y1) // 4],
+                    ], dtype=np.int32)
+                    labels = np.ones(len(pts), dtype=np.int32)  # 1 = positive
+                    _ = self.predictor.add_new_points(
+                        inference_state=inference_state,
+                        frame_idx=ann_frame_idx,
+                        obj_id=ann_obj_id,
+                        points=pts,
+                        labels=labels,
+                    )
+            # propagate across the chunk
+            video_segments = {}
+            for out_frame_idx, out_obj_ids, out_mask_logits in self.predictor.propagate_in_video(inference_state):
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+            # collect masks (single object id)
+            for out_frame_idx in range(len(video_segments)):
+                # (H, W) boolean/uint8
+                mask = next(iter(video_segments[out_frame_idx].values()))
+                mask = mask[0].astype(np.uint8)
+                all_mask.append(mask)
+        return all_mask
+    def get_mask_from_face_point(self, frames, th_step, kp2ds_all):
+        """
+        Build masks using a single face *center point* per key frame,
+        then propagate with SAM2 across each chunk of frames.
+        """
+        H, W = frames[0].shape[:2]
+        frame_num = len(frames)
+        num_step = 1 if frame_num < th_step else (frame_num + th_step) // th_step
+        all_mask = []
+        for step_idx in range(num_step):
+            each_frames = frames[step_idx * th_step:(step_idx + 1) * th_step]
+            kp2ds = kp2ds_all[step_idx * th_step:(step_idx + 1) * th_step]
+            if len(each_frames) == 0:
+                continue
+            # choose a few key frames to seed the object
+            key_frame_num = 4 if len(each_frames) > 4 else 1
+            key_frame_step = max(1, len(kp2ds) // key_frame_num)
+            key_frame_index_list = list(range(0, len(kp2ds), key_frame_step))[:key_frame_num]
+            # compute center point from face bbox for each selected key frame
+            center_pts = []
+            for kfi in key_frame_index_list:
+                meta = kp2ds[kfi]
+                # your helper returns (x1, x2, y1, y2)
+                x1, x2, y1, y2 = get_face_bboxes(
+                    meta['keypoints_face'][:, :2],
+                    scale=1.3,
+                    image_shape=(H, W)
+                )
+                cx = (x1 + x2) // 2
+                cy = (y1 + y2) // 2
+                # clip just in case
+                cx = int(max(0, min(cx, W - 1)))
+                cy = int(max(0, min(cy, H - 1)))
+                center_pts.append(np.array([cx, cy], dtype=np.int32))
+            # init SAM2 for this chunk
+            inference_state = self.predictor.init_state_v2(frames=each_frames)
+            self.predictor.reset_state(inference_state)
+            ann_obj_id = 1
+            # seed each key frame with a single positive point at the face center
+            for ann_frame_idx, pt in zip(key_frame_index_list, center_pts):
+                pts = pt[None, :]  # shape (1, 2)
+                labels = np.ones(1, dtype=np.int32)  # 1 = positive
+                _ = self.predictor.add_new_points(
+                    inference_state=inference_state,
+                    frame_idx=ann_frame_idx,
+                    obj_id=ann_obj_id,
+                    points=pts,
+                    labels=labels,
+                )
+            # propagate across the chunk
+            video_segments = {}
+            for out_frame_idx, out_obj_ids, out_mask_logits in self.predictor.propagate_in_video(inference_state):
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+            # collect masks (single object id)
+            for out_frame_idx in range(len(video_segments)):
+                mask = next(iter(video_segments[out_frame_idx].values()))
+                mask = mask[0].astype(np.uint8)
+                all_mask.append(mask)
+        return all_mask