Spaces:

facebook
/

EdgeTAM

Running on Zero

App Files Files Community

chongzhou commited on May 19

Commit

cf4b18a

1 Parent(s): 238c545

move segment_with_points to CPU

Browse files

Files changed (2) hide show

app.py +56 -77
sam2/sam2_video_predictor.py +1 -1

app.py CHANGED Viewed

@@ -246,7 +246,6 @@ def preprocess_video_in(
     ]
-@spaces.GPU(duration=5)
 def segment_with_points(
     point_type,
     first_frame,
@@ -256,68 +255,64 @@ def segment_with_points(
     inference_state,
     evt: gr.SelectData,
 ):
-    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cuda")
-    if torch.cuda.get_device_properties(0).major >= 8:
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        input_points.append(evt.index)
-        print(f"TRACKING INPUT POINT: {input_points}")
-        if point_type == "include":
-            input_labels.append(1)
-        elif point_type == "exclude":
-            input_labels.append(0)
-        print(f"TRACKING INPUT LABEL: {input_labels}")
-        # Open the image and get its dimensions
-        transparent_background = Image.fromarray(first_frame).convert("RGBA")
-        w, h = transparent_background.size
-        # Define the circle radius as a fraction of the smaller dimension
-        fraction = 0.01  # You can adjust this value as needed
-        radius = int(fraction * min(w, h))
-        # Create a transparent layer to draw on
-        transparent_layer = np.zeros((h, w, 4), dtype=np.uint8)
-        for index, track in enumerate(input_points):
-            if input_labels[index] == 1:
-                cv2.circle(transparent_layer, track, radius, (0, 255, 0, 255), -1)
-            else:
-                cv2.circle(transparent_layer, track, radius, (255, 0, 0, 255), -1)
-        # Convert the transparent layer back to an image
-        transparent_layer = Image.fromarray(transparent_layer, "RGBA")
-        selected_point_map = Image.alpha_composite(
-            transparent_background, transparent_layer
-        )
-        # Let's add a positive click at (x, y) = (210, 350) to get started
-        points = np.array(input_points, dtype=np.float32)
-        # for labels, `1` means positive click and `0` means negative click
-        labels = np.array(input_labels, dtype=np.int32)
-        _, _, out_mask_logits = predictor.add_new_points(
-            inference_state=inference_state,
-            frame_idx=0,
-            obj_id=OBJ_ID,
-            points=points,
-            labels=labels,
-        )
-        mask_image = show_mask((out_mask_logits[0] > 0.0).cpu().numpy())
-        first_frame_output = Image.alpha_composite(transparent_background, mask_image)
-        torch.cuda.empty_cache()
-        return (
-            selected_point_map,
-            first_frame_output,
-            first_frame,
-            all_frames,
-            input_points,
-            input_labels,
-            inference_state,
-        )
 def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
@@ -338,10 +333,8 @@ def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
 @spaces.GPU(duration=30)
 def propagate_to_all(
     video_in,
-    first_frame,
     all_frames,
     input_points,
-    input_labels,
     inference_state,
 ):
     predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cuda")
@@ -394,14 +387,7 @@ def propagate_to_all(
         # Write the result to a file
         clip.write_videofile(final_vid_output_path, codec="libx264")
-        return (
-            gr.update(value=final_vid_output_path),
-            first_frame,
-            all_frames,
-            input_points,
-            input_labels,
-            inference_state,
-        )
 def update_ui():
@@ -586,19 +572,12 @@ with gr.Blocks() as demo:
         fn=propagate_to_all,
         inputs=[
             video_in,
-            first_frame,
             all_frames,
             input_points,
-            input_labels,
             inference_state,
         ],
         outputs=[
             output_video,
-            first_frame,
-            all_frames,
-            input_points,
-            input_labels,
-            inference_state,
         ],
         concurrency_limit=10,
         queue=False,

     ]
 def segment_with_points(
     point_type,
     first_frame,
     inference_state,
     evt: gr.SelectData,
 ):
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
+    input_points.append(evt.index)
+    print(f"TRACKING INPUT POINT: {input_points}")
+    if point_type == "include":
+        input_labels.append(1)
+    elif point_type == "exclude":
+        input_labels.append(0)
+    print(f"TRACKING INPUT LABEL: {input_labels}")
+    # Open the image and get its dimensions
+    transparent_background = Image.fromarray(first_frame).convert("RGBA")
+    w, h = transparent_background.size
+    # Define the circle radius as a fraction of the smaller dimension
+    fraction = 0.01  # You can adjust this value as needed
+    radius = int(fraction * min(w, h))
+    # Create a transparent layer to draw on
+    transparent_layer = np.zeros((h, w, 4), dtype=np.uint8)
+    for index, track in enumerate(input_points):
+        if input_labels[index] == 1:
+            cv2.circle(transparent_layer, track, radius, (0, 255, 0, 255), -1)
+        else:
+            cv2.circle(transparent_layer, track, radius, (255, 0, 0, 255), -1)
+    # Convert the transparent layer back to an image
+    transparent_layer = Image.fromarray(transparent_layer, "RGBA")
+    selected_point_map = Image.alpha_composite(
+        transparent_background, transparent_layer
+    )
+    # Let's add a positive click at (x, y) = (210, 350) to get started
+    points = np.array(input_points, dtype=np.float32)
+    # for labels, `1` means positive click and `0` means negative click
+    labels = np.array(input_labels, dtype=np.int32)
+    _, _, out_mask_logits = predictor.add_new_points(
+        inference_state=inference_state,
+        frame_idx=0,
+        obj_id=OBJ_ID,
+        points=points,
+        labels=labels,
+    )
+    mask_image = show_mask((out_mask_logits[0] > 0.0).cpu().numpy())
+    first_frame_output = Image.alpha_composite(transparent_background, mask_image)
+    torch.cuda.empty_cache()
+    return (
+        selected_point_map,
+        first_frame_output,
+        first_frame,
+        all_frames,
+        input_points,
+        input_labels,
+        inference_state,
+    )
 def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
 @spaces.GPU(duration=30)
 def propagate_to_all(
     video_in,
     all_frames,
     input_points,
     inference_state,
 ):
     predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cuda")
         # Write the result to a file
         clip.write_videofile(final_vid_output_path, codec="libx264")
+        return gr.update(value=final_vid_output_path)
 def update_ui():
         fn=propagate_to_all,
         inputs=[
             video_in,
             all_frames,
             input_points,
             inference_state,
         ],
         outputs=[
             output_video,
         ],
         concurrency_limit=10,
         queue=False,

sam2/sam2_video_predictor.py CHANGED Viewed

@@ -107,7 +107,7 @@ class SAM2VideoPredictor(SAM2Base):
         inference_state["tracking_has_started"] = False
         inference_state["frames_already_tracked"] = {}
         # Warm up the visual backbone and cache the image feature on frame 0
-        # self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
         return inference_state
     @classmethod

         inference_state["tracking_has_started"] = False
         inference_state["frames_already_tracked"] = {}
         # Warm up the visual backbone and cache the image feature on frame 0
+        self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
         return inference_state
     @classmethod