Spaces:

facebook
/

EdgeTAM

Running on Zero

chongzhou commited on 14 days ago

Commit

45c2c68

1 Parent(s): 7209747

move cached feature to CUDA

Files changed (1) hide show

sam2/sam2_video_predictor.py CHANGED Viewed

@@ -882,9 +882,9 @@ class SAM2VideoPredictor(SAM2Base):
         image, backbone_out = inference_state["cached_features"].get(
             frame_idx, (None, None)
         )
         if backbone_out is None:
             # Cache miss -- we will run inference on a single image
-            device = inference_state["device"]
             image = inference_state["images"][frame_idx].to(device).float().unsqueeze(0)
             backbone_out = self.forward_image(image)
             # Cache the most recent frame's feature (for repeated interactions with
@@ -900,10 +900,10 @@ class SAM2VideoPredictor(SAM2Base):
         for i, feat in enumerate(expanded_backbone_out["backbone_fpn"]):
             expanded_backbone_out["backbone_fpn"][i] = feat.expand(
                 batch_size, -1, -1, -1
-            )
         for i, pos in enumerate(expanded_backbone_out["vision_pos_enc"]):
             pos = pos.expand(batch_size, -1, -1, -1)
-            expanded_backbone_out["vision_pos_enc"][i] = pos
         features = self._prepare_backbone_features(expanded_backbone_out)
         features = (expanded_image,) + features

         image, backbone_out = inference_state["cached_features"].get(
             frame_idx, (None, None)
         )
+        device = inference_state["device"]
         if backbone_out is None:
             # Cache miss -- we will run inference on a single image
             image = inference_state["images"][frame_idx].to(device).float().unsqueeze(0)
             backbone_out = self.forward_image(image)
             # Cache the most recent frame's feature (for repeated interactions with
         for i, feat in enumerate(expanded_backbone_out["backbone_fpn"]):
             expanded_backbone_out["backbone_fpn"][i] = feat.expand(
                 batch_size, -1, -1, -1
+            ).to(device)
         for i, pos in enumerate(expanded_backbone_out["vision_pos_enc"]):
             pos = pos.expand(batch_size, -1, -1, -1)
+            expanded_backbone_out["vision_pos_enc"][i] = pos.to(device)
         features = self._prepare_backbone_features(expanded_backbone_out)
         features = (expanded_image,) + features