Spaces:

MyNiuuu
/

MOFA-Video_Traj

Running on Zero

App Files Files Community

myniu commited on Jul 3

Commit

2935911

•

1 Parent(s): 69f142b

init

Browse files

Files changed (1) hide show

app.py +28 -30

app.py CHANGED Viewed

@@ -216,20 +216,18 @@ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
 class Drag:
     @spaces.GPU(duration=200)
-    def __init__(self, device, height, width):
-        self.device = device
         svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
         mofa_ckpt = "ckpts/controlnet"
-        self.device = 'cuda'
         self.weight_dtype = torch.float16
         self.pipeline, self.cmp = init_models(
             svd_ckpt,
             mofa_ckpt,
             weight_dtype=self.weight_dtype,
-            device=self.device
         )
         self.height = height
@@ -304,12 +302,12 @@ class Drag:
         print('start diffusion process...')
-        input_drag_384_inmask = input_drag_384_inmask.to(self.device, dtype=self.weight_dtype)
-        mask_384_inmask = mask_384_inmask.to(self.device, dtype=self.weight_dtype)
-        input_drag_384_outmask = input_drag_384_outmask.to(self.device, dtype=self.weight_dtype)
-        mask_384_outmask = mask_384_outmask.to(self.device, dtype=self.weight_dtype)
-        input_first_frame_384 = input_first_frame_384.to(self.device, dtype=self.weight_dtype)
         if in_mask_flag:
             flow_inmask = self.get_flow(
@@ -318,7 +316,7 @@ class Drag:
             )
         else:
             fb, fl = mask_384_inmask.shape[:2]
-            flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to(self.device, dtype=self.weight_dtype)
         if out_mask_flag:
             flow_outmask = self.get_flow(
@@ -327,7 +325,7 @@ class Drag:
             )
         else:
             fb, fl = mask_384_outmask.shape[:2]
-            flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to(self.device, dtype=self.weight_dtype)
         inmask_no_zero = (flow_inmask != 0).all(dim=2)
         inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
@@ -426,10 +424,10 @@ class Drag:
                 np.zeros((25 - 1, 384, 384, 2)), \
                     np.zeros((25 - 1, 384, 384))
-        input_drag_384_inmask = torch.from_numpy(input_drag_384_inmask).unsqueeze(0).to(self.device)  # [1, 13, h, w, 2]
-        input_mask_384_inmask = torch.from_numpy(input_mask_384_inmask).unsqueeze(0).to(self.device)  # [1, 13, h, w]
-        input_drag_384_outmask = torch.from_numpy(input_drag_384_outmask).unsqueeze(0).to(self.device)  # [1, 13, h, w, 2]
-        input_mask_384_outmask = torch.from_numpy(input_mask_384_outmask).unsqueeze(0).to(self.device)  # [1, 13, h, w]
         first_frames_transform = transforms.Compose([
             lambda x: Image.fromarray(x),
@@ -437,7 +435,7 @@ class Drag:
         ])
         input_first_frame = image2arr(first_frame_path)
-        input_first_frame = repeat(first_frames_transform(input_first_frame), 'c h w -> b c h w', b=1).to(self.device)
         seed = 42
         num_frames = 25
@@ -452,12 +450,12 @@ class Drag:
         input_drag_384_outmask = input_drag_384_outmask.permute(0, 1, 4, 2, 3)  # [1, 13, 2, 384, 384]
         mask_384_outmask = input_mask_384_outmask.unsqueeze(2).repeat(1, 1, 2, 1, 1)  # [1, 13, 2, 384, 384]
-        input_drag_384_inmask = input_drag_384_inmask.to(self.device, dtype=self.weight_dtype)
-        mask_384_inmask = mask_384_inmask.to(self.device, dtype=self.weight_dtype)
-        input_drag_384_outmask = input_drag_384_outmask.to(self.device, dtype=self.weight_dtype)
-        mask_384_outmask = mask_384_outmask.to(self.device, dtype=self.weight_dtype)
-        input_first_frame_384 = input_first_frame_384.to(self.device, dtype=self.weight_dtype)
         if in_mask_flag:
             flow_inmask = self.get_flow(
@@ -466,7 +464,7 @@ class Drag:
             )
         else:
             fb, fl = mask_384_inmask.shape[:2]
-            flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to(self.device, dtype=self.weight_dtype)
         if out_mask_flag:
             flow_outmask = self.get_flow(
@@ -475,7 +473,7 @@ class Drag:
             )
         else:
             fb, fl = mask_384_outmask.shape[:2]
-            flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to(self.device, dtype=self.weight_dtype)
         inmask_no_zero = (flow_inmask != 0).all(dim=2)
         inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
@@ -566,17 +564,17 @@ class Drag:
         for i in tqdm(range(num_inference)):
             if not outputs:
                 first_frames = image2arr(first_frame_path)
-                first_frames = repeat(first_frames_transform(first_frames), 'c h w -> b c h w', b=inference_batch_size).to(self.device)
             else:
                 first_frames = outputs['logits_imgs'][:, -1]
             outputs = self.forward_sample(
-                input_drag_384_inmask.to(self.device),
-                input_drag_384_outmask.to(self.device),
-                first_frames.to(self.device),
-                input_mask_384_inmask.to(self.device),
-                input_mask_384_outmask.to(self.device),
                 in_mask_flag,
                 out_mask_flag,
                 motion_brush_mask_384,
@@ -656,7 +654,7 @@ with gr.Blocks() as demo:
     )
     target_size = 512
-    DragNUWA_net = Drag("cuda:0", target_size, target_size)
     first_frame_path = gr.State()
     tracking_points = gr.State([])
     motion_brush_points = gr.State([])

 class Drag:
     @spaces.GPU(duration=200)
+    def __init__(self, height, width):
         svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
         mofa_ckpt = "ckpts/controlnet"
         self.weight_dtype = torch.float16
         self.pipeline, self.cmp = init_models(
             svd_ckpt,
             mofa_ckpt,
             weight_dtype=self.weight_dtype,
+            device='cuda'
         )
         self.height = height
         print('start diffusion process...')
+        input_drag_384_inmask = input_drag_384_inmask.to('cuda', dtype=self.weight_dtype)
+        mask_384_inmask = mask_384_inmask.to('cuda', dtype=self.weight_dtype)
+        input_drag_384_outmask = input_drag_384_outmask.to('cuda', dtype=self.weight_dtype)
+        mask_384_outmask = mask_384_outmask.to('cuda', dtype=self.weight_dtype)
+        input_first_frame_384 = input_first_frame_384.to('cuda', dtype=self.weight_dtype)
         if in_mask_flag:
             flow_inmask = self.get_flow(
             )
         else:
             fb, fl = mask_384_inmask.shape[:2]
+            flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=self.weight_dtype)
         if out_mask_flag:
             flow_outmask = self.get_flow(
             )
         else:
             fb, fl = mask_384_outmask.shape[:2]
+            flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=self.weight_dtype)
         inmask_no_zero = (flow_inmask != 0).all(dim=2)
         inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
                 np.zeros((25 - 1, 384, 384, 2)), \
                     np.zeros((25 - 1, 384, 384))
+        input_drag_384_inmask = torch.from_numpy(input_drag_384_inmask).unsqueeze(0).to('cuda')  # [1, 13, h, w, 2]
+        input_mask_384_inmask = torch.from_numpy(input_mask_384_inmask).unsqueeze(0).to('cuda')  # [1, 13, h, w]
+        input_drag_384_outmask = torch.from_numpy(input_drag_384_outmask).unsqueeze(0).to('cuda')  # [1, 13, h, w, 2]
+        input_mask_384_outmask = torch.from_numpy(input_mask_384_outmask).unsqueeze(0).to('cuda')  # [1, 13, h, w]
         first_frames_transform = transforms.Compose([
             lambda x: Image.fromarray(x),
         ])
         input_first_frame = image2arr(first_frame_path)
+        input_first_frame = repeat(first_frames_transform(input_first_frame), 'c h w -> b c h w', b=1).to('cuda')
         seed = 42
         num_frames = 25
         input_drag_384_outmask = input_drag_384_outmask.permute(0, 1, 4, 2, 3)  # [1, 13, 2, 384, 384]
         mask_384_outmask = input_mask_384_outmask.unsqueeze(2).repeat(1, 1, 2, 1, 1)  # [1, 13, 2, 384, 384]
+        input_drag_384_inmask = input_drag_384_inmask.to('cuda', dtype=self.weight_dtype)
+        mask_384_inmask = mask_384_inmask.to('cuda', dtype=self.weight_dtype)
+        input_drag_384_outmask = input_drag_384_outmask.to('cuda', dtype=self.weight_dtype)
+        mask_384_outmask = mask_384_outmask.to('cuda', dtype=self.weight_dtype)
+        input_first_frame_384 = input_first_frame_384.to('cuda', dtype=self.weight_dtype)
         if in_mask_flag:
             flow_inmask = self.get_flow(
             )
         else:
             fb, fl = mask_384_inmask.shape[:2]
+            flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=self.weight_dtype)
         if out_mask_flag:
             flow_outmask = self.get_flow(
             )
         else:
             fb, fl = mask_384_outmask.shape[:2]
+            flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=self.weight_dtype)
         inmask_no_zero = (flow_inmask != 0).all(dim=2)
         inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
         for i in tqdm(range(num_inference)):
             if not outputs:
                 first_frames = image2arr(first_frame_path)
+                first_frames = repeat(first_frames_transform(first_frames), 'c h w -> b c h w', b=inference_batch_size).to('cuda')
             else:
                 first_frames = outputs['logits_imgs'][:, -1]
             outputs = self.forward_sample(
+                input_drag_384_inmask.to('cuda'),
+                input_drag_384_outmask.to('cuda'),
+                first_frames.to('cuda'),
+                input_mask_384_inmask.to('cuda'),
+                input_mask_384_outmask.to('cuda'),
                 in_mask_flag,
                 out_mask_flag,
                 motion_brush_mask_384,
     )
     target_size = 512
+    DragNUWA_net = Drag(target_size, target_size)
     first_frame_path = gr.State()
     tracking_points = gr.State([])
     motion_brush_points = gr.State([])