myniu commited on
Commit
43ba5db
1 Parent(s): e9f1b91
Files changed (2) hide show
  1. app.py +144 -132
  2. oldapp.py → modifiedapp.py +135 -147
app.py CHANGED
@@ -89,6 +89,79 @@ def get_sparseflow_and_mask_forward(
89
  return s_flow, mask
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def interpolate_trajectory(points, n_points):
93
  x = [point[0] for point in points]
94
  y = [point[1] for point in points]
@@ -142,110 +215,22 @@ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
142
  return trajectory_maps, transparent_layer
143
 
144
 
145
- with gr.Blocks() as demo:
146
- gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
147
-
148
- gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
149
-
150
- gr.Markdown(
151
- """
152
- During the inference, kindly follow these instructions:
153
- <br>
154
- 1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
155
- 2. Proceed to draw trajectories: <br>
156
- 2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
157
- 2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
158
- 2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
159
- 2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
160
- 2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
161
- 3. Click the "Run" button to animate the image according to the path. <br>
162
- """
163
- )
164
 
165
- height, width = 512, 512
 
 
 
 
 
166
 
167
- pipeline, cmp = None, None
168
 
169
- first_frame_path = gr.State()
170
- tracking_points = gr.State([])
171
- motion_brush_points = gr.State([])
172
- motion_brush_mask = gr.State()
173
- motion_brush_viz = gr.State()
174
- inference_batch_size = gr.State(1)
175
 
176
- @spaces.GPU(duration=100)
177
- def init_models(pretrained_model_name_or_path="ckpts/stable-video-diffusion-img2vid-xt-1-1", resume_from_checkpoint="ckpts/controlnet", weight_dtype=torch.float16, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
178
 
179
- from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
180
- from pipeline.pipeline import FlowControlNetPipeline
181
- from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
182
-
183
- print('start loading models...')
184
- # Load scheduler, tokenizer and models.
185
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(
186
- pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
187
- )
188
- vae = AutoencoderKLTemporalDecoder.from_pretrained(
189
- pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
190
- unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
191
- pretrained_model_name_or_path,
192
- subfolder="unet",
193
- low_cpu_mem_usage=True,
194
- variant="fp16",
195
- )
196
-
197
- controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
198
-
199
- cmp = CMP_demo(
200
- './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
201
- 42000
202
- ).to(device)
203
- cmp.requires_grad_(False)
204
-
205
- # Freeze vae and image_encoder
206
- vae.requires_grad_(False)
207
- image_encoder.requires_grad_(False)
208
- unet.requires_grad_(False)
209
- controlnet.requires_grad_(False)
210
-
211
- # Move image_encoder and vae to gpu and cast to weight_dtype
212
- image_encoder.to(device, dtype=weight_dtype)
213
- vae.to(device, dtype=weight_dtype)
214
- unet.to(device, dtype=weight_dtype)
215
- controlnet.to(device, dtype=weight_dtype)
216
-
217
- if enable_xformers_memory_efficient_attention:
218
- if is_xformers_available():
219
- import xformers
220
-
221
- xformers_version = version.parse(xformers.__version__)
222
- if xformers_version == version.parse("0.0.16"):
223
- print(
224
- "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
225
- )
226
- unet.enable_xformers_memory_efficient_attention()
227
- else:
228
- raise ValueError(
229
- "xformers is not available. Make sure it is installed correctly")
230
-
231
- if allow_tf32:
232
- torch.backends.cuda.matmul.allow_tf32 = True
233
-
234
- pipeline = FlowControlNetPipeline.from_pretrained(
235
- pretrained_model_name_or_path,
236
- unet=unet,
237
- controlnet=controlnet,
238
- image_encoder=image_encoder,
239
- vae=vae,
240
- torch_dtype=weight_dtype,
241
- )
242
- pipeline = pipeline.to(device)
243
-
244
- print('models loaded.')
245
-
246
- return pipeline, cmp
247
-
248
- def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
249
 
250
  '''
251
  frames: [b, 13, 3, 384, 384] (0, 1) tensor
@@ -270,19 +255,19 @@ with gr.Blocks() as demo:
270
  return cmp_flow
271
 
272
 
273
- def get_flow(pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
274
 
275
  fb, fl, fc, _, _ = pixel_values_384.shape
276
 
277
- controlnet_flow = get_cmp_flow(
278
  pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
279
  sparse_optical_flow_384,
280
  mask_384, motion_brush_mask
281
  )
282
 
283
- if height != 384 or width != 384:
284
- scales = [height / 384, width / 384]
285
- controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (height, width), mode='nearest').reshape(fb, fl, 2, height, width)
286
  controlnet_flow[:, :, 0] *= scales[1]
287
  controlnet_flow[:, :, 1] *= scales[0]
288
 
@@ -290,7 +275,7 @@ with gr.Blocks() as demo:
290
 
291
 
292
  @torch.no_grad()
293
- def forward_sample(input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
294
  '''
295
  input_drag: [1, 13, 320, 576, 2]
296
  input_drag_384: [1, 13, 384, 384, 2]
@@ -322,22 +307,22 @@ with gr.Blocks() as demo:
322
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
323
 
324
  if in_mask_flag:
325
- flow_inmask = get_flow(
326
  input_first_frame_384,
327
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask
328
  )
329
  else:
330
  fb, fl = mask_384_inmask.shape[:2]
331
- flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
332
 
333
  if out_mask_flag:
334
- flow_outmask = get_flow(
335
  input_first_frame_384,
336
  input_drag_384_outmask, mask_384_outmask
337
  )
338
  else:
339
  fb, fl = mask_384_outmask.shape[:2]
340
- flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
341
 
342
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
343
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
@@ -383,16 +368,16 @@ with gr.Blocks() as demo:
383
 
384
  @spaces.GPU
385
  @torch.no_grad()
386
- def get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path):
387
 
388
- original_width, original_height = width, height
389
 
390
  input_all_points = tracking_points.constructor_args['value']
391
 
392
  if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
393
  return np.uint8(np.ones((original_width, original_height, 3))*255)
394
 
395
- resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
396
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
397
 
398
  new_resized_all_points = []
@@ -470,22 +455,22 @@ with gr.Blocks() as demo:
470
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
471
 
472
  if in_mask_flag:
473
- flow_inmask = get_flow(
474
  input_first_frame_384,
475
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
476
  )
477
  else:
478
  fb, fl = mask_384_inmask.shape[:2]
479
- flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
480
 
481
  if out_mask_flag:
482
- flow_outmask = get_flow(
483
  input_first_frame_384,
484
  input_drag_384_outmask, mask_384_outmask
485
  )
486
  else:
487
  fb, fl = mask_384_outmask.shape[:2]
488
- flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
489
 
490
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
491
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
@@ -498,12 +483,12 @@ with gr.Blocks() as demo:
498
  return viz_esti_flows
499
 
500
  @spaces.GPU(duration=200)
501
- def run(first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
502
 
503
- original_width, original_height = width, height
504
 
505
  input_all_points = tracking_points.constructor_args['value']
506
- resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
507
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
508
 
509
  new_resized_all_points = []
@@ -556,9 +541,9 @@ with gr.Blocks() as demo:
556
  id = base.split('_')[0]
557
 
558
  image_pil = image2pil(first_frame_path)
559
- image_pil = image_pil.resize((width, height), Image.BILINEAR).convert('RGB')
560
 
561
- visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, width, height)
562
 
563
  motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
564
  visualized_drag = visualized_drag[0].convert('RGBA')
@@ -581,7 +566,7 @@ with gr.Blocks() as demo:
581
  first_frames = outputs['logits_imgs'][:, -1]
582
 
583
 
584
- outputs = forward_sample(
585
  input_drag_384_inmask.to('cuda'),
586
  input_drag_384_outmask.to('cuda'),
587
  first_frames.to('cuda'),
@@ -644,16 +629,43 @@ with gr.Blocks() as demo:
644
 
645
  return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
646
 
647
- @spaces.GPU(duration=100)
648
- def preprocess_image(image):
649
 
650
- pipeline, cmp = init_models()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
 
652
  image_pil = image2pil(image.name)
653
  raw_w, raw_h = image_pil.size
654
 
655
  max_edge = min(raw_w, raw_h)
656
- resize_ratio = width / max_edge
657
 
658
  image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
659
 
@@ -663,8 +675,8 @@ with gr.Blocks() as demo:
663
 
664
  image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
665
 
666
- width = crop_w
667
- height = crop_h
668
 
669
  id = str(time.time()).split('.')[0]
670
  os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
@@ -709,7 +721,7 @@ with gr.Blocks() as demo:
709
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
710
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
711
 
712
- viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
713
 
714
  return tracking_points, trajectory_map, viz_flow
715
 
@@ -729,7 +741,7 @@ with gr.Blocks() as demo:
729
  transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
730
  motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
731
 
732
- viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
733
 
734
  return motion_brush_mask, transparent_layer, motion_map, viz_flow
735
 
@@ -765,7 +777,7 @@ with gr.Blocks() as demo:
765
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
766
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
767
 
768
- viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
769
 
770
  return tracking_points, trajectory_map, viz_flow
771
 
@@ -820,6 +832,6 @@ with gr.Blocks() as demo:
820
 
821
  input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
822
 
823
- run_button.click(run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
824
 
825
  demo.launch()
 
89
  return s_flow, mask
90
 
91
 
92
+ @spaces.GPU(duration=100)
93
+ def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
94
+
95
+ from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
96
+ from pipeline.pipeline import FlowControlNetPipeline
97
+ from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
98
+
99
+ print('start loading models...')
100
+ # Load scheduler, tokenizer and models.
101
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
102
+ pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
103
+ )
104
+ vae = AutoencoderKLTemporalDecoder.from_pretrained(
105
+ pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
106
+ unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
107
+ pretrained_model_name_or_path,
108
+ subfolder="unet",
109
+ low_cpu_mem_usage=True,
110
+ variant="fp16",
111
+ )
112
+
113
+ controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
114
+
115
+ cmp = CMP_demo(
116
+ './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
117
+ 42000
118
+ ).to(device)
119
+ cmp.requires_grad_(False)
120
+
121
+ # Freeze vae and image_encoder
122
+ vae.requires_grad_(False)
123
+ image_encoder.requires_grad_(False)
124
+ unet.requires_grad_(False)
125
+ controlnet.requires_grad_(False)
126
+
127
+ # Move image_encoder and vae to gpu and cast to weight_dtype
128
+ image_encoder.to(device, dtype=weight_dtype)
129
+ vae.to(device, dtype=weight_dtype)
130
+ unet.to(device, dtype=weight_dtype)
131
+ controlnet.to(device, dtype=weight_dtype)
132
+
133
+ if enable_xformers_memory_efficient_attention:
134
+ if is_xformers_available():
135
+ import xformers
136
+
137
+ xformers_version = version.parse(xformers.__version__)
138
+ if xformers_version == version.parse("0.0.16"):
139
+ print(
140
+ "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
141
+ )
142
+ unet.enable_xformers_memory_efficient_attention()
143
+ else:
144
+ raise ValueError(
145
+ "xformers is not available. Make sure it is installed correctly")
146
+
147
+ if allow_tf32:
148
+ torch.backends.cuda.matmul.allow_tf32 = True
149
+
150
+ pipeline = FlowControlNetPipeline.from_pretrained(
151
+ pretrained_model_name_or_path,
152
+ unet=unet,
153
+ controlnet=controlnet,
154
+ image_encoder=image_encoder,
155
+ vae=vae,
156
+ torch_dtype=weight_dtype,
157
+ )
158
+ pipeline = pipeline.to(device)
159
+
160
+ print('models loaded.')
161
+
162
+ return pipeline, cmp
163
+
164
+
165
  def interpolate_trajectory(points, n_points):
166
  x = [point[0] for point in points]
167
  y = [point[1] for point in points]
 
215
  return trajectory_maps, transparent_layer
216
 
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
+ pipeline, cmp = init_models(
220
+ "ckpts/stable-video-diffusion-img2vid-xt-1-1",
221
+ "ckpts/controlnet",
222
+ weight_dtype=torch.float16,
223
+ device='cuda'
224
+ )
225
 
 
226
 
227
+ class Drag:
228
+ def __init__(self, height, width):
 
 
 
 
229
 
230
+ self.height = height
231
+ self.width = width
232
 
233
+ def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  '''
236
  frames: [b, 13, 3, 384, 384] (0, 1) tensor
 
255
  return cmp_flow
256
 
257
 
258
+ def get_flow(self, pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
259
 
260
  fb, fl, fc, _, _ = pixel_values_384.shape
261
 
262
+ controlnet_flow = self.get_cmp_flow(
263
  pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
264
  sparse_optical_flow_384,
265
  mask_384, motion_brush_mask
266
  )
267
 
268
+ if self.height != 384 or self.width != 384:
269
+ scales = [self.height / 384, self.width / 384]
270
+ controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (self.height, self.width), mode='nearest').reshape(fb, fl, 2, self.height, self.width)
271
  controlnet_flow[:, :, 0] *= scales[1]
272
  controlnet_flow[:, :, 1] *= scales[0]
273
 
 
275
 
276
 
277
  @torch.no_grad()
278
+ def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
279
  '''
280
  input_drag: [1, 13, 320, 576, 2]
281
  input_drag_384: [1, 13, 384, 384, 2]
 
307
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
308
 
309
  if in_mask_flag:
310
+ flow_inmask = self.get_flow(
311
  input_first_frame_384,
312
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask
313
  )
314
  else:
315
  fb, fl = mask_384_inmask.shape[:2]
316
+ flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
317
 
318
  if out_mask_flag:
319
+ flow_outmask = self.get_flow(
320
  input_first_frame_384,
321
  input_drag_384_outmask, mask_384_outmask
322
  )
323
  else:
324
  fb, fl = mask_384_outmask.shape[:2]
325
+ flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
326
 
327
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
328
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
 
368
 
369
  @spaces.GPU
370
  @torch.no_grad()
371
+ def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
372
 
373
+ original_width, original_height = self.width, self.height
374
 
375
  input_all_points = tracking_points.constructor_args['value']
376
 
377
  if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
378
  return np.uint8(np.ones((original_width, original_height, 3))*255)
379
 
380
+ resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
381
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
382
 
383
  new_resized_all_points = []
 
455
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
456
 
457
  if in_mask_flag:
458
+ flow_inmask = self.get_flow(
459
  input_first_frame_384,
460
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
461
  )
462
  else:
463
  fb, fl = mask_384_inmask.shape[:2]
464
+ flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
465
 
466
  if out_mask_flag:
467
+ flow_outmask = self.get_flow(
468
  input_first_frame_384,
469
  input_drag_384_outmask, mask_384_outmask
470
  )
471
  else:
472
  fb, fl = mask_384_outmask.shape[:2]
473
+ flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
474
 
475
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
476
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
 
483
  return viz_esti_flows
484
 
485
  @spaces.GPU(duration=200)
486
+ def run(self, first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
487
 
488
+ original_width, original_height = self.width, self.height
489
 
490
  input_all_points = tracking_points.constructor_args['value']
491
+ resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
492
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
493
 
494
  new_resized_all_points = []
 
541
  id = base.split('_')[0]
542
 
543
  image_pil = image2pil(first_frame_path)
544
+ image_pil = image_pil.resize((self.width, self.height), Image.BILINEAR).convert('RGB')
545
 
546
+ visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, self.width, self.height)
547
 
548
  motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
549
  visualized_drag = visualized_drag[0].convert('RGBA')
 
566
  first_frames = outputs['logits_imgs'][:, -1]
567
 
568
 
569
+ outputs = self.forward_sample(
570
  input_drag_384_inmask.to('cuda'),
571
  input_drag_384_outmask.to('cuda'),
572
  first_frames.to('cuda'),
 
629
 
630
  return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
631
 
 
 
632
 
633
+ with gr.Blocks() as demo:
634
+ gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
635
+
636
+ gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
637
+
638
+ gr.Markdown(
639
+ """
640
+ During the inference, kindly follow these instructions:
641
+ <br>
642
+ 1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
643
+ 2. Proceed to draw trajectories: <br>
644
+ 2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
645
+ 2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
646
+ 2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
647
+ 2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
648
+ 2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
649
+ 3. Click the "Run" button to animate the image according to the path. <br>
650
+ """
651
+ )
652
+
653
+ target_size = 512
654
+ DragNUWA_net = Drag(target_size, target_size)
655
+ first_frame_path = gr.State()
656
+ tracking_points = gr.State([])
657
+ motion_brush_points = gr.State([])
658
+ motion_brush_mask = gr.State()
659
+ motion_brush_viz = gr.State()
660
+ inference_batch_size = gr.State(1)
661
+
662
+ def preprocess_image(image):
663
 
664
  image_pil = image2pil(image.name)
665
  raw_w, raw_h = image_pil.size
666
 
667
  max_edge = min(raw_w, raw_h)
668
+ resize_ratio = target_size / max_edge
669
 
670
  image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
671
 
 
675
 
676
  image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
677
 
678
+ DragNUWA_net.width = crop_w
679
+ DragNUWA_net.height = crop_h
680
 
681
  id = str(time.time()).split('.')[0]
682
  os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
 
721
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
722
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
723
 
724
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
725
 
726
  return tracking_points, trajectory_map, viz_flow
727
 
 
741
  transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
742
  motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
743
 
744
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
745
 
746
  return motion_brush_mask, transparent_layer, motion_map, viz_flow
747
 
 
777
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
778
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
779
 
780
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
781
 
782
  return tracking_points, trajectory_map, viz_flow
783
 
 
832
 
833
  input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
834
 
835
+ run_button.click(DragNUWA_net.run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
836
 
837
  demo.launch()
oldapp.py → modifiedapp.py RENAMED
@@ -89,78 +89,6 @@ def get_sparseflow_and_mask_forward(
89
  return s_flow, mask
90
 
91
 
92
- def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
93
-
94
- from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
95
- from pipeline.pipeline import FlowControlNetPipeline
96
- from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
97
-
98
- print('start loading models...')
99
- # Load scheduler, tokenizer and models.
100
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(
101
- pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
102
- )
103
- vae = AutoencoderKLTemporalDecoder.from_pretrained(
104
- pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
105
- unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
106
- pretrained_model_name_or_path,
107
- subfolder="unet",
108
- low_cpu_mem_usage=True,
109
- variant="fp16",
110
- )
111
-
112
- controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
113
-
114
- cmp = CMP_demo(
115
- './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
116
- 42000
117
- ).to(device)
118
- cmp.requires_grad_(False)
119
-
120
- # Freeze vae and image_encoder
121
- vae.requires_grad_(False)
122
- image_encoder.requires_grad_(False)
123
- unet.requires_grad_(False)
124
- controlnet.requires_grad_(False)
125
-
126
- # Move image_encoder and vae to gpu and cast to weight_dtype
127
- image_encoder.to(device, dtype=weight_dtype)
128
- vae.to(device, dtype=weight_dtype)
129
- unet.to(device, dtype=weight_dtype)
130
- controlnet.to(device, dtype=weight_dtype)
131
-
132
- if enable_xformers_memory_efficient_attention:
133
- if is_xformers_available():
134
- import xformers
135
-
136
- xformers_version = version.parse(xformers.__version__)
137
- if xformers_version == version.parse("0.0.16"):
138
- print(
139
- "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
140
- )
141
- unet.enable_xformers_memory_efficient_attention()
142
- else:
143
- raise ValueError(
144
- "xformers is not available. Make sure it is installed correctly")
145
-
146
- if allow_tf32:
147
- torch.backends.cuda.matmul.allow_tf32 = True
148
-
149
- pipeline = FlowControlNetPipeline.from_pretrained(
150
- pretrained_model_name_or_path,
151
- unet=unet,
152
- controlnet=controlnet,
153
- image_encoder=image_encoder,
154
- vae=vae,
155
- torch_dtype=weight_dtype,
156
- )
157
- pipeline = pipeline.to(device)
158
-
159
- print('models loaded.')
160
-
161
- return pipeline, cmp
162
-
163
-
164
  def interpolate_trajectory(points, n_points):
165
  x = [point[0] for point in points]
166
  y = [point[1] for point in points]
@@ -214,24 +142,110 @@ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
214
  return trajectory_maps, transparent_layer
215
 
216
 
217
- class Drag:
218
- @spaces.GPU(duration=200)
219
- def __init__(self, height, width):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
- svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
222
- mofa_ckpt = "ckpts/controlnet"
223
 
224
- self.pipeline, self.cmp = init_models(
225
- svd_ckpt,
226
- mofa_ckpt,
227
- weight_dtype=torch.float16,
228
- device='cuda'
 
 
 
 
 
 
 
 
 
 
 
229
  )
230
 
231
- self.height = height
232
- self.width = width
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
 
 
 
 
235
 
236
  '''
237
  frames: [b, 13, 3, 384, 384] (0, 1) tensor
@@ -244,7 +258,7 @@ class Drag:
244
  frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
245
  sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
246
  mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
247
- cmp_flow = self.cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
248
 
249
  if brush_mask is not None:
250
  brush_mask = torch.from_numpy(brush_mask) / 255.
@@ -256,19 +270,19 @@ class Drag:
256
  return cmp_flow
257
 
258
 
259
- def get_flow(self, pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
260
 
261
  fb, fl, fc, _, _ = pixel_values_384.shape
262
 
263
- controlnet_flow = self.get_cmp_flow(
264
  pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
265
  sparse_optical_flow_384,
266
  mask_384, motion_brush_mask
267
  )
268
 
269
- if self.height != 384 or self.width != 384:
270
- scales = [self.height / 384, self.width / 384]
271
- controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (self.height, self.width), mode='nearest').reshape(fb, fl, 2, self.height, self.width)
272
  controlnet_flow[:, :, 0] *= scales[1]
273
  controlnet_flow[:, :, 1] *= scales[0]
274
 
@@ -276,7 +290,7 @@ class Drag:
276
 
277
 
278
  @torch.no_grad()
279
- def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
280
  '''
281
  input_drag: [1, 13, 320, 576, 2]
282
  input_drag_384: [1, 13, 384, 384, 2]
@@ -308,29 +322,29 @@ class Drag:
308
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
309
 
310
  if in_mask_flag:
311
- flow_inmask = self.get_flow(
312
  input_first_frame_384,
313
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask
314
  )
315
  else:
316
  fb, fl = mask_384_inmask.shape[:2]
317
- flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
318
 
319
  if out_mask_flag:
320
- flow_outmask = self.get_flow(
321
  input_first_frame_384,
322
  input_drag_384_outmask, mask_384_outmask
323
  )
324
  else:
325
  fb, fl = mask_384_outmask.shape[:2]
326
- flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
327
 
328
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
329
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
330
 
331
  controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
332
 
333
- val_output = self.pipeline(
334
  input_first_frame_pil,
335
  input_first_frame_pil,
336
  controlnet_flow,
@@ -369,16 +383,16 @@ class Drag:
369
 
370
  @spaces.GPU
371
  @torch.no_grad()
372
- def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
373
 
374
- original_width, original_height = self.width, self.height
375
 
376
  input_all_points = tracking_points.constructor_args['value']
377
 
378
  if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
379
  return np.uint8(np.ones((original_width, original_height, 3))*255)
380
 
381
- resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
382
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
383
 
384
  new_resized_all_points = []
@@ -456,22 +470,22 @@ class Drag:
456
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
457
 
458
  if in_mask_flag:
459
- flow_inmask = self.get_flow(
460
  input_first_frame_384,
461
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
462
  )
463
  else:
464
  fb, fl = mask_384_inmask.shape[:2]
465
- flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
466
 
467
  if out_mask_flag:
468
- flow_outmask = self.get_flow(
469
  input_first_frame_384,
470
  input_drag_384_outmask, mask_384_outmask
471
  )
472
  else:
473
  fb, fl = mask_384_outmask.shape[:2]
474
- flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to('cuda', dtype=torch.float16)
475
 
476
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
477
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
@@ -484,12 +498,12 @@ class Drag:
484
  return viz_esti_flows
485
 
486
  @spaces.GPU(duration=200)
487
- def run(self, first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
488
 
489
- original_width, original_height = self.width, self.height
490
 
491
  input_all_points = tracking_points.constructor_args['value']
492
- resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
493
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
494
 
495
  new_resized_all_points = []
@@ -542,9 +556,9 @@ class Drag:
542
  id = base.split('_')[0]
543
 
544
  image_pil = image2pil(first_frame_path)
545
- image_pil = image_pil.resize((self.width, self.height), Image.BILINEAR).convert('RGB')
546
 
547
- visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, self.width, self.height)
548
 
549
  motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
550
  visualized_drag = visualized_drag[0].convert('RGBA')
@@ -567,7 +581,7 @@ class Drag:
567
  first_frames = outputs['logits_imgs'][:, -1]
568
 
569
 
570
- outputs = self.forward_sample(
571
  input_drag_384_inmask.to('cuda'),
572
  input_drag_384_outmask.to('cuda'),
573
  first_frames.to('cuda'),
@@ -630,43 +644,17 @@ class Drag:
630
 
631
  return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
632
 
633
-
634
- with gr.Blocks() as demo:
635
- gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
636
-
637
- gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
638
-
639
- gr.Markdown(
640
- """
641
- During the inference, kindly follow these instructions:
642
- <br>
643
- 1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
644
- 2. Proceed to draw trajectories: <br>
645
- 2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
646
- 2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
647
- 2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
648
- 2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
649
- 2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
650
- 3. Click the "Run" button to animate the image according to the path. <br>
651
- """
652
- )
653
-
654
- target_size = 512
655
- DragNUWA_net = Drag(target_size, target_size)
656
- first_frame_path = gr.State()
657
- tracking_points = gr.State([])
658
- motion_brush_points = gr.State([])
659
- motion_brush_mask = gr.State()
660
- motion_brush_viz = gr.State()
661
- inference_batch_size = gr.State(1)
662
-
663
  def preprocess_image(image):
 
 
 
664
 
665
  image_pil = image2pil(image.name)
666
  raw_w, raw_h = image_pil.size
667
 
668
  max_edge = min(raw_w, raw_h)
669
- resize_ratio = target_size / max_edge
670
 
671
  image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
672
 
@@ -676,8 +664,8 @@ with gr.Blocks() as demo:
676
 
677
  image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
678
 
679
- DragNUWA_net.width = crop_w
680
- DragNUWA_net.height = crop_h
681
 
682
  id = str(time.time()).split('.')[0]
683
  os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
@@ -722,7 +710,7 @@ with gr.Blocks() as demo:
722
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
723
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
724
 
725
- viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
726
 
727
  return tracking_points, trajectory_map, viz_flow
728
 
@@ -742,7 +730,7 @@ with gr.Blocks() as demo:
742
  transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
743
  motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
744
 
745
- viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
746
 
747
  return motion_brush_mask, transparent_layer, motion_map, viz_flow
748
 
@@ -778,7 +766,7 @@ with gr.Blocks() as demo:
778
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
779
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
780
 
781
- viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
782
 
783
  return tracking_points, trajectory_map, viz_flow
784
 
@@ -833,6 +821,6 @@ with gr.Blocks() as demo:
833
 
834
  input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
835
 
836
- run_button.click(DragNUWA_net.run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
837
 
838
  demo.launch()
 
89
  return s_flow, mask
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def interpolate_trajectory(points, n_points):
93
  x = [point[0] for point in points]
94
  y = [point[1] for point in points]
 
142
  return trajectory_maps, transparent_layer
143
 
144
 
145
+ with gr.Blocks() as demo:
146
+ gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
147
+
148
+ gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
149
+
150
+ gr.Markdown(
151
+ """
152
+ During the inference, kindly follow these instructions:
153
+ <br>
154
+ 1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
155
+ 2. Proceed to draw trajectories: <br>
156
+ 2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
157
+ 2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
158
+ 2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
159
+ 2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
160
+ 2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
161
+ 3. Click the "Run" button to animate the image according to the path. <br>
162
+ """
163
+ )
164
+
165
+ height, width = 512, 512
166
+
167
+ pipeline, cmp = None, None
168
+
169
+ first_frame_path = gr.State()
170
+ tracking_points = gr.State([])
171
+ motion_brush_points = gr.State([])
172
+ motion_brush_mask = gr.State()
173
+ motion_brush_viz = gr.State()
174
+ inference_batch_size = gr.State(1)
175
 
176
+ @spaces.GPU(duration=100)
177
+ def init_models(pretrained_model_name_or_path="ckpts/stable-video-diffusion-img2vid-xt-1-1", resume_from_checkpoint="ckpts/controlnet", weight_dtype=torch.float16, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
178
 
179
+ from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
180
+ from pipeline.pipeline import FlowControlNetPipeline
181
+ from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
182
+
183
+ print('start loading models...')
184
+ # Load scheduler, tokenizer and models.
185
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
186
+ pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
187
+ )
188
+ vae = AutoencoderKLTemporalDecoder.from_pretrained(
189
+ pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
190
+ unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
191
+ pretrained_model_name_or_path,
192
+ subfolder="unet",
193
+ low_cpu_mem_usage=True,
194
+ variant="fp16",
195
  )
196
 
197
+ controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
198
+
199
+ cmp = CMP_demo(
200
+ './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
201
+ 42000
202
+ ).to(device)
203
+ cmp.requires_grad_(False)
204
+
205
+ # Freeze vae and image_encoder
206
+ vae.requires_grad_(False)
207
+ image_encoder.requires_grad_(False)
208
+ unet.requires_grad_(False)
209
+ controlnet.requires_grad_(False)
210
+
211
+ # Move image_encoder and vae to gpu and cast to weight_dtype
212
+ image_encoder.to(device, dtype=weight_dtype)
213
+ vae.to(device, dtype=weight_dtype)
214
+ unet.to(device, dtype=weight_dtype)
215
+ controlnet.to(device, dtype=weight_dtype)
216
+
217
+ if enable_xformers_memory_efficient_attention:
218
+ if is_xformers_available():
219
+ import xformers
220
+
221
+ xformers_version = version.parse(xformers.__version__)
222
+ if xformers_version == version.parse("0.0.16"):
223
+ print(
224
+ "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
225
+ )
226
+ unet.enable_xformers_memory_efficient_attention()
227
+ else:
228
+ raise ValueError(
229
+ "xformers is not available. Make sure it is installed correctly")
230
+
231
+ if allow_tf32:
232
+ torch.backends.cuda.matmul.allow_tf32 = True
233
+
234
+ pipeline = FlowControlNetPipeline.from_pretrained(
235
+ pretrained_model_name_or_path,
236
+ unet=unet,
237
+ controlnet=controlnet,
238
+ image_encoder=image_encoder,
239
+ vae=vae,
240
+ torch_dtype=weight_dtype,
241
+ )
242
+ pipeline = pipeline.to(device)
243
 
244
+ print('models loaded.')
245
+
246
+ return pipeline, cmp
247
+
248
+ def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
249
 
250
  '''
251
  frames: [b, 13, 3, 384, 384] (0, 1) tensor
 
258
  frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
259
  sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
260
  mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
261
+ cmp_flow = cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
262
 
263
  if brush_mask is not None:
264
  brush_mask = torch.from_numpy(brush_mask) / 255.
 
270
  return cmp_flow
271
 
272
 
273
+ def get_flow(pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
274
 
275
  fb, fl, fc, _, _ = pixel_values_384.shape
276
 
277
+ controlnet_flow = get_cmp_flow(
278
  pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
279
  sparse_optical_flow_384,
280
  mask_384, motion_brush_mask
281
  )
282
 
283
+ if height != 384 or width != 384:
284
+ scales = [height / 384, width / 384]
285
+ controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (height, width), mode='nearest').reshape(fb, fl, 2, height, width)
286
  controlnet_flow[:, :, 0] *= scales[1]
287
  controlnet_flow[:, :, 1] *= scales[0]
288
 
 
290
 
291
 
292
  @torch.no_grad()
293
+ def forward_sample(input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
294
  '''
295
  input_drag: [1, 13, 320, 576, 2]
296
  input_drag_384: [1, 13, 384, 384, 2]
 
322
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
323
 
324
  if in_mask_flag:
325
+ flow_inmask = get_flow(
326
  input_first_frame_384,
327
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask
328
  )
329
  else:
330
  fb, fl = mask_384_inmask.shape[:2]
331
+ flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
332
 
333
  if out_mask_flag:
334
+ flow_outmask = get_flow(
335
  input_first_frame_384,
336
  input_drag_384_outmask, mask_384_outmask
337
  )
338
  else:
339
  fb, fl = mask_384_outmask.shape[:2]
340
+ flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
341
 
342
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
343
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
344
 
345
  controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
346
 
347
+ val_output = pipeline(
348
  input_first_frame_pil,
349
  input_first_frame_pil,
350
  controlnet_flow,
 
383
 
384
  @spaces.GPU
385
  @torch.no_grad()
386
+ def get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path):
387
 
388
+ original_width, original_height = width, height
389
 
390
  input_all_points = tracking_points.constructor_args['value']
391
 
392
  if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
393
  return np.uint8(np.ones((original_width, original_height, 3))*255)
394
 
395
+ resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
396
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
397
 
398
  new_resized_all_points = []
 
470
  input_first_frame_384 = input_first_frame_384.to('cuda', dtype=torch.float16)
471
 
472
  if in_mask_flag:
473
+ flow_inmask = get_flow(
474
  input_first_frame_384,
475
  input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
476
  )
477
  else:
478
  fb, fl = mask_384_inmask.shape[:2]
479
+ flow_inmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
480
 
481
  if out_mask_flag:
482
+ flow_outmask = get_flow(
483
  input_first_frame_384,
484
  input_drag_384_outmask, mask_384_outmask
485
  )
486
  else:
487
  fb, fl = mask_384_outmask.shape[:2]
488
+ flow_outmask = torch.zeros(fb, fl, 2, height, width).to('cuda', dtype=torch.float16)
489
 
490
  inmask_no_zero = (flow_inmask != 0).all(dim=2)
491
  inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
 
498
  return viz_esti_flows
499
 
500
  @spaces.GPU(duration=200)
501
+ def run(first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
502
 
503
+ original_width, original_height = width, height
504
 
505
  input_all_points = tracking_points.constructor_args['value']
506
+ resized_all_points = [tuple([tuple([int(e1[0]*width/original_width), int(e1[1]*height/original_height)]) for e1 in e]) for e in input_all_points]
507
  resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
508
 
509
  new_resized_all_points = []
 
556
  id = base.split('_')[0]
557
 
558
  image_pil = image2pil(first_frame_path)
559
+ image_pil = image_pil.resize((width, height), Image.BILINEAR).convert('RGB')
560
 
561
+ visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, width, height)
562
 
563
  motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
564
  visualized_drag = visualized_drag[0].convert('RGBA')
 
581
  first_frames = outputs['logits_imgs'][:, -1]
582
 
583
 
584
+ outputs = forward_sample(
585
  input_drag_384_inmask.to('cuda'),
586
  input_drag_384_outmask.to('cuda'),
587
  first_frames.to('cuda'),
 
644
 
645
  return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
646
 
647
+ @spaces.GPU(duration=100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  def preprocess_image(image):
649
+
650
+ if pipeline is None or cmp is None:
651
+ pipeline, cmp = init_models()
652
 
653
  image_pil = image2pil(image.name)
654
  raw_w, raw_h = image_pil.size
655
 
656
  max_edge = min(raw_w, raw_h)
657
+ resize_ratio = width / max_edge
658
 
659
  image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
660
 
 
664
 
665
  image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
666
 
667
+ width = crop_w
668
+ height = crop_h
669
 
670
  id = str(time.time()).split('.')[0]
671
  os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
 
710
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
711
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
712
 
713
+ viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
714
 
715
  return tracking_points, trajectory_map, viz_flow
716
 
 
730
  transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
731
  motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
732
 
733
+ viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
734
 
735
  return motion_brush_mask, transparent_layer, motion_map, viz_flow
736
 
 
766
  transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
767
  trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
768
 
769
+ viz_flow = get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
770
 
771
  return tracking_points, trajectory_map, viz_flow
772
 
 
821
 
822
  input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
823
 
824
+ run_button.click(run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
825
 
826
  demo.launch()