myniu commited on
Commit
826d651
1 Parent(s): 1c2b635
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +51 -9
  2. app.py +838 -0
  3. ckpts/controlnet/config.json +45 -0
  4. ckpts/controlnet/diffusion_pytorch_model.safetensors +3 -0
  5. models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/config.yaml +59 -0
  6. models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/resume.sh +8 -0
  7. models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/resume_slurm.sh +9 -0
  8. models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/train.sh +6 -0
  9. models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/train_slurm.sh +7 -0
  10. models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/validate.sh +6 -0
  11. models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/validate_slurm.sh +8 -0
  12. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/config.yaml +58 -0
  13. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/resume.sh +8 -0
  14. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/resume_slurm.sh +9 -0
  15. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/train.sh +6 -0
  16. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/train_slurm.sh +7 -0
  17. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/validate.sh +6 -0
  18. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/validate_slurm.sh +8 -0
  19. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/config.yaml +58 -0
  20. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/resume.sh +6 -0
  21. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/resume_slurm.sh +9 -0
  22. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/train.sh +4 -0
  23. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/train_slurm.sh +7 -0
  24. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/validate.sh +6 -0
  25. models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/validate_slurm.sh +8 -0
  26. models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/config.yaml +61 -0
  27. models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/resume.sh +8 -0
  28. models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/resume_slurm.sh +9 -0
  29. models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/train.sh +6 -0
  30. models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/train_slurm.sh +7 -0
  31. models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/validate.sh +6 -0
  32. models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/validate_slurm.sh +8 -0
  33. models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/config.yaml +58 -0
  34. models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/resume.sh +8 -0
  35. models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/resume_slurm.sh +9 -0
  36. models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/train.sh +6 -0
  37. models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/train_slurm.sh +7 -0
  38. models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/validate.sh +6 -0
  39. models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/validate_slurm.sh +8 -0
  40. models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/config.yaml +58 -0
  41. models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/resume.sh +8 -0
  42. models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/resume_slurm.sh +9 -0
  43. models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/train.sh +6 -0
  44. models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/train_slurm.sh +7 -0
  45. models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/validate.sh +6 -0
  46. models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/validate_slurm.sh +8 -0
  47. models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/checkpoints/ckpt_iter_42000.pth.tar +3 -0
  48. models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml +59 -0
  49. models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/resume.sh +6 -0
  50. models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/resume_slurm.sh +9 -0
README.md CHANGED
@@ -1,13 +1,55 @@
1
  ---
2
- title: MOFA-Video Traj
3
- emoji: 📚
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
- pinned: false
10
  license: apache-2.0
 
11
  ---
 
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
 
 
 
 
 
 
 
 
2
  license: apache-2.0
3
+ sdk_version: 4.5.0
4
  ---
5
+ ## Updates 🔥🔥🔥
6
 
7
+ We have released the Gradio demo for **Hybrid (Trajectory + Landmark)** Controls [HERE](https://huggingface.co/MyNiuuu/MOFA-Video-Hybrid)!
8
+
9
+
10
+ ## Introduction
11
+
12
+ This repo provides the inference Gradio demo for Trajectory Control of MOFA-Video.
13
+
14
+ ## Environment Setup
15
+
16
+ `pip install -r requirements.txt`
17
+
18
+ ## Download checkpoints
19
+
20
+ 1. Download the pretrained checkpoints of [SVD_xt](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt-1-1) from huggingface to `./ckpts`.
21
+
22
+ 2. Download the checkpint of [MOFA-Adapter](https://huggingface.co/MyNiuuu/MOFA-Video-Traj) from huggingface to `./ckpts`.
23
+
24
+ The final structure of checkpoints should be:
25
+
26
+
27
+ ```text
28
+ ./ckpts/
29
+ |-- controlnet
30
+ | |-- config.json
31
+ | `-- diffusion_pytorch_model.safetensors
32
+ |-- stable-video-diffusion-img2vid-xt-1-1
33
+ | |-- feature_extractor
34
+ | |-- ...
35
+ | |-- image_encoder
36
+ | |-- ...
37
+ | |-- scheduler
38
+ | |-- ...
39
+ | |-- unet
40
+ | |-- ...
41
+ | |-- vae
42
+ | |-- ...
43
+ | |-- svd_xt_1_1.safetensors
44
+ | `-- model_index.json
45
+ ```
46
+
47
+ ## Run Gradio Demo
48
+
49
+ `python run_gradio.py`
50
+
51
+ Please refer to the instructions on the gradio interface during the inference process.
52
+
53
+ ## Paper
54
+
55
+ arxiv.org/abs/2405.20222
app.py ADDED
@@ -0,0 +1,838 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import cv2
4
+ import os
5
+ from PIL import Image, ImageFilter
6
+ import uuid
7
+ from scipy.interpolate import interp1d, PchipInterpolator
8
+ import torchvision
9
+ # from utils import *
10
+ import time
11
+ from tqdm import tqdm
12
+ import imageio
13
+
14
+ import torch
15
+ import torch.nn.functional as F
16
+ import torchvision
17
+ import torchvision.transforms as transforms
18
+ from einops import rearrange, repeat
19
+
20
+ from packaging import version
21
+
22
+ from accelerate.utils import set_seed
23
+ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
24
+
25
+ from diffusers import AutoencoderKLTemporalDecoder, EulerDiscreteScheduler
26
+ from diffusers.utils import check_min_version
27
+ from diffusers.utils.import_utils import is_xformers_available
28
+
29
+ from utils.flow_viz import flow_to_image
30
+ from utils.utils import split_filename, image2arr, image2pil, ensure_dirname
31
+
32
+
33
+ output_dir_video = "./outputs/videos"
34
+ output_dir_frame = "./outputs/frames"
35
+
36
+
37
+ ensure_dirname(output_dir_video)
38
+ ensure_dirname(output_dir_frame)
39
+
40
+
41
+ def divide_points_afterinterpolate(resized_all_points, motion_brush_mask):
42
+ k = resized_all_points.shape[0]
43
+ starts = resized_all_points[:, 0] # [K, 2]
44
+
45
+ in_masks = []
46
+ out_masks = []
47
+
48
+ for i in range(k):
49
+ x, y = int(starts[i][1]), int(starts[i][0])
50
+ if motion_brush_mask[x][y] == 255:
51
+ in_masks.append(resized_all_points[i])
52
+ else:
53
+ out_masks.append(resized_all_points[i])
54
+
55
+ in_masks = np.array(in_masks)
56
+ out_masks = np.array(out_masks)
57
+
58
+ return in_masks, out_masks
59
+
60
+
61
+ def get_sparseflow_and_mask_forward(
62
+ resized_all_points,
63
+ n_steps, H, W,
64
+ is_backward_flow=False
65
+ ):
66
+
67
+ K = resized_all_points.shape[0]
68
+
69
+ starts = resized_all_points[:, 0] # [K, 2]
70
+
71
+ interpolated_ends = resized_all_points[:, 1:]
72
+
73
+ s_flow = np.zeros((K, n_steps, H, W, 2))
74
+ mask = np.zeros((K, n_steps, H, W))
75
+
76
+ for k in range(K):
77
+ for i in range(n_steps):
78
+ start, end = starts[k], interpolated_ends[k][i]
79
+ flow = np.int64(end - start) * (-1 if is_backward_flow is True else 1)
80
+ s_flow[k][i][int(start[1]), int(start[0])] = flow
81
+ mask[k][i][int(start[1]), int(start[0])] = 1
82
+
83
+ s_flow = np.sum(s_flow, axis=0)
84
+ mask = np.sum(mask, axis=0)
85
+
86
+ return s_flow, mask
87
+
88
+
89
+
90
+ def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
91
+
92
+ from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
93
+ from pipeline.pipeline import FlowControlNetPipeline
94
+ from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
95
+
96
+ print('start loading models...')
97
+ # Load scheduler, tokenizer and models.
98
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
99
+ pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
100
+ )
101
+ vae = AutoencoderKLTemporalDecoder.from_pretrained(
102
+ pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
103
+ unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
104
+ pretrained_model_name_or_path,
105
+ subfolder="unet",
106
+ low_cpu_mem_usage=True,
107
+ variant="fp16",
108
+ )
109
+
110
+ controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
111
+
112
+ cmp = CMP_demo(
113
+ './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
114
+ 42000
115
+ ).to(device)
116
+ cmp.requires_grad_(False)
117
+
118
+ # Freeze vae and image_encoder
119
+ vae.requires_grad_(False)
120
+ image_encoder.requires_grad_(False)
121
+ unet.requires_grad_(False)
122
+ controlnet.requires_grad_(False)
123
+
124
+ # Move image_encoder and vae to gpu and cast to weight_dtype
125
+ image_encoder.to(device, dtype=weight_dtype)
126
+ vae.to(device, dtype=weight_dtype)
127
+ unet.to(device, dtype=weight_dtype)
128
+ controlnet.to(device, dtype=weight_dtype)
129
+
130
+ if enable_xformers_memory_efficient_attention:
131
+ if is_xformers_available():
132
+ import xformers
133
+
134
+ xformers_version = version.parse(xformers.__version__)
135
+ if xformers_version == version.parse("0.0.16"):
136
+ print(
137
+ "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
138
+ )
139
+ unet.enable_xformers_memory_efficient_attention()
140
+ else:
141
+ raise ValueError(
142
+ "xformers is not available. Make sure it is installed correctly")
143
+
144
+ if allow_tf32:
145
+ torch.backends.cuda.matmul.allow_tf32 = True
146
+
147
+ pipeline = FlowControlNetPipeline.from_pretrained(
148
+ pretrained_model_name_or_path,
149
+ unet=unet,
150
+ controlnet=controlnet,
151
+ image_encoder=image_encoder,
152
+ vae=vae,
153
+ torch_dtype=weight_dtype,
154
+ )
155
+ pipeline = pipeline.to(device)
156
+
157
+ print('models loaded.')
158
+
159
+ return pipeline, cmp
160
+
161
+
162
+ def interpolate_trajectory(points, n_points):
163
+ x = [point[0] for point in points]
164
+ y = [point[1] for point in points]
165
+
166
+ t = np.linspace(0, 1, len(points))
167
+
168
+ fx = PchipInterpolator(t, x)
169
+ fy = PchipInterpolator(t, y)
170
+
171
+ new_t = np.linspace(0, 1, n_points)
172
+
173
+ new_x = fx(new_t)
174
+ new_y = fy(new_t)
175
+ new_points = list(zip(new_x, new_y))
176
+
177
+ return new_points
178
+
179
+
180
+ def visualize_drag_v2(background_image_path, splited_tracks, width, height):
181
+ trajectory_maps = []
182
+
183
+ background_image = Image.open(background_image_path).convert('RGBA')
184
+ background_image = background_image.resize((width, height))
185
+ w, h = background_image.size
186
+ transparent_background = np.array(background_image)
187
+ transparent_background[:, :, -1] = 128
188
+ transparent_background = Image.fromarray(transparent_background)
189
+
190
+ # Create a transparent layer with the same size as the background image
191
+ transparent_layer = np.zeros((h, w, 4))
192
+ for splited_track in splited_tracks:
193
+ if len(splited_track) > 1:
194
+ splited_track = interpolate_trajectory(splited_track, 16)
195
+ splited_track = splited_track[:16]
196
+ for i in range(len(splited_track)-1):
197
+ start_point = (int(splited_track[i][0]), int(splited_track[i][1]))
198
+ end_point = (int(splited_track[i+1][0]), int(splited_track[i+1][1]))
199
+ vx = end_point[0] - start_point[0]
200
+ vy = end_point[1] - start_point[1]
201
+ arrow_length = np.sqrt(vx**2 + vy**2)
202
+ if i == len(splited_track)-2:
203
+ cv2.arrowedLine(transparent_layer, start_point, end_point, (255, 0, 0, 192), 2, tipLength=8 / arrow_length)
204
+ else:
205
+ cv2.line(transparent_layer, start_point, end_point, (255, 0, 0, 192), 2)
206
+ else:
207
+ cv2.circle(transparent_layer, (int(splited_track[0][0]), int(splited_track[0][1])), 2, (255, 0, 0, 192), -1)
208
+
209
+ transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
210
+ trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
211
+ trajectory_maps.append(trajectory_map)
212
+ return trajectory_maps, transparent_layer
213
+
214
+
215
+ class Drag:
216
+ def __init__(self, device, height, width, model_length):
217
+ self.device = device
218
+
219
+ svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
220
+ mofa_ckpt = "ckpts/controlnet"
221
+
222
+ self.device = 'cuda'
223
+ self.weight_dtype = torch.float16
224
+
225
+ self.pipeline, self.cmp = init_models(
226
+ svd_ckpt,
227
+ mofa_ckpt,
228
+ weight_dtype=self.weight_dtype,
229
+ device=self.device
230
+ )
231
+
232
+ self.height = height
233
+ self.width = width
234
+ self.model_length = model_length
235
+
236
+ def get_cmp_flow(self, frames, sparse_optical_flow, mask, brush_mask=None):
237
+
238
+ '''
239
+ frames: [b, 13, 3, 384, 384] (0, 1) tensor
240
+ sparse_optical_flow: [b, 13, 2, 384, 384] (-384, 384) tensor
241
+ mask: [b, 13, 2, 384, 384] {0, 1} tensor
242
+ '''
243
+
244
+ b, t, c, h, w = frames.shape
245
+ assert h == 384 and w == 384
246
+ frames = frames.flatten(0, 1) # [b*13, 3, 256, 256]
247
+ sparse_optical_flow = sparse_optical_flow.flatten(0, 1) # [b*13, 2, 256, 256]
248
+ mask = mask.flatten(0, 1) # [b*13, 2, 256, 256]
249
+ cmp_flow = self.cmp.run(frames, sparse_optical_flow, mask) # [b*13, 2, 256, 256]
250
+
251
+ if brush_mask is not None:
252
+ brush_mask = torch.from_numpy(brush_mask) / 255.
253
+ brush_mask = brush_mask.to(cmp_flow.device, dtype=cmp_flow.dtype)
254
+ brush_mask = brush_mask.unsqueeze(0).unsqueeze(0)
255
+ cmp_flow = cmp_flow * brush_mask
256
+
257
+ cmp_flow = cmp_flow.reshape(b, t, 2, h, w)
258
+ return cmp_flow
259
+
260
+
261
+ def get_flow(self, pixel_values_384, sparse_optical_flow_384, mask_384, motion_brush_mask=None):
262
+
263
+ fb, fl, fc, _, _ = pixel_values_384.shape
264
+
265
+ controlnet_flow = self.get_cmp_flow(
266
+ pixel_values_384[:, 0:1, :, :, :].repeat(1, fl, 1, 1, 1),
267
+ sparse_optical_flow_384,
268
+ mask_384, motion_brush_mask
269
+ )
270
+
271
+ if self.height != 384 or self.width != 384:
272
+ scales = [self.height / 384, self.width / 384]
273
+ controlnet_flow = F.interpolate(controlnet_flow.flatten(0, 1), (self.height, self.width), mode='nearest').reshape(fb, fl, 2, self.height, self.width)
274
+ controlnet_flow[:, :, 0] *= scales[1]
275
+ controlnet_flow[:, :, 1] *= scales[0]
276
+
277
+ return controlnet_flow
278
+
279
+
280
+ @torch.no_grad()
281
+ def forward_sample(self, input_drag_384_inmask, input_drag_384_outmask, input_first_frame, input_mask_384_inmask, input_mask_384_outmask, in_mask_flag, out_mask_flag, motion_brush_mask=None, ctrl_scale=1., outputs=dict()):
282
+ '''
283
+ input_drag: [1, 13, 320, 576, 2]
284
+ input_drag_384: [1, 13, 384, 384, 2]
285
+ input_first_frame: [1, 3, 320, 576]
286
+ '''
287
+
288
+ seed = 42
289
+ num_frames = self.model_length
290
+
291
+ set_seed(seed)
292
+
293
+ input_first_frame_384 = F.interpolate(input_first_frame, (384, 384))
294
+ input_first_frame_384 = input_first_frame_384.repeat(num_frames - 1, 1, 1, 1).unsqueeze(0)
295
+ input_first_frame_pil = Image.fromarray(np.uint8(input_first_frame[0].cpu().permute(1, 2, 0)*255))
296
+ height, width = input_first_frame.shape[-2:]
297
+
298
+ input_drag_384_inmask = input_drag_384_inmask.permute(0, 1, 4, 2, 3) # [1, 13, 2, 384, 384]
299
+ mask_384_inmask = input_mask_384_inmask.unsqueeze(2).repeat(1, 1, 2, 1, 1) # [1, 13, 2, 384, 384]
300
+ input_drag_384_outmask = input_drag_384_outmask.permute(0, 1, 4, 2, 3) # [1, 13, 2, 384, 384]
301
+ mask_384_outmask = input_mask_384_outmask.unsqueeze(2).repeat(1, 1, 2, 1, 1) # [1, 13, 2, 384, 384]
302
+
303
+ print('start diffusion process...')
304
+
305
+ input_drag_384_inmask = input_drag_384_inmask.to(self.device, dtype=self.weight_dtype)
306
+ mask_384_inmask = mask_384_inmask.to(self.device, dtype=self.weight_dtype)
307
+ input_drag_384_outmask = input_drag_384_outmask.to(self.device, dtype=self.weight_dtype)
308
+ mask_384_outmask = mask_384_outmask.to(self.device, dtype=self.weight_dtype)
309
+
310
+ input_first_frame_384 = input_first_frame_384.to(self.device, dtype=self.weight_dtype)
311
+
312
+ if in_mask_flag:
313
+ flow_inmask = self.get_flow(
314
+ input_first_frame_384,
315
+ input_drag_384_inmask, mask_384_inmask, motion_brush_mask
316
+ )
317
+ else:
318
+ fb, fl = mask_384_inmask.shape[:2]
319
+ flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to(self.device, dtype=self.weight_dtype)
320
+
321
+ if out_mask_flag:
322
+ flow_outmask = self.get_flow(
323
+ input_first_frame_384,
324
+ input_drag_384_outmask, mask_384_outmask
325
+ )
326
+ else:
327
+ fb, fl = mask_384_outmask.shape[:2]
328
+ flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to(self.device, dtype=self.weight_dtype)
329
+
330
+ inmask_no_zero = (flow_inmask != 0).all(dim=2)
331
+ inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
332
+
333
+ controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
334
+
335
+ val_output = self.pipeline(
336
+ input_first_frame_pil,
337
+ input_first_frame_pil,
338
+ controlnet_flow,
339
+ height=height,
340
+ width=width,
341
+ num_frames=num_frames,
342
+ decode_chunk_size=8,
343
+ motion_bucket_id=127,
344
+ fps=7,
345
+ noise_aug_strength=0.02,
346
+ controlnet_cond_scale=ctrl_scale,
347
+ )
348
+
349
+ video_frames, estimated_flow = val_output.frames[0], val_output.controlnet_flow
350
+
351
+ for i in range(num_frames):
352
+ img = video_frames[i]
353
+ video_frames[i] = np.array(img)
354
+ video_frames = torch.from_numpy(np.array(video_frames)).cuda().permute(0, 3, 1, 2).unsqueeze(0) / 255.
355
+
356
+ print(video_frames.shape)
357
+
358
+ viz_esti_flows = []
359
+ for i in range(estimated_flow.shape[1]):
360
+ temp_flow = estimated_flow[0][i].permute(1, 2, 0)
361
+ viz_esti_flows.append(flow_to_image(temp_flow))
362
+ viz_esti_flows = [np.uint8(np.ones_like(viz_esti_flows[-1]) * 255)] + viz_esti_flows
363
+ viz_esti_flows = np.stack(viz_esti_flows) # [t-1, h, w, c]
364
+
365
+ total_nps = viz_esti_flows
366
+
367
+ outputs['logits_imgs'] = video_frames
368
+ outputs['flows'] = torch.from_numpy(total_nps).cuda().permute(0, 3, 1, 2).unsqueeze(0) / 255.
369
+
370
+ return outputs
371
+
372
+ @torch.no_grad()
373
+ def get_cmp_flow_from_tracking_points(self, tracking_points, motion_brush_mask, first_frame_path):
374
+
375
+ original_width, original_height = self.width, self.height
376
+
377
+ input_all_points = tracking_points.constructor_args['value']
378
+
379
+ if len(input_all_points) == 0 or len(input_all_points[-1]) == 1:
380
+ return np.uint8(np.ones((original_width, original_height, 3))*255)
381
+
382
+ resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
383
+ resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
384
+
385
+ new_resized_all_points = []
386
+ new_resized_all_points_384 = []
387
+ for tnum in range(len(resized_all_points)):
388
+ new_resized_all_points.append(interpolate_trajectory(input_all_points[tnum], self.model_length))
389
+ new_resized_all_points_384.append(interpolate_trajectory(resized_all_points_384[tnum], self.model_length))
390
+
391
+ resized_all_points = np.array(new_resized_all_points)
392
+ resized_all_points_384 = np.array(new_resized_all_points_384)
393
+
394
+ motion_brush_mask_384 = cv2.resize(motion_brush_mask, (384, 384), cv2.INTER_NEAREST)
395
+
396
+ resized_all_points_384_inmask, resized_all_points_384_outmask = \
397
+ divide_points_afterinterpolate(resized_all_points_384, motion_brush_mask_384)
398
+
399
+ in_mask_flag = False
400
+ out_mask_flag = False
401
+
402
+ if resized_all_points_384_inmask.shape[0] != 0:
403
+ in_mask_flag = True
404
+ input_drag_384_inmask, input_mask_384_inmask = \
405
+ get_sparseflow_and_mask_forward(
406
+ resized_all_points_384_inmask,
407
+ self.model_length - 1, 384, 384
408
+ )
409
+ else:
410
+ input_drag_384_inmask, input_mask_384_inmask = \
411
+ np.zeros((self.model_length - 1, 384, 384, 2)), \
412
+ np.zeros((self.model_length - 1, 384, 384))
413
+
414
+ if resized_all_points_384_outmask.shape[0] != 0:
415
+ out_mask_flag = True
416
+ input_drag_384_outmask, input_mask_384_outmask = \
417
+ get_sparseflow_and_mask_forward(
418
+ resized_all_points_384_outmask,
419
+ self.model_length - 1, 384, 384
420
+ )
421
+ else:
422
+ input_drag_384_outmask, input_mask_384_outmask = \
423
+ np.zeros((self.model_length - 1, 384, 384, 2)), \
424
+ np.zeros((self.model_length - 1, 384, 384))
425
+
426
+ input_drag_384_inmask = torch.from_numpy(input_drag_384_inmask).unsqueeze(0).to(self.device) # [1, 13, h, w, 2]
427
+ input_mask_384_inmask = torch.from_numpy(input_mask_384_inmask).unsqueeze(0).to(self.device) # [1, 13, h, w]
428
+ input_drag_384_outmask = torch.from_numpy(input_drag_384_outmask).unsqueeze(0).to(self.device) # [1, 13, h, w, 2]
429
+ input_mask_384_outmask = torch.from_numpy(input_mask_384_outmask).unsqueeze(0).to(self.device) # [1, 13, h, w]
430
+
431
+ first_frames_transform = transforms.Compose([
432
+ lambda x: Image.fromarray(x),
433
+ transforms.ToTensor(),
434
+ ])
435
+
436
+ input_first_frame = image2arr(first_frame_path)
437
+ input_first_frame = repeat(first_frames_transform(input_first_frame), 'c h w -> b c h w', b=1).to(self.device)
438
+
439
+ seed = 42
440
+ num_frames = self.model_length
441
+
442
+ set_seed(seed)
443
+
444
+ input_first_frame_384 = F.interpolate(input_first_frame, (384, 384))
445
+ input_first_frame_384 = input_first_frame_384.repeat(num_frames - 1, 1, 1, 1).unsqueeze(0)
446
+
447
+ input_drag_384_inmask = input_drag_384_inmask.permute(0, 1, 4, 2, 3) # [1, 13, 2, 384, 384]
448
+ mask_384_inmask = input_mask_384_inmask.unsqueeze(2).repeat(1, 1, 2, 1, 1) # [1, 13, 2, 384, 384]
449
+ input_drag_384_outmask = input_drag_384_outmask.permute(0, 1, 4, 2, 3) # [1, 13, 2, 384, 384]
450
+ mask_384_outmask = input_mask_384_outmask.unsqueeze(2).repeat(1, 1, 2, 1, 1) # [1, 13, 2, 384, 384]
451
+
452
+ input_drag_384_inmask = input_drag_384_inmask.to(self.device, dtype=self.weight_dtype)
453
+ mask_384_inmask = mask_384_inmask.to(self.device, dtype=self.weight_dtype)
454
+ input_drag_384_outmask = input_drag_384_outmask.to(self.device, dtype=self.weight_dtype)
455
+ mask_384_outmask = mask_384_outmask.to(self.device, dtype=self.weight_dtype)
456
+
457
+ input_first_frame_384 = input_first_frame_384.to(self.device, dtype=self.weight_dtype)
458
+
459
+ if in_mask_flag:
460
+ flow_inmask = self.get_flow(
461
+ input_first_frame_384,
462
+ input_drag_384_inmask, mask_384_inmask, motion_brush_mask_384
463
+ )
464
+ else:
465
+ fb, fl = mask_384_inmask.shape[:2]
466
+ flow_inmask = torch.zeros(fb, fl, 2, self.height, self.width).to(self.device, dtype=self.weight_dtype)
467
+
468
+ if out_mask_flag:
469
+ flow_outmask = self.get_flow(
470
+ input_first_frame_384,
471
+ input_drag_384_outmask, mask_384_outmask
472
+ )
473
+ else:
474
+ fb, fl = mask_384_outmask.shape[:2]
475
+ flow_outmask = torch.zeros(fb, fl, 2, self.height, self.width).to(self.device, dtype=self.weight_dtype)
476
+
477
+ inmask_no_zero = (flow_inmask != 0).all(dim=2)
478
+ inmask_no_zero = inmask_no_zero.unsqueeze(2).expand_as(flow_inmask)
479
+
480
+ controlnet_flow = torch.where(inmask_no_zero, flow_inmask, flow_outmask)
481
+
482
+ controlnet_flow = controlnet_flow[0, -1].permute(1, 2, 0)
483
+ viz_esti_flows = flow_to_image(controlnet_flow) # [h, w, c]
484
+
485
+ return viz_esti_flows
486
+
487
+ def run(self, first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale):
488
+
489
+ original_width, original_height = self.width, self.height
490
+
491
+ input_all_points = tracking_points.constructor_args['value']
492
+ resized_all_points = [tuple([tuple([int(e1[0]*self.width/original_width), int(e1[1]*self.height/original_height)]) for e1 in e]) for e in input_all_points]
493
+ resized_all_points_384 = [tuple([tuple([int(e1[0]*384/original_width), int(e1[1]*384/original_height)]) for e1 in e]) for e in input_all_points]
494
+
495
+ new_resized_all_points = []
496
+ new_resized_all_points_384 = []
497
+ for tnum in range(len(resized_all_points)):
498
+ new_resized_all_points.append(interpolate_trajectory(input_all_points[tnum], self.model_length))
499
+ new_resized_all_points_384.append(interpolate_trajectory(resized_all_points_384[tnum], self.model_length))
500
+
501
+ resized_all_points = np.array(new_resized_all_points)
502
+ resized_all_points_384 = np.array(new_resized_all_points_384)
503
+
504
+ motion_brush_mask_384 = cv2.resize(motion_brush_mask, (384, 384), cv2.INTER_NEAREST)
505
+
506
+ resized_all_points_384_inmask, resized_all_points_384_outmask = \
507
+ divide_points_afterinterpolate(resized_all_points_384, motion_brush_mask_384)
508
+
509
+ in_mask_flag = False
510
+ out_mask_flag = False
511
+
512
+ if resized_all_points_384_inmask.shape[0] != 0:
513
+ in_mask_flag = True
514
+ input_drag_384_inmask, input_mask_384_inmask = \
515
+ get_sparseflow_and_mask_forward(
516
+ resized_all_points_384_inmask,
517
+ self.model_length - 1, 384, 384
518
+ )
519
+ else:
520
+ input_drag_384_inmask, input_mask_384_inmask = \
521
+ np.zeros((self.model_length - 1, 384, 384, 2)), \
522
+ np.zeros((self.model_length - 1, 384, 384))
523
+
524
+ if resized_all_points_384_outmask.shape[0] != 0:
525
+ out_mask_flag = True
526
+ input_drag_384_outmask, input_mask_384_outmask = \
527
+ get_sparseflow_and_mask_forward(
528
+ resized_all_points_384_outmask,
529
+ self.model_length - 1, 384, 384
530
+ )
531
+ else:
532
+ input_drag_384_outmask, input_mask_384_outmask = \
533
+ np.zeros((self.model_length - 1, 384, 384, 2)), \
534
+ np.zeros((self.model_length - 1, 384, 384))
535
+
536
+ input_drag_384_inmask = torch.from_numpy(input_drag_384_inmask).unsqueeze(0) # [1, 13, h, w, 2]
537
+ input_mask_384_inmask = torch.from_numpy(input_mask_384_inmask).unsqueeze(0) # [1, 13, h, w]
538
+ input_drag_384_outmask = torch.from_numpy(input_drag_384_outmask).unsqueeze(0) # [1, 13, h, w, 2]
539
+ input_mask_384_outmask = torch.from_numpy(input_mask_384_outmask).unsqueeze(0) # [1, 13, h, w]
540
+
541
+ dir, base, ext = split_filename(first_frame_path)
542
+ id = base.split('_')[0]
543
+
544
+ image_pil = image2pil(first_frame_path)
545
+ image_pil = image_pil.resize((self.width, self.height), Image.BILINEAR).convert('RGB')
546
+
547
+ visualized_drag, _ = visualize_drag_v2(first_frame_path, resized_all_points, self.width, self.height)
548
+
549
+ motion_brush_viz_pil = Image.fromarray(motion_brush_viz.astype(np.uint8)).convert('RGBA')
550
+ visualized_drag = visualized_drag[0].convert('RGBA')
551
+ visualized_drag_brush = Image.alpha_composite(motion_brush_viz_pil, visualized_drag)
552
+
553
+ first_frames_transform = transforms.Compose([
554
+ lambda x: Image.fromarray(x),
555
+ transforms.ToTensor(),
556
+ ])
557
+
558
+ outputs = None
559
+ ouput_video_list = []
560
+ ouput_flow_list = []
561
+ num_inference = 1
562
+ for i in tqdm(range(num_inference)):
563
+ if not outputs:
564
+ first_frames = image2arr(first_frame_path)
565
+ first_frames = repeat(first_frames_transform(first_frames), 'c h w -> b c h w', b=inference_batch_size).to(self.device)
566
+ else:
567
+ first_frames = outputs['logits_imgs'][:, -1]
568
+
569
+
570
+ outputs = self.forward_sample(
571
+ input_drag_384_inmask.to(self.device),
572
+ input_drag_384_outmask.to(self.device),
573
+ first_frames.to(self.device),
574
+ input_mask_384_inmask.to(self.device),
575
+ input_mask_384_outmask.to(self.device),
576
+ in_mask_flag,
577
+ out_mask_flag,
578
+ motion_brush_mask_384,
579
+ ctrl_scale)
580
+
581
+ ouput_video_list.append(outputs['logits_imgs'])
582
+ ouput_flow_list.append(outputs['flows'])
583
+
584
+ hint_path = os.path.join(output_dir_video, str(id), f'{id}_hint.png')
585
+ visualized_drag_brush.save(hint_path)
586
+
587
+ for i in range(inference_batch_size):
588
+ output_tensor = [ouput_video_list[0][i]]
589
+ flow_tensor = [ouput_flow_list[0][i]]
590
+ output_tensor = torch.cat(output_tensor, dim=0)
591
+ flow_tensor = torch.cat(flow_tensor, dim=0)
592
+
593
+ outputs_path = os.path.join(output_dir_video, str(id), f's{ctrl_scale}', f'{id}_output.gif')
594
+ flows_path = os.path.join(output_dir_video, str(id), f's{ctrl_scale}', f'{id}_flow.gif')
595
+
596
+ outputs_mp4_path = os.path.join(output_dir_video, str(id), f's{ctrl_scale}', f'{id}_output.mp4')
597
+ flows_mp4_path = os.path.join(output_dir_video, str(id), f's{ctrl_scale}', f'{id}_flow.mp4')
598
+
599
+ outputs_frames_path = os.path.join(output_dir_frame, str(id), f's{ctrl_scale}', f'{id}_output')
600
+ flows_frames_path = os.path.join(output_dir_frame, str(id), f's{ctrl_scale}', f'{id}_flow')
601
+
602
+ os.makedirs(os.path.join(output_dir_video, str(id), f's{ctrl_scale}'), exist_ok=True)
603
+ os.makedirs(os.path.join(outputs_frames_path), exist_ok=True)
604
+ os.makedirs(os.path.join(flows_frames_path), exist_ok=True)
605
+
606
+ print(output_tensor.shape)
607
+
608
+ output_RGB = output_tensor.permute(0, 2, 3, 1).mul(255).cpu().numpy()
609
+ flow_RGB = flow_tensor.permute(0, 2, 3, 1).mul(255).cpu().numpy()
610
+
611
+ torchvision.io.write_video(
612
+ outputs_mp4_path,
613
+ output_RGB,
614
+ fps=20, video_codec='h264', options={'crf': '10'}
615
+ )
616
+
617
+ torchvision.io.write_video(
618
+ flows_mp4_path,
619
+ flow_RGB,
620
+ fps=20, video_codec='h264', options={'crf': '10'}
621
+ )
622
+
623
+ imageio.mimsave(outputs_path, np.uint8(output_RGB), fps=20, loop=0)
624
+
625
+ imageio.mimsave(flows_path, np.uint8(flow_RGB), fps=20, loop=0)
626
+
627
+ for f in range(output_RGB.shape[0]):
628
+ Image.fromarray(np.uint8(output_RGB[f])).save(os.path.join(outputs_frames_path, f'{str(f).zfill(3)}.png'))
629
+ Image.fromarray(np.uint8(flow_RGB[f])).save(os.path.join(flows_frames_path, f'{str(f).zfill(3)}.png'))
630
+
631
+ return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
632
+
633
+
634
+ with gr.Blocks() as demo:
635
+ gr.Markdown("""<h1 align="center">MOFA-Video</h1><br>""")
636
+
637
+ gr.Markdown("""Official Gradio Demo for <a href='https://myniuuu.github.io/MOFA_Video'><b>MOFA-Video: Controllable Image Animation via Generative Motion Field Adaptions in Frozen Image-to-Video Diffusion Model</b></a>.<br>""")
638
+
639
+ gr.Markdown(
640
+ """
641
+ During the inference, kindly follow these instructions:
642
+ <br>
643
+ 1. Use the "Upload Image" button to upload an image. Avoid dragging the image directly into the window. <br>
644
+ 2. Proceed to draw trajectories: <br>
645
+ 2.1. Click "Add Trajectory" first, then select points on the "Add Trajectory Here" image. The first click sets the starting point. Click multiple points to create a non-linear trajectory. To add a new trajectory, click "Add Trajectory" again and select points on the image. Avoid clicking the "Add Trajectory" button multiple times without clicking points in the image to add the trajectory, as this can lead to errors. <br>
646
+ 2.2. After adding each trajectory, an optical flow image will be displayed automatically. Use it as a reference to adjust the trajectory for desired effects (e.g., area, intensity). <br>
647
+ 2.3. To delete the latest trajectory, click "Delete Last Trajectory." <br>
648
+ 2.4. Choose the Control Scale in the bar. This determines the control intensity. Setting it to 0 means no control (pure generation result of SVD itself), while setting it to 1 results in the strongest control (which will not lead to good results in most cases because of twisting artifacts). A preset value of 0.6 is recommended for most cases. <br>
649
+ 2.5. To use the motion brush for restraining the control area of the trajectory, click to add masks on the "Add Motion Brush Here" image. The motion brush restricts the optical flow area derived from the trajectory whose starting point is within the motion brush. The displayed optical flow image will change correspondingly. Adjust the motion brush radius using the "Motion Brush Radius" bar. <br>
650
+ 3. Click the "Run" button to animate the image according to the path. <br>
651
+ """
652
+ )
653
+
654
+ target_size = 512
655
+ DragNUWA_net = Drag("cuda:0", target_size, target_size, 25)
656
+ first_frame_path = gr.State()
657
+ tracking_points = gr.State([])
658
+ motion_brush_points = gr.State([])
659
+ motion_brush_mask = gr.State()
660
+ motion_brush_viz = gr.State()
661
+ inference_batch_size = gr.State(1)
662
+
663
+ def preprocess_image(image):
664
+
665
+ image_pil = image2pil(image.name)
666
+ raw_w, raw_h = image_pil.size
667
+
668
+ max_edge = min(raw_w, raw_h)
669
+ resize_ratio = target_size / max_edge
670
+
671
+ image_pil = image_pil.resize((round(raw_w * resize_ratio), round(raw_h * resize_ratio)), Image.BILINEAR)
672
+
673
+ new_w, new_h = image_pil.size
674
+ crop_w = new_w - (new_w % 64)
675
+ crop_h = new_h - (new_h % 64)
676
+
677
+ image_pil = transforms.CenterCrop((crop_h, crop_w))(image_pil.convert('RGB'))
678
+
679
+ DragNUWA_net.width = crop_w
680
+ DragNUWA_net.height = crop_h
681
+
682
+ id = str(time.time()).split('.')[0]
683
+ os.makedirs(os.path.join(output_dir_video, str(id)), exist_ok=True)
684
+ os.makedirs(os.path.join(output_dir_frame, str(id)), exist_ok=True)
685
+
686
+ first_frame_path = os.path.join(output_dir_video, str(id), f"{id}_input.png")
687
+ image_pil.save(first_frame_path)
688
+
689
+ return first_frame_path, first_frame_path, first_frame_path, gr.State([]), gr.State([]), np.zeros((crop_h, crop_w)), np.zeros((crop_h, crop_w, 4))
690
+
691
+ def add_drag(tracking_points):
692
+ if len(tracking_points.constructor_args['value']) != 0 and tracking_points.constructor_args['value'][-1] == []:
693
+ return tracking_points
694
+ tracking_points.constructor_args['value'].append([])
695
+ return tracking_points
696
+
697
+ def add_mask(motion_brush_points):
698
+ motion_brush_points.constructor_args['value'].append([])
699
+ return motion_brush_points
700
+
701
+ def delete_last_drag(tracking_points, first_frame_path, motion_brush_mask):
702
+ if len(tracking_points.constructor_args['value']) > 0:
703
+ tracking_points.constructor_args['value'].pop()
704
+ transparent_background = Image.open(first_frame_path).convert('RGBA')
705
+ w, h = transparent_background.size
706
+ transparent_layer = np.zeros((h, w, 4))
707
+ for track in tracking_points.constructor_args['value']:
708
+ if len(track) > 1:
709
+ for i in range(len(track)-1):
710
+ start_point = track[i]
711
+ end_point = track[i+1]
712
+ vx = end_point[0] - start_point[0]
713
+ vy = end_point[1] - start_point[1]
714
+ arrow_length = np.sqrt(vx**2 + vy**2)
715
+ if i == len(track)-2:
716
+ cv2.arrowedLine(transparent_layer, tuple(start_point), tuple(end_point), (255, 0, 0, 255), 2, tipLength=8 / arrow_length)
717
+ else:
718
+ cv2.line(transparent_layer, tuple(start_point), tuple(end_point), (255, 0, 0, 255), 2,)
719
+ else:
720
+ cv2.circle(transparent_layer, tuple(track[0]), 5, (255, 0, 0, 255), -1)
721
+
722
+ transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
723
+ trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
724
+
725
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
726
+
727
+ return tracking_points, trajectory_map, viz_flow
728
+
729
+ def add_motion_brushes(motion_brush_points, motion_brush_mask, transparent_layer, first_frame_path, radius, tracking_points, evt: gr.SelectData):
730
+
731
+ transparent_background = Image.open(first_frame_path).convert('RGBA')
732
+ w, h = transparent_background.size
733
+
734
+ motion_points = motion_brush_points.constructor_args['value']
735
+ motion_points.append(evt.index)
736
+
737
+ x, y = evt.index
738
+
739
+ cv2.circle(motion_brush_mask, (x, y), radius, 255, -1)
740
+ cv2.circle(transparent_layer, (x, y), radius, (0, 0, 255, 255), -1)
741
+
742
+ transparent_layer_pil = Image.fromarray(transparent_layer.astype(np.uint8))
743
+ motion_map = Image.alpha_composite(transparent_background, transparent_layer_pil)
744
+
745
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
746
+
747
+ return motion_brush_mask, transparent_layer, motion_map, viz_flow
748
+
749
+ def add_tracking_points(tracking_points, first_frame_path, motion_brush_mask, evt: gr.SelectData):
750
+
751
+ print(f"You selected {evt.value} at {evt.index} from {evt.target}")
752
+
753
+ if len(tracking_points.constructor_args['value']) == 0:
754
+ tracking_points.constructor_args['value'].append([])
755
+
756
+ tracking_points.constructor_args['value'][-1].append(evt.index)
757
+
758
+ # print(tracking_points.constructor_args['value'])
759
+
760
+ transparent_background = Image.open(first_frame_path).convert('RGBA')
761
+ w, h = transparent_background.size
762
+ transparent_layer = np.zeros((h, w, 4))
763
+ for track in tracking_points.constructor_args['value']:
764
+ if len(track) > 1:
765
+ for i in range(len(track)-1):
766
+ start_point = track[i]
767
+ end_point = track[i+1]
768
+ vx = end_point[0] - start_point[0]
769
+ vy = end_point[1] - start_point[1]
770
+ arrow_length = np.sqrt(vx**2 + vy**2)
771
+ if i == len(track)-2:
772
+ cv2.arrowedLine(transparent_layer, tuple(start_point), tuple(end_point), (255, 0, 0, 255), 2, tipLength=8 / arrow_length)
773
+ else:
774
+ cv2.line(transparent_layer, tuple(start_point), tuple(end_point), (255, 0, 0, 255), 2,)
775
+ else:
776
+ cv2.circle(transparent_layer, tuple(track[0]), 3, (255, 0, 0, 255), -1)
777
+
778
+ transparent_layer = Image.fromarray(transparent_layer.astype(np.uint8))
779
+ trajectory_map = Image.alpha_composite(transparent_background, transparent_layer)
780
+
781
+ viz_flow = DragNUWA_net.get_cmp_flow_from_tracking_points(tracking_points, motion_brush_mask, first_frame_path)
782
+
783
+ return tracking_points, trajectory_map, viz_flow
784
+
785
+ with gr.Row():
786
+ with gr.Column(scale=2):
787
+ image_upload_button = gr.UploadButton(label="Upload Image",file_types=["image"])
788
+ add_drag_button = gr.Button(value="Add Trajectory")
789
+ run_button = gr.Button(value="Run")
790
+ delete_last_drag_button = gr.Button(value="Delete Last Trajectory")
791
+ brush_radius = gr.Slider(label='Motion Brush Radius',
792
+ minimum=1,
793
+ maximum=100,
794
+ step=1,
795
+ value=10)
796
+ ctrl_scale = gr.Slider(label='Control Scale',
797
+ minimum=0,
798
+ maximum=1.,
799
+ step=0.01,
800
+ value=0.6)
801
+
802
+ with gr.Column(scale=5):
803
+ input_image = gr.Image(label="Add Trajectory Here",
804
+ interactive=True)
805
+ with gr.Column(scale=5):
806
+ input_image_mask = gr.Image(label="Add Motion Brush Here",
807
+ interactive=True)
808
+
809
+ with gr.Row():
810
+ with gr.Column(scale=6):
811
+ viz_flow = gr.Image(label="Visualized Flow")
812
+ with gr.Column(scale=6):
813
+ hint_image = gr.Image(label="Visualized Hint Image")
814
+ with gr.Row():
815
+ with gr.Column(scale=6):
816
+ output_video = gr.Image(label="Output Video")
817
+ with gr.Column(scale=6):
818
+ output_flow = gr.Image(label="Output Flow")
819
+
820
+ with gr.Row():
821
+ with gr.Column(scale=6):
822
+ output_video_mp4 = gr.Video(label="Output Video mp4")
823
+ with gr.Column(scale=6):
824
+ output_flow_mp4 = gr.Video(label="Output Flow mp4")
825
+
826
+ image_upload_button.upload(preprocess_image, image_upload_button, [input_image, input_image_mask, first_frame_path, tracking_points, motion_brush_points, motion_brush_mask, motion_brush_viz])
827
+
828
+ add_drag_button.click(add_drag, tracking_points, tracking_points)
829
+
830
+ delete_last_drag_button.click(delete_last_drag, [tracking_points, first_frame_path, motion_brush_mask], [tracking_points, input_image, viz_flow])
831
+
832
+ input_image.select(add_tracking_points, [tracking_points, first_frame_path, motion_brush_mask], [tracking_points, input_image, viz_flow])
833
+
834
+ input_image_mask.select(add_motion_brushes, [motion_brush_points, motion_brush_mask, motion_brush_viz, first_frame_path, brush_radius, tracking_points], [motion_brush_mask, motion_brush_viz, input_image_mask, viz_flow])
835
+
836
+ run_button.click(DragNUWA_net.run, [first_frame_path, tracking_points, inference_batch_size, motion_brush_mask, motion_brush_viz, ctrl_scale], [hint_image, output_video, output_flow, output_video_mp4, output_flow_mp4])
837
+
838
+ demo.launch(server_name="127.0.0.1", debug=True, server_port=9080)
ckpts/controlnet/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "FlowControlNet",
3
+ "_diffusers_version": "0.25.1",
4
+ "_name_or_path": "/apdcephfs_cq10/share_1290939/myniu/svd_controlnet/svdxt11_featureflow_forward_avg_256256_stride4/unimatch_512384/checkpoint-100000/controlnet",
5
+ "addition_time_embed_dim": 256,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "conditioning_channels": 3,
13
+ "conditioning_embedding_out_channels": [
14
+ 16,
15
+ 32,
16
+ 96,
17
+ 256
18
+ ],
19
+ "cross_attention_dim": 1024,
20
+ "down_block_types": [
21
+ "CrossAttnDownBlockSpatioTemporal",
22
+ "CrossAttnDownBlockSpatioTemporal",
23
+ "CrossAttnDownBlockSpatioTemporal",
24
+ "DownBlockSpatioTemporal"
25
+ ],
26
+ "in_channels": 8,
27
+ "layers_per_block": 2,
28
+ "num_attention_heads": [
29
+ 5,
30
+ 10,
31
+ 10,
32
+ 20
33
+ ],
34
+ "num_frames": 25,
35
+ "out_channels": 4,
36
+ "projection_class_embeddings_input_dim": 768,
37
+ "sample_size": null,
38
+ "transformer_layers_per_block": 1,
39
+ "up_block_types": [
40
+ "UpBlockSpatioTemporal",
41
+ "CrossAttnUpBlockSpatioTemporal",
42
+ "CrossAttnUpBlockSpatioTemporal",
43
+ "CrossAttnUpBlockSpatioTemporal"
44
+ ]
45
+ }
ckpts/controlnet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1303192a1e72d071e15e7eb37fd1ea15f6424aaf2cd6b6b1e1bb3b1e9e75d37e
3
+ size 2777345452
models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/config.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: CMP
3
+ total_iter: 140000
4
+ lr_steps: [80000, 120000]
5
+ lr_mults: [0.1, 0.1]
6
+ lr: 0.1
7
+ optim: SGD
8
+ warmup_lr: []
9
+ warmup_steps: []
10
+ module:
11
+ arch: CMP
12
+ image_encoder: alexnet_fcn_32x
13
+ sparse_encoder: shallownet32x
14
+ flow_decoder: MotionDecoderPlain
15
+ skip_layer: False
16
+ img_enc_dim: 256
17
+ sparse_enc_dim: 16
18
+ output_dim: 198
19
+ decoder_combo: [1,2,4]
20
+ pretrained_image_encoder: False
21
+ flow_criterion: "DiscreteLoss"
22
+ nbins: 99
23
+ fmax: 50
24
+ data:
25
+ workers: 2
26
+ batch_size: 12
27
+ batch_size_test: 1
28
+ data_mean: [123.675, 116.28, 103.53] # RGB
29
+ data_div: [58.395, 57.12, 57.375]
30
+ short_size: 416
31
+ crop_size: [384, 384]
32
+ sample_strategy: ['grid', 'watershed']
33
+ sample_bg_ratio: 0.000025
34
+ nms_ks: 81
35
+ max_num_guide: 150
36
+
37
+ flow_file_type: "jpg"
38
+ image_flow_aug:
39
+ flip: False
40
+ flow_aug:
41
+ reverse: False
42
+ scale: False
43
+ rotate: False
44
+ train_source:
45
+ - data/yfcc/lists/train.txt
46
+ - data/youtube9000/lists/train.txt
47
+ val_source:
48
+ - data/yfcc/lists/val.txt
49
+ memcached: False
50
+ trainer:
51
+ initial_val: True
52
+ print_freq: 100
53
+ val_freq: 10000
54
+ save_freq: 10000
55
+ val_iter: -1
56
+ val_disp_start_iter: 0
57
+ val_disp_end_iter: 16
58
+ loss_record: ['loss_flow']
59
+ tensorboard: False
models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/resume.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch \
7
+ --load-iter 10000 \
8
+ --resume
models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/resume_slurm.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm \
8
+ --load-iter 10000 \
9
+ --resume
models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/train.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch
models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/train_slurm.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm
models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/validate.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch \
5
+ --load-iter 70000 \
6
+ --validate
models/cmp/experiments/rep_learning/alexnet_yfcc+youtube_voc_16gpu_140k/validate_slurm.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py --config $work_path/config.yaml --launcher slurm \
7
+ --load-iter 70000 \
8
+ --validate
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/config.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: CMP
3
+ total_iter: 70000
4
+ lr_steps: [40000, 60000]
5
+ lr_mults: [0.1, 0.1]
6
+ lr: 0.1
7
+ optim: SGD
8
+ warmup_lr: []
9
+ warmup_steps: []
10
+ module:
11
+ arch: CMP
12
+ image_encoder: alexnet_fcn_32x
13
+ sparse_encoder: shallownet32x
14
+ flow_decoder: MotionDecoderPlain
15
+ skip_layer: False
16
+ img_enc_dim: 256
17
+ sparse_enc_dim: 16
18
+ output_dim: 198
19
+ decoder_combo: [1,2,4]
20
+ pretrained_image_encoder: False
21
+ flow_criterion: "DiscreteLoss"
22
+ nbins: 99
23
+ fmax: 50
24
+ data:
25
+ workers: 2
26
+ batch_size: 12
27
+ batch_size_test: 1
28
+ data_mean: [123.675, 116.28, 103.53] # RGB
29
+ data_div: [58.395, 57.12, 57.375]
30
+ short_size: 416
31
+ crop_size: [384, 384]
32
+ sample_strategy: ['grid', 'watershed']
33
+ sample_bg_ratio: 0.00015625
34
+ nms_ks: 41
35
+ max_num_guide: 150
36
+
37
+ flow_file_type: "jpg"
38
+ image_flow_aug:
39
+ flip: False
40
+ flow_aug:
41
+ reverse: False
42
+ scale: False
43
+ rotate: False
44
+ train_source:
45
+ - data/yfcc/lists/train.txt
46
+ val_source:
47
+ - data/yfcc/lists/val.txt
48
+ memcached: False
49
+ trainer:
50
+ initial_val: True
51
+ print_freq: 100
52
+ val_freq: 10000
53
+ save_freq: 10000
54
+ val_iter: -1
55
+ val_disp_start_iter: 0
56
+ val_disp_end_iter: 16
57
+ loss_record: ['loss_flow']
58
+ tensorboard: False
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/resume.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch \
7
+ --load-iter 10000 \
8
+ --resume
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/resume_slurm.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm \
8
+ --load-iter 10000 \
9
+ --resume
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/train.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/train_slurm.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/validate.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch \
5
+ --load-iter 70000 \
6
+ --validate
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_16gpu_70k/validate_slurm.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py --config $work_path/config.yaml --launcher slurm \
7
+ --load-iter 70000 \
8
+ --validate
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/config.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: CMP
3
+ total_iter: 140000
4
+ lr_steps: [80000, 120000]
5
+ lr_mults: [0.1, 0.1]
6
+ lr: 0.1
7
+ optim: SGD
8
+ warmup_lr: []
9
+ warmup_steps: []
10
+ module:
11
+ arch: CMP
12
+ image_encoder: alexnet_fcn_32x
13
+ sparse_encoder: shallownet32x
14
+ flow_decoder: MotionDecoderPlain
15
+ skip_layer: False
16
+ img_enc_dim: 256
17
+ sparse_enc_dim: 16
18
+ output_dim: 198
19
+ decoder_combo: [1,2,4]
20
+ pretrained_image_encoder: False
21
+ flow_criterion: "DiscreteLoss"
22
+ nbins: 99
23
+ fmax: 50
24
+ data:
25
+ workers: 2
26
+ batch_size: 12
27
+ batch_size_test: 1
28
+ data_mean: [123.675, 116.28, 103.53] # RGB
29
+ data_div: [58.395, 57.12, 57.375]
30
+ short_size: 416
31
+ crop_size: [384, 384]
32
+ sample_strategy: ['grid', 'watershed']
33
+ sample_bg_ratio: 0.00015625
34
+ nms_ks: 41
35
+ max_num_guide: 150
36
+
37
+ flow_file_type: "jpg"
38
+ image_flow_aug:
39
+ flip: False
40
+ flow_aug:
41
+ reverse: False
42
+ scale: False
43
+ rotate: False
44
+ train_source:
45
+ - data/yfcc/lists/train.txt
46
+ val_source:
47
+ - data/yfcc/lists/val.txt
48
+ memcached: False
49
+ trainer:
50
+ initial_val: True
51
+ print_freq: 100
52
+ val_freq: 10000
53
+ save_freq: 10000
54
+ val_iter: -1
55
+ val_disp_start_iter: 0
56
+ val_disp_end_iter: 16
57
+ loss_record: ['loss_flow']
58
+ tensorboard: False
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/resume.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch \
5
+ --load-iter 10000 \
6
+ --resume
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/resume_slurm.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm \
8
+ --load-iter 10000 \
9
+ --resume
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/train.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/train_slurm.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/validate.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch \
5
+ --load-iter 70000 \
6
+ --validate
models/cmp/experiments/rep_learning/alexnet_yfcc_voc_8gpu_140k/validate_slurm.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py --config $work_path/config.yaml --launcher slurm \
7
+ --load-iter 70000 \
8
+ --validate
models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/config.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: CMP
3
+ total_iter: 70000
4
+ lr_steps: [40000, 60000]
5
+ lr_mults: [0.1, 0.1]
6
+ lr: 0.1
7
+ optim: SGD
8
+ warmup_lr: []
9
+ warmup_steps: []
10
+ module:
11
+ arch: CMP
12
+ image_encoder: resnet50
13
+ sparse_encoder: shallownet8x
14
+ flow_decoder: MotionDecoderPlain
15
+ skip_layer: False
16
+ img_enc_dim: 256
17
+ sparse_enc_dim: 16
18
+ output_dim: 198
19
+ decoder_combo: [1,2,4]
20
+ pretrained_image_encoder: False
21
+ flow_criterion: "DiscreteLoss"
22
+ nbins: 99
23
+ fmax: 50
24
+ data:
25
+ workers: 2
26
+ batch_size: 10
27
+ batch_size_test: 1
28
+ data_mean: [123.675, 116.28, 103.53] # RGB
29
+ data_div: [58.395, 57.12, 57.375]
30
+ short_size: 416
31
+ crop_size: [320, 320]
32
+ sample_strategy: ['grid', 'watershed']
33
+ sample_bg_ratio: 0.00015625
34
+ nms_ks: 15
35
+ max_num_guide: -1
36
+
37
+ flow_file_type: "jpg"
38
+ image_flow_aug:
39
+ flip: False
40
+ flow_aug:
41
+ reverse: False
42
+ scale: False
43
+ rotate: False
44
+ train_source:
45
+ - data/yfcc/lists/train.txt
46
+ - data/youtube9000/lists/train.txt
47
+ - data/VIP/lists/train.txt
48
+ - data/MPII/lists/train.txt
49
+ val_source:
50
+ - data/yfcc/lists/val.txt
51
+ memcached: False
52
+ trainer:
53
+ initial_val: True
54
+ print_freq: 100
55
+ val_freq: 10000
56
+ save_freq: 10000
57
+ val_iter: -1
58
+ val_disp_start_iter: 0
59
+ val_disp_end_iter: 16
60
+ loss_record: ['loss_flow']
61
+ tensorboard: False
models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/resume.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch \
7
+ --load-iter 10000 \
8
+ --resume
models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/resume_slurm.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm \
8
+ --load-iter 10000 \
9
+ --resume
models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/train.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch
models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/train_slurm.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm
models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/validate.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch \
5
+ --load-iter 70000 \
6
+ --validate
models/cmp/experiments/rep_learning/resnet50_yfcc+youtube+vip+mpii_lip_16gpu_70k/validate_slurm.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py --config $work_path/config.yaml --launcher slurm \
7
+ --load-iter 70000 \
8
+ --validate
models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/config.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: CMP
3
+ total_iter: 42000
4
+ lr_steps: [24000, 36000]
5
+ lr_mults: [0.1, 0.1]
6
+ lr: 0.1
7
+ optim: SGD
8
+ warmup_lr: []
9
+ warmup_steps: []
10
+ module:
11
+ arch: CMP
12
+ image_encoder: resnet50
13
+ sparse_encoder: shallownet8x
14
+ flow_decoder: MotionDecoderPlain
15
+ skip_layer: False
16
+ img_enc_dim: 256
17
+ sparse_enc_dim: 16
18
+ output_dim: 198
19
+ decoder_combo: [1,2,4]
20
+ pretrained_image_encoder: False
21
+ flow_criterion: "DiscreteLoss"
22
+ nbins: 99
23
+ fmax: 50
24
+ data:
25
+ workers: 2
26
+ batch_size: 16
27
+ batch_size_test: 1
28
+ data_mean: [123.675, 116.28, 103.53] # RGB
29
+ data_div: [58.395, 57.12, 57.375]
30
+ short_size: 333
31
+ crop_size: [256, 256]
32
+ sample_strategy: ['grid', 'watershed']
33
+ sample_bg_ratio: 0.00005632
34
+ nms_ks: 49
35
+ max_num_guide: -1
36
+
37
+ flow_file_type: "jpg"
38
+ image_flow_aug:
39
+ flip: False
40
+ flow_aug:
41
+ reverse: False
42
+ scale: False
43
+ rotate: False
44
+ train_source:
45
+ - data/yfcc/lists/train.txt
46
+ val_source:
47
+ - data/yfcc/lists/val.txt
48
+ memcached: False
49
+ trainer:
50
+ initial_val: True
51
+ print_freq: 100
52
+ val_freq: 10000
53
+ save_freq: 10000
54
+ val_iter: -1
55
+ val_disp_start_iter: 0
56
+ val_disp_end_iter: 16
57
+ loss_record: ['loss_flow']
58
+ tensorboard: False
models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/resume.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch \
7
+ --load-iter 10000 \
8
+ --resume
models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/resume_slurm.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm \
8
+ --load-iter 10000 \
9
+ --resume
models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/train.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch
models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/train_slurm.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm
models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/validate.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch \
5
+ --load-iter 70000 \
6
+ --validate
models/cmp/experiments/rep_learning/resnet50_yfcc_coco_16gpu_42k/validate_slurm.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py --config $work_path/config.yaml --launcher slurm \
7
+ --load-iter 70000 \
8
+ --validate
models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/config.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: CMP
3
+ total_iter: 42000
4
+ lr_steps: [24000, 36000]
5
+ lr_mults: [0.1, 0.1]
6
+ lr: 0.1
7
+ optim: SGD
8
+ warmup_lr: []
9
+ warmup_steps: []
10
+ module:
11
+ arch: CMP
12
+ image_encoder: resnet50
13
+ sparse_encoder: shallownet8x
14
+ flow_decoder: MotionDecoderPlain
15
+ skip_layer: False
16
+ img_enc_dim: 256
17
+ sparse_enc_dim: 16
18
+ output_dim: 198
19
+ decoder_combo: [1,2,4]
20
+ pretrained_image_encoder: False
21
+ flow_criterion: "DiscreteLoss"
22
+ nbins: 99
23
+ fmax: 50
24
+ data:
25
+ workers: 2
26
+ batch_size: 10
27
+ batch_size_test: 1
28
+ data_mean: [123.675, 116.28, 103.53] # RGB
29
+ data_div: [58.395, 57.12, 57.375]
30
+ short_size: 416
31
+ crop_size: [320, 320]
32
+ sample_strategy: ['grid', 'watershed']
33
+ sample_bg_ratio: 0.00003629
34
+ nms_ks: 67
35
+ max_num_guide: -1
36
+
37
+ flow_file_type: "jpg"
38
+ image_flow_aug:
39
+ flip: False
40
+ flow_aug:
41
+ reverse: False
42
+ scale: False
43
+ rotate: False
44
+ train_source:
45
+ - data/yfcc/lists/train.txt
46
+ val_source:
47
+ - data/yfcc/lists/val.txt
48
+ memcached: False
49
+ trainer:
50
+ initial_val: True
51
+ print_freq: 100
52
+ val_freq: 10000
53
+ save_freq: 10000
54
+ val_iter: -1
55
+ val_disp_start_iter: 0
56
+ val_disp_end_iter: 16
57
+ loss_record: ['loss_flow']
58
+ tensorboard: False
models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/resume.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch \
7
+ --load-iter 10000 \
8
+ --resume
models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/resume_slurm.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm \
8
+ --load-iter 10000 \
9
+ --resume
models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/train.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 \
4
+ --nnodes=2 --node_rank=$1 \
5
+ --master_addr="192.168.1.1" main.py \
6
+ --config $work_path/config.yaml --launcher pytorch
models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/train_slurm.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n16 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm
models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/validate.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch \
5
+ --load-iter 70000 \
6
+ --validate
models/cmp/experiments/rep_learning/resnet50_yfcc_voc_16gpu_42k/validate_slurm.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py --config $work_path/config.yaml --launcher slurm \
7
+ --load-iter 70000 \
8
+ --validate
models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/checkpoints/ckpt_iter_42000.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd3a385e227c29f89b5c7c6f4c89d356f6022fa7fcfc71ab1bd40e9833048dd6
3
+ size 228465722
models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: CMP
3
+ total_iter: 42000
4
+ lr_steps: [24000, 36000]
5
+ lr_mults: [0.1, 0.1]
6
+ lr: 0.1
7
+ optim: SGD
8
+ warmup_lr: []
9
+ warmup_steps: []
10
+ module:
11
+ arch: CMP
12
+ image_encoder: resnet50
13
+ sparse_encoder: shallownet8x
14
+ flow_decoder: MotionDecoderSkipLayer
15
+ skip_layer: True
16
+ img_enc_dim: 256
17
+ sparse_enc_dim: 16
18
+ output_dim: 198
19
+ decoder_combo: [1,2,4]
20
+ pretrained_image_encoder: False
21
+ flow_criterion: "DiscreteLoss"
22
+ nbins: 99
23
+ fmax: 50
24
+ data:
25
+ workers: 2
26
+ batch_size: 8
27
+ batch_size_test: 1
28
+ data_mean: [123.675, 116.28, 103.53] # RGB
29
+ data_div: [58.395, 57.12, 57.375]
30
+ short_size: 416
31
+ crop_size: [384, 384]
32
+ sample_strategy: ['grid', 'watershed']
33
+ sample_bg_ratio: 5.74e-5
34
+ nms_ks: 41
35
+ max_num_guide: -1
36
+
37
+ flow_file_type: "jpg"
38
+ image_flow_aug:
39
+ flip: False
40
+ flow_aug:
41
+ reverse: False
42
+ scale: False
43
+ rotate: False
44
+ train_source:
45
+ - data/VIP/lists/train.txt
46
+ - data/MPII/lists/train.txt
47
+ val_source:
48
+ - data/VIP/lists/randval.txt
49
+ memcached: False
50
+ trainer:
51
+ initial_val: True
52
+ print_freq: 100
53
+ val_freq: 5000
54
+ save_freq: 5000
55
+ val_iter: -1
56
+ val_disp_start_iter: 0
57
+ val_disp_end_iter: 16
58
+ loss_record: ['loss_flow']
59
+ tensorboard: True
models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/resume.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ python -m torch.distributed.launch --nproc_per_node=8 main.py \
4
+ --config $work_path/config.yaml --launcher pytorch \
5
+ --load-iter 10000 \
6
+ --resume
models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/resume_slurm.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ work_path=$(dirname $0)
3
+ partition=$1
4
+ GLOG_vmodule=MemcachedClient=-1 srun --mpi=pmi2 -p $partition -n8 \
5
+ --gres=gpu:8 --ntasks-per-node=8 \
6
+ python -u main.py \
7
+ --config $work_path/config.yaml --launcher slurm \
8
+ --load-iter 10000 \
9
+ --resume