myniu commited on
Commit
e9f1b91
1 Parent(s): bfb52d0
Files changed (1) hide show
  1. app.py +77 -82
app.py CHANGED
@@ -89,79 +89,6 @@ def get_sparseflow_and_mask_forward(
89
  return s_flow, mask
90
 
91
 
92
- @spaces.GPU(duration=200)
93
- def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
94
-
95
- from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
96
- from pipeline.pipeline import FlowControlNetPipeline
97
- from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
98
-
99
- print('start loading models...')
100
- # Load scheduler, tokenizer and models.
101
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(
102
- pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
103
- )
104
- vae = AutoencoderKLTemporalDecoder.from_pretrained(
105
- pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
106
- unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
107
- pretrained_model_name_or_path,
108
- subfolder="unet",
109
- low_cpu_mem_usage=True,
110
- variant="fp16",
111
- )
112
-
113
- controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
114
-
115
- cmp = CMP_demo(
116
- './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
117
- 42000
118
- ).to(device)
119
- cmp.requires_grad_(False)
120
-
121
- # Freeze vae and image_encoder
122
- vae.requires_grad_(False)
123
- image_encoder.requires_grad_(False)
124
- unet.requires_grad_(False)
125
- controlnet.requires_grad_(False)
126
-
127
- # Move image_encoder and vae to gpu and cast to weight_dtype
128
- image_encoder.to(device, dtype=weight_dtype)
129
- vae.to(device, dtype=weight_dtype)
130
- unet.to(device, dtype=weight_dtype)
131
- controlnet.to(device, dtype=weight_dtype)
132
-
133
- if enable_xformers_memory_efficient_attention:
134
- if is_xformers_available():
135
- import xformers
136
-
137
- xformers_version = version.parse(xformers.__version__)
138
- if xformers_version == version.parse("0.0.16"):
139
- print(
140
- "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
141
- )
142
- unet.enable_xformers_memory_efficient_attention()
143
- else:
144
- raise ValueError(
145
- "xformers is not available. Make sure it is installed correctly")
146
-
147
- if allow_tf32:
148
- torch.backends.cuda.matmul.allow_tf32 = True
149
-
150
- pipeline = FlowControlNetPipeline.from_pretrained(
151
- pretrained_model_name_or_path,
152
- unet=unet,
153
- controlnet=controlnet,
154
- image_encoder=image_encoder,
155
- vae=vae,
156
- torch_dtype=weight_dtype,
157
- )
158
- pipeline = pipeline.to(device)
159
-
160
- print('models loaded.')
161
-
162
- return pipeline, cmp
163
-
164
-
165
  def interpolate_trajectory(points, n_points):
166
  x = [point[0] for point in points]
167
  y = [point[1] for point in points]
@@ -236,15 +163,8 @@ with gr.Blocks() as demo:
236
  )
237
 
238
  height, width = 512, 512
239
- svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
240
- mofa_ckpt = "ckpts/controlnet"
241
-
242
- pipeline, cmp = init_models(
243
- svd_ckpt,
244
- mofa_ckpt,
245
- weight_dtype=torch.float16,
246
- device='cuda'
247
- )
248
 
249
  first_frame_path = gr.State()
250
  tracking_points = gr.State([])
@@ -253,6 +173,78 @@ with gr.Blocks() as demo:
253
  motion_brush_viz = gr.State()
254
  inference_batch_size = gr.State(1)
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
257
 
258
  '''
@@ -652,8 +644,11 @@ with gr.Blocks() as demo:
652
 
653
  return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
654
 
 
655
  def preprocess_image(image):
656
 
 
 
657
  image_pil = image2pil(image.name)
658
  raw_w, raw_h = image_pil.size
659
 
 
89
  return s_flow, mask
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def interpolate_trajectory(points, n_points):
93
  x = [point[0] for point in points]
94
  y = [point[1] for point in points]
 
163
  )
164
 
165
  height, width = 512, 512
166
+
167
+ pipeline, cmp = None, None
 
 
 
 
 
 
 
168
 
169
  first_frame_path = gr.State()
170
  tracking_points = gr.State([])
 
173
  motion_brush_viz = gr.State()
174
  inference_batch_size = gr.State(1)
175
 
176
+ @spaces.GPU(duration=100)
177
+ def init_models(pretrained_model_name_or_path="ckpts/stable-video-diffusion-img2vid-xt-1-1", resume_from_checkpoint="ckpts/controlnet", weight_dtype=torch.float16, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
178
+
179
+ from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
180
+ from pipeline.pipeline import FlowControlNetPipeline
181
+ from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
182
+
183
+ print('start loading models...')
184
+ # Load scheduler, tokenizer and models.
185
+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(
186
+ pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
187
+ )
188
+ vae = AutoencoderKLTemporalDecoder.from_pretrained(
189
+ pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
190
+ unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
191
+ pretrained_model_name_or_path,
192
+ subfolder="unet",
193
+ low_cpu_mem_usage=True,
194
+ variant="fp16",
195
+ )
196
+
197
+ controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
198
+
199
+ cmp = CMP_demo(
200
+ './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
201
+ 42000
202
+ ).to(device)
203
+ cmp.requires_grad_(False)
204
+
205
+ # Freeze vae and image_encoder
206
+ vae.requires_grad_(False)
207
+ image_encoder.requires_grad_(False)
208
+ unet.requires_grad_(False)
209
+ controlnet.requires_grad_(False)
210
+
211
+ # Move image_encoder and vae to gpu and cast to weight_dtype
212
+ image_encoder.to(device, dtype=weight_dtype)
213
+ vae.to(device, dtype=weight_dtype)
214
+ unet.to(device, dtype=weight_dtype)
215
+ controlnet.to(device, dtype=weight_dtype)
216
+
217
+ if enable_xformers_memory_efficient_attention:
218
+ if is_xformers_available():
219
+ import xformers
220
+
221
+ xformers_version = version.parse(xformers.__version__)
222
+ if xformers_version == version.parse("0.0.16"):
223
+ print(
224
+ "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
225
+ )
226
+ unet.enable_xformers_memory_efficient_attention()
227
+ else:
228
+ raise ValueError(
229
+ "xformers is not available. Make sure it is installed correctly")
230
+
231
+ if allow_tf32:
232
+ torch.backends.cuda.matmul.allow_tf32 = True
233
+
234
+ pipeline = FlowControlNetPipeline.from_pretrained(
235
+ pretrained_model_name_or_path,
236
+ unet=unet,
237
+ controlnet=controlnet,
238
+ image_encoder=image_encoder,
239
+ vae=vae,
240
+ torch_dtype=weight_dtype,
241
+ )
242
+ pipeline = pipeline.to(device)
243
+
244
+ print('models loaded.')
245
+
246
+ return pipeline, cmp
247
+
248
  def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
249
 
250
  '''
 
644
 
645
  return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
646
 
647
+ @spaces.GPU(duration=100)
648
  def preprocess_image(image):
649
 
650
+ pipeline, cmp = init_models()
651
+
652
  image_pil = image2pil(image.name)
653
  raw_w, raw_h = image_pil.size
654