Spaces:
Running
on
Zero
Running
on
Zero
myniu
commited on
Commit
•
e9f1b91
1
Parent(s):
bfb52d0
init
Browse files
app.py
CHANGED
@@ -89,79 +89,6 @@ def get_sparseflow_and_mask_forward(
|
|
89 |
return s_flow, mask
|
90 |
|
91 |
|
92 |
-
@spaces.GPU(duration=200)
|
93 |
-
def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
|
94 |
-
|
95 |
-
from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
|
96 |
-
from pipeline.pipeline import FlowControlNetPipeline
|
97 |
-
from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
|
98 |
-
|
99 |
-
print('start loading models...')
|
100 |
-
# Load scheduler, tokenizer and models.
|
101 |
-
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
102 |
-
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
103 |
-
)
|
104 |
-
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
105 |
-
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
106 |
-
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
107 |
-
pretrained_model_name_or_path,
|
108 |
-
subfolder="unet",
|
109 |
-
low_cpu_mem_usage=True,
|
110 |
-
variant="fp16",
|
111 |
-
)
|
112 |
-
|
113 |
-
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
114 |
-
|
115 |
-
cmp = CMP_demo(
|
116 |
-
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
117 |
-
42000
|
118 |
-
).to(device)
|
119 |
-
cmp.requires_grad_(False)
|
120 |
-
|
121 |
-
# Freeze vae and image_encoder
|
122 |
-
vae.requires_grad_(False)
|
123 |
-
image_encoder.requires_grad_(False)
|
124 |
-
unet.requires_grad_(False)
|
125 |
-
controlnet.requires_grad_(False)
|
126 |
-
|
127 |
-
# Move image_encoder and vae to gpu and cast to weight_dtype
|
128 |
-
image_encoder.to(device, dtype=weight_dtype)
|
129 |
-
vae.to(device, dtype=weight_dtype)
|
130 |
-
unet.to(device, dtype=weight_dtype)
|
131 |
-
controlnet.to(device, dtype=weight_dtype)
|
132 |
-
|
133 |
-
if enable_xformers_memory_efficient_attention:
|
134 |
-
if is_xformers_available():
|
135 |
-
import xformers
|
136 |
-
|
137 |
-
xformers_version = version.parse(xformers.__version__)
|
138 |
-
if xformers_version == version.parse("0.0.16"):
|
139 |
-
print(
|
140 |
-
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
141 |
-
)
|
142 |
-
unet.enable_xformers_memory_efficient_attention()
|
143 |
-
else:
|
144 |
-
raise ValueError(
|
145 |
-
"xformers is not available. Make sure it is installed correctly")
|
146 |
-
|
147 |
-
if allow_tf32:
|
148 |
-
torch.backends.cuda.matmul.allow_tf32 = True
|
149 |
-
|
150 |
-
pipeline = FlowControlNetPipeline.from_pretrained(
|
151 |
-
pretrained_model_name_or_path,
|
152 |
-
unet=unet,
|
153 |
-
controlnet=controlnet,
|
154 |
-
image_encoder=image_encoder,
|
155 |
-
vae=vae,
|
156 |
-
torch_dtype=weight_dtype,
|
157 |
-
)
|
158 |
-
pipeline = pipeline.to(device)
|
159 |
-
|
160 |
-
print('models loaded.')
|
161 |
-
|
162 |
-
return pipeline, cmp
|
163 |
-
|
164 |
-
|
165 |
def interpolate_trajectory(points, n_points):
|
166 |
x = [point[0] for point in points]
|
167 |
y = [point[1] for point in points]
|
@@ -236,15 +163,8 @@ with gr.Blocks() as demo:
|
|
236 |
)
|
237 |
|
238 |
height, width = 512, 512
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
pipeline, cmp = init_models(
|
243 |
-
svd_ckpt,
|
244 |
-
mofa_ckpt,
|
245 |
-
weight_dtype=torch.float16,
|
246 |
-
device='cuda'
|
247 |
-
)
|
248 |
|
249 |
first_frame_path = gr.State()
|
250 |
tracking_points = gr.State([])
|
@@ -253,6 +173,78 @@ with gr.Blocks() as demo:
|
|
253 |
motion_brush_viz = gr.State()
|
254 |
inference_batch_size = gr.State(1)
|
255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
|
257 |
|
258 |
'''
|
@@ -652,8 +644,11 @@ with gr.Blocks() as demo:
|
|
652 |
|
653 |
return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
|
654 |
|
|
|
655 |
def preprocess_image(image):
|
656 |
|
|
|
|
|
657 |
image_pil = image2pil(image.name)
|
658 |
raw_w, raw_h = image_pil.size
|
659 |
|
|
|
89 |
return s_flow, mask
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
def interpolate_trajectory(points, n_points):
|
93 |
x = [point[0] for point in points]
|
94 |
y = [point[1] for point in points]
|
|
|
163 |
)
|
164 |
|
165 |
height, width = 512, 512
|
166 |
+
|
167 |
+
pipeline, cmp = None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
first_frame_path = gr.State()
|
170 |
tracking_points = gr.State([])
|
|
|
173 |
motion_brush_viz = gr.State()
|
174 |
inference_batch_size = gr.State(1)
|
175 |
|
176 |
+
@spaces.GPU(duration=100)
|
177 |
+
def init_models(pretrained_model_name_or_path="ckpts/stable-video-diffusion-img2vid-xt-1-1", resume_from_checkpoint="ckpts/controlnet", weight_dtype=torch.float16, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
|
178 |
+
|
179 |
+
from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
|
180 |
+
from pipeline.pipeline import FlowControlNetPipeline
|
181 |
+
from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
|
182 |
+
|
183 |
+
print('start loading models...')
|
184 |
+
# Load scheduler, tokenizer and models.
|
185 |
+
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
186 |
+
pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
|
187 |
+
)
|
188 |
+
vae = AutoencoderKLTemporalDecoder.from_pretrained(
|
189 |
+
pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
|
190 |
+
unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
|
191 |
+
pretrained_model_name_or_path,
|
192 |
+
subfolder="unet",
|
193 |
+
low_cpu_mem_usage=True,
|
194 |
+
variant="fp16",
|
195 |
+
)
|
196 |
+
|
197 |
+
controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
|
198 |
+
|
199 |
+
cmp = CMP_demo(
|
200 |
+
'./models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
|
201 |
+
42000
|
202 |
+
).to(device)
|
203 |
+
cmp.requires_grad_(False)
|
204 |
+
|
205 |
+
# Freeze vae and image_encoder
|
206 |
+
vae.requires_grad_(False)
|
207 |
+
image_encoder.requires_grad_(False)
|
208 |
+
unet.requires_grad_(False)
|
209 |
+
controlnet.requires_grad_(False)
|
210 |
+
|
211 |
+
# Move image_encoder and vae to gpu and cast to weight_dtype
|
212 |
+
image_encoder.to(device, dtype=weight_dtype)
|
213 |
+
vae.to(device, dtype=weight_dtype)
|
214 |
+
unet.to(device, dtype=weight_dtype)
|
215 |
+
controlnet.to(device, dtype=weight_dtype)
|
216 |
+
|
217 |
+
if enable_xformers_memory_efficient_attention:
|
218 |
+
if is_xformers_available():
|
219 |
+
import xformers
|
220 |
+
|
221 |
+
xformers_version = version.parse(xformers.__version__)
|
222 |
+
if xformers_version == version.parse("0.0.16"):
|
223 |
+
print(
|
224 |
+
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
225 |
+
)
|
226 |
+
unet.enable_xformers_memory_efficient_attention()
|
227 |
+
else:
|
228 |
+
raise ValueError(
|
229 |
+
"xformers is not available. Make sure it is installed correctly")
|
230 |
+
|
231 |
+
if allow_tf32:
|
232 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
233 |
+
|
234 |
+
pipeline = FlowControlNetPipeline.from_pretrained(
|
235 |
+
pretrained_model_name_or_path,
|
236 |
+
unet=unet,
|
237 |
+
controlnet=controlnet,
|
238 |
+
image_encoder=image_encoder,
|
239 |
+
vae=vae,
|
240 |
+
torch_dtype=weight_dtype,
|
241 |
+
)
|
242 |
+
pipeline = pipeline.to(device)
|
243 |
+
|
244 |
+
print('models loaded.')
|
245 |
+
|
246 |
+
return pipeline, cmp
|
247 |
+
|
248 |
def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
|
249 |
|
250 |
'''
|
|
|
644 |
|
645 |
return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
|
646 |
|
647 |
+
@spaces.GPU(duration=100)
|
648 |
def preprocess_image(image):
|
649 |
|
650 |
+
pipeline, cmp = init_models()
|
651 |
+
|
652 |
image_pil = image2pil(image.name)
|
653 |
raw_w, raw_h = image_pil.size
|
654 |
|