heheyas commited on
Commit
b1531dc
1 Parent(s): 3667a5a

update app.py

Browse files
Files changed (2) hide show
  1. app.py +107 -139
  2. app_bkp.py +294 -0
app.py CHANGED
@@ -25,6 +25,7 @@ from glob import glob
25
  from mediapy import write_video
26
  from pathlib import Path
27
  import spaces
 
28
 
29
 
30
  @spaces.GPU
@@ -142,153 +143,120 @@ def do_sample(
142
  return video_path
143
 
144
 
 
145
  def change_model_params(model, min_cfg, max_cfg):
146
  model.sampler.guider.max_scale = max_cfg
147
  model.sampler.guider.min_scale = min_cfg
148
 
149
 
150
- @spaces.GPU
151
- def launch(device="cuda", share=False):
152
- model_config = "./scripts/pub/configs/V3D_512.yaml"
153
- num_frames = OmegaConf.load(
154
- model_config
155
- ).model.params.sampler_config.params.guider_config.params.num_frames
156
- print("Detected num_frames:", num_frames)
157
- # num_steps = default(num_steps, 25)
158
- num_steps = 25
159
- output_folder = "outputs/V3D_512"
160
-
161
- sd = load_safetensors("./ckpts/svd_xt.safetensors")
162
- clip_model_config = OmegaConf.load("./configs/embedder/clip_image.yaml")
163
- clip_model = instantiate_from_config(clip_model_config).eval()
164
- clip_sd = dict()
165
- for k, v in sd.items():
166
- if "conditioner.embedders.0" in k:
167
- clip_sd[k.replace("conditioner.embedders.0.", "")] = v
168
- clip_model.load_state_dict(clip_sd)
169
- clip_model = clip_model.to(device)
170
 
171
- ae_model_config = OmegaConf.load("./configs/ae/video.yaml")
172
- ae_model = instantiate_from_config(ae_model_config).eval()
173
- encoder_sd = dict()
174
- for k, v in sd.items():
175
- if "first_stage_model" in k:
176
- encoder_sd[k.replace("first_stage_model.", "")] = v
177
- ae_model.load_state_dict(encoder_sd)
178
- ae_model = ae_model.to(device)
179
- rembg_session = rembg.new_session()
180
 
181
- model, _ = load_model(
182
- model_config, device, num_frames, num_steps, min_cfg=3.5, max_cfg=3.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  )
184
-
185
- with gr.Blocks(title="V3D", theme=gr.themes.Monochrome()) as demo:
186
- with gr.Row(equal_height=True):
187
- with gr.Column():
188
- input_image = gr.Image(value=None, label="Input Image")
189
-
190
- border_ratio_slider = gr.Slider(
191
- value=0.3,
192
- label="Border Ratio",
193
- minimum=0.05,
194
- maximum=0.5,
195
- step=0.05,
196
- )
197
- decoding_t_slider = gr.Slider(
198
- value=1,
199
- label="Number of Decoding frames",
200
- minimum=1,
201
- maximum=num_frames,
202
- step=1,
203
- )
204
- min_guidance_slider = gr.Slider(
205
- value=3.5,
206
- label="Min CFG Value",
207
- minimum=0.05,
208
- maximum=0.5,
209
- step=0.05,
210
- )
211
- max_guidance_slider = gr.Slider(
212
- value=3.5,
213
- label="Max CFG Value",
214
- minimum=0.05,
215
- maximum=0.5,
216
- step=0.05,
217
- )
218
- run_button = gr.Button(value="Run V3D")
219
-
220
- with gr.Column():
221
- output_video = gr.Video(value=None, label="Output Orbit Video")
222
-
223
- @run_button.click(
224
- inputs=[
225
- input_image,
226
- border_ratio_slider,
227
- min_guidance_slider,
228
- max_guidance_slider,
229
- decoding_t_slider,
230
- ],
231
- outputs=[output_video],
232
  )
233
- def _(image, border_ratio, min_guidance, max_guidance, decoding_t):
234
- change_model_params(model, min_guidance, max_guidance)
235
- return do_sample(
236
- image,
237
- model,
238
- clip_model,
239
- ae_model,
240
- device,
241
- num_frames,
242
- num_steps,
243
- int(decoding_t),
244
- border_ratio,
245
- False,
246
- rembg_session,
247
- output_folder,
248
- )
249
-
250
- # do_sample(
251
- # np.asarray(Image.open("assets/baby_yoda.png")),
252
- # model,
253
- # clip_model,
254
- # ae_model,
255
- # device,
256
- # num_frames,
257
- # num_steps,
258
- # 1,
259
- # 0.3,
260
- # False,
261
- # rembg_session,
262
- # output_folder,
263
- # )
264
- demo.launch(inbrowser=True, inline=False, share=share, show_error=True)
265
-
266
-
267
- if __name__ == "__main__":
268
- parser = argparse.ArgumentParser()
269
- parser.add_argument("--device", type=str, default="cuda")
270
- parser.add_argument("--share", action="store_true")
271
-
272
- opt = parser.parse_args()
273
-
274
- test = OmegaConf.load("./scripts/pub/configs/V3D_512.yaml")
275
- print(test)
276
-
277
- def download_if_need(path, url):
278
- if Path(path).exists():
279
- return
280
- import wget
281
-
282
- path = Path(path)
283
- path.parent.mkdir(parents=True, exist_ok=True)
284
- wget.download(url, out=str(path))
285
 
286
- # download_if_need(
287
- # "ckpts/svd_xt.safetensors",
288
- # "https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors",
289
- # )
290
- # download_if_need(
291
- # "ckpts/V3D_512.ckpt", "https://huggingface.co/heheyas/V3D/resolve/main/V3D.ckpt"
292
- # )
293
 
294
- launch(opt.device, opt.share)
 
25
  from mediapy import write_video
26
  from pathlib import Path
27
  import spaces
28
+ from huggingface_hub import hf_hub_download
29
 
30
 
31
  @spaces.GPU
 
143
  return video_path
144
 
145
 
146
+ @spaces.GPU
147
  def change_model_params(model, min_cfg, max_cfg):
148
  model.sampler.guider.max_scale = max_cfg
149
  model.sampler.guider.min_scale = min_cfg
150
 
151
 
152
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ # download
155
+ V3D_ckpt_path = hf_hub_download(repo_id="heheyas/V3D", filename="V3D.ckpt")
156
+ svd_xt_ckpt_path = hf_hub_download(
157
+ repo_id="stabilityai/stable-video-diffusion-img2vid-xt",
158
+ filename="svd_xt.safetensors",
159
+ )
 
 
 
160
 
161
+ model_config = "./scripts/pub/configs/V3D_512.yaml"
162
+ num_frames = OmegaConf.load(
163
+ model_config
164
+ ).model.params.sampler_config.params.guider_config.params.num_frames
165
+ print("Detected num_frames:", num_frames)
166
+ # num_steps = default(num_steps, 25)
167
+ num_steps = 25
168
+ output_folder = "outputs/V3D_512"
169
+
170
+ sd = load_safetensors(svd_xt_ckpt_path)
171
+ clip_model_config = OmegaConf.load("./configs/embedder/clip_image.yaml")
172
+ clip_model = instantiate_from_config(clip_model_config).eval()
173
+ clip_sd = dict()
174
+ for k, v in sd.items():
175
+ if "conditioner.embedders.0" in k:
176
+ clip_sd[k.replace("conditioner.embedders.0.", "")] = v
177
+ clip_model.load_state_dict(clip_sd)
178
+ clip_model = clip_model.to(device)
179
+
180
+ ae_model_config = OmegaConf.load("./configs/ae/video.yaml")
181
+ ae_model = instantiate_from_config(ae_model_config).eval()
182
+ encoder_sd = dict()
183
+ for k, v in sd.items():
184
+ if "first_stage_model" in k:
185
+ encoder_sd[k.replace("first_stage_model.", "")] = v
186
+ ae_model.load_state_dict(encoder_sd)
187
+ ae_model = ae_model.to(device)
188
+ rembg_session = rembg.new_session()
189
+
190
+ model_config.model.params.ckpt_path = V3D_ckpt_path
191
+ model, _ = load_model(
192
+ model_config, device, num_frames, num_steps, min_cfg=3.5, max_cfg=3.5
193
+ )
194
+ model = model.to(device)
195
+
196
+ with gr.Blocks(title="V3D", theme=gr.themes.Monochrome()) as demo:
197
+ with gr.Row(equal_height=True):
198
+ with gr.Column():
199
+ input_image = gr.Image(value=None, label="Input Image")
200
+
201
+ border_ratio_slider = gr.Slider(
202
+ value=0.3,
203
+ label="Border Ratio",
204
+ minimum=0.05,
205
+ maximum=0.5,
206
+ step=0.05,
207
+ )
208
+ decoding_t_slider = gr.Slider(
209
+ value=1,
210
+ label="Number of Decoding frames",
211
+ minimum=1,
212
+ maximum=num_frames,
213
+ step=1,
214
+ )
215
+ min_guidance_slider = gr.Slider(
216
+ value=3.5,
217
+ label="Min CFG Value",
218
+ minimum=0.05,
219
+ maximum=0.5,
220
+ step=0.05,
221
+ )
222
+ max_guidance_slider = gr.Slider(
223
+ value=3.5,
224
+ label="Max CFG Value",
225
+ minimum=0.05,
226
+ maximum=0.5,
227
+ step=0.05,
228
+ )
229
+ run_button = gr.Button(value="Run V3D")
230
+
231
+ with gr.Column():
232
+ output_video = gr.Video(value=None, label="Output Orbit Video")
233
+
234
+ @run_button.click(
235
+ inputs=[
236
+ input_image,
237
+ border_ratio_slider,
238
+ min_guidance_slider,
239
+ max_guidance_slider,
240
+ decoding_t_slider,
241
+ ],
242
+ outputs=[output_video],
243
  )
244
+ def _(image, border_ratio, min_guidance, max_guidance, decoding_t):
245
+ change_model_params(model, min_guidance, max_guidance)
246
+ return do_sample(
247
+ image,
248
+ model,
249
+ clip_model,
250
+ ae_model,
251
+ device,
252
+ num_frames,
253
+ num_steps,
254
+ int(decoding_t),
255
+ border_ratio,
256
+ False,
257
+ rembg_session,
258
+ output_folder,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
 
 
 
 
 
 
 
261
 
262
+ demo.launch()
app_bkp.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TODO
2
+ import numpy as np
3
+ import argparse
4
+ import torch
5
+ from torchvision.utils import make_grid
6
+ import tempfile
7
+ import gradio as gr
8
+ from omegaconf import OmegaConf
9
+ from einops import rearrange
10
+ from scripts.pub.V3D_512 import (
11
+ sample_one,
12
+ get_batch,
13
+ get_unique_embedder_keys_from_conditioner,
14
+ load_model,
15
+ )
16
+ from sgm.util import default, instantiate_from_config
17
+ from safetensors.torch import load_file as load_safetensors
18
+ from PIL import Image
19
+ from kiui.op import recenter
20
+ from torchvision.transforms import ToTensor
21
+ from einops import rearrange, repeat
22
+ import rembg
23
+ import os
24
+ from glob import glob
25
+ from mediapy import write_video
26
+ from pathlib import Path
27
+ import spaces
28
+
29
+
30
+ @spaces.GPU
31
+ def do_sample(
32
+ image,
33
+ model,
34
+ clip_model,
35
+ ae_model,
36
+ device,
37
+ num_frames,
38
+ num_steps,
39
+ decoding_t,
40
+ border_ratio,
41
+ ignore_alpha,
42
+ rembg_session,
43
+ output_folder,
44
+ ):
45
+ # if image.mode == "RGBA":
46
+ # image = image.convert("RGB")
47
+ image = Image.fromarray(image)
48
+ w, h = image.size
49
+
50
+ if border_ratio > 0:
51
+ if image.mode != "RGBA" or ignore_alpha:
52
+ image = image.convert("RGB")
53
+ image = np.asarray(image)
54
+ carved_image = rembg.remove(image, session=rembg_session) # [H, W, 4]
55
+ else:
56
+ image = np.asarray(image)
57
+ carved_image = image
58
+ mask = carved_image[..., -1] > 0
59
+ image = recenter(carved_image, mask, border_ratio=border_ratio)
60
+ image = image.astype(np.float32) / 255.0
61
+ if image.shape[-1] == 4:
62
+ image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
63
+ image = Image.fromarray((image * 255).astype(np.uint8))
64
+ else:
65
+ print("Ignore border ratio")
66
+ image = image.resize((512, 512))
67
+
68
+ image = ToTensor()(image)
69
+ image = image * 2.0 - 1.0
70
+
71
+ image = image.unsqueeze(0).to(device)
72
+ H, W = image.shape[2:]
73
+ assert image.shape[1] == 3
74
+ F = 8
75
+ C = 4
76
+ shape = (num_frames, C, H // F, W // F)
77
+
78
+ value_dict = {}
79
+ value_dict["motion_bucket_id"] = 0
80
+ value_dict["fps_id"] = 0
81
+ value_dict["cond_aug"] = 0.05
82
+ value_dict["cond_frames_without_noise"] = clip_model(image)
83
+ value_dict["cond_frames"] = ae_model.encode(image)
84
+ value_dict["cond_frames"] += 0.05 * torch.randn_like(value_dict["cond_frames"])
85
+ value_dict["cond_aug"] = 0.05
86
+
87
+ with torch.no_grad():
88
+ with torch.autocast(device):
89
+ batch, batch_uc = get_batch(
90
+ get_unique_embedder_keys_from_conditioner(model.conditioner),
91
+ value_dict,
92
+ [1, num_frames],
93
+ T=num_frames,
94
+ device=device,
95
+ )
96
+ c, uc = model.conditioner.get_unconditional_conditioning(
97
+ batch,
98
+ batch_uc=batch_uc,
99
+ force_uc_zero_embeddings=[
100
+ "cond_frames",
101
+ "cond_frames_without_noise",
102
+ ],
103
+ )
104
+
105
+ for k in ["crossattn", "concat"]:
106
+ uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
107
+ uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
108
+ c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
109
+ c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
110
+
111
+ randn = torch.randn(shape, device=device)
112
+ randn = randn.to(device)
113
+
114
+ additional_model_inputs = {}
115
+ additional_model_inputs["image_only_indicator"] = torch.zeros(
116
+ 2, num_frames
117
+ ).to(device)
118
+ additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
119
+
120
+ def denoiser(input, sigma, c):
121
+ return model.denoiser(
122
+ model.model, input, sigma, c, **additional_model_inputs
123
+ )
124
+
125
+ samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
126
+ model.en_and_decode_n_samples_a_time = decoding_t
127
+ samples_x = model.decode_first_stage(samples_z)
128
+ samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
129
+
130
+ os.makedirs(output_folder, exist_ok=True)
131
+ base_count = len(glob(os.path.join(output_folder, "*.mp4")))
132
+ video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
133
+
134
+ frames = (
135
+ (rearrange(samples, "t c h w -> t h w c") * 255)
136
+ .cpu()
137
+ .numpy()
138
+ .astype(np.uint8)
139
+ )
140
+ write_video(video_path, frames, fps=6)
141
+
142
+ return video_path
143
+
144
+
145
+ def change_model_params(model, min_cfg, max_cfg):
146
+ model.sampler.guider.max_scale = max_cfg
147
+ model.sampler.guider.min_scale = min_cfg
148
+
149
+
150
+ @spaces.GPU
151
+ def launch(device="cuda", share=False):
152
+ model_config = "./scripts/pub/configs/V3D_512.yaml"
153
+ num_frames = OmegaConf.load(
154
+ model_config
155
+ ).model.params.sampler_config.params.guider_config.params.num_frames
156
+ print("Detected num_frames:", num_frames)
157
+ # num_steps = default(num_steps, 25)
158
+ num_steps = 25
159
+ output_folder = "outputs/V3D_512"
160
+
161
+ sd = load_safetensors("./ckpts/svd_xt.safetensors")
162
+ clip_model_config = OmegaConf.load("./configs/embedder/clip_image.yaml")
163
+ clip_model = instantiate_from_config(clip_model_config).eval()
164
+ clip_sd = dict()
165
+ for k, v in sd.items():
166
+ if "conditioner.embedders.0" in k:
167
+ clip_sd[k.replace("conditioner.embedders.0.", "")] = v
168
+ clip_model.load_state_dict(clip_sd)
169
+ clip_model = clip_model.to(device)
170
+
171
+ ae_model_config = OmegaConf.load("./configs/ae/video.yaml")
172
+ ae_model = instantiate_from_config(ae_model_config).eval()
173
+ encoder_sd = dict()
174
+ for k, v in sd.items():
175
+ if "first_stage_model" in k:
176
+ encoder_sd[k.replace("first_stage_model.", "")] = v
177
+ ae_model.load_state_dict(encoder_sd)
178
+ ae_model = ae_model.to(device)
179
+ rembg_session = rembg.new_session()
180
+
181
+ model, _ = load_model(
182
+ model_config, device, num_frames, num_steps, min_cfg=3.5, max_cfg=3.5
183
+ )
184
+
185
+ with gr.Blocks(title="V3D", theme=gr.themes.Monochrome()) as demo:
186
+ with gr.Row(equal_height=True):
187
+ with gr.Column():
188
+ input_image = gr.Image(value=None, label="Input Image")
189
+
190
+ border_ratio_slider = gr.Slider(
191
+ value=0.3,
192
+ label="Border Ratio",
193
+ minimum=0.05,
194
+ maximum=0.5,
195
+ step=0.05,
196
+ )
197
+ decoding_t_slider = gr.Slider(
198
+ value=1,
199
+ label="Number of Decoding frames",
200
+ minimum=1,
201
+ maximum=num_frames,
202
+ step=1,
203
+ )
204
+ min_guidance_slider = gr.Slider(
205
+ value=3.5,
206
+ label="Min CFG Value",
207
+ minimum=0.05,
208
+ maximum=0.5,
209
+ step=0.05,
210
+ )
211
+ max_guidance_slider = gr.Slider(
212
+ value=3.5,
213
+ label="Max CFG Value",
214
+ minimum=0.05,
215
+ maximum=0.5,
216
+ step=0.05,
217
+ )
218
+ run_button = gr.Button(value="Run V3D")
219
+
220
+ with gr.Column():
221
+ output_video = gr.Video(value=None, label="Output Orbit Video")
222
+
223
+ @run_button.click(
224
+ inputs=[
225
+ input_image,
226
+ border_ratio_slider,
227
+ min_guidance_slider,
228
+ max_guidance_slider,
229
+ decoding_t_slider,
230
+ ],
231
+ outputs=[output_video],
232
+ )
233
+ def _(image, border_ratio, min_guidance, max_guidance, decoding_t):
234
+ change_model_params(model, min_guidance, max_guidance)
235
+ return do_sample(
236
+ image,
237
+ model,
238
+ clip_model,
239
+ ae_model,
240
+ device,
241
+ num_frames,
242
+ num_steps,
243
+ int(decoding_t),
244
+ border_ratio,
245
+ False,
246
+ rembg_session,
247
+ output_folder,
248
+ )
249
+
250
+ # do_sample(
251
+ # np.asarray(Image.open("assets/baby_yoda.png")),
252
+ # model,
253
+ # clip_model,
254
+ # ae_model,
255
+ # device,
256
+ # num_frames,
257
+ # num_steps,
258
+ # 1,
259
+ # 0.3,
260
+ # False,
261
+ # rembg_session,
262
+ # output_folder,
263
+ # )
264
+ demo.launch(inbrowser=True, inline=False, share=share, show_error=True)
265
+
266
+
267
+ if __name__ == "__main__":
268
+ parser = argparse.ArgumentParser()
269
+ parser.add_argument("--device", type=str, default="cuda")
270
+ parser.add_argument("--share", action="store_true")
271
+
272
+ opt = parser.parse_args()
273
+
274
+ test = OmegaConf.load("./scripts/pub/configs/V3D_512.yaml")
275
+ print(test)
276
+
277
+ def download_if_need(path, url):
278
+ if Path(path).exists():
279
+ return
280
+ import wget
281
+
282
+ path = Path(path)
283
+ path.parent.mkdir(parents=True, exist_ok=True)
284
+ wget.download(url, out=str(path))
285
+
286
+ # download_if_need(
287
+ # "ckpts/svd_xt.safetensors",
288
+ # "https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/resolve/main/svd_xt.safetensors",
289
+ # )
290
+ # download_if_need(
291
+ # "ckpts/V3D_512.ckpt", "https://huggingface.co/heheyas/V3D/resolve/main/V3D.ckpt"
292
+ # )
293
+
294
+ launch(opt.device, opt.share)