multimodalart HF staff commited on
Commit
9ca6c30
1 Parent(s): a7729a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -208
app.py CHANGED
@@ -1,73 +1,27 @@
1
- import math
 
2
  import os
3
  from glob import glob
4
  from pathlib import Path
5
  from typing import Optional
6
 
7
- import cv2
8
- import numpy as np
9
- import torch
10
- from einops import rearrange, repeat
11
- from fire import Fire
12
- from omegaconf import OmegaConf
13
  from PIL import Image
14
- from torchvision.transforms import ToTensor
15
 
16
- from scripts.util.detection.nsfw_and_watermark_dectection import \
17
- DeepFloydDataFiltering
18
- from sgm.inference.helpers import embed_watermark
19
- from sgm.util import default, instantiate_from_config
20
 
21
- import gradio as gr
22
  import uuid
23
  import random
24
  from huggingface_hub import hf_hub_download
25
 
26
- hf_hub_download(repo_id="stabilityai/stable-video-diffusion-img2vid-xt", filename="svd_xt.safetensors", local_dir="checkpoints")
 
 
 
 
27
 
28
- version = "svd_xt"
29
- device = "cuda"
30
  max_64_bit_int = 2**63 - 1
31
 
32
- def load_model(
33
- config: str,
34
- device: str,
35
- num_frames: int,
36
- num_steps: int,
37
- ):
38
- config = OmegaConf.load(config)
39
- if device == "cuda":
40
- config.model.params.conditioner_config.params.emb_models[
41
- 0
42
- ].params.open_clip_embedding_config.params.init_device = device
43
-
44
- config.model.params.sampler_config.params.num_steps = num_steps
45
- config.model.params.sampler_config.params.guider_config.params.num_frames = (
46
- num_frames
47
- )
48
- if device == "cuda":
49
- with torch.device(device):
50
- model = instantiate_from_config(config.model).to(device).eval()
51
- else:
52
- model = instantiate_from_config(config.model).to(device).eval()
53
-
54
- filter = DeepFloydDataFiltering(verbose=False, device=device)
55
- return model, filter
56
-
57
- if version == "svd_xt":
58
- num_frames = 25
59
- num_steps = 30
60
- model_config = "scripts/sampling/configs/svd_xt.yaml"
61
- else:
62
- raise ValueError(f"Version {version} does not exist.")
63
-
64
- model, filter = load_model(
65
- model_config,
66
- device,
67
- num_frames,
68
- num_steps,
69
- )
70
-
71
  def sample(
72
  image: Image,
73
  seed: Optional[int] = None,
@@ -76,168 +30,28 @@ def sample(
76
  fps_id: int = 6,
77
  version: str = "svd_xt",
78
  cond_aug: float = 0.02,
79
- decoding_t: int = 5, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
80
  device: str = "cuda",
81
  output_folder: str = "outputs",
82
- progress=gr.Progress(track_tqdm=True)
83
  ):
 
 
 
84
  if(randomize_seed):
85
  seed = random.randint(0, max_64_bit_int)
86
-
87
- torch.manual_seed(seed)
88
 
89
- if image.mode == "RGBA":
90
- image = image.convert("RGB")
91
- w, h = image.size
92
 
93
- if h % 64 != 0 or w % 64 != 0:
94
- width, height = map(lambda x: x - x % 64, (w, h))
95
- image = image.resize((width, height))
96
- print(
97
- f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
98
- )
99
-
100
- image = ToTensor()(image)
101
- image = image * 2.0 - 1.0
102
- image = image.unsqueeze(0).to(device)
103
- H, W = image.shape[2:]
104
- assert image.shape[1] == 3
105
- F = 8
106
- C = 4
107
- shape = (num_frames, C, H // F, W // F)
108
- if (H, W) != (576, 1024):
109
- print(
110
- "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
111
- )
112
- if motion_bucket_id > 255:
113
- print(
114
- "WARNING: High motion bucket! This may lead to suboptimal performance."
115
- )
116
-
117
- if fps_id < 5:
118
- print("WARNING: Small fps value! This may lead to suboptimal performance.")
119
-
120
- if fps_id > 30:
121
- print("WARNING: Large fps value! This may lead to suboptimal performance.")
122
-
123
- value_dict = {}
124
- value_dict["motion_bucket_id"] = motion_bucket_id
125
- value_dict["fps_id"] = fps_id
126
- value_dict["cond_aug"] = cond_aug
127
- value_dict["cond_frames_without_noise"] = image
128
- value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
129
- value_dict["cond_aug"] = cond_aug
130
-
131
- with torch.no_grad():
132
- with torch.autocast(device):
133
- batch, batch_uc = get_batch(
134
- get_unique_embedder_keys_from_conditioner(model.conditioner),
135
- value_dict,
136
- [1, num_frames],
137
- T=num_frames,
138
- device=device,
139
- )
140
- c, uc = model.conditioner.get_unconditional_conditioning(
141
- batch,
142
- batch_uc=batch_uc,
143
- force_uc_zero_embeddings=[
144
- "cond_frames",
145
- "cond_frames_without_noise",
146
- ],
147
- )
148
-
149
- for k in ["crossattn", "concat"]:
150
- uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
151
- uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
152
- c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
153
- c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
154
-
155
- randn = torch.randn(shape, device=device)
156
-
157
- additional_model_inputs = {}
158
- additional_model_inputs["image_only_indicator"] = torch.zeros(
159
- 2, num_frames
160
- ).to(device)
161
- additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
162
-
163
- def denoiser(input, sigma, c):
164
- return model.denoiser(
165
- model.model, input, sigma, c, **additional_model_inputs
166
- )
167
-
168
- samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
169
- model.en_and_decode_n_samples_a_time = decoding_t
170
- samples_x = model.decode_first_stage(samples_z)
171
- samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
172
-
173
- os.makedirs(output_folder, exist_ok=True)
174
- base_count = len(glob(os.path.join(output_folder, "*.mp4")))
175
- video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
176
- writer = cv2.VideoWriter(
177
- video_path,
178
- cv2.VideoWriter_fourcc(*"mp4v"),
179
- fps_id + 1,
180
- (samples.shape[-1], samples.shape[-2]),
181
- )
182
-
183
- samples = embed_watermark(samples)
184
- samples = filter(samples)
185
- vid = (
186
- (rearrange(samples, "t c h w -> t h w c") * 255)
187
- .cpu()
188
- .numpy()
189
- .astype(np.uint8)
190
- )
191
- for frame in vid:
192
- frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
193
- writer.write(frame)
194
- writer.release()
195
  return video_path, seed
196
 
197
- def get_unique_embedder_keys_from_conditioner(conditioner):
198
- return list(set([x.input_key for x in conditioner.embedders]))
199
-
200
-
201
- def get_batch(keys, value_dict, N, T, device):
202
- batch = {}
203
- batch_uc = {}
204
-
205
- for key in keys:
206
- if key == "fps_id":
207
- batch[key] = (
208
- torch.tensor([value_dict["fps_id"]])
209
- .to(device)
210
- .repeat(int(math.prod(N)))
211
- )
212
- elif key == "motion_bucket_id":
213
- batch[key] = (
214
- torch.tensor([value_dict["motion_bucket_id"]])
215
- .to(device)
216
- .repeat(int(math.prod(N)))
217
- )
218
- elif key == "cond_aug":
219
- batch[key] = repeat(
220
- torch.tensor([value_dict["cond_aug"]]).to(device),
221
- "1 -> b",
222
- b=math.prod(N),
223
- )
224
- elif key == "cond_frames":
225
- batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
226
- elif key == "cond_frames_without_noise":
227
- batch[key] = repeat(
228
- value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
229
- )
230
- else:
231
- batch[key] = value_dict[key]
232
-
233
- if T is not None:
234
- batch["num_video_frames"] = T
235
-
236
- for key in batch.keys():
237
- if key not in batch_uc and isinstance(batch[key], torch.Tensor):
238
- batch_uc[key] = torch.clone(batch[key])
239
- return batch, batch_uc
240
-
241
  def resize_image(image, output_size=(1024, 576)):
242
  # Calculate aspect ratios
243
  target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size
 
1
+ import gradio as gr
2
+ import torch
3
  import os
4
  from glob import glob
5
  from pathlib import Path
6
  from typing import Optional
7
 
8
+ from diffusers import StableVideoDiffusionPipeline
9
+ from diffusers.utils import load_image, export_to_video
 
 
 
 
10
  from PIL import Image
 
11
 
 
 
 
 
12
 
 
13
  import uuid
14
  import random
15
  from huggingface_hub import hf_hub_download
16
 
17
+ pipe = StableVideoDiffusionPipeline.from_pretrained(
18
+ "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
19
+ )
20
+ pipe.to("cuda")
21
+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
22
 
 
 
23
  max_64_bit_int = 2**63 - 1
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def sample(
26
  image: Image,
27
  seed: Optional[int] = None,
 
30
  fps_id: int = 6,
31
  version: str = "svd_xt",
32
  cond_aug: float = 0.02,
33
+ decoding_t: int = 4, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
34
  device: str = "cuda",
35
  output_folder: str = "outputs",
36
+ #progress=gr.Progress(track_tqdm=True)
37
  ):
38
+ if image.mode == "RGBA":
39
+ image = image.convert("RGB")
40
+
41
  if(randomize_seed):
42
  seed = random.randint(0, max_64_bit_int)
43
+ generator = torch.manual_seed(seed)
 
44
 
45
+ os.makedirs(output_folder, exist_ok=True)
46
+ base_count = len(glob(os.path.join(output_folder, "*.mp4")))
47
+ video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
48
 
49
+ frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1).frames[0]
50
+ export_to_video(frames, video_path, fps=fps_id)
51
+ torch.manual_seed(seed)
52
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  return video_path, seed
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def resize_image(image, output_size=(1024, 576)):
56
  # Calculate aspect ratios
57
  target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size