Spaces:
Runtime error
Runtime error
Commit
·
88a2ed3
1
Parent(s):
a767e49
revert to working state
Browse files
app.py
CHANGED
@@ -6,7 +6,6 @@ from utils import video_to_frames, add_dict_to_yaml_file, save_video, seed_every
|
|
6 |
from tokenflow_pnp import TokenFlow
|
7 |
from preprocess_utils import *
|
8 |
from tokenflow_utils import *
|
9 |
-
import math
|
10 |
# load sd model
|
11 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
12 |
model_id = "stabilityai/stable-diffusion-2-1-base"
|
@@ -52,11 +51,6 @@ def get_example():
|
|
52 |
]
|
53 |
return case
|
54 |
|
55 |
-
def largest_divisor(n):
|
56 |
-
for i in range(2, int(math.sqrt(n)) + 1):
|
57 |
-
if n % i == 0:
|
58 |
-
return n // i
|
59 |
-
return n
|
60 |
|
61 |
def prep(config):
|
62 |
# timesteps to save
|
@@ -108,26 +102,7 @@ def prep(config):
|
|
108 |
|
109 |
|
110 |
return frames, latents, total_inverted_latents, rgb_reconstruction
|
111 |
-
|
112 |
-
|
113 |
-
def calculate_fps(input_video, batch_size):
|
114 |
-
frames, frames_per_second = video_to_frames(input_video)
|
115 |
-
#total_vid_frames = len(frames)
|
116 |
-
#total_vid_duration = total_vid_frames/frames_per_second
|
117 |
-
|
118 |
-
#if(total_vid_duration < 1):
|
119 |
-
# frames_to_process = total_vid_frames
|
120 |
-
#else:
|
121 |
-
# frames_to_process = int(frames_per_second/n_seconds)
|
122 |
-
#
|
123 |
-
#if frames_to_process % batch_size != 0:
|
124 |
-
# batch_size = largest_divisor(batch_size)
|
125 |
-
#print("total vid duration", total_vid_duration)
|
126 |
-
#print("frames to process", frames_to_process)
|
127 |
-
#print("batch size", batch_size)
|
128 |
-
print("fps", frames_per_second)
|
129 |
-
return frames, frames_per_second
|
130 |
-
|
131 |
def preprocess_and_invert(input_video,
|
132 |
frames,
|
133 |
latents,
|
@@ -140,8 +115,6 @@ def preprocess_and_invert(input_video,
|
|
140 |
n_timesteps = 50,
|
141 |
batch_size: int = 8,
|
142 |
n_frames: int = 40,
|
143 |
-
n_seconds: int = 1,
|
144 |
-
n_fps_input: int = 40,
|
145 |
inversion_prompt:str = '',
|
146 |
|
147 |
):
|
@@ -161,31 +134,10 @@ def preprocess_and_invert(input_video,
|
|
161 |
preprocess_config['n_frames'] = n_frames
|
162 |
preprocess_config['seed'] = seed
|
163 |
preprocess_config['inversion_prompt'] = inversion_prompt
|
164 |
-
|
165 |
-
if(not frames):
|
166 |
-
preprocess_config['frames'],n_fps_input = video_to_frames(input_video)
|
167 |
-
not_processed = True
|
168 |
-
else:
|
169 |
-
preprocess_config['frames'] = frames
|
170 |
-
|
171 |
-
print("pre-process fps ", n_fps_input)
|
172 |
preprocess_config['data_path'] = input_video.split(".")[0]
|
173 |
|
174 |
-
total_vid_frames = len(preprocess_config['frames'])
|
175 |
-
print("total frames", total_vid_frames)
|
176 |
-
total_vid_duration = total_vid_frames/n_fps_input
|
177 |
-
|
178 |
-
if(total_vid_duration < 1):
|
179 |
-
preprocess_config['n_frames'] = total_vid_frames
|
180 |
-
else:
|
181 |
-
preprocess_config['n_frames'] = int(n_fps_input/n_seconds)
|
182 |
-
|
183 |
-
if preprocess_config['n_frames'] % batch_size != 0:
|
184 |
-
preprocess_config['batch_size'] = largest_divisor(batch_size)
|
185 |
|
186 |
-
print("Running with batch size of ", preprocess_config['batch_size'])
|
187 |
-
print("Total vid frames", preprocess_config['n_frames'])
|
188 |
-
|
189 |
if randomize_seed:
|
190 |
seed = randomize_seed_fn()
|
191 |
seed_everything(seed)
|
@@ -198,7 +150,7 @@ def preprocess_and_invert(input_video,
|
|
198 |
inverted_latents = gr.State(value=total_inverted_latents)
|
199 |
do_inversion = False
|
200 |
|
201 |
-
return frames, latents, inverted_latents, do_inversion
|
202 |
|
203 |
|
204 |
def edit_with_pnp(input_video,
|
@@ -215,8 +167,6 @@ def edit_with_pnp(input_video,
|
|
215 |
pnp_f_t: float = 0.8,
|
216 |
batch_size: int = 8, #needs to be the same as for preprocess
|
217 |
n_frames: int = 40,#needs to be the same as for preprocess
|
218 |
-
n_seconds: int = 1,
|
219 |
-
n_fps_input: int = 40,
|
220 |
n_timesteps: int = 50,
|
221 |
gudiance_scale: float = 7.5,
|
222 |
inversion_prompt: str = "", #needs to be the same as for preprocess
|
@@ -236,12 +186,10 @@ def edit_with_pnp(input_video,
|
|
236 |
config["pnp_attn_t"] = pnp_attn_t
|
237 |
config["pnp_f_t"] = pnp_f_t
|
238 |
config["pnp_inversion_prompt"] = inversion_prompt
|
239 |
-
|
240 |
-
print("Running with batch size of ", config['batch_size'])
|
241 |
-
print("Total vid frames", config['n_frames'])
|
242 |
|
243 |
if do_inversion:
|
244 |
-
frames, latents, inverted_latents, do_inversion
|
245 |
input_video,
|
246 |
frames,
|
247 |
latents,
|
@@ -253,11 +201,7 @@ def edit_with_pnp(input_video,
|
|
253 |
n_timesteps,
|
254 |
batch_size,
|
255 |
n_frames,
|
256 |
-
n_seconds,
|
257 |
-
n_fps_input,
|
258 |
inversion_prompt)
|
259 |
-
config["batch_size"] = batch_size
|
260 |
-
config["n_frames"] = n_frames
|
261 |
do_inversion = False
|
262 |
|
263 |
|
@@ -277,6 +221,7 @@ def edit_with_pnp(input_video,
|
|
277 |
# demo #
|
278 |
########
|
279 |
|
|
|
280 |
intro = """
|
281 |
<div style="text-align:center">
|
282 |
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;">
|
@@ -288,6 +233,8 @@ intro = """
|
|
288 |
</div>
|
289 |
"""
|
290 |
|
|
|
|
|
291 |
with gr.Blocks(css="style.css") as demo:
|
292 |
|
293 |
gr.HTML(intro)
|
@@ -295,8 +242,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
295 |
inverted_latents = gr.State()
|
296 |
latents = gr.State()
|
297 |
do_inversion = gr.State(value=True)
|
298 |
-
|
299 |
-
|
300 |
with gr.Row():
|
301 |
input_video = gr.Video(label="Input Video", interactive=True, elem_id="input_video")
|
302 |
output_video = gr.Video(label="Edited Video", interactive=False, elem_id="output_video")
|
@@ -336,19 +282,15 @@ with gr.Blocks(css="style.css") as demo:
|
|
336 |
|
337 |
with gr.Column(min_width=100):
|
338 |
inversion_prompt = gr.Textbox(lines=1, label="Inversion prompt", interactive=True, placeholder="")
|
339 |
-
batch_size = gr.Slider(label='Batch size', minimum=1, maximum=
|
340 |
-
value=8, step=1, interactive=True
|
341 |
n_frames = gr.Slider(label='Num frames', minimum=2, maximum=200,
|
342 |
-
value=24, step=1, interactive=True
|
343 |
-
n_seconds = gr.Slider(label='Num seconds', info="How many seconds of your video to process",
|
344 |
-
minimum=1, maximum=2, step=1)
|
345 |
n_timesteps = gr.Slider(label='Diffusion steps', minimum=25, maximum=100,
|
346 |
value=50, step=25, interactive=True)
|
347 |
-
|
348 |
-
n_fps = gr.Slider(label='Output frames per second', minimum=1, maximum=60,
|
349 |
value=10, step=1, interactive=True)
|
350 |
|
351 |
-
|
352 |
with gr.TabItem('Plug-and-Play Parameters'):
|
353 |
with gr.Column(min_width=100):
|
354 |
pnp_attn_t = gr.Slider(label='pnp attention threshold', minimum=0, maximum=1,
|
@@ -382,7 +324,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
382 |
input_video.upload(
|
383 |
fn = reset_do_inversion,
|
384 |
outputs = [do_inversion],
|
385 |
-
queue = False).then(fn =
|
386 |
inputs = [input_video,
|
387 |
frames,
|
388 |
latents,
|
@@ -394,20 +336,15 @@ with gr.Blocks(css="style.css") as demo:
|
|
394 |
n_timesteps,
|
395 |
batch_size,
|
396 |
n_frames,
|
397 |
-
n_seconds,
|
398 |
-
n_fps_input,
|
399 |
inversion_prompt
|
400 |
],
|
401 |
outputs = [frames,
|
402 |
latents,
|
403 |
inverted_latents,
|
404 |
-
do_inversion
|
405 |
-
|
406 |
-
n_frames
|
407 |
])
|
408 |
|
409 |
-
input_video.change(fn = calculate_fps, inputs=[input_video], outputs=[frames, n_fps_input], queue=False)
|
410 |
-
|
411 |
run_button.click(fn = edit_with_pnp,
|
412 |
inputs = [input_video,
|
413 |
frames,
|
@@ -422,8 +359,6 @@ with gr.Blocks(css="style.css") as demo:
|
|
422 |
pnp_f_t,
|
423 |
batch_size,
|
424 |
n_frames,
|
425 |
-
n_seconds,
|
426 |
-
n_fps_input,
|
427 |
n_timesteps,
|
428 |
gudiance_scale,
|
429 |
inversion_prompt,
|
|
|
6 |
from tokenflow_pnp import TokenFlow
|
7 |
from preprocess_utils import *
|
8 |
from tokenflow_utils import *
|
|
|
9 |
# load sd model
|
10 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
11 |
model_id = "stabilityai/stable-diffusion-2-1-base"
|
|
|
51 |
]
|
52 |
return case
|
53 |
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def prep(config):
|
56 |
# timesteps to save
|
|
|
102 |
|
103 |
|
104 |
return frames, latents, total_inverted_latents, rgb_reconstruction
|
105 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
def preprocess_and_invert(input_video,
|
107 |
frames,
|
108 |
latents,
|
|
|
115 |
n_timesteps = 50,
|
116 |
batch_size: int = 8,
|
117 |
n_frames: int = 40,
|
|
|
|
|
118 |
inversion_prompt:str = '',
|
119 |
|
120 |
):
|
|
|
134 |
preprocess_config['n_frames'] = n_frames
|
135 |
preprocess_config['seed'] = seed
|
136 |
preprocess_config['inversion_prompt'] = inversion_prompt
|
137 |
+
preprocess_config['frames'] = video_to_frames(input_video)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
preprocess_config['data_path'] = input_video.split(".")[0]
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
|
|
|
|
|
|
141 |
if randomize_seed:
|
142 |
seed = randomize_seed_fn()
|
143 |
seed_everything(seed)
|
|
|
150 |
inverted_latents = gr.State(value=total_inverted_latents)
|
151 |
do_inversion = False
|
152 |
|
153 |
+
return frames, latents, inverted_latents, do_inversion
|
154 |
|
155 |
|
156 |
def edit_with_pnp(input_video,
|
|
|
167 |
pnp_f_t: float = 0.8,
|
168 |
batch_size: int = 8, #needs to be the same as for preprocess
|
169 |
n_frames: int = 40,#needs to be the same as for preprocess
|
|
|
|
|
170 |
n_timesteps: int = 50,
|
171 |
gudiance_scale: float = 7.5,
|
172 |
inversion_prompt: str = "", #needs to be the same as for preprocess
|
|
|
186 |
config["pnp_attn_t"] = pnp_attn_t
|
187 |
config["pnp_f_t"] = pnp_f_t
|
188 |
config["pnp_inversion_prompt"] = inversion_prompt
|
189 |
+
|
|
|
|
|
190 |
|
191 |
if do_inversion:
|
192 |
+
frames, latents, inverted_latents, do_inversion = preprocess_and_invert(
|
193 |
input_video,
|
194 |
frames,
|
195 |
latents,
|
|
|
201 |
n_timesteps,
|
202 |
batch_size,
|
203 |
n_frames,
|
|
|
|
|
204 |
inversion_prompt)
|
|
|
|
|
205 |
do_inversion = False
|
206 |
|
207 |
|
|
|
221 |
# demo #
|
222 |
########
|
223 |
|
224 |
+
|
225 |
intro = """
|
226 |
<div style="text-align:center">
|
227 |
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;">
|
|
|
233 |
</div>
|
234 |
"""
|
235 |
|
236 |
+
|
237 |
+
|
238 |
with gr.Blocks(css="style.css") as demo:
|
239 |
|
240 |
gr.HTML(intro)
|
|
|
242 |
inverted_latents = gr.State()
|
243 |
latents = gr.State()
|
244 |
do_inversion = gr.State(value=True)
|
245 |
+
|
|
|
246 |
with gr.Row():
|
247 |
input_video = gr.Video(label="Input Video", interactive=True, elem_id="input_video")
|
248 |
output_video = gr.Video(label="Edited Video", interactive=False, elem_id="output_video")
|
|
|
282 |
|
283 |
with gr.Column(min_width=100):
|
284 |
inversion_prompt = gr.Textbox(lines=1, label="Inversion prompt", interactive=True, placeholder="")
|
285 |
+
batch_size = gr.Slider(label='Batch size', minimum=1, maximum=10,
|
286 |
+
value=8, step=1, interactive=True)
|
287 |
n_frames = gr.Slider(label='Num frames', minimum=2, maximum=200,
|
288 |
+
value=24, step=1, interactive=True)
|
|
|
|
|
289 |
n_timesteps = gr.Slider(label='Diffusion steps', minimum=25, maximum=100,
|
290 |
value=50, step=25, interactive=True)
|
291 |
+
n_fps = gr.Slider(label='Frames per second', minimum=1, maximum=60,
|
|
|
292 |
value=10, step=1, interactive=True)
|
293 |
|
|
|
294 |
with gr.TabItem('Plug-and-Play Parameters'):
|
295 |
with gr.Column(min_width=100):
|
296 |
pnp_attn_t = gr.Slider(label='pnp attention threshold', minimum=0, maximum=1,
|
|
|
324 |
input_video.upload(
|
325 |
fn = reset_do_inversion,
|
326 |
outputs = [do_inversion],
|
327 |
+
queue = False).then(fn = preprocess_and_invert,
|
328 |
inputs = [input_video,
|
329 |
frames,
|
330 |
latents,
|
|
|
336 |
n_timesteps,
|
337 |
batch_size,
|
338 |
n_frames,
|
|
|
|
|
339 |
inversion_prompt
|
340 |
],
|
341 |
outputs = [frames,
|
342 |
latents,
|
343 |
inverted_latents,
|
344 |
+
do_inversion
|
345 |
+
|
|
|
346 |
])
|
347 |
|
|
|
|
|
348 |
run_button.click(fn = edit_with_pnp,
|
349 |
inputs = [input_video,
|
350 |
frames,
|
|
|
359 |
pnp_f_t,
|
360 |
batch_size,
|
361 |
n_frames,
|
|
|
|
|
362 |
n_timesteps,
|
363 |
gudiance_scale,
|
364 |
inversion_prompt,
|
utils.py
CHANGED
@@ -16,7 +16,7 @@ from kornia.utils.grid import create_meshgrid
|
|
16 |
import cv2
|
17 |
|
18 |
def save_video_frames(video_path, img_size=(512,512)):
|
19 |
-
video, _,
|
20 |
# rotate video -90 degree if video is .mov format. this is a weird bug in torchvision
|
21 |
if video_path.endswith('.mov'):
|
22 |
video = T.functional.rotate(video, -90)
|
@@ -29,8 +29,7 @@ def save_video_frames(video_path, img_size=(512,512)):
|
|
29 |
image_resized.save(f'data/{video_name}/{ind}.png')
|
30 |
|
31 |
def video_to_frames(video_path, img_size=(512,512)):
|
32 |
-
video, _,
|
33 |
-
|
34 |
# rotate video -90 degree if video is .mov format. this is a weird bug in torchvision
|
35 |
if video_path.endswith('.mov'):
|
36 |
video = T.functional.rotate(video, -90)
|
@@ -40,19 +39,10 @@ def video_to_frames(video_path, img_size=(512,512)):
|
|
40 |
for i in range(len(video)):
|
41 |
ind = str(i).zfill(5)
|
42 |
image = T.ToPILImage()(video[i])
|
43 |
-
|
44 |
-
# get new height and width to maintain aspect ratio
|
45 |
-
height, width = image.size
|
46 |
-
new_height = math.floor(img_size[0] * height / width)
|
47 |
-
new_width = math.floor(img_size[1] * width / height)
|
48 |
-
|
49 |
-
# pad
|
50 |
-
image = Image.new(image.mode, (new_width, new_height), (0, 0, 0))
|
51 |
-
|
52 |
image_resized = image.resize((img_size), resample=Image.Resampling.LANCZOS)
|
53 |
# image_resized.save(f'data/{video_name}/{ind}.png')
|
54 |
frames.append(image_resized)
|
55 |
-
return frames
|
56 |
|
57 |
def add_dict_to_yaml_file(file_path, key, value):
|
58 |
data = {}
|
|
|
16 |
import cv2
|
17 |
|
18 |
def save_video_frames(video_path, img_size=(512,512)):
|
19 |
+
video, _, _ = read_video(video_path, output_format="TCHW")
|
20 |
# rotate video -90 degree if video is .mov format. this is a weird bug in torchvision
|
21 |
if video_path.endswith('.mov'):
|
22 |
video = T.functional.rotate(video, -90)
|
|
|
29 |
image_resized.save(f'data/{video_name}/{ind}.png')
|
30 |
|
31 |
def video_to_frames(video_path, img_size=(512,512)):
|
32 |
+
video, _, _ = read_video(video_path, output_format="TCHW")
|
|
|
33 |
# rotate video -90 degree if video is .mov format. this is a weird bug in torchvision
|
34 |
if video_path.endswith('.mov'):
|
35 |
video = T.functional.rotate(video, -90)
|
|
|
39 |
for i in range(len(video)):
|
40 |
ind = str(i).zfill(5)
|
41 |
image = T.ToPILImage()(video[i])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
image_resized = image.resize((img_size), resample=Image.Resampling.LANCZOS)
|
43 |
# image_resized.save(f'data/{video_name}/{ind}.png')
|
44 |
frames.append(image_resized)
|
45 |
+
return frames
|
46 |
|
47 |
def add_dict_to_yaml_file(file_path, key, value):
|
48 |
data = {}
|