Spaces:
Runtime error
Runtime error
chenyangqi
commited on
Commit
·
afd7574
1
Parent(s):
0cbd26d
rearrange the spatial layout; add crop to input video
Browse files- FateZero/video_diffusion/data/dataset.py +21 -2
- app_fatezero.py +94 -46
- inference_fatezero.py +30 -1
FateZero/video_diffusion/data/dataset.py
CHANGED
@@ -4,6 +4,8 @@ import numpy as np
|
|
4 |
from PIL import Image
|
5 |
from einops import rearrange
|
6 |
from pathlib import Path
|
|
|
|
|
7 |
|
8 |
import torch
|
9 |
from torch.utils.data import Dataset
|
@@ -149,10 +151,27 @@ class ImageSequenceDataset(Dataset):
|
|
149 |
frame_start = index
|
150 |
return (frame_start + i for i in range(self.n_sample_frame))
|
151 |
|
152 |
-
@staticmethod
|
153 |
-
def get_image_list(path):
|
154 |
images = []
|
|
|
|
|
|
|
|
|
155 |
for file in sorted(os.listdir(path)):
|
156 |
if file.endswith(IMAGE_EXTENSION):
|
157 |
images.append(file)
|
158 |
return images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from PIL import Image
|
5 |
from einops import rearrange
|
6 |
from pathlib import Path
|
7 |
+
import imageio
|
8 |
+
import cv2
|
9 |
|
10 |
import torch
|
11 |
from torch.utils.data import Dataset
|
|
|
151 |
frame_start = index
|
152 |
return (frame_start + i for i in range(self.n_sample_frame))
|
153 |
|
154 |
+
# @staticmethod
|
155 |
+
def get_image_list(self, path):
|
156 |
images = []
|
157 |
+
if path[-4:] == '.mp4':
|
158 |
+
path = self.mp4_to_png(path)
|
159 |
+
self.path = path
|
160 |
+
|
161 |
for file in sorted(os.listdir(path)):
|
162 |
if file.endswith(IMAGE_EXTENSION):
|
163 |
images.append(file)
|
164 |
return images
|
165 |
+
|
166 |
+
# @staticmethod
|
167 |
+
def mp4_to_png(self, video_source=None):
|
168 |
+
reader = imageio.get_reader(video_source)
|
169 |
+
os.makedirs(video_source[:-4], exist_ok=True)
|
170 |
+
|
171 |
+
for i, im in enumerate(reader):
|
172 |
+
# use :05d to add zero, no space before the 05d
|
173 |
+
# if (i+1)%10 == 0:
|
174 |
+
path = os.path.join(video_source[:-4], f"{i:05d}.png")
|
175 |
+
# print(path)
|
176 |
+
cv2.imwrite(path, im[:, :, ::-1])
|
177 |
+
return video_source[:-4]
|
app_fatezero.py
CHANGED
@@ -36,8 +36,59 @@ with gr.Blocks(css='style.css') as demo:
|
|
36 |
|
37 |
with gr.Row():
|
38 |
with gr.Column():
|
39 |
-
with gr.
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
model_id = gr.Dropdown(
|
42 |
label='Model ID',
|
43 |
choices=[
|
@@ -55,54 +106,21 @@ with gr.Blocks(css='style.css') as demo:
|
|
55 |
# prompt_used_for_training = gr.Text(
|
56 |
# label='Training prompt', interactive=False)
|
57 |
|
58 |
-
data_path = gr.Dropdown(
|
59 |
-
label='data path',
|
60 |
-
choices=[
|
61 |
-
'FateZero/data/teaser_car-turn',
|
62 |
-
'FateZero/data/style/sunflower',
|
63 |
-
# add shape editing ckpt here
|
64 |
-
],
|
65 |
-
value='FateZero/data/teaser_car-turn')
|
66 |
|
67 |
|
|
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
cross_replace_steps = gr.Slider(label='cross-attention replace steps',
|
81 |
-
info='More steps, replace more cross attention to preserve semantic layout.',
|
82 |
-
minimum=0.0,
|
83 |
-
maximum=1.0,
|
84 |
-
step=0.1,
|
85 |
-
value=0.7)
|
86 |
-
|
87 |
-
self_replace_steps = gr.Slider(label='self-attention replace steps',
|
88 |
-
info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
|
89 |
-
minimum=0.0,
|
90 |
-
maximum=1.0,
|
91 |
-
step=0.1,
|
92 |
-
value=0.7)
|
93 |
-
|
94 |
-
enhance_words = gr.Textbox(label='words to be enhanced',
|
95 |
-
info='Amplify the target-words cross attention',
|
96 |
-
max_lines=1,
|
97 |
-
placeholder='Example: "watercolor "',
|
98 |
-
value='watercolor')
|
99 |
|
100 |
-
enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
|
101 |
-
info='larger value, more elements of target words',
|
102 |
-
minimum=0.0,
|
103 |
-
maximum=20.0,
|
104 |
-
step=1,
|
105 |
-
value=10)
|
106 |
|
107 |
|
108 |
with gr.Accordion('DDIM Parameters', open=True):
|
@@ -129,6 +147,34 @@ with gr.Blocks(css='style.css') as demo:
|
|
129 |
''')
|
130 |
with gr.Column():
|
131 |
result = gr.Video(label='Result')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
with gr.Row():
|
133 |
examples = [
|
134 |
[
|
@@ -190,6 +236,8 @@ with gr.Blocks(css='style.css') as demo:
|
|
190 |
enhance_words_value,
|
191 |
num_steps,
|
192 |
guidance_scale,
|
|
|
|
|
193 |
]
|
194 |
# prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
|
195 |
target_prompt.submit(fn=merge_config_then_run, inputs=inputs, outputs=result)
|
|
|
36 |
|
37 |
with gr.Row():
|
38 |
with gr.Column():
|
39 |
+
with gr.Accordion('Input Video', open=True):
|
40 |
+
user_input_video = gr.File(label='Input Source Video')
|
41 |
+
with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
|
42 |
+
n_sample_frame = gr.Slider(label='Number of Frames in Video',
|
43 |
+
# info='We test 8 frames in our paper',
|
44 |
+
minimum=0,
|
45 |
+
maximum=32,
|
46 |
+
step=1,
|
47 |
+
value=8)
|
48 |
+
stride = gr.Slider(label='Temporal sampling stride in Video',
|
49 |
+
minimum=0,
|
50 |
+
maximum=20,
|
51 |
+
step=1,
|
52 |
+
value=1)
|
53 |
+
start_sample_frame = gr.Number(label='Start frame in the video',
|
54 |
+
value=0,
|
55 |
+
precision=0)
|
56 |
+
|
57 |
+
with gr.Accordion('Spatial Crop offset', open=False):
|
58 |
+
left_crop = gr.Number(label='Left crop',
|
59 |
+
value=0,
|
60 |
+
precision=0)
|
61 |
+
right_crop = gr.Number(label='Right crop',
|
62 |
+
value=0,
|
63 |
+
precision=0)
|
64 |
+
top_crop = gr.Number(label='Top crop',
|
65 |
+
value=0,
|
66 |
+
precision=0)
|
67 |
+
bottom_crop = gr.Number(label='Bottom crop',
|
68 |
+
value=0,
|
69 |
+
precision=0)
|
70 |
+
offset_list = [
|
71 |
+
left_crop,
|
72 |
+
right_crop,
|
73 |
+
top_crop,
|
74 |
+
bottom_crop,
|
75 |
+
]
|
76 |
+
|
77 |
+
ImageSequenceDataset_list = [
|
78 |
+
start_sample_frame,
|
79 |
+
n_sample_frame,
|
80 |
+
stride
|
81 |
+
] + offset_list
|
82 |
+
|
83 |
+
|
84 |
+
data_path = gr.Dropdown(
|
85 |
+
label='provided data path',
|
86 |
+
choices=[
|
87 |
+
'FateZero/data/teaser_car-turn',
|
88 |
+
'FateZero/data/style/sunflower',
|
89 |
+
# add shape editing ckpt here
|
90 |
+
],
|
91 |
+
value='FateZero/data/teaser_car-turn')
|
92 |
model_id = gr.Dropdown(
|
93 |
label='Model ID',
|
94 |
choices=[
|
|
|
106 |
# prompt_used_for_training = gr.Text(
|
107 |
# label='Training prompt', interactive=False)
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
|
111 |
+
with gr.Accordion('Text Prompt', open=True):
|
112 |
|
113 |
+
source_prompt = gr.Textbox(label='Source Prompt',
|
114 |
+
info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
|
115 |
+
max_lines=1,
|
116 |
+
placeholder='Example: "a silver jeep driving down a curvy road in the countryside"',
|
117 |
+
value='a silver jeep driving down a curvy road in the countryside')
|
118 |
+
target_prompt = gr.Textbox(label='Target Prompt',
|
119 |
+
info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
|
120 |
+
max_lines=1,
|
121 |
+
placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"',
|
122 |
+
value='watercolor painting of a silver jeep driving down a curvy road in the countryside')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
|
126 |
with gr.Accordion('DDIM Parameters', open=True):
|
|
|
147 |
''')
|
148 |
with gr.Column():
|
149 |
result = gr.Video(label='Result')
|
150 |
+
result.style(height=512, width=512)
|
151 |
+
with gr.Accordion('FateZero Parameters for attention fusing', open=True):
|
152 |
+
cross_replace_steps = gr.Slider(label='cross-attention replace steps',
|
153 |
+
info='More steps, replace more cross attention to preserve semantic layout.',
|
154 |
+
minimum=0.0,
|
155 |
+
maximum=1.0,
|
156 |
+
step=0.1,
|
157 |
+
value=0.7)
|
158 |
+
|
159 |
+
self_replace_steps = gr.Slider(label='self-attention replace steps',
|
160 |
+
info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
|
161 |
+
minimum=0.0,
|
162 |
+
maximum=1.0,
|
163 |
+
step=0.1,
|
164 |
+
value=0.7)
|
165 |
+
|
166 |
+
enhance_words = gr.Textbox(label='words to be enhanced',
|
167 |
+
info='Amplify the target-words cross attention',
|
168 |
+
max_lines=1,
|
169 |
+
placeholder='Example: "watercolor "',
|
170 |
+
value='watercolor')
|
171 |
+
|
172 |
+
enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
|
173 |
+
info='larger value, more elements of target words',
|
174 |
+
minimum=0.0,
|
175 |
+
maximum=20.0,
|
176 |
+
step=1,
|
177 |
+
value=10)
|
178 |
with gr.Row():
|
179 |
examples = [
|
180 |
[
|
|
|
236 |
enhance_words_value,
|
237 |
num_steps,
|
238 |
guidance_scale,
|
239 |
+
user_input_video,
|
240 |
+
*ImageSequenceDataset_list
|
241 |
]
|
242 |
# prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
|
243 |
target_prompt.submit(fn=merge_config_then_run, inputs=inputs, outputs=result)
|
inference_fatezero.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
from FateZero.test_fatezero import *
|
3 |
|
4 |
import copy
|
|
|
5 |
|
6 |
|
7 |
def merge_config_then_run(
|
@@ -14,7 +15,17 @@ def merge_config_then_run(
|
|
14 |
enhance_words,
|
15 |
enhance_words_value,
|
16 |
num_steps,
|
17 |
-
guidance_scale
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
):
|
19 |
# , ] = inputs
|
20 |
default_edit_config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'
|
@@ -26,6 +37,24 @@ def merge_config_then_run(
|
|
26 |
# config_now['pretrained_model_path'] = model_id
|
27 |
config_now['train_dataset']['prompt'] = source_prompt
|
28 |
config_now['train_dataset']['path'] = data_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
config_now['validation_sample_logger_config']['prompts'] = [target_prompt]
|
30 |
|
31 |
|
|
|
2 |
from FateZero.test_fatezero import *
|
3 |
|
4 |
import copy
|
5 |
+
import gradio as gr
|
6 |
|
7 |
|
8 |
def merge_config_then_run(
|
|
|
15 |
enhance_words,
|
16 |
enhance_words_value,
|
17 |
num_steps,
|
18 |
+
guidance_scale,
|
19 |
+
user_input_video,
|
20 |
+
|
21 |
+
# Temporal and spatial crop of the video
|
22 |
+
start_sample_frame,
|
23 |
+
n_sample_frame,
|
24 |
+
stride,
|
25 |
+
left_crop,
|
26 |
+
right_crop,
|
27 |
+
top_crop,
|
28 |
+
bottom_crop,
|
29 |
):
|
30 |
# , ] = inputs
|
31 |
default_edit_config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'
|
|
|
37 |
# config_now['pretrained_model_path'] = model_id
|
38 |
config_now['train_dataset']['prompt'] = source_prompt
|
39 |
config_now['train_dataset']['path'] = data_path
|
40 |
+
# ImageSequenceDataset_dict = { }
|
41 |
+
offset_dict = {
|
42 |
+
"left": left_crop,
|
43 |
+
"right": right_crop,
|
44 |
+
"top": top_crop,
|
45 |
+
"bottom": bottom_crop,
|
46 |
+
}
|
47 |
+
ImageSequenceDataset_dict = {
|
48 |
+
"start_sample_frame" : start_sample_frame,
|
49 |
+
"n_sample_frame" : n_sample_frame,
|
50 |
+
"stride" : stride,
|
51 |
+
"offset": offset_dict,
|
52 |
+
}
|
53 |
+
config_now['train_dataset'].update(ImageSequenceDataset_dict)
|
54 |
+
if user_input_video and data_path is None:
|
55 |
+
raise gr.Error('You need to upload a video or choose a provided video')
|
56 |
+
if user_input_video is not None and user_input_video.name is not None:
|
57 |
+
config_now['train_dataset']['path'] = user_input_video.name
|
58 |
config_now['validation_sample_logger_config']['prompts'] = [target_prompt]
|
59 |
|
60 |
|