chenyangqi commited on
Commit
a8b3fe4
1 Parent(s): 556db34

add preview for input video

Browse files
FateZero/data/.gitignore CHANGED
@@ -2,4 +2,6 @@
2
  !teaser_car-turn
3
  !teaser_car-turn/*
4
  !.gitignore
5
- !download.sh
 
 
2
  !teaser_car-turn
3
  !teaser_car-turn/*
4
  !.gitignore
5
+ !download.sh
6
+ !*.mp4
7
+ !*/*.mp4
FateZero/data/teaser_car-turn.mp4 ADDED
Binary file (147 kB). View file
FateZero/script/png_to_mp4.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import imageio
2
+ import os
3
+ from glob import glob
4
+
5
+ def png_to_mp4(example_input_path, out_path=None):
6
+ # Create output folder if it doesn't exist
7
+ if out_path is None:
8
+ out_path = example_input_path[0:-4] + '/out.mp4'
9
+ print(out_path)
10
+ # os.makedirs(out_path, exist_ok=True)
11
+ png_list = sorted(glob(example_input_path + '/*.*g'))
12
+ # Read the GIF file using imageio
13
+ frames = []
14
+ for png in png_list:
15
+ print(png)
16
+ # fps = reader.get_meta_data()['fps'] # Get the FPS of the GIF
17
+
18
+
19
+
20
+ # Iterate over each frame in the GIF and save it as a PNG image
21
+ # for i, frame in enumerate(reader):
22
+ # frame_path = os.path.join(out_path, f'frame_{i:05d}.png')
23
+ # imageio.imwrite(frame_path, frame)
24
+ frames.append(imageio.imread(png))
25
+
26
+ # Save the frames as an MP4 video using imageio
27
+ # mp4_path = os.path.join(out_path, 'output.mp4')
28
+ print(out_path)
29
+ # breakpoint()
30
+ imageio.mimsave(out_path, frames, fps=10)
31
+
32
+
33
+ video_all_folder = '/home/cqiaa/diffusion/hugging_face/FateZero/FateZero/data/style'
34
+ video_list = glob(video_all_folder+'/*')
35
+ for example_input_path in video_list:
36
+ print(example_input_path)
37
+ out_path = example_input_path+'.mp4'
38
+ png_to_mp4(example_input_path, out_path)
39
+
40
+ # example_input_path = 'data/style/blackswan'
41
+ # out_path = example_input_path+'.mp4'
42
+ # png_to_mp4(example_input_path, out_path)
app_fatezero.py CHANGED
@@ -6,24 +6,8 @@ import os
6
 
7
  import gradio as gr
8
 
9
- # from inference import InferencePipeline
10
- # from FateZero import test_fatezero
11
  from inference_fatezero import merge_config_then_run
12
 
13
- # class InferenceUtil:
14
- # def __init__(self, hf_token: str | None):
15
- # self.hf_token = hf_token
16
-
17
- # def load_model_info(self, model_id: str) -> tuple[str, str]:
18
- # # todo FIXME
19
- # try:
20
- # card = InferencePipeline.get_model_card(model_id, self.hf_token)
21
- # except Exception:
22
- # return '', ''
23
- # base_model = getattr(card.data, 'base_model', '')
24
- # training_prompt = getattr(card.data, 'training_prompt', '')
25
- # return base_model, training_prompt
26
-
27
 
28
  # TITLE = '# [FateZero](http://fate-zero-edit.github.io/)'
29
  HF_TOKEN = os.getenv('HF_TOKEN')
@@ -93,32 +77,33 @@ with gr.Blocks(css='style.css') as demo:
93
  </span>
94
  </h2>
95
  <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
96
- FateZero is a first zero-shot framework for text-driven video editing via pretrained diffusion models without training.
97
  </h2>
98
  </div>
99
  """)
100
 
101
 
102
  gr.HTML("""
103
- <p>Note that due to limite of memory and computing resource on hugging face, the results here are only toy examples and takes longer time to edit.
104
- <p>For better performance and faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
105
  <br/>
106
  <a href="https://huggingface.co/spaces/chenyangqi/FateZero?duplicate=true">
107
  <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
108
- <p>Or try our github <a href=https://github.com/ChenyangQiQi/FateZero> code </a> on your own GPU.
109
  </p>""")
110
 
111
  with gr.Row():
112
  with gr.Column():
113
  with gr.Accordion('Input Video', open=True):
114
- user_input_video = gr.File(label='Input Source Video')
 
115
  with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
116
- n_sample_frame = gr.Slider(label='Number of Frames in Video',
117
  minimum=0,
118
  maximum=32,
119
  step=1,
120
  value=8)
121
- stride = gr.Slider(label='Temporal sampling stride in Video',
122
  minimum=0,
123
  maximum=20,
124
  step=1,
@@ -153,16 +138,6 @@ with gr.Blocks(css='style.css') as demo:
153
  stride
154
  ] + offset_list
155
 
156
-
157
- data_path = gr.Dropdown(
158
- label='Or use provided data in our paper',
159
- choices=[
160
- 'FateZero/data/teaser_car-turn',
161
- 'FateZero/data/style/sunflower',
162
- 'FateZero/data/attribute/swan_swarov',
163
- # add shape editing ckpt here
164
- ],
165
- value='FateZero/data/teaser_car-turn')
166
  model_id = gr.Dropdown(
167
  label='Model ID',
168
  choices=[
@@ -170,16 +145,6 @@ with gr.Blocks(css='style.css') as demo:
170
  # add shape editing ckpt here
171
  ],
172
  value='CompVis/stable-diffusion-v1-4')
173
- # with gr.Accordion(
174
- # label=
175
- # 'Model info (Base model and prompt used for training)',
176
- # open=False):
177
- # with gr.Row():
178
- # base_model_used_for_training = gr.Text(
179
- # label='Base model', interactive=False)
180
- # prompt_used_for_training = gr.Text(
181
- # label='Training prompt', interactive=False)
182
-
183
 
184
 
185
  with gr.Accordion('Text Prompt', open=True):
@@ -197,91 +162,60 @@ with gr.Blocks(css='style.css') as demo:
197
 
198
 
199
 
200
- with gr.Accordion('DDIM Parameters', open=True):
201
- num_steps = gr.Slider(label='Number of Steps',
202
- info='larger value has better editing capacity, but takes more time and memory',
203
- minimum=0,
204
- maximum=50,
205
- step=1,
206
- value=10)
207
- guidance_scale = gr.Slider(label='CFG Scale',
208
- minimum=0,
209
- maximum=50,
210
- step=0.1,
211
- value=7.5)
212
 
213
  run_button = gr.Button('Generate')
214
 
215
- # gr.Markdown('''
216
- # - It takes a few minutes to download model first.
217
- # - Expected time to generate an 8-frame video: 70 seconds with T4, 24 seconds with A10G, (10 seconds with A100)
218
- # ''')
219
- # gr.Markdown('''
220
- # todo
221
- # ''')
222
  with gr.Column():
223
  result = gr.Video(label='Result')
224
- result.style(height=512, width=512)
225
  with gr.Accordion('FateZero Parameters for attention fusing', open=True):
226
- cross_replace_steps = gr.Slider(label='cross-attention replace steps',
227
  info='More steps, replace more cross attention to preserve semantic layout.',
228
  minimum=0.0,
229
  maximum=1.0,
230
  step=0.1,
231
  value=0.7)
232
 
233
- self_replace_steps = gr.Slider(label='self-attention replace steps',
234
  info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
235
  minimum=0.0,
236
  maximum=1.0,
237
  step=0.1,
238
  value=0.7)
239
 
240
- enhance_words = gr.Textbox(label='words to be enhanced',
241
  info='Amplify the target-words cross attention',
242
  max_lines=1,
243
  placeholder='Example: "watercolor "',
244
  value='watercolor')
245
 
246
- enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
247
  info='larger value, more elements of target words',
248
  minimum=0.0,
249
  maximum=20.0,
250
  step=1,
251
  value=10)
 
 
 
 
 
 
 
 
 
 
 
 
252
  with gr.Row():
253
  from example import style_example
254
  examples = style_example
255
- # examples = [
256
- # [
257
- # 'CompVis/stable-diffusion-v1-4',
258
- # 'FateZero/data/teaser_car-turn',
259
- # 'a silver jeep driving down a curvy road in the countryside',
260
- # 'watercolor painting of a silver jeep driving down a curvy road in the countryside',
261
- # 0.8,
262
- # 0.8,
263
- # "watercolor",
264
- # 10,
265
- # 10,
266
- # 7.5,
267
- # ],
268
- # [
269
- # 'CompVis/stable-diffusion-v1-4',
270
- # 'FateZero/data/style/sunflower',
271
- # 'a yellow sunflower',
272
- # 'van gogh style painting of a yellow sunflower',
273
- # 0.5,
274
- # 0.5,
275
- # 'van gogh',
276
- # 10,
277
- # 10,
278
- # 7.5,
279
- # ],
280
- # ]
281
  gr.Examples(examples=examples,
282
  inputs=[
283
  model_id,
284
- data_path,
285
  source_prompt,
286
  target_prompt,
287
  cross_replace_steps,
@@ -299,15 +233,9 @@ with gr.Blocks(css='style.css') as demo:
299
  # cache_examples=os.getenv('SYSTEM') == 'spaces'
300
  )
301
 
302
- # model_id.change(fn=app.load_model_info,
303
- # inputs=model_id,
304
- # outputs=[
305
- # base_model_used_for_training,
306
- # prompt_used_for_training,
307
- # ])
308
  inputs = [
309
  model_id,
310
- data_path,
311
  source_prompt,
312
  target_prompt,
313
  cross_replace_steps,
@@ -319,9 +247,7 @@ with gr.Blocks(css='style.css') as demo:
319
  user_input_video,
320
  *ImageSequenceDataset_list
321
  ]
322
- # prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
323
  target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
324
- # run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
325
  run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
326
 
327
  demo.queue().launch()
6
 
7
  import gradio as gr
8
 
 
 
9
  from inference_fatezero import merge_config_then_run
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # TITLE = '# [FateZero](http://fate-zero-edit.github.io/)'
13
  HF_TOKEN = os.getenv('HF_TOKEN')
77
  </span>
78
  </h2>
79
  <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
80
+ FateZero is the first zero-shot framework for text-driven video editing via pretrained diffusion models without training.
81
  </h2>
82
  </div>
83
  """)
84
 
85
 
86
  gr.HTML("""
87
+ <p>Note that due to the limits of memory and computing resources on hugging-face, the results here are only toy examples and take longer to edit.
88
+ <p>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue.
89
  <br/>
90
  <a href="https://huggingface.co/spaces/chenyangqi/FateZero?duplicate=true">
91
  <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
92
+ <p>Alternatively, try our GitHub <a href=https://github.com/ChenyangQiQi/FateZero> code </a> on your GPU.
93
  </p>""")
94
 
95
  with gr.Row():
96
  with gr.Column():
97
  with gr.Accordion('Input Video', open=True):
98
+ # user_input_video = gr.File(label='Input Source Video')
99
+ user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
100
  with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
101
+ n_sample_frame = gr.Slider(label='Number of Frames',
102
  minimum=0,
103
  maximum=32,
104
  step=1,
105
  value=8)
106
+ stride = gr.Slider(label='Temporal stride',
107
  minimum=0,
108
  maximum=20,
109
  step=1,
138
  stride
139
  ] + offset_list
140
 
 
 
 
 
 
 
 
 
 
 
141
  model_id = gr.Dropdown(
142
  label='Model ID',
143
  choices=[
145
  # add shape editing ckpt here
146
  ],
147
  value='CompVis/stable-diffusion-v1-4')
 
 
 
 
 
 
 
 
 
 
148
 
149
 
150
  with gr.Accordion('Text Prompt', open=True):
162
 
163
 
164
 
165
+
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  run_button = gr.Button('Generate')
168
 
 
 
 
 
 
 
 
169
  with gr.Column():
170
  result = gr.Video(label='Result')
171
+ # result.style(height=512, width=512)
172
  with gr.Accordion('FateZero Parameters for attention fusing', open=True):
173
+ cross_replace_steps = gr.Slider(label='Cross-att replace steps',
174
  info='More steps, replace more cross attention to preserve semantic layout.',
175
  minimum=0.0,
176
  maximum=1.0,
177
  step=0.1,
178
  value=0.7)
179
 
180
+ self_replace_steps = gr.Slider(label='Self-att replace steps',
181
  info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
182
  minimum=0.0,
183
  maximum=1.0,
184
  step=0.1,
185
  value=0.7)
186
 
187
+ enhance_words = gr.Textbox(label='Enhanced words',
188
  info='Amplify the target-words cross attention',
189
  max_lines=1,
190
  placeholder='Example: "watercolor "',
191
  value='watercolor')
192
 
193
+ enhance_words_value = gr.Slider(label='Target cross-att amplification',
194
  info='larger value, more elements of target words',
195
  minimum=0.0,
196
  maximum=20.0,
197
  step=1,
198
  value=10)
199
+ with gr.Accordion('DDIM Parameters', open=True):
200
+ num_steps = gr.Slider(label='Number of Steps',
201
+ info='larger value has better editing capacity, but takes more time and memory. (50 steps may produces memory errors)',
202
+ minimum=0,
203
+ maximum=50,
204
+ step=1,
205
+ value=10)
206
+ guidance_scale = gr.Slider(label='CFG Scale',
207
+ minimum=0,
208
+ maximum=50,
209
+ step=0.1,
210
+ value=7.5)
211
  with gr.Row():
212
  from example import style_example
213
  examples = style_example
214
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  gr.Examples(examples=examples,
216
  inputs=[
217
  model_id,
218
+ user_input_video,
219
  source_prompt,
220
  target_prompt,
221
  cross_replace_steps,
233
  # cache_examples=os.getenv('SYSTEM') == 'spaces'
234
  )
235
 
 
 
 
 
 
 
236
  inputs = [
237
  model_id,
238
+ user_input_video,
239
  source_prompt,
240
  target_prompt,
241
  cross_replace_steps,
247
  user_input_video,
248
  *ImageSequenceDataset_list
249
  ]
 
250
  target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
 
251
  run_button.click(fn=pipe.run, inputs=inputs, outputs=result)
252
 
253
  demo.queue().launch()
example.py CHANGED
@@ -2,7 +2,7 @@ num_steps = 10
2
  style_example = [
3
  [
4
  'CompVis/stable-diffusion-v1-4',
5
- 'FateZero/data/teaser_car-turn',
6
  'a silver jeep driving down a curvy road in the countryside',
7
  'watercolor painting of a silver jeep driving down a curvy road in the countryside',
8
  0.8,
@@ -17,7 +17,7 @@ style_example = [
17
  ],
18
  [
19
  'CompVis/stable-diffusion-v1-4',
20
- 'FateZero/data/style/sunflower',
21
  'a yellow sunflower',
22
  'van gogh style painting of a yellow sunflower',
23
  0.5,
@@ -30,7 +30,7 @@ style_example = [
30
  ],
31
  [
32
  'CompVis/stable-diffusion-v1-4',
33
- 'FateZero/data/style/surf',
34
  'a man with round helmet surfing on a white wave in blue ocean with a rope',
35
  'The Ukiyo-e style painting of a man with round helmet surfing on a white wave in blue ocean with a rope',
36
  0.9,
@@ -43,7 +43,7 @@ style_example = [
43
  ],
44
  [
45
  'CompVis/stable-diffusion-v1-4',
46
- 'FateZero/data/style/train',
47
  'a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track',
48
  'a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track Makoto Shinkai style',
49
  0.9,
@@ -57,7 +57,7 @@ style_example = [
57
 
58
  [
59
  'CompVis/stable-diffusion-v1-4',
60
- 'FateZero/data/attribute/swan_swarov',
61
  'a black swan with a red beak swimming in a river near a wall and bushes',
62
  'a Swarovski crystal swan with a red beak swimming in a river near a wall and bushes',
63
  0.8,
2
  style_example = [
3
  [
4
  'CompVis/stable-diffusion-v1-4',
5
+ 'FateZero/data/teaser_car-turn.mp4',
6
  'a silver jeep driving down a curvy road in the countryside',
7
  'watercolor painting of a silver jeep driving down a curvy road in the countryside',
8
  0.8,
17
  ],
18
  [
19
  'CompVis/stable-diffusion-v1-4',
20
+ 'FateZero/data/style/sunflower.mp4',
21
  'a yellow sunflower',
22
  'van gogh style painting of a yellow sunflower',
23
  0.5,
30
  ],
31
  [
32
  'CompVis/stable-diffusion-v1-4',
33
+ 'FateZero/data/style/surf.mp4',
34
  'a man with round helmet surfing on a white wave in blue ocean with a rope',
35
  'The Ukiyo-e style painting of a man with round helmet surfing on a white wave in blue ocean with a rope',
36
  0.9,
43
  ],
44
  [
45
  'CompVis/stable-diffusion-v1-4',
46
+ 'FateZero/data/style/train.mp4',
47
  'a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track',
48
  'a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track Makoto Shinkai style',
49
  0.9,
57
 
58
  [
59
  'CompVis/stable-diffusion-v1-4',
60
+ 'FateZero/data/attribute/swan_swarov.mp4',
61
  'a black swan with a red beak swimming in a river near a wall and bushes',
62
  'a Swarovski crystal swan with a red beak swimming in a river near a wall and bushes',
63
  0.8,
inference_fatezero.py CHANGED
@@ -92,8 +92,11 @@ class merge_config_then_run():
92
  config_now['train_dataset'].update(ImageSequenceDataset_dict)
93
  if user_input_video and data_path is None:
94
  raise gr.Error('You need to upload a video or choose a provided video')
95
- if user_input_video is not None and user_input_video.name is not None:
96
- config_now['train_dataset']['path'] = user_input_video.name
 
 
 
97
  config_now['validation_sample_logger_config']['prompts'] = [target_prompt]
98
 
99
 
92
  config_now['train_dataset'].update(ImageSequenceDataset_dict)
93
  if user_input_video and data_path is None:
94
  raise gr.Error('You need to upload a video or choose a provided video')
95
+ if user_input_video is not None:
96
+ if isinstance(user_input_video, str):
97
+ config_now['train_dataset']['path'] = user_input_video
98
+ elif hasattr(user_input_video, 'name') and user_input_video.name is not None:
99
+ config_now['train_dataset']['path'] = user_input_video.name
100
  config_now['validation_sample_logger_config']['prompts'] = [target_prompt]
101
 
102