chenyangqi commited on
Commit
afd7574
1 Parent(s): 0cbd26d

rearrange the spatial layout; add crop to input video

Browse files
FateZero/video_diffusion/data/dataset.py CHANGED
@@ -4,6 +4,8 @@ import numpy as np
4
  from PIL import Image
5
  from einops import rearrange
6
  from pathlib import Path
 
 
7
 
8
  import torch
9
  from torch.utils.data import Dataset
@@ -149,10 +151,27 @@ class ImageSequenceDataset(Dataset):
149
  frame_start = index
150
  return (frame_start + i for i in range(self.n_sample_frame))
151
 
152
- @staticmethod
153
- def get_image_list(path):
154
  images = []
 
 
 
 
155
  for file in sorted(os.listdir(path)):
156
  if file.endswith(IMAGE_EXTENSION):
157
  images.append(file)
158
  return images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from PIL import Image
5
  from einops import rearrange
6
  from pathlib import Path
7
+ import imageio
8
+ import cv2
9
 
10
  import torch
11
  from torch.utils.data import Dataset
 
151
  frame_start = index
152
  return (frame_start + i for i in range(self.n_sample_frame))
153
 
154
+ # @staticmethod
155
+ def get_image_list(self, path):
156
  images = []
157
+ if path[-4:] == '.mp4':
158
+ path = self.mp4_to_png(path)
159
+ self.path = path
160
+
161
  for file in sorted(os.listdir(path)):
162
  if file.endswith(IMAGE_EXTENSION):
163
  images.append(file)
164
  return images
165
+
166
+ # @staticmethod
167
+ def mp4_to_png(self, video_source=None):
168
+ reader = imageio.get_reader(video_source)
169
+ os.makedirs(video_source[:-4], exist_ok=True)
170
+
171
+ for i, im in enumerate(reader):
172
+ # use :05d to add zero, no space before the 05d
173
+ # if (i+1)%10 == 0:
174
+ path = os.path.join(video_source[:-4], f"{i:05d}.png")
175
+ # print(path)
176
+ cv2.imwrite(path, im[:, :, ::-1])
177
+ return video_source[:-4]
app_fatezero.py CHANGED
@@ -36,8 +36,59 @@ with gr.Blocks(css='style.css') as demo:
36
 
37
  with gr.Row():
38
  with gr.Column():
39
- with gr.Box():
40
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  model_id = gr.Dropdown(
42
  label='Model ID',
43
  choices=[
@@ -55,54 +106,21 @@ with gr.Blocks(css='style.css') as demo:
55
  # prompt_used_for_training = gr.Text(
56
  # label='Training prompt', interactive=False)
57
 
58
- data_path = gr.Dropdown(
59
- label='data path',
60
- choices=[
61
- 'FateZero/data/teaser_car-turn',
62
- 'FateZero/data/style/sunflower',
63
- # add shape editing ckpt here
64
- ],
65
- value='FateZero/data/teaser_car-turn')
66
 
67
 
 
68
 
69
- source_prompt = gr.Textbox(label='Source Prompt',
70
- info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
71
- max_lines=1,
72
- placeholder='Example: "a silver jeep driving down a curvy road in the countryside"',
73
- value='a silver jeep driving down a curvy road in the countryside')
74
- target_prompt = gr.Textbox(label='Target Prompt',
75
- info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
76
- max_lines=1,
77
- placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"',
78
- value='watercolor painting of a silver jeep driving down a curvy road in the countryside')
79
-
80
- cross_replace_steps = gr.Slider(label='cross-attention replace steps',
81
- info='More steps, replace more cross attention to preserve semantic layout.',
82
- minimum=0.0,
83
- maximum=1.0,
84
- step=0.1,
85
- value=0.7)
86
-
87
- self_replace_steps = gr.Slider(label='self-attention replace steps',
88
- info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
89
- minimum=0.0,
90
- maximum=1.0,
91
- step=0.1,
92
- value=0.7)
93
-
94
- enhance_words = gr.Textbox(label='words to be enhanced',
95
- info='Amplify the target-words cross attention',
96
- max_lines=1,
97
- placeholder='Example: "watercolor "',
98
- value='watercolor')
99
 
100
- enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
101
- info='larger value, more elements of target words',
102
- minimum=0.0,
103
- maximum=20.0,
104
- step=1,
105
- value=10)
106
 
107
 
108
  with gr.Accordion('DDIM Parameters', open=True):
@@ -129,6 +147,34 @@ with gr.Blocks(css='style.css') as demo:
129
  ''')
130
  with gr.Column():
131
  result = gr.Video(label='Result')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  with gr.Row():
133
  examples = [
134
  [
@@ -190,6 +236,8 @@ with gr.Blocks(css='style.css') as demo:
190
  enhance_words_value,
191
  num_steps,
192
  guidance_scale,
 
 
193
  ]
194
  # prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
195
  target_prompt.submit(fn=merge_config_then_run, inputs=inputs, outputs=result)
 
36
 
37
  with gr.Row():
38
  with gr.Column():
39
+ with gr.Accordion('Input Video', open=True):
40
+ user_input_video = gr.File(label='Input Source Video')
41
+ with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
42
+ n_sample_frame = gr.Slider(label='Number of Frames in Video',
43
+ # info='We test 8 frames in our paper',
44
+ minimum=0,
45
+ maximum=32,
46
+ step=1,
47
+ value=8)
48
+ stride = gr.Slider(label='Temporal sampling stride in Video',
49
+ minimum=0,
50
+ maximum=20,
51
+ step=1,
52
+ value=1)
53
+ start_sample_frame = gr.Number(label='Start frame in the video',
54
+ value=0,
55
+ precision=0)
56
+
57
+ with gr.Accordion('Spatial Crop offset', open=False):
58
+ left_crop = gr.Number(label='Left crop',
59
+ value=0,
60
+ precision=0)
61
+ right_crop = gr.Number(label='Right crop',
62
+ value=0,
63
+ precision=0)
64
+ top_crop = gr.Number(label='Top crop',
65
+ value=0,
66
+ precision=0)
67
+ bottom_crop = gr.Number(label='Bottom crop',
68
+ value=0,
69
+ precision=0)
70
+ offset_list = [
71
+ left_crop,
72
+ right_crop,
73
+ top_crop,
74
+ bottom_crop,
75
+ ]
76
+
77
+ ImageSequenceDataset_list = [
78
+ start_sample_frame,
79
+ n_sample_frame,
80
+ stride
81
+ ] + offset_list
82
+
83
+
84
+ data_path = gr.Dropdown(
85
+ label='provided data path',
86
+ choices=[
87
+ 'FateZero/data/teaser_car-turn',
88
+ 'FateZero/data/style/sunflower',
89
+ # add shape editing ckpt here
90
+ ],
91
+ value='FateZero/data/teaser_car-turn')
92
  model_id = gr.Dropdown(
93
  label='Model ID',
94
  choices=[
 
106
  # prompt_used_for_training = gr.Text(
107
  # label='Training prompt', interactive=False)
108
 
 
 
 
 
 
 
 
 
109
 
110
 
111
+ with gr.Accordion('Text Prompt', open=True):
112
 
113
+ source_prompt = gr.Textbox(label='Source Prompt',
114
+ info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
115
+ max_lines=1,
116
+ placeholder='Example: "a silver jeep driving down a curvy road in the countryside"',
117
+ value='a silver jeep driving down a curvy road in the countryside')
118
+ target_prompt = gr.Textbox(label='Target Prompt',
119
+ info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
120
+ max_lines=1,
121
+ placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"',
122
+ value='watercolor painting of a silver jeep driving down a curvy road in the countryside')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
 
 
 
 
 
 
124
 
125
 
126
  with gr.Accordion('DDIM Parameters', open=True):
 
147
  ''')
148
  with gr.Column():
149
  result = gr.Video(label='Result')
150
+ result.style(height=512, width=512)
151
+ with gr.Accordion('FateZero Parameters for attention fusing', open=True):
152
+ cross_replace_steps = gr.Slider(label='cross-attention replace steps',
153
+ info='More steps, replace more cross attention to preserve semantic layout.',
154
+ minimum=0.0,
155
+ maximum=1.0,
156
+ step=0.1,
157
+ value=0.7)
158
+
159
+ self_replace_steps = gr.Slider(label='self-attention replace steps',
160
+ info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
161
+ minimum=0.0,
162
+ maximum=1.0,
163
+ step=0.1,
164
+ value=0.7)
165
+
166
+ enhance_words = gr.Textbox(label='words to be enhanced',
167
+ info='Amplify the target-words cross attention',
168
+ max_lines=1,
169
+ placeholder='Example: "watercolor "',
170
+ value='watercolor')
171
+
172
+ enhance_words_value = gr.Slider(label='Amplify the target cross-attention',
173
+ info='larger value, more elements of target words',
174
+ minimum=0.0,
175
+ maximum=20.0,
176
+ step=1,
177
+ value=10)
178
  with gr.Row():
179
  examples = [
180
  [
 
236
  enhance_words_value,
237
  num_steps,
238
  guidance_scale,
239
+ user_input_video,
240
+ *ImageSequenceDataset_list
241
  ]
242
  # prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
243
  target_prompt.submit(fn=merge_config_then_run, inputs=inputs, outputs=result)
inference_fatezero.py CHANGED
@@ -2,6 +2,7 @@
2
  from FateZero.test_fatezero import *
3
 
4
  import copy
 
5
 
6
 
7
  def merge_config_then_run(
@@ -14,7 +15,17 @@ def merge_config_then_run(
14
  enhance_words,
15
  enhance_words_value,
16
  num_steps,
17
- guidance_scale
 
 
 
 
 
 
 
 
 
 
18
  ):
19
  # , ] = inputs
20
  default_edit_config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'
@@ -26,6 +37,24 @@ def merge_config_then_run(
26
  # config_now['pretrained_model_path'] = model_id
27
  config_now['train_dataset']['prompt'] = source_prompt
28
  config_now['train_dataset']['path'] = data_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  config_now['validation_sample_logger_config']['prompts'] = [target_prompt]
30
 
31
 
 
2
  from FateZero.test_fatezero import *
3
 
4
  import copy
5
+ import gradio as gr
6
 
7
 
8
  def merge_config_then_run(
 
15
  enhance_words,
16
  enhance_words_value,
17
  num_steps,
18
+ guidance_scale,
19
+ user_input_video,
20
+
21
+ # Temporal and spatial crop of the video
22
+ start_sample_frame,
23
+ n_sample_frame,
24
+ stride,
25
+ left_crop,
26
+ right_crop,
27
+ top_crop,
28
+ bottom_crop,
29
  ):
30
  # , ] = inputs
31
  default_edit_config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'
 
37
  # config_now['pretrained_model_path'] = model_id
38
  config_now['train_dataset']['prompt'] = source_prompt
39
  config_now['train_dataset']['path'] = data_path
40
+ # ImageSequenceDataset_dict = { }
41
+ offset_dict = {
42
+ "left": left_crop,
43
+ "right": right_crop,
44
+ "top": top_crop,
45
+ "bottom": bottom_crop,
46
+ }
47
+ ImageSequenceDataset_dict = {
48
+ "start_sample_frame" : start_sample_frame,
49
+ "n_sample_frame" : n_sample_frame,
50
+ "stride" : stride,
51
+ "offset": offset_dict,
52
+ }
53
+ config_now['train_dataset'].update(ImageSequenceDataset_dict)
54
+ if user_input_video and data_path is None:
55
+ raise gr.Error('You need to upload a video or choose a provided video')
56
+ if user_input_video is not None and user_input_video.name is not None:
57
+ config_now['train_dataset']['path'] = user_input_video.name
58
  config_now['validation_sample_logger_config']['prompts'] = [target_prompt]
59
 
60