vinesmsuic commited on
Commit
3350655
1 Parent(s): cc6a3a0
Files changed (2) hide show
  1. app.py +203 -192
  2. gradio_demo.py +203 -192
app.py CHANGED
@@ -14,7 +14,6 @@ from PIL import Image
14
  import torch
15
  import numpy as np
16
 
17
-
18
  from black_box_image_edit.instructpix2pix import InstructPix2Pix
19
  from prepare_video import crop_and_resize_video
20
  from edit_image import infer_video
@@ -28,198 +27,210 @@ from diffusers import DDIMInverseScheduler, DDIMScheduler
28
  from diffusers.utils import load_image
29
  import imageio
30
 
 
31
 
32
  demo_examples = [
33
- ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "Dog turning its head"],
34
- ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "A Man Doing Exercises For The Body And Mind"],
35
- ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "Girl dancing ballet"],
 
 
36
  ]
37
 
38
  TEMP_DIR = "_demo_temp"
39
 
40
- #================================================================================================
41
- image_edit_model = InstructPix2Pix()
42
-
43
- @torch.no_grad()
44
- @spaces.GPU(duration=30)
45
- def perform_edit(video_path, prompt, force_512=False, seed=42, negative_prompt=""):
46
- edited_image_path = infer_video(image_edit_model,
47
- video_path,
48
- output_dir=TEMP_DIR,
49
- prompt=prompt,
50
- prompt_type="instruct",
51
- force_512=force_512,
52
- seed=seed,
53
- negative_prompt=negative_prompt,
54
- overwrite=True)
55
- return edited_image_path
56
- #================================================================================================
57
-
58
- config = {
59
- # DDIM inversion
60
- "inverse_config": {
61
- "image_size": [512, 512],
62
- "n_frames": 16,
63
- "cfg": 1.0,
64
- "target_fps": 8,
65
- "ddim_inv_prompt": "",
66
- "prompt": "",
67
- "negative_prompt": "",
68
- },
69
- "pnp_config": {
70
- "random_ratio": 0.0,
71
- "target_fps": 8,
72
- },
73
- }
74
- config = OmegaConf.create(config)
75
-
76
- # Initialize the I2VGenXL pipeline
77
- pipe = I2VGenXLPipeline.from_pretrained(
78
- "ali-vilab/i2vgen-xl",
79
- torch_dtype=torch.float16,
80
- variant="fp16",
81
- ).to("cuda:0")
82
-
83
- # Initialize the DDIM inverse scheduler
84
- inverse_scheduler = DDIMInverseScheduler.from_pretrained(
85
- "ali-vilab/i2vgen-xl",
86
- subfolder="scheduler",
87
- )
88
- # Initialize the DDIM scheduler
89
- ddim_scheduler = DDIMScheduler.from_pretrained(
90
- "ali-vilab/i2vgen-xl",
91
- subfolder="scheduler",
92
- )
93
-
94
- @torch.no_grad()
95
- @spaces.GPU(duration=150)
96
- def perform_anyv2v(
97
- video_path,
98
- video_prompt,
99
- video_negative_prompt,
100
- edited_first_frame_path,
101
- conv_inj,
102
- spatial_inj,
103
- temp_inj,
104
- num_inference_steps,
105
- guidance_scale,
106
- ddim_init_latents_t_idx,
107
- ddim_inversion_steps,
108
- seed,
109
- ):
110
-
111
- tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
112
- if os.path.exists(tmp_dir):
113
- shutil.rmtree(tmp_dir)
114
- os.makedirs(tmp_dir)
115
-
116
- ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
117
-
118
- def read_frames(video_path):
119
- frames = []
120
- with imageio.get_reader(video_path) as reader:
121
- for i, frame in enumerate(reader):
122
- pil_image = Image.fromarray(frame)
123
- frames.append(pil_image)
124
- return frames
125
- frame_list = read_frames(str(video_path))
126
-
127
- config.inverse_config.image_size = list(frame_list[0].size)
128
- config.inverse_config.n_steps = ddim_inversion_steps
129
- config.inverse_config.n_frames = len(frame_list)
130
- config.inverse_config.output_dir = ddim_latents_path
131
- ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
132
-
133
- # Step 1. DDIM Inversion
134
- first_frame = frame_list[0]
135
-
136
- generator = torch.Generator(device="cuda:0")
137
- generator = generator.manual_seed(seed)
138
- _ddim_latents = ddim_inversion(
139
- config.inverse_config,
140
- first_frame,
141
- frame_list,
142
- pipe,
143
- inverse_scheduler,
144
- generator,
145
- )
146
-
147
- # Step 2. DDIM Sampling + PnP feature and attention injection
148
- # Load the edited first frame
149
- edited_1st_frame = load_image(edited_first_frame_path).resize(
150
- config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
151
- )
152
- # Load the initial latents at t
153
- ddim_scheduler.set_timesteps(num_inference_steps)
154
- print(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
155
- ddim_latents_at_t = load_ddim_latents_at_t(
156
- ddim_scheduler.timesteps[ddim_init_latents_t_idx],
157
- ddim_latents_path=ddim_latents_path,
158
- )
159
- print(
160
- f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
161
- )
162
- print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
163
-
164
- # Blend the latents
165
- random_latents = torch.randn_like(ddim_latents_at_t)
166
- print(
167
- f"Blending random_ratio (1 means random latent): {config.pnp_config.random_ratio}"
168
- )
169
- mixed_latents = (
170
- random_latents * config.pnp_config.random_ratio
171
- + ddim_latents_at_t * (1 - config.pnp_config.random_ratio)
172
- )
173
-
174
- # Init Pnp
175
- config.pnp_config.n_steps = num_inference_steps
176
- config.pnp_config.pnp_f_t = conv_inj
177
- config.pnp_config.pnp_spatial_attn_t = spatial_inj
178
- config.pnp_config.pnp_temp_attn_t = temp_inj
179
- config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
180
- init_pnp(pipe, ddim_scheduler, config.pnp_config)
181
- # Edit video
182
- pipe.register_modules(scheduler=ddim_scheduler)
183
-
184
- edited_video = pipe.sample_with_pnp(
185
- prompt=video_prompt,
186
- image=edited_1st_frame,
187
- height=config.inverse_config.image_size[1],
188
- width=config.inverse_config.image_size[0],
189
- num_frames=config.inverse_config.n_frames,
190
- num_inference_steps=config.pnp_config.n_steps,
191
- guidance_scale=guidance_scale,
192
- negative_prompt=video_negative_prompt,
193
- target_fps=config.pnp_config.target_fps,
194
- latents=mixed_latents,
195
- generator=generator,
196
- return_dict=True,
197
- ddim_init_latents_t_idx=ddim_init_latents_t_idx,
198
- ddim_inv_latents_path=ddim_latents_path,
199
- ddim_inv_prompt=config.inverse_config.ddim_inv_prompt,
200
- ddim_inv_1st_frame=first_frame,
201
- ).frames[0]
202
-
203
- edited_video = [
204
- frame.resize(config.inverse_config.image_size, resample=Image.LANCZOS)
205
- for frame in edited_video
206
- ]
207
-
208
- def images_to_video(images, output_path, fps=24):
209
- writer = imageio.get_writer(output_path, fps=fps)
210
-
211
- for img in images:
212
- img_np = np.array(img)
213
- writer.append_data(img_np)
214
-
215
- writer.close()
216
- output_path = os.path.join(tmp_dir, "edited_video.mp4")
217
- images_to_video(
218
- edited_video, output_path, fps=config.pnp_config.target_fps
219
- )
220
- return output_path
221
- #================================================================================================
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
225
  def check_video(video_path):
@@ -256,7 +267,7 @@ def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg
256
  ie_seed = int.from_bytes(os.urandom(2), "big")
257
  print(f"Using seed: {ie_seed}")
258
 
259
- edited_image_path = perform_edit(video_path=video_path,
260
  prompt=instruct_prompt,
261
  force_512=ie_force_512,
262
  seed=ie_seed,
@@ -281,7 +292,7 @@ def btn_infer_fn(video_path,
281
  seed = int.from_bytes(os.urandom(2), "big")
282
  print(f"Using seed: {seed}")
283
 
284
- result_video_path = perform_anyv2v(video_path=video_path,
285
  video_prompt=video_prompt,
286
  video_negative_prompt=video_negative_prompt,
287
  edited_first_frame_path=edited_first_frame_path,
@@ -334,11 +345,11 @@ with gr.Blocks() as demo:
334
  ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
335
 
336
  with gr.Column():
337
- gr.Markdown("# AnyV2V Stage")
338
  gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
339
  video_output = gr.Video(label="Video Output")
340
  video_prompt = gr.Textbox(label="Video description prompt")
341
- btn_infer = gr.Button("Run AnyV2V")
342
  settings_anyv2v = gr.Accordion("Settings for AnyV2V")
343
  with settings_anyv2v:
344
  with gr.Column():
@@ -357,8 +368,8 @@ with gr.Blocks() as demo:
357
 
358
 
359
  examples = gr.Examples(examples=demo_examples,
360
- label="Examples (Just click on AnyV2V button after loading them into the UI)",
361
- inputs=[video_input, image_input_output, video_prompt])
362
 
363
  btn_pv.click(
364
  btn_preprocess_video_fn,
 
14
  import torch
15
  import numpy as np
16
 
 
17
  from black_box_image_edit.instructpix2pix import InstructPix2Pix
18
  from prepare_video import crop_and_resize_video
19
  from edit_image import infer_video
 
27
  from diffusers.utils import load_image
28
  import imageio
29
 
30
+ DEBUG_MODE = False
31
 
32
  demo_examples = [
33
+ ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "darth vader walking", 0.1, 0.1, 1.0],
34
+ ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
35
+ ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
36
+ ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
37
+ ["./demo/A Couple In A Public Display Of Affection.mp4", "./demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png", "A couple in a public display of affection, snowing", 0.3, 0.3, 1.0]
38
  ]
39
 
40
  TEMP_DIR = "_demo_temp"
41
 
42
+ class ImageEditor:
43
+ def __init__(self) -> None:
44
+ self.image_edit_model = InstructPix2Pix()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ @torch.no_grad()
47
+ @spaces.GPU(duration=60)
48
+ def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
49
+ edited_image_path = infer_video(self.image_edit_model,
50
+ video_path,
51
+ output_dir=TEMP_DIR,
52
+ prompt=prompt,
53
+ prompt_type="instruct",
54
+ force_512=force_512,
55
+ seed=seed,
56
+ negative_prompt=negative_prompt,
57
+ overwrite=True)
58
+ return edited_image_path
59
+
60
+ class AnyV2V_I2VGenXL:
61
+ def __init__(self) -> None:
62
+ # Set up default inversion config file
63
+ config = {
64
+ # DDIM inversion
65
+ "inverse_config": {
66
+ "image_size": [512, 512],
67
+ "n_frames": 16,
68
+ "cfg": 1.0,
69
+ "target_fps": 8,
70
+ "ddim_inv_prompt": "",
71
+ "prompt": "",
72
+ "negative_prompt": "",
73
+ },
74
+ "pnp_config": {
75
+ "random_ratio": 0.0,
76
+ "target_fps": 8,
77
+ },
78
+ }
79
+ self.config = OmegaConf.create(config)
80
+
81
+ @torch.no_grad()
82
+ @spaces.GPU(duration=150)
83
+ def perform_anyv2v(self,
84
+ video_path,
85
+ video_prompt,
86
+ video_negative_prompt,
87
+ edited_first_frame_path,
88
+ conv_inj,
89
+ spatial_inj,
90
+ temp_inj,
91
+ num_inference_steps,
92
+ guidance_scale,
93
+ ddim_init_latents_t_idx,
94
+ ddim_inversion_steps,
95
+ seed,
96
+ ):
97
+
98
+ # Initialize the I2VGenXL pipeline
99
+ self.pipe = I2VGenXLPipeline.from_pretrained(
100
+ "ali-vilab/i2vgen-xl",
101
+ torch_dtype=torch.float16,
102
+ variant="fp16",
103
+ ).to("cuda:0")
104
+
105
+ # Initialize the DDIM inverse scheduler
106
+ self.inverse_scheduler = DDIMInverseScheduler.from_pretrained(
107
+ "ali-vilab/i2vgen-xl",
108
+ subfolder="scheduler",
109
+ )
110
+ # Initialize the DDIM scheduler
111
+ self.ddim_scheduler = DDIMScheduler.from_pretrained(
112
+ "ali-vilab/i2vgen-xl",
113
+ subfolder="scheduler",
114
+ )
115
+
116
+ tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
117
+ if os.path.exists(tmp_dir):
118
+ shutil.rmtree(tmp_dir)
119
+ os.makedirs(tmp_dir)
120
+
121
+ ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
122
+
123
+ def read_frames(video_path):
124
+ frames = []
125
+ with imageio.get_reader(video_path) as reader:
126
+ for i, frame in enumerate(reader):
127
+ pil_image = Image.fromarray(frame)
128
+ frames.append(pil_image)
129
+ return frames
130
+ frame_list = read_frames(str(video_path))
131
+
132
+ self.config.inverse_config.image_size = list(frame_list[0].size)
133
+ self.config.inverse_config.n_steps = ddim_inversion_steps
134
+ self.config.inverse_config.n_frames = len(frame_list)
135
+ self.config.inverse_config.output_dir = ddim_latents_path
136
+ ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
137
+
138
+ # Step 1. DDIM Inversion
139
+ first_frame = frame_list[0]
140
+
141
+ generator = torch.Generator(device="cuda:0")
142
+ generator = generator.manual_seed(seed)
143
+ _ddim_latents = ddim_inversion(
144
+ self.config.inverse_config,
145
+ first_frame,
146
+ frame_list,
147
+ self.pipe,
148
+ self.inverse_scheduler,
149
+ generator,
150
+ )
151
+
152
+ # Step 2. DDIM Sampling + PnP feature and attention injection
153
+ # Load the edited first frame
154
+ edited_1st_frame = load_image(edited_first_frame_path).resize(
155
+ self.config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
156
+ )
157
+ # Load the initial latents at t
158
+ self.ddim_scheduler.set_timesteps(num_inference_steps)
159
+ print(f"ddim_scheduler.timesteps: {self.ddim_scheduler.timesteps}")
160
+ ddim_latents_at_t = load_ddim_latents_at_t(
161
+ self.ddim_scheduler.timesteps[ddim_init_latents_t_idx],
162
+ ddim_latents_path=ddim_latents_path,
163
+ )
164
+ print(
165
+ f"ddim_scheduler.timesteps[t_idx]: {self.ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
166
+ )
167
+ print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
168
+
169
+ # Blend the latents
170
+ random_latents = torch.randn_like(ddim_latents_at_t)
171
+ print(
172
+ f"Blending random_ratio (1 means random latent): {self.config.pnp_config.random_ratio}"
173
+ )
174
+ mixed_latents = (
175
+ random_latents * self.config.pnp_config.random_ratio
176
+ + ddim_latents_at_t * (1 - self.config.pnp_config.random_ratio)
177
+ )
178
+
179
+ # Init Pnp
180
+ self.config.pnp_config.n_steps = num_inference_steps
181
+ self.config.pnp_config.pnp_f_t = conv_inj
182
+ self.config.pnp_config.pnp_spatial_attn_t = spatial_inj
183
+ self.config.pnp_config.pnp_temp_attn_t = temp_inj
184
+ self.config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
185
+ init_pnp(self.pipe, self.ddim_scheduler, self.config.pnp_config)
186
+ # Edit video
187
+ self.pipe.register_modules(scheduler=self.ddim_scheduler)
188
+
189
+ edited_video = self.pipe.sample_with_pnp(
190
+ prompt=video_prompt,
191
+ image=edited_1st_frame,
192
+ height=self.config.inverse_config.image_size[1],
193
+ width=self.config.inverse_config.image_size[0],
194
+ num_frames=self.config.inverse_config.n_frames,
195
+ num_inference_steps=self.config.pnp_config.n_steps,
196
+ guidance_scale=guidance_scale,
197
+ negative_prompt=video_negative_prompt,
198
+ target_fps=self.config.pnp_config.target_fps,
199
+ latents=mixed_latents,
200
+ generator=generator,
201
+ return_dict=True,
202
+ ddim_init_latents_t_idx=ddim_init_latents_t_idx,
203
+ ddim_inv_latents_path=ddim_latents_path,
204
+ ddim_inv_prompt=self.config.inverse_config.ddim_inv_prompt,
205
+ ddim_inv_1st_frame=first_frame,
206
+ ).frames[0]
207
+
208
+ edited_video = [
209
+ frame.resize(self.config.inverse_config.image_size, resample=Image.LANCZOS)
210
+ for frame in edited_video
211
+ ]
212
+
213
+ def images_to_video(images, output_path, fps=24):
214
+ writer = imageio.get_writer(output_path, fps=fps)
215
+
216
+ for img in images:
217
+ img_np = np.array(img)
218
+ writer.append_data(img_np)
219
+
220
+ writer.close()
221
+ output_path = os.path.join(tmp_dir, "edited_video.mp4")
222
+ images_to_video(
223
+ edited_video, output_path, fps=self.config.pnp_config.target_fps
224
+ )
225
+ return output_path
226
+
227
+
228
+ # Init the class
229
+ #=====================================
230
+ if not DEBUG_MODE:
231
+ Image_Editor = ImageEditor()
232
+ AnyV2V_Editor = AnyV2V_I2VGenXL()
233
+ #=====================================
234
 
235
  def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
236
  def check_video(video_path):
 
267
  ie_seed = int.from_bytes(os.urandom(2), "big")
268
  print(f"Using seed: {ie_seed}")
269
 
270
+ edited_image_path = Image_Editor.perform_edit(video_path=video_path,
271
  prompt=instruct_prompt,
272
  force_512=ie_force_512,
273
  seed=ie_seed,
 
292
  seed = int.from_bytes(os.urandom(2), "big")
293
  print(f"Using seed: {seed}")
294
 
295
+ result_video_path = AnyV2V_Editor.perform_anyv2v(video_path=video_path,
296
  video_prompt=video_prompt,
297
  video_negative_prompt=video_negative_prompt,
298
  edited_first_frame_path=edited_first_frame_path,
 
345
  ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
346
 
347
  with gr.Column():
348
+ gr.Markdown("# Video Editing Stage")
349
  gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
350
  video_output = gr.Video(label="Video Output")
351
  video_prompt = gr.Textbox(label="Video description prompt")
352
+ btn_infer = gr.Button("Run Video Editing")
353
  settings_anyv2v = gr.Accordion("Settings for AnyV2V")
354
  with settings_anyv2v:
355
  with gr.Column():
 
368
 
369
 
370
  examples = gr.Examples(examples=demo_examples,
371
+ label="Examples (Just click on Video Editing button after loading them into the UI)",
372
+ inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
373
 
374
  btn_pv.click(
375
  btn_preprocess_video_fn,
gradio_demo.py CHANGED
@@ -14,7 +14,6 @@ from PIL import Image
14
  import torch
15
  import numpy as np
16
 
17
-
18
  from black_box_image_edit.instructpix2pix import InstructPix2Pix
19
  from prepare_video import crop_and_resize_video
20
  from edit_image import infer_video
@@ -28,198 +27,210 @@ from diffusers import DDIMInverseScheduler, DDIMScheduler
28
  from diffusers.utils import load_image
29
  import imageio
30
 
 
31
 
32
  demo_examples = [
33
- ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "Dog turning its head"],
34
- ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "A Man Doing Exercises For The Body And Mind"],
35
- ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "Girl dancing ballet"],
 
 
36
  ]
37
 
38
  TEMP_DIR = "_demo_temp"
39
 
40
- #================================================================================================
41
- image_edit_model = InstructPix2Pix()
42
-
43
- @torch.no_grad()
44
- @spaces.GPU(duration=30)
45
- def perform_edit(video_path, prompt, force_512=False, seed=42, negative_prompt=""):
46
- edited_image_path = infer_video(image_edit_model,
47
- video_path,
48
- output_dir=TEMP_DIR,
49
- prompt=prompt,
50
- prompt_type="instruct",
51
- force_512=force_512,
52
- seed=seed,
53
- negative_prompt=negative_prompt,
54
- overwrite=True)
55
- return edited_image_path
56
- #================================================================================================
57
-
58
- config = {
59
- # DDIM inversion
60
- "inverse_config": {
61
- "image_size": [512, 512],
62
- "n_frames": 16,
63
- "cfg": 1.0,
64
- "target_fps": 8,
65
- "ddim_inv_prompt": "",
66
- "prompt": "",
67
- "negative_prompt": "",
68
- },
69
- "pnp_config": {
70
- "random_ratio": 0.0,
71
- "target_fps": 8,
72
- },
73
- }
74
- config = OmegaConf.create(config)
75
-
76
- # Initialize the I2VGenXL pipeline
77
- pipe = I2VGenXLPipeline.from_pretrained(
78
- "ali-vilab/i2vgen-xl",
79
- torch_dtype=torch.float16,
80
- variant="fp16",
81
- ).to("cuda:0")
82
-
83
- # Initialize the DDIM inverse scheduler
84
- inverse_scheduler = DDIMInverseScheduler.from_pretrained(
85
- "ali-vilab/i2vgen-xl",
86
- subfolder="scheduler",
87
- )
88
- # Initialize the DDIM scheduler
89
- ddim_scheduler = DDIMScheduler.from_pretrained(
90
- "ali-vilab/i2vgen-xl",
91
- subfolder="scheduler",
92
- )
93
-
94
- @torch.no_grad()
95
- @spaces.GPU(duration=150)
96
- def perform_anyv2v(
97
- video_path,
98
- video_prompt,
99
- video_negative_prompt,
100
- edited_first_frame_path,
101
- conv_inj,
102
- spatial_inj,
103
- temp_inj,
104
- num_inference_steps,
105
- guidance_scale,
106
- ddim_init_latents_t_idx,
107
- ddim_inversion_steps,
108
- seed,
109
- ):
110
-
111
- tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
112
- if os.path.exists(tmp_dir):
113
- shutil.rmtree(tmp_dir)
114
- os.makedirs(tmp_dir)
115
-
116
- ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
117
-
118
- def read_frames(video_path):
119
- frames = []
120
- with imageio.get_reader(video_path) as reader:
121
- for i, frame in enumerate(reader):
122
- pil_image = Image.fromarray(frame)
123
- frames.append(pil_image)
124
- return frames
125
- frame_list = read_frames(str(video_path))
126
-
127
- config.inverse_config.image_size = list(frame_list[0].size)
128
- config.inverse_config.n_steps = ddim_inversion_steps
129
- config.inverse_config.n_frames = len(frame_list)
130
- config.inverse_config.output_dir = ddim_latents_path
131
- ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
132
-
133
- # Step 1. DDIM Inversion
134
- first_frame = frame_list[0]
135
-
136
- generator = torch.Generator(device="cuda:0")
137
- generator = generator.manual_seed(seed)
138
- _ddim_latents = ddim_inversion(
139
- config.inverse_config,
140
- first_frame,
141
- frame_list,
142
- pipe,
143
- inverse_scheduler,
144
- generator,
145
- )
146
-
147
- # Step 2. DDIM Sampling + PnP feature and attention injection
148
- # Load the edited first frame
149
- edited_1st_frame = load_image(edited_first_frame_path).resize(
150
- config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
151
- )
152
- # Load the initial latents at t
153
- ddim_scheduler.set_timesteps(num_inference_steps)
154
- print(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
155
- ddim_latents_at_t = load_ddim_latents_at_t(
156
- ddim_scheduler.timesteps[ddim_init_latents_t_idx],
157
- ddim_latents_path=ddim_latents_path,
158
- )
159
- print(
160
- f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
161
- )
162
- print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
163
-
164
- # Blend the latents
165
- random_latents = torch.randn_like(ddim_latents_at_t)
166
- print(
167
- f"Blending random_ratio (1 means random latent): {config.pnp_config.random_ratio}"
168
- )
169
- mixed_latents = (
170
- random_latents * config.pnp_config.random_ratio
171
- + ddim_latents_at_t * (1 - config.pnp_config.random_ratio)
172
- )
173
-
174
- # Init Pnp
175
- config.pnp_config.n_steps = num_inference_steps
176
- config.pnp_config.pnp_f_t = conv_inj
177
- config.pnp_config.pnp_spatial_attn_t = spatial_inj
178
- config.pnp_config.pnp_temp_attn_t = temp_inj
179
- config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
180
- init_pnp(pipe, ddim_scheduler, config.pnp_config)
181
- # Edit video
182
- pipe.register_modules(scheduler=ddim_scheduler)
183
-
184
- edited_video = pipe.sample_with_pnp(
185
- prompt=video_prompt,
186
- image=edited_1st_frame,
187
- height=config.inverse_config.image_size[1],
188
- width=config.inverse_config.image_size[0],
189
- num_frames=config.inverse_config.n_frames,
190
- num_inference_steps=config.pnp_config.n_steps,
191
- guidance_scale=guidance_scale,
192
- negative_prompt=video_negative_prompt,
193
- target_fps=config.pnp_config.target_fps,
194
- latents=mixed_latents,
195
- generator=generator,
196
- return_dict=True,
197
- ddim_init_latents_t_idx=ddim_init_latents_t_idx,
198
- ddim_inv_latents_path=ddim_latents_path,
199
- ddim_inv_prompt=config.inverse_config.ddim_inv_prompt,
200
- ddim_inv_1st_frame=first_frame,
201
- ).frames[0]
202
-
203
- edited_video = [
204
- frame.resize(config.inverse_config.image_size, resample=Image.LANCZOS)
205
- for frame in edited_video
206
- ]
207
-
208
- def images_to_video(images, output_path, fps=24):
209
- writer = imageio.get_writer(output_path, fps=fps)
210
-
211
- for img in images:
212
- img_np = np.array(img)
213
- writer.append_data(img_np)
214
-
215
- writer.close()
216
- output_path = os.path.join(tmp_dir, "edited_video.mp4")
217
- images_to_video(
218
- edited_video, output_path, fps=config.pnp_config.target_fps
219
- )
220
- return output_path
221
- #================================================================================================
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
225
  def check_video(video_path):
@@ -256,7 +267,7 @@ def btn_image_edit_fn(video_path, instruct_prompt, ie_force_512, ie_seed, ie_neg
256
  ie_seed = int.from_bytes(os.urandom(2), "big")
257
  print(f"Using seed: {ie_seed}")
258
 
259
- edited_image_path = perform_edit(video_path=video_path,
260
  prompt=instruct_prompt,
261
  force_512=ie_force_512,
262
  seed=ie_seed,
@@ -281,7 +292,7 @@ def btn_infer_fn(video_path,
281
  seed = int.from_bytes(os.urandom(2), "big")
282
  print(f"Using seed: {seed}")
283
 
284
- result_video_path = perform_anyv2v(video_path=video_path,
285
  video_prompt=video_prompt,
286
  video_negative_prompt=video_negative_prompt,
287
  edited_first_frame_path=edited_first_frame_path,
@@ -334,11 +345,11 @@ with gr.Blocks() as demo:
334
  ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
335
 
336
  with gr.Column():
337
- gr.Markdown("# AnyV2V Stage")
338
  gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
339
  video_output = gr.Video(label="Video Output")
340
  video_prompt = gr.Textbox(label="Video description prompt")
341
- btn_infer = gr.Button("Run AnyV2V")
342
  settings_anyv2v = gr.Accordion("Settings for AnyV2V")
343
  with settings_anyv2v:
344
  with gr.Column():
@@ -357,8 +368,8 @@ with gr.Blocks() as demo:
357
 
358
 
359
  examples = gr.Examples(examples=demo_examples,
360
- label="Examples (Just click on AnyV2V button after loading them into the UI)",
361
- inputs=[video_input, image_input_output, video_prompt])
362
 
363
  btn_pv.click(
364
  btn_preprocess_video_fn,
 
14
  import torch
15
  import numpy as np
16
 
 
17
  from black_box_image_edit.instructpix2pix import InstructPix2Pix
18
  from prepare_video import crop_and_resize_video
19
  from edit_image import infer_video
 
27
  from diffusers.utils import load_image
28
  import imageio
29
 
30
+ DEBUG_MODE = False
31
 
32
  demo_examples = [
33
+ ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "darth vader walking", 0.1, 0.1, 1.0],
34
+ ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
35
+ ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
36
+ ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
37
+ ["./demo/A Couple In A Public Display Of Affection.mp4", "./demo/A Couple In A Public Display Of Affection/edited_first_frame/Snowing.png", "A couple in a public display of affection, snowing", 0.3, 0.3, 1.0]
38
  ]
39
 
40
  TEMP_DIR = "_demo_temp"
41
 
42
+ class ImageEditor:
43
+ def __init__(self) -> None:
44
+ self.image_edit_model = InstructPix2Pix()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ @torch.no_grad()
47
+ @spaces.GPU(duration=60)
48
+ def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
49
+ edited_image_path = infer_video(self.image_edit_model,
50
+ video_path,
51
+ output_dir=TEMP_DIR,
52
+ prompt=prompt,
53
+ prompt_type="instruct",
54
+ force_512=force_512,
55
+ seed=seed,
56
+ negative_prompt=negative_prompt,
57
+ overwrite=True)
58
+ return edited_image_path
59
+
60
+ class AnyV2V_I2VGenXL:
61
+ def __init__(self) -> None:
62
+ # Set up default inversion config file
63
+ config = {
64
+ # DDIM inversion
65
+ "inverse_config": {
66
+ "image_size": [512, 512],
67
+ "n_frames": 16,
68
+ "cfg": 1.0,
69
+ "target_fps": 8,
70
+ "ddim_inv_prompt": "",
71
+ "prompt": "",
72
+ "negative_prompt": "",
73
+ },
74
+ "pnp_config": {
75
+ "random_ratio": 0.0,
76
+ "target_fps": 8,
77
+ },
78
+ }
79
+ self.config = OmegaConf.create(config)
80
+
81
+ @torch.no_grad()
82
+ @spaces.GPU(duration=150)
83
+ def perform_anyv2v(self,
84
+ video_path,
85
+ video_prompt,
86
+ video_negative_prompt,
87
+ edited_first_frame_path,
88
+ conv_inj,
89
+ spatial_inj,
90
+ temp_inj,
91
+ num_inference_steps,
92
+ guidance_scale,
93
+ ddim_init_latents_t_idx,
94
+ ddim_inversion_steps,
95
+ seed,
96
+ ):
97
+
98
+ # Initialize the I2VGenXL pipeline
99
+ self.pipe = I2VGenXLPipeline.from_pretrained(
100
+ "ali-vilab/i2vgen-xl",
101
+ torch_dtype=torch.float16,
102
+ variant="fp16",
103
+ ).to("cuda:0")
104
+
105
+ # Initialize the DDIM inverse scheduler
106
+ self.inverse_scheduler = DDIMInverseScheduler.from_pretrained(
107
+ "ali-vilab/i2vgen-xl",
108
+ subfolder="scheduler",
109
+ )
110
+ # Initialize the DDIM scheduler
111
+ self.ddim_scheduler = DDIMScheduler.from_pretrained(
112
+ "ali-vilab/i2vgen-xl",
113
+ subfolder="scheduler",
114
+ )
115
+
116
+ tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
117
+ if os.path.exists(tmp_dir):
118
+ shutil.rmtree(tmp_dir)
119
+ os.makedirs(tmp_dir)
120
+
121
+ ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
122
+
123
+ def read_frames(video_path):
124
+ frames = []
125
+ with imageio.get_reader(video_path) as reader:
126
+ for i, frame in enumerate(reader):
127
+ pil_image = Image.fromarray(frame)
128
+ frames.append(pil_image)
129
+ return frames
130
+ frame_list = read_frames(str(video_path))
131
+
132
+ self.config.inverse_config.image_size = list(frame_list[0].size)
133
+ self.config.inverse_config.n_steps = ddim_inversion_steps
134
+ self.config.inverse_config.n_frames = len(frame_list)
135
+ self.config.inverse_config.output_dir = ddim_latents_path
136
+ ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
137
+
138
+ # Step 1. DDIM Inversion
139
+ first_frame = frame_list[0]
140
+
141
+ generator = torch.Generator(device="cuda:0")
142
+ generator = generator.manual_seed(seed)
143
+ _ddim_latents = ddim_inversion(
144
+ self.config.inverse_config,
145
+ first_frame,
146
+ frame_list,
147
+ self.pipe,
148
+ self.inverse_scheduler,
149
+ generator,
150
+ )
151
+
152
+ # Step 2. DDIM Sampling + PnP feature and attention injection
153
+ # Load the edited first frame
154
+ edited_1st_frame = load_image(edited_first_frame_path).resize(
155
+ self.config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
156
+ )
157
+ # Load the initial latents at t
158
+ self.ddim_scheduler.set_timesteps(num_inference_steps)
159
+ print(f"ddim_scheduler.timesteps: {self.ddim_scheduler.timesteps}")
160
+ ddim_latents_at_t = load_ddim_latents_at_t(
161
+ self.ddim_scheduler.timesteps[ddim_init_latents_t_idx],
162
+ ddim_latents_path=ddim_latents_path,
163
+ )
164
+ print(
165
+ f"ddim_scheduler.timesteps[t_idx]: {self.ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
166
+ )
167
+ print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
168
+
169
+ # Blend the latents
170
+ random_latents = torch.randn_like(ddim_latents_at_t)
171
+ print(
172
+ f"Blending random_ratio (1 means random latent): {self.config.pnp_config.random_ratio}"
173
+ )
174
+ mixed_latents = (
175
+ random_latents * self.config.pnp_config.random_ratio
176
+ + ddim_latents_at_t * (1 - self.config.pnp_config.random_ratio)
177
+ )
178
+
179
+ # Init Pnp
180
+ self.config.pnp_config.n_steps = num_inference_steps
181
+ self.config.pnp_config.pnp_f_t = conv_inj
182
+ self.config.pnp_config.pnp_spatial_attn_t = spatial_inj
183
+ self.config.pnp_config.pnp_temp_attn_t = temp_inj
184
+ self.config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
185
+ init_pnp(self.pipe, self.ddim_scheduler, self.config.pnp_config)
186
+ # Edit video
187
+ self.pipe.register_modules(scheduler=self.ddim_scheduler)
188
+
189
+ edited_video = self.pipe.sample_with_pnp(
190
+ prompt=video_prompt,
191
+ image=edited_1st_frame,
192
+ height=self.config.inverse_config.image_size[1],
193
+ width=self.config.inverse_config.image_size[0],
194
+ num_frames=self.config.inverse_config.n_frames,
195
+ num_inference_steps=self.config.pnp_config.n_steps,
196
+ guidance_scale=guidance_scale,
197
+ negative_prompt=video_negative_prompt,
198
+ target_fps=self.config.pnp_config.target_fps,
199
+ latents=mixed_latents,
200
+ generator=generator,
201
+ return_dict=True,
202
+ ddim_init_latents_t_idx=ddim_init_latents_t_idx,
203
+ ddim_inv_latents_path=ddim_latents_path,
204
+ ddim_inv_prompt=self.config.inverse_config.ddim_inv_prompt,
205
+ ddim_inv_1st_frame=first_frame,
206
+ ).frames[0]
207
+
208
+ edited_video = [
209
+ frame.resize(self.config.inverse_config.image_size, resample=Image.LANCZOS)
210
+ for frame in edited_video
211
+ ]
212
+
213
+ def images_to_video(images, output_path, fps=24):
214
+ writer = imageio.get_writer(output_path, fps=fps)
215
+
216
+ for img in images:
217
+ img_np = np.array(img)
218
+ writer.append_data(img_np)
219
+
220
+ writer.close()
221
+ output_path = os.path.join(tmp_dir, "edited_video.mp4")
222
+ images_to_video(
223
+ edited_video, output_path, fps=self.config.pnp_config.target_fps
224
+ )
225
+ return output_path
226
+
227
+
228
+ # Init the class
229
+ #=====================================
230
+ if not DEBUG_MODE:
231
+ Image_Editor = ImageEditor()
232
+ AnyV2V_Editor = AnyV2V_I2VGenXL()
233
+ #=====================================
234
 
235
  def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
236
  def check_video(video_path):
 
267
  ie_seed = int.from_bytes(os.urandom(2), "big")
268
  print(f"Using seed: {ie_seed}")
269
 
270
+ edited_image_path = Image_Editor.perform_edit(video_path=video_path,
271
  prompt=instruct_prompt,
272
  force_512=ie_force_512,
273
  seed=ie_seed,
 
292
  seed = int.from_bytes(os.urandom(2), "big")
293
  print(f"Using seed: {seed}")
294
 
295
+ result_video_path = AnyV2V_Editor.perform_anyv2v(video_path=video_path,
296
  video_prompt=video_prompt,
297
  video_negative_prompt=video_negative_prompt,
298
  edited_first_frame_path=edited_first_frame_path,
 
345
  ie_force_512 = gr.Checkbox(label="Force resize to 512x512 before feeding into the image editing model")
346
 
347
  with gr.Column():
348
+ gr.Markdown("# Video Editing Stage")
349
  gr.Markdown("Enjoy the full control of the video editing process using the edited image and the preprocessed video! Click on the Run AnyV2V button after inputting the video description prompt. Try tweak with the setting if the output does not satisfy you!")
350
  video_output = gr.Video(label="Video Output")
351
  video_prompt = gr.Textbox(label="Video description prompt")
352
+ btn_infer = gr.Button("Run Video Editing")
353
  settings_anyv2v = gr.Accordion("Settings for AnyV2V")
354
  with settings_anyv2v:
355
  with gr.Column():
 
368
 
369
 
370
  examples = gr.Examples(examples=demo_examples,
371
+ label="Examples (Just click on Video Editing button after loading them into the UI)",
372
+ inputs=[video_input, image_input_output, video_prompt, av_pnp_f_t, av_pnp_spatial_attn_t, av_pnp_temp_attn_t])
373
 
374
  btn_pv.click(
375
  btn_preprocess_video_fn,