vinesmsuic commited on
Commit
26378e3
1 Parent(s): 15186bb
Files changed (2) hide show
  1. app.py +182 -191
  2. gradio_demo.py +182 -191
app.py CHANGED
@@ -30,7 +30,7 @@ import imageio
30
  DEBUG_MODE = False
31
 
32
  demo_examples = [
33
- ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "darth vader walking", 0.1, 0.1, 1.0],
34
  ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
35
  ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
36
  ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
@@ -39,198 +39,189 @@ demo_examples = [
39
 
40
  TEMP_DIR = "_demo_temp"
41
 
42
- class ImageEditor:
43
- def __init__(self) -> None:
44
- self.image_edit_model = InstructPix2Pix()
45
 
46
- @torch.no_grad()
47
- @spaces.GPU(duration=30)
48
- def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
49
- edited_image_path = infer_video(self.image_edit_model,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  video_path,
51
- output_dir=TEMP_DIR,
52
- prompt=prompt,
53
- prompt_type="instruct",
54
- force_512=force_512,
55
- seed=seed,
56
- negative_prompt=negative_prompt,
57
- overwrite=True)
58
- return edited_image_path
59
-
60
- class AnyV2V_I2VGenXL:
61
- def __init__(self) -> None:
62
- # Set up default inversion config file
63
- config = {
64
- # DDIM inversion
65
- "inverse_config": {
66
- "image_size": [512, 512],
67
- "n_frames": 16,
68
- "cfg": 1.0,
69
- "target_fps": 8,
70
- "ddim_inv_prompt": "",
71
- "prompt": "",
72
- "negative_prompt": "",
73
- },
74
- "pnp_config": {
75
- "random_ratio": 0.0,
76
- "target_fps": 8,
77
- },
78
- }
79
- self.config = OmegaConf.create(config)
80
-
81
- @torch.no_grad()
82
- @spaces.GPU(duration=150)
83
- def perform_anyv2v(self,
84
- video_path,
85
- video_prompt,
86
- video_negative_prompt,
87
- edited_first_frame_path,
88
- conv_inj,
89
- spatial_inj,
90
- temp_inj,
91
- num_inference_steps,
92
- guidance_scale,
93
- ddim_init_latents_t_idx,
94
- ddim_inversion_steps,
95
- seed,
96
- ):
97
-
98
- # Initialize the I2VGenXL pipeline
99
- self.pipe = I2VGenXLPipeline.from_pretrained(
100
- "ali-vilab/i2vgen-xl",
101
- torch_dtype=torch.float16,
102
- variant="fp16",
103
- ).to("cuda:0")
104
-
105
- # Initialize the DDIM inverse scheduler
106
- self.inverse_scheduler = DDIMInverseScheduler.from_pretrained(
107
- "ali-vilab/i2vgen-xl",
108
- subfolder="scheduler",
109
- )
110
- # Initialize the DDIM scheduler
111
- self.ddim_scheduler = DDIMScheduler.from_pretrained(
112
- "ali-vilab/i2vgen-xl",
113
- subfolder="scheduler",
114
- )
115
-
116
- tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
117
- if os.path.exists(tmp_dir):
118
- shutil.rmtree(tmp_dir)
119
- os.makedirs(tmp_dir)
120
-
121
- ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
122
-
123
- def read_frames(video_path):
124
- frames = []
125
- with imageio.get_reader(video_path) as reader:
126
- for i, frame in enumerate(reader):
127
- pil_image = Image.fromarray(frame)
128
- frames.append(pil_image)
129
- return frames
130
- frame_list = read_frames(str(video_path))
131
-
132
- self.config.inverse_config.image_size = list(frame_list[0].size)
133
- self.config.inverse_config.n_steps = ddim_inversion_steps
134
- self.config.inverse_config.n_frames = len(frame_list)
135
- self.config.inverse_config.output_dir = ddim_latents_path
136
- ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
137
-
138
- # Step 1. DDIM Inversion
139
- first_frame = frame_list[0]
140
-
141
- generator = torch.Generator(device="cuda:0")
142
- generator = generator.manual_seed(seed)
143
- _ddim_latents = ddim_inversion(
144
- self.config.inverse_config,
145
- first_frame,
146
- frame_list,
147
- self.pipe,
148
- self.inverse_scheduler,
149
- generator,
150
- )
151
-
152
- # Step 2. DDIM Sampling + PnP feature and attention injection
153
- # Load the edited first frame
154
- edited_1st_frame = load_image(edited_first_frame_path).resize(
155
- self.config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
156
- )
157
- # Load the initial latents at t
158
- self.ddim_scheduler.set_timesteps(num_inference_steps)
159
- print(f"ddim_scheduler.timesteps: {self.ddim_scheduler.timesteps}")
160
- ddim_latents_at_t = load_ddim_latents_at_t(
161
- self.ddim_scheduler.timesteps[ddim_init_latents_t_idx],
162
- ddim_latents_path=ddim_latents_path,
163
- )
164
- print(
165
- f"ddim_scheduler.timesteps[t_idx]: {self.ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
166
- )
167
- print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
168
-
169
- # Blend the latents
170
- random_latents = torch.randn_like(ddim_latents_at_t)
171
- print(
172
- f"Blending random_ratio (1 means random latent): {self.config.pnp_config.random_ratio}"
173
- )
174
- mixed_latents = (
175
- random_latents * self.config.pnp_config.random_ratio
176
- + ddim_latents_at_t * (1 - self.config.pnp_config.random_ratio)
177
- )
178
-
179
- # Init Pnp
180
- self.config.pnp_config.n_steps = num_inference_steps
181
- self.config.pnp_config.pnp_f_t = conv_inj
182
- self.config.pnp_config.pnp_spatial_attn_t = spatial_inj
183
- self.config.pnp_config.pnp_temp_attn_t = temp_inj
184
- self.config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
185
- init_pnp(self.pipe, self.ddim_scheduler, self.config.pnp_config)
186
- # Edit video
187
- self.pipe.register_modules(scheduler=self.ddim_scheduler)
188
-
189
- edited_video = self.pipe.sample_with_pnp(
190
- prompt=video_prompt,
191
- image=edited_1st_frame,
192
- height=self.config.inverse_config.image_size[1],
193
- width=self.config.inverse_config.image_size[0],
194
- num_frames=self.config.inverse_config.n_frames,
195
- num_inference_steps=self.config.pnp_config.n_steps,
196
- guidance_scale=guidance_scale,
197
- negative_prompt=video_negative_prompt,
198
- target_fps=self.config.pnp_config.target_fps,
199
- latents=mixed_latents,
200
- generator=generator,
201
- return_dict=True,
202
- ddim_init_latents_t_idx=ddim_init_latents_t_idx,
203
- ddim_inv_latents_path=ddim_latents_path,
204
- ddim_inv_prompt=self.config.inverse_config.ddim_inv_prompt,
205
- ddim_inv_1st_frame=first_frame,
206
- ).frames[0]
207
-
208
- edited_video = [
209
- frame.resize(self.config.inverse_config.image_size, resample=Image.LANCZOS)
210
- for frame in edited_video
211
- ]
212
-
213
- def images_to_video(images, output_path, fps=24):
214
- writer = imageio.get_writer(output_path, fps=fps)
215
-
216
- for img in images:
217
- img_np = np.array(img)
218
- writer.append_data(img_np)
219
-
220
- writer.close()
221
- output_path = os.path.join(tmp_dir, "edited_video.mp4")
222
- images_to_video(
223
- edited_video, output_path, fps=self.config.pnp_config.target_fps
224
- )
225
- return output_path
226
-
227
-
228
- # Init the class
229
- #=====================================
230
- if not DEBUG_MODE:
231
- Image_Editor = ImageEditor()
232
- AnyV2V_Editor = AnyV2V_I2VGenXL()
233
- #=====================================
234
 
235
  def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
236
  def check_video(video_path):
 
30
  DEBUG_MODE = False
31
 
32
  demo_examples = [
33
+ ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "man walking", 0.1, 0.1, 1.0],
34
  ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
35
  ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
36
  ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
 
39
 
40
  TEMP_DIR = "_demo_temp"
41
 
 
 
 
42
 
43
+ image_edit_model = InstructPix2Pix()
44
+
45
+ @torch.no_grad()
46
+ @spaces.GPU(duration=30)
47
+ def perform_edit(video_path, prompt, force_512=False, seed=42, negative_prompt=""):
48
+ edited_image_path = infer_video(image_edit_model,
49
+ video_path,
50
+ output_dir=TEMP_DIR,
51
+ prompt=prompt,
52
+ prompt_type="instruct",
53
+ force_512=force_512,
54
+ seed=seed,
55
+ negative_prompt=negative_prompt,
56
+ overwrite=True)
57
+ return edited_image_path
58
+
59
+
60
+ # Set up default inversion config file
61
+ config = {
62
+ # DDIM inversion
63
+ "inverse_config": {
64
+ "image_size": [512, 512],
65
+ "n_frames": 16,
66
+ "cfg": 1.0,
67
+ "target_fps": 8,
68
+ "ddim_inv_prompt": "",
69
+ "prompt": "",
70
+ "negative_prompt": "",
71
+ },
72
+ "pnp_config": {
73
+ "random_ratio": 0.0,
74
+ "target_fps": 8,
75
+ },
76
+ }
77
+ config = OmegaConf.create(config)
78
+
79
+ # Initialize the I2VGenXL pipeline
80
+ pipe = I2VGenXLPipeline.from_pretrained(
81
+ "ali-vilab/i2vgen-xl",
82
+ torch_dtype=torch.float16,
83
+ variant="fp16",
84
+ ).to("cuda:0")
85
+
86
+ # Initialize the DDIM inverse scheduler
87
+ inverse_scheduler = DDIMInverseScheduler.from_pretrained(
88
+ "ali-vilab/i2vgen-xl",
89
+ subfolder="scheduler",
90
+ )
91
+ # Initialize the DDIM scheduler
92
+ ddim_scheduler = DDIMScheduler.from_pretrained(
93
+ "ali-vilab/i2vgen-xl",
94
+ subfolder="scheduler",
95
+ )
96
+
97
+ @torch.no_grad()
98
+ @spaces.GPU(duration=150)
99
+ def perform_anyv2v(
100
  video_path,
101
+ video_prompt,
102
+ video_negative_prompt,
103
+ edited_first_frame_path,
104
+ conv_inj,
105
+ spatial_inj,
106
+ temp_inj,
107
+ num_inference_steps,
108
+ guidance_scale,
109
+ ddim_init_latents_t_idx,
110
+ ddim_inversion_steps,
111
+ seed,
112
+ ):
113
+
114
+ tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
115
+ if os.path.exists(tmp_dir):
116
+ shutil.rmtree(tmp_dir)
117
+ os.makedirs(tmp_dir)
118
+
119
+ ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
120
+
121
+ def read_frames(video_path):
122
+ frames = []
123
+ with imageio.get_reader(video_path) as reader:
124
+ for i, frame in enumerate(reader):
125
+ pil_image = Image.fromarray(frame)
126
+ frames.append(pil_image)
127
+ return frames
128
+ frame_list = read_frames(str(video_path))
129
+
130
+ config.inverse_config.image_size = list(frame_list[0].size)
131
+ config.inverse_config.n_steps = ddim_inversion_steps
132
+ config.inverse_config.n_frames = len(frame_list)
133
+ config.inverse_config.output_dir = ddim_latents_path
134
+ ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
135
+
136
+ # Step 1. DDIM Inversion
137
+ first_frame = frame_list[0]
138
+
139
+ generator = torch.Generator(device="cuda:0")
140
+ generator = generator.manual_seed(seed)
141
+ _ddim_latents = ddim_inversion(
142
+ config.inverse_config,
143
+ first_frame,
144
+ frame_list,
145
+ pipe,
146
+ inverse_scheduler,
147
+ generator,
148
+ )
149
+
150
+ # Step 2. DDIM Sampling + PnP feature and attention injection
151
+ # Load the edited first frame
152
+ edited_1st_frame = load_image(edited_first_frame_path).resize(
153
+ config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
154
+ )
155
+ # Load the initial latents at t
156
+ ddim_scheduler.set_timesteps(num_inference_steps)
157
+ print(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
158
+ ddim_latents_at_t = load_ddim_latents_at_t(
159
+ ddim_scheduler.timesteps[ddim_init_latents_t_idx],
160
+ ddim_latents_path=ddim_latents_path,
161
+ )
162
+ print(
163
+ f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
164
+ )
165
+ print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
166
+
167
+ # Blend the latents
168
+ random_latents = torch.randn_like(ddim_latents_at_t)
169
+ print(
170
+ f"Blending random_ratio (1 means random latent): {config.pnp_config.random_ratio}"
171
+ )
172
+ mixed_latents = (
173
+ random_latents * config.pnp_config.random_ratio
174
+ + ddim_latents_at_t * (1 - config.pnp_config.random_ratio)
175
+ )
176
+
177
+ # Init Pnp
178
+ config.pnp_config.n_steps = num_inference_steps
179
+ config.pnp_config.pnp_f_t = conv_inj
180
+ config.pnp_config.pnp_spatial_attn_t = spatial_inj
181
+ config.pnp_config.pnp_temp_attn_t = temp_inj
182
+ config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
183
+ init_pnp(pipe, ddim_scheduler, config.pnp_config)
184
+ # Edit video
185
+ pipe.register_modules(scheduler=ddim_scheduler)
186
+
187
+ edited_video = pipe.sample_with_pnp(
188
+ prompt=video_prompt,
189
+ image=edited_1st_frame,
190
+ height=config.inverse_config.image_size[1],
191
+ width=config.inverse_config.image_size[0],
192
+ num_frames=config.inverse_config.n_frames,
193
+ num_inference_steps=config.pnp_config.n_steps,
194
+ guidance_scale=guidance_scale,
195
+ negative_prompt=video_negative_prompt,
196
+ target_fps=config.pnp_config.target_fps,
197
+ latents=mixed_latents,
198
+ generator=generator,
199
+ return_dict=True,
200
+ ddim_init_latents_t_idx=ddim_init_latents_t_idx,
201
+ ddim_inv_latents_path=ddim_latents_path,
202
+ ddim_inv_prompt=config.inverse_config.ddim_inv_prompt,
203
+ ddim_inv_1st_frame=first_frame,
204
+ ).frames[0]
205
+
206
+ edited_video = [
207
+ frame.resize(config.inverse_config.image_size, resample=Image.LANCZOS)
208
+ for frame in edited_video
209
+ ]
210
+
211
+ def images_to_video(images, output_path, fps=24):
212
+ writer = imageio.get_writer(output_path, fps=fps)
213
+
214
+ for img in images:
215
+ img_np = np.array(img)
216
+ writer.append_data(img_np)
217
+
218
+ writer.close()
219
+ output_path = os.path.join(tmp_dir, "edited_video.mp4")
220
+ images_to_video(
221
+ edited_video, output_path, fps=config.pnp_config.target_fps
222
+ )
223
+ return output_path
224
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
227
  def check_video(video_path):
gradio_demo.py CHANGED
@@ -30,7 +30,7 @@ import imageio
30
  DEBUG_MODE = False
31
 
32
  demo_examples = [
33
- ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "darth vader walking", 0.1, 0.1, 1.0],
34
  ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
35
  ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
36
  ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
@@ -39,198 +39,189 @@ demo_examples = [
39
 
40
  TEMP_DIR = "_demo_temp"
41
 
42
- class ImageEditor:
43
- def __init__(self) -> None:
44
- self.image_edit_model = InstructPix2Pix()
45
 
46
- @torch.no_grad()
47
- @spaces.GPU(duration=30)
48
- def perform_edit(self, video_path, prompt, force_512=False, seed=42, negative_prompt=""):
49
- edited_image_path = infer_video(self.image_edit_model,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  video_path,
51
- output_dir=TEMP_DIR,
52
- prompt=prompt,
53
- prompt_type="instruct",
54
- force_512=force_512,
55
- seed=seed,
56
- negative_prompt=negative_prompt,
57
- overwrite=True)
58
- return edited_image_path
59
-
60
- class AnyV2V_I2VGenXL:
61
- def __init__(self) -> None:
62
- # Set up default inversion config file
63
- config = {
64
- # DDIM inversion
65
- "inverse_config": {
66
- "image_size": [512, 512],
67
- "n_frames": 16,
68
- "cfg": 1.0,
69
- "target_fps": 8,
70
- "ddim_inv_prompt": "",
71
- "prompt": "",
72
- "negative_prompt": "",
73
- },
74
- "pnp_config": {
75
- "random_ratio": 0.0,
76
- "target_fps": 8,
77
- },
78
- }
79
- self.config = OmegaConf.create(config)
80
-
81
- @torch.no_grad()
82
- @spaces.GPU(duration=150)
83
- def perform_anyv2v(self,
84
- video_path,
85
- video_prompt,
86
- video_negative_prompt,
87
- edited_first_frame_path,
88
- conv_inj,
89
- spatial_inj,
90
- temp_inj,
91
- num_inference_steps,
92
- guidance_scale,
93
- ddim_init_latents_t_idx,
94
- ddim_inversion_steps,
95
- seed,
96
- ):
97
-
98
- # Initialize the I2VGenXL pipeline
99
- self.pipe = I2VGenXLPipeline.from_pretrained(
100
- "ali-vilab/i2vgen-xl",
101
- torch_dtype=torch.float16,
102
- variant="fp16",
103
- ).to("cuda:0")
104
-
105
- # Initialize the DDIM inverse scheduler
106
- self.inverse_scheduler = DDIMInverseScheduler.from_pretrained(
107
- "ali-vilab/i2vgen-xl",
108
- subfolder="scheduler",
109
- )
110
- # Initialize the DDIM scheduler
111
- self.ddim_scheduler = DDIMScheduler.from_pretrained(
112
- "ali-vilab/i2vgen-xl",
113
- subfolder="scheduler",
114
- )
115
-
116
- tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
117
- if os.path.exists(tmp_dir):
118
- shutil.rmtree(tmp_dir)
119
- os.makedirs(tmp_dir)
120
-
121
- ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
122
-
123
- def read_frames(video_path):
124
- frames = []
125
- with imageio.get_reader(video_path) as reader:
126
- for i, frame in enumerate(reader):
127
- pil_image = Image.fromarray(frame)
128
- frames.append(pil_image)
129
- return frames
130
- frame_list = read_frames(str(video_path))
131
-
132
- self.config.inverse_config.image_size = list(frame_list[0].size)
133
- self.config.inverse_config.n_steps = ddim_inversion_steps
134
- self.config.inverse_config.n_frames = len(frame_list)
135
- self.config.inverse_config.output_dir = ddim_latents_path
136
- ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
137
-
138
- # Step 1. DDIM Inversion
139
- first_frame = frame_list[0]
140
-
141
- generator = torch.Generator(device="cuda:0")
142
- generator = generator.manual_seed(seed)
143
- _ddim_latents = ddim_inversion(
144
- self.config.inverse_config,
145
- first_frame,
146
- frame_list,
147
- self.pipe,
148
- self.inverse_scheduler,
149
- generator,
150
- )
151
-
152
- # Step 2. DDIM Sampling + PnP feature and attention injection
153
- # Load the edited first frame
154
- edited_1st_frame = load_image(edited_first_frame_path).resize(
155
- self.config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
156
- )
157
- # Load the initial latents at t
158
- self.ddim_scheduler.set_timesteps(num_inference_steps)
159
- print(f"ddim_scheduler.timesteps: {self.ddim_scheduler.timesteps}")
160
- ddim_latents_at_t = load_ddim_latents_at_t(
161
- self.ddim_scheduler.timesteps[ddim_init_latents_t_idx],
162
- ddim_latents_path=ddim_latents_path,
163
- )
164
- print(
165
- f"ddim_scheduler.timesteps[t_idx]: {self.ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
166
- )
167
- print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
168
-
169
- # Blend the latents
170
- random_latents = torch.randn_like(ddim_latents_at_t)
171
- print(
172
- f"Blending random_ratio (1 means random latent): {self.config.pnp_config.random_ratio}"
173
- )
174
- mixed_latents = (
175
- random_latents * self.config.pnp_config.random_ratio
176
- + ddim_latents_at_t * (1 - self.config.pnp_config.random_ratio)
177
- )
178
-
179
- # Init Pnp
180
- self.config.pnp_config.n_steps = num_inference_steps
181
- self.config.pnp_config.pnp_f_t = conv_inj
182
- self.config.pnp_config.pnp_spatial_attn_t = spatial_inj
183
- self.config.pnp_config.pnp_temp_attn_t = temp_inj
184
- self.config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
185
- init_pnp(self.pipe, self.ddim_scheduler, self.config.pnp_config)
186
- # Edit video
187
- self.pipe.register_modules(scheduler=self.ddim_scheduler)
188
-
189
- edited_video = self.pipe.sample_with_pnp(
190
- prompt=video_prompt,
191
- image=edited_1st_frame,
192
- height=self.config.inverse_config.image_size[1],
193
- width=self.config.inverse_config.image_size[0],
194
- num_frames=self.config.inverse_config.n_frames,
195
- num_inference_steps=self.config.pnp_config.n_steps,
196
- guidance_scale=guidance_scale,
197
- negative_prompt=video_negative_prompt,
198
- target_fps=self.config.pnp_config.target_fps,
199
- latents=mixed_latents,
200
- generator=generator,
201
- return_dict=True,
202
- ddim_init_latents_t_idx=ddim_init_latents_t_idx,
203
- ddim_inv_latents_path=ddim_latents_path,
204
- ddim_inv_prompt=self.config.inverse_config.ddim_inv_prompt,
205
- ddim_inv_1st_frame=first_frame,
206
- ).frames[0]
207
-
208
- edited_video = [
209
- frame.resize(self.config.inverse_config.image_size, resample=Image.LANCZOS)
210
- for frame in edited_video
211
- ]
212
-
213
- def images_to_video(images, output_path, fps=24):
214
- writer = imageio.get_writer(output_path, fps=fps)
215
-
216
- for img in images:
217
- img_np = np.array(img)
218
- writer.append_data(img_np)
219
-
220
- writer.close()
221
- output_path = os.path.join(tmp_dir, "edited_video.mp4")
222
- images_to_video(
223
- edited_video, output_path, fps=self.config.pnp_config.target_fps
224
- )
225
- return output_path
226
-
227
-
228
- # Init the class
229
- #=====================================
230
- if not DEBUG_MODE:
231
- Image_Editor = ImageEditor()
232
- AnyV2V_Editor = AnyV2V_I2VGenXL()
233
- #=====================================
234
 
235
  def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
236
  def check_video(video_path):
 
30
  DEBUG_MODE = False
31
 
32
  demo_examples = [
33
+ ["./demo/Man Walking.mp4", "./demo/Man Walking/edited_first_frame/turn the man into darth vader.png", "man walking", 0.1, 0.1, 1.0],
34
  ["./demo/A kitten turning its head on a wooden floor.mp4", "./demo/A kitten turning its head on a wooden floor/edited_first_frame/A dog turning its head on a wooden floor.png", "A dog turning its head on a wooden floor", 0.2, 0.2, 0.5],
35
  ["./demo/An Old Man Doing Exercises For The Body And Mind.mp4", "./demo/An Old Man Doing Exercises For The Body And Mind/edited_first_frame/jack ma.png", "a man doing exercises for the body and mind", 0.8, 0.8, 1.0],
36
  ["./demo/Ballet.mp4", "./demo/Ballet/edited_first_frame/van gogh style.png", "girl dancing ballet, in the style of van gogh", 1.0, 1.0, 1.0],
 
39
 
40
  TEMP_DIR = "_demo_temp"
41
 
 
 
 
42
 
43
+ image_edit_model = InstructPix2Pix()
44
+
45
+ @torch.no_grad()
46
+ @spaces.GPU(duration=30)
47
+ def perform_edit(video_path, prompt, force_512=False, seed=42, negative_prompt=""):
48
+ edited_image_path = infer_video(image_edit_model,
49
+ video_path,
50
+ output_dir=TEMP_DIR,
51
+ prompt=prompt,
52
+ prompt_type="instruct",
53
+ force_512=force_512,
54
+ seed=seed,
55
+ negative_prompt=negative_prompt,
56
+ overwrite=True)
57
+ return edited_image_path
58
+
59
+
60
+ # Set up default inversion config file
61
+ config = {
62
+ # DDIM inversion
63
+ "inverse_config": {
64
+ "image_size": [512, 512],
65
+ "n_frames": 16,
66
+ "cfg": 1.0,
67
+ "target_fps": 8,
68
+ "ddim_inv_prompt": "",
69
+ "prompt": "",
70
+ "negative_prompt": "",
71
+ },
72
+ "pnp_config": {
73
+ "random_ratio": 0.0,
74
+ "target_fps": 8,
75
+ },
76
+ }
77
+ config = OmegaConf.create(config)
78
+
79
+ # Initialize the I2VGenXL pipeline
80
+ pipe = I2VGenXLPipeline.from_pretrained(
81
+ "ali-vilab/i2vgen-xl",
82
+ torch_dtype=torch.float16,
83
+ variant="fp16",
84
+ ).to("cuda:0")
85
+
86
+ # Initialize the DDIM inverse scheduler
87
+ inverse_scheduler = DDIMInverseScheduler.from_pretrained(
88
+ "ali-vilab/i2vgen-xl",
89
+ subfolder="scheduler",
90
+ )
91
+ # Initialize the DDIM scheduler
92
+ ddim_scheduler = DDIMScheduler.from_pretrained(
93
+ "ali-vilab/i2vgen-xl",
94
+ subfolder="scheduler",
95
+ )
96
+
97
+ @torch.no_grad()
98
+ @spaces.GPU(duration=150)
99
+ def perform_anyv2v(
100
  video_path,
101
+ video_prompt,
102
+ video_negative_prompt,
103
+ edited_first_frame_path,
104
+ conv_inj,
105
+ spatial_inj,
106
+ temp_inj,
107
+ num_inference_steps,
108
+ guidance_scale,
109
+ ddim_init_latents_t_idx,
110
+ ddim_inversion_steps,
111
+ seed,
112
+ ):
113
+
114
+ tmp_dir = os.path.join(TEMP_DIR, "AnyV2V")
115
+ if os.path.exists(tmp_dir):
116
+ shutil.rmtree(tmp_dir)
117
+ os.makedirs(tmp_dir)
118
+
119
+ ddim_latents_path = os.path.join(tmp_dir, "ddim_latents")
120
+
121
+ def read_frames(video_path):
122
+ frames = []
123
+ with imageio.get_reader(video_path) as reader:
124
+ for i, frame in enumerate(reader):
125
+ pil_image = Image.fromarray(frame)
126
+ frames.append(pil_image)
127
+ return frames
128
+ frame_list = read_frames(str(video_path))
129
+
130
+ config.inverse_config.image_size = list(frame_list[0].size)
131
+ config.inverse_config.n_steps = ddim_inversion_steps
132
+ config.inverse_config.n_frames = len(frame_list)
133
+ config.inverse_config.output_dir = ddim_latents_path
134
+ ddim_init_latents_t_idx = min(ddim_init_latents_t_idx, num_inference_steps - 1)
135
+
136
+ # Step 1. DDIM Inversion
137
+ first_frame = frame_list[0]
138
+
139
+ generator = torch.Generator(device="cuda:0")
140
+ generator = generator.manual_seed(seed)
141
+ _ddim_latents = ddim_inversion(
142
+ config.inverse_config,
143
+ first_frame,
144
+ frame_list,
145
+ pipe,
146
+ inverse_scheduler,
147
+ generator,
148
+ )
149
+
150
+ # Step 2. DDIM Sampling + PnP feature and attention injection
151
+ # Load the edited first frame
152
+ edited_1st_frame = load_image(edited_first_frame_path).resize(
153
+ config.inverse_config.image_size, resample=Image.Resampling.LANCZOS
154
+ )
155
+ # Load the initial latents at t
156
+ ddim_scheduler.set_timesteps(num_inference_steps)
157
+ print(f"ddim_scheduler.timesteps: {ddim_scheduler.timesteps}")
158
+ ddim_latents_at_t = load_ddim_latents_at_t(
159
+ ddim_scheduler.timesteps[ddim_init_latents_t_idx],
160
+ ddim_latents_path=ddim_latents_path,
161
+ )
162
+ print(
163
+ f"ddim_scheduler.timesteps[t_idx]: {ddim_scheduler.timesteps[ddim_init_latents_t_idx]}"
164
+ )
165
+ print(f"ddim_latents_at_t.shape: {ddim_latents_at_t.shape}")
166
+
167
+ # Blend the latents
168
+ random_latents = torch.randn_like(ddim_latents_at_t)
169
+ print(
170
+ f"Blending random_ratio (1 means random latent): {config.pnp_config.random_ratio}"
171
+ )
172
+ mixed_latents = (
173
+ random_latents * config.pnp_config.random_ratio
174
+ + ddim_latents_at_t * (1 - config.pnp_config.random_ratio)
175
+ )
176
+
177
+ # Init Pnp
178
+ config.pnp_config.n_steps = num_inference_steps
179
+ config.pnp_config.pnp_f_t = conv_inj
180
+ config.pnp_config.pnp_spatial_attn_t = spatial_inj
181
+ config.pnp_config.pnp_temp_attn_t = temp_inj
182
+ config.pnp_config.ddim_init_latents_t_idx = ddim_init_latents_t_idx
183
+ init_pnp(pipe, ddim_scheduler, config.pnp_config)
184
+ # Edit video
185
+ pipe.register_modules(scheduler=ddim_scheduler)
186
+
187
+ edited_video = pipe.sample_with_pnp(
188
+ prompt=video_prompt,
189
+ image=edited_1st_frame,
190
+ height=config.inverse_config.image_size[1],
191
+ width=config.inverse_config.image_size[0],
192
+ num_frames=config.inverse_config.n_frames,
193
+ num_inference_steps=config.pnp_config.n_steps,
194
+ guidance_scale=guidance_scale,
195
+ negative_prompt=video_negative_prompt,
196
+ target_fps=config.pnp_config.target_fps,
197
+ latents=mixed_latents,
198
+ generator=generator,
199
+ return_dict=True,
200
+ ddim_init_latents_t_idx=ddim_init_latents_t_idx,
201
+ ddim_inv_latents_path=ddim_latents_path,
202
+ ddim_inv_prompt=config.inverse_config.ddim_inv_prompt,
203
+ ddim_inv_1st_frame=first_frame,
204
+ ).frames[0]
205
+
206
+ edited_video = [
207
+ frame.resize(config.inverse_config.image_size, resample=Image.LANCZOS)
208
+ for frame in edited_video
209
+ ]
210
+
211
+ def images_to_video(images, output_path, fps=24):
212
+ writer = imageio.get_writer(output_path, fps=fps)
213
+
214
+ for img in images:
215
+ img_np = np.array(img)
216
+ writer.append_data(img_np)
217
+
218
+ writer.close()
219
+ output_path = os.path.join(tmp_dir, "edited_video.mp4")
220
+ images_to_video(
221
+ edited_video, output_path, fps=config.pnp_config.target_fps
222
+ )
223
+ return output_path
224
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  def btn_preprocess_video_fn(video_path, width, height, start_time, end_time, center_crop, x_offset, y_offset, longest_to_width):
227
  def check_video(video_path):