alex commited on
Commit
b95275b
·
1 Parent(s): 490cabd

allow prompt only

Browse files
Files changed (2) hide show
  1. app.py +98 -59
  2. ovi/utils/fm_solvers_unipc.py +18 -1
app.py CHANGED
@@ -47,7 +47,7 @@ os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/processed_results"
47
  import gradio as gr
48
  import argparse
49
  from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
50
- from diffusers import FluxPipeline
51
  import tempfile
52
  from ovi.utils.io_utils import save_video
53
  from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible
@@ -104,6 +104,7 @@ print(f"loading model...")
104
  DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
105
  DEFAULT_CONFIG['mode'] = "t2v" # hardcoded since it is always cpu offloaded
106
  ovi_engine = OviFusionEngine()
 
107
  print("loaded model")
108
 
109
 
@@ -133,10 +134,39 @@ def resize_for_model(image_path):
133
  )
134
  return new_img, target_size
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def generate_scene(
137
  text_prompt,
138
- image,
139
  sample_steps = 50,
 
140
  session_id = None,
141
  video_seed = 100,
142
  solver_name = "unipc",
@@ -150,17 +180,15 @@ def generate_scene(
150
  ):
151
  text_prompt_processed = (text_prompt or "").strip()
152
 
153
- if not image:
154
- raise gr.Error("Please provide an image")
155
-
156
-
157
  if not text_prompt_processed:
158
  raise gr.Error("Please enter a prompt.")
159
 
 
 
160
 
161
  return generate_video(text_prompt,
162
- image,
163
  sample_steps,
 
164
  session_id,
165
  video_seed,
166
  solver_name,
@@ -174,8 +202,8 @@ def generate_scene(
174
 
175
  def get_duration(
176
  text_prompt,
177
- image,
178
  sample_steps,
 
179
  session_id,
180
  video_seed,
181
  solver_name,
@@ -187,16 +215,21 @@ def get_duration(
187
  audio_negative_prompt,
188
  progress,
189
  ):
 
 
 
 
 
190
  warmup = 20
191
 
192
- return int(sample_steps * 3 + warmup)
193
 
194
 
195
  @spaces.GPU(duration=get_duration)
196
  def generate_video(
197
  text_prompt,
198
- image,
199
  sample_steps = 50,
 
200
  session_id = None,
201
  video_seed = 100,
202
  solver_name = "unipc",
@@ -208,47 +241,48 @@ def generate_video(
208
  audio_negative_prompt = "",
209
  progress=gr.Progress(track_tqdm=True)
210
  ):
211
- try:
212
- image_path = None
 
 
 
 
 
 
 
 
 
 
213
 
214
- if image is not None:
215
- image_path = image
216
-
217
- if session_id is None:
218
- session_id = uuid.uuid4().hex
219
-
220
-
221
- output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
222
- os.makedirs(output_dir, exist_ok=True)
223
- output_path = os.path.join(output_dir, f"generated_video.mp4")
224
-
225
-
226
- _, target_size = resize_for_model(image_path)
227
-
228
- video_frame_width = target_size[0]
229
- video_frame_height = target_size[1]
230
-
231
- generated_video, generated_audio, _ = ovi_engine.generate(
232
- text_prompt=text_prompt,
233
- image_path=image_path,
234
- video_frame_height_width=[video_frame_height, video_frame_width],
235
- seed=video_seed,
236
- solver_name=solver_name,
237
- sample_steps=sample_steps,
238
- shift=shift,
239
- video_guidance_scale=video_guidance_scale,
240
- audio_guidance_scale=audio_guidance_scale,
241
- slg_layer=slg_layer,
242
- video_negative_prompt=video_negative_prompt,
243
- audio_negative_prompt=audio_negative_prompt,
244
- )
245
 
246
- save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- return output_path
249
- except Exception as e:
250
- print(f"Error during video generation: {e}")
251
- return None
252
 
253
 
254
  def cleanup(request: gr.Request):
@@ -268,7 +302,6 @@ css = """
268
  max-width: 1024px;
269
  }
270
  """
271
-
272
  theme = gr.themes.Ocean()
273
 
274
  with gr.Blocks(css=css, theme=theme) as demo:
@@ -298,21 +331,21 @@ with gr.Blocks(css=css, theme=theme) as demo:
298
  with gr.Row():
299
  with gr.Column():
300
  # Image section
301
- image = gr.Image(type="filepath", label="Image", height=360)
302
 
303
  video_text_prompt = gr.Textbox(label="Scene Prompt",
304
  lines=5,
305
- value="A person in a scene that their mouth is slightly open as they speak, <S>Enjoy this moment<E> and as they roll their eyes. <AUDCAP>Clear voice, faint ambient outdoor sounds.<ENDAUDCAP>",
306
- placeholder="Describe your video...")
307
  sample_steps = gr.Slider(
308
  value=50,
309
- label="Generation Steps",
310
  minimum=20,
311
  maximum=100,
312
  step=1.0
313
  )
314
  run_btn = gr.Button("Action 🎬", variant="primary")
315
 
 
 
316
  gr.Markdown(
317
  """
318
  💡 **Prompt Guidelines**
@@ -350,35 +383,41 @@ with gr.Blocks(css=css, theme=theme) as demo:
350
 
351
  [
352
  "The video opens with a close-up of a woman with vibrant reddish-orange, shoulder-length hair and heavy dark eye makeup. She is wearing a dark brown leather jacket over a grey hooded top. She looks intently to her right, her mouth slightly agape, and her expression is serious and focused. The background shows a room with light green walls and dark wooden cabinets on the left, and a green plant on the right. She speaks, her voice clear and direct, saying, <S>doing<E>. She then pauses briefly, her gaze unwavering, and continues, <S>And I need you to trust them.<E>. Her mouth remains slightly open, indicating she is either about to speak more or has just finished a sentence, with a look of intense sincerity.. <AUDCAP>Tense, dramatic background music, clear female voice.<ENDAUDCAP>",
353
- "example_prompts/pngs/8.png",
354
  50,
 
355
  ],
356
 
357
  [
358
  "A young woman with long, wavy blonde hair and light-colored eyes is shown in a medium shot against a blurred backdrop of lush green foliage. She wears a denim jacket over a striped top. Initially, her eyes are closed and her mouth is slightly open as she speaks, <S>Enjoy this moment<E>. Her eyes then slowly open, looking slightly upwards and to the right, as her expression shifts to one of thoughtful contemplation. She continues to speak, <S>No matter where it's taking<E>, her gaze then settling with a serious and focused look towards someone off-screen to her right.. <AUDCAP>Clear female voice, faint ambient outdoor sounds.<ENDAUDCAP>",
359
- "example_prompts/pngs/2.png",
360
  50,
 
361
  ],
362
 
363
  [
364
  "A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the ""CHOICE FM"" logo and various social media handles like ""@ilovechoicefm"" with ""RALEIGH"" below it. The man intently addresses the microphone, articulating, <S>is talent. It's all about authenticity. You gotta be who you really are, especially if you're working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>",
365
- "example_prompts/pngs/5.png",
366
  50,
 
367
  ],
368
 
369
  [
370
  "The video opens with a close-up on an older man with long, grey hair and a short, grey beard, wearing dark sunglasses. He is clad in a dark coat, possibly with fur trim, and black gloves. His face is angled slightly upwards and to the right, as he begins to speak, his mouth slightly open. In the immediate foreground, out of focus, is the dark-clad shoulder and the back of the head of another person. The man articulates, <S>labbra. Ti ci vorrebbe...<E> His expression remains contemplative, and he continues, seemingly completing his thought, <S>Un ego solare.<E> The background behind him is a textured, grey stone wall, suggesting an outdoor setting. The man's gaze remains fixed upwards, his expression thoughtful.. <AUDCAP>A clear, slightly low-pitched male voice speaking Italian. The overall soundscape is quiet, with no prominent background noises or music.<ENDAUDCAP>",
371
- "example_prompts/pngs/7.png",
372
  50,
 
373
  ],
374
 
375
  [
376
  "The scene is set outdoors with a blurry, bright green background, suggesting grass and a sunny environment. On the left, a woman with long, dark hair, wearing a red top and a necklace with a white pendant, faces towards the right. Her expression is serious and slightly perturbed as she speaks, with her lips slightly pursed. She says, <S>UFO, UFC thing.<E> On the right, the back of a man's head and his right ear are visible, indicating he is facing away from the camera, listening to the woman. He has short, dark hair. The woman continues speaking, her expression remaining serious, <S>And if you're not watching that, it's one of those ancient movies from an era that's<E> as the frame holds steady on the two figures.. <AUDCAP>Clear female speech, distant low-frequency hum.<ENDAUDCAP>",
 
377
  "example_prompts/pngs/9.png",
 
 
 
 
378
  50,
 
379
  ],
380
  ],
381
- inputs=[video_text_prompt, image, sample_steps],
382
  outputs=[output_path],
383
  fn=generate_video,
384
  cache_examples=True,
@@ -386,7 +425,7 @@ with gr.Blocks(css=css, theme=theme) as demo:
386
 
387
  run_btn.click(
388
  fn=generate_scene,
389
- inputs=[video_text_prompt, image, sample_steps, session_state],
390
  outputs=[output_path],
391
  )
392
 
 
47
  import gradio as gr
48
  import argparse
49
  from ovi.ovi_fusion_engine import OviFusionEngine, DEFAULT_CONFIG
50
+ from diffusers import DiffusionPipeline
51
  import tempfile
52
  from ovi.utils.io_utils import save_video
53
  from ovi.utils.processing_utils import clean_text, scale_hw_to_area_divisible
 
104
  DEFAULT_CONFIG['cpu_offload'] = enable_cpu_offload # always use cpu offload if image generation is enabled
105
  DEFAULT_CONFIG['mode'] = "t2v" # hardcoded since it is always cpu offloaded
106
  ovi_engine = OviFusionEngine()
107
+ flux_model = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-Krea-dev", torch_dtype=torch.bfloat16)
108
  print("loaded model")
109
 
110
 
 
134
  )
135
  return new_img, target_size
136
 
137
+ @spaces.GPU()
138
+ def generate_image(text_prompt, session_id, image_height = 1024, image_width = 1024):
139
+
140
+ if flux_model is None:
141
+ return None
142
+ text_prompt = clean_text(text_prompt)
143
+
144
+ image_h, image_w = scale_hw_to_area_divisible(image_height, image_width, area=1024 * 1024)
145
+
146
+ flux_model.to("cuda")
147
+
148
+ image = flux_model(
149
+ text_prompt,
150
+ height=image_h,
151
+ width=image_w,
152
+ num_inference_steps = 28,
153
+ guidance_scale=4.5,
154
+ generator=torch.Generator().manual_seed(int(1234))
155
+ ).images[0]
156
+
157
+ flux_model.to("cpu")
158
+
159
+ output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
160
+ os.makedirs(output_dir, exist_ok=True)
161
+ output_path = os.path.join(output_dir, f"generate_image.png")
162
+
163
+ image.save(output_path)
164
+ return output_path
165
+
166
  def generate_scene(
167
  text_prompt,
 
168
  sample_steps = 50,
169
+ image = None,
170
  session_id = None,
171
  video_seed = 100,
172
  solver_name = "unipc",
 
180
  ):
181
  text_prompt_processed = (text_prompt or "").strip()
182
 
 
 
 
 
183
  if not text_prompt_processed:
184
  raise gr.Error("Please enter a prompt.")
185
 
186
+ if session_id is None:
187
+ session_id = uuid.uuid4().hex
188
 
189
  return generate_video(text_prompt,
 
190
  sample_steps,
191
+ image,
192
  session_id,
193
  video_seed,
194
  solver_name,
 
202
 
203
  def get_duration(
204
  text_prompt,
 
205
  sample_steps,
206
+ image,
207
  session_id,
208
  video_seed,
209
  solver_name,
 
215
  audio_negative_prompt,
216
  progress,
217
  ):
218
+ image_generation_s = 0
219
+
220
+ if not image:
221
+ image_generation_s = 30
222
+
223
  warmup = 20
224
 
225
+ return int(sample_steps * 3 + warmup + image_generation_s)
226
 
227
 
228
  @spaces.GPU(duration=get_duration)
229
  def generate_video(
230
  text_prompt,
 
231
  sample_steps = 50,
232
+ image = None,
233
  session_id = None,
234
  video_seed = 100,
235
  solver_name = "unipc",
 
241
  audio_negative_prompt = "",
242
  progress=gr.Progress(track_tqdm=True)
243
  ):
244
+
245
+ if session_id is None:
246
+ session_id = uuid.uuid4().hex
247
+
248
+ image_path = None
249
+
250
+ if not image:
251
+ image = generate_image(text_prompt, session_id)
252
+
253
+ if image is not None:
254
+ image_path = image
255
+
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
259
+ os.makedirs(output_dir, exist_ok=True)
260
+ output_path = os.path.join(output_dir, f"generated_video.mp4")
261
+
262
+
263
+ _, target_size = resize_for_model(image_path)
264
+
265
+ video_frame_width = target_size[0]
266
+ video_frame_height = target_size[1]
267
+
268
+ generated_video, generated_audio, _ = ovi_engine.generate(
269
+ text_prompt=text_prompt,
270
+ image_path=image_path,
271
+ video_frame_height_width=[video_frame_height, video_frame_width],
272
+ seed=video_seed,
273
+ solver_name=solver_name,
274
+ sample_steps=sample_steps,
275
+ shift=shift,
276
+ video_guidance_scale=video_guidance_scale,
277
+ audio_guidance_scale=audio_guidance_scale,
278
+ slg_layer=slg_layer,
279
+ video_negative_prompt=video_negative_prompt,
280
+ audio_negative_prompt=audio_negative_prompt,
281
+ )
282
 
283
+ save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
284
+
285
+ return output_path
 
286
 
287
 
288
  def cleanup(request: gr.Request):
 
302
  max-width: 1024px;
303
  }
304
  """
 
305
  theme = gr.themes.Ocean()
306
 
307
  with gr.Blocks(css=css, theme=theme) as demo:
 
331
  with gr.Row():
332
  with gr.Column():
333
  # Image section
 
334
 
335
  video_text_prompt = gr.Textbox(label="Scene Prompt",
336
  lines=5,
337
+ placeholder="Describe your scene...")
 
338
  sample_steps = gr.Slider(
339
  value=50,
340
+ label="Sample Steps",
341
  minimum=20,
342
  maximum=100,
343
  step=1.0
344
  )
345
  run_btn = gr.Button("Action 🎬", variant="primary")
346
 
347
+ image = gr.Image(type="filepath", label="Image Ref", height=360)
348
+
349
  gr.Markdown(
350
  """
351
  💡 **Prompt Guidelines**
 
383
 
384
  [
385
  "The video opens with a close-up of a woman with vibrant reddish-orange, shoulder-length hair and heavy dark eye makeup. She is wearing a dark brown leather jacket over a grey hooded top. She looks intently to her right, her mouth slightly agape, and her expression is serious and focused. The background shows a room with light green walls and dark wooden cabinets on the left, and a green plant on the right. She speaks, her voice clear and direct, saying, <S>doing<E>. She then pauses briefly, her gaze unwavering, and continues, <S>And I need you to trust them.<E>. Her mouth remains slightly open, indicating she is either about to speak more or has just finished a sentence, with a look of intense sincerity.. <AUDCAP>Tense, dramatic background music, clear female voice.<ENDAUDCAP>",
 
386
  50,
387
+ None,
388
  ],
389
 
390
  [
391
  "A young woman with long, wavy blonde hair and light-colored eyes is shown in a medium shot against a blurred backdrop of lush green foliage. She wears a denim jacket over a striped top. Initially, her eyes are closed and her mouth is slightly open as she speaks, <S>Enjoy this moment<E>. Her eyes then slowly open, looking slightly upwards and to the right, as her expression shifts to one of thoughtful contemplation. She continues to speak, <S>No matter where it's taking<E>, her gaze then settling with a serious and focused look towards someone off-screen to her right.. <AUDCAP>Clear female voice, faint ambient outdoor sounds.<ENDAUDCAP>",
 
392
  50,
393
+ "example_prompts/pngs/2.png",
394
  ],
395
 
396
  [
397
  "A bearded man wearing large dark sunglasses and a blue patterned cardigan sits in a studio, actively speaking into a large, suspended microphone. He has headphones on and gestures with his hands, displaying rings on his fingers. Behind him, a wall is covered with red, textured sound-dampening foam on the left, and a white banner on the right features the ""CHOICE FM"" logo and various social media handles like ""@ilovechoicefm"" with ""RALEIGH"" below it. The man intently addresses the microphone, articulating, <S>is talent. It's all about authenticity. You gotta be who you really are, especially if you're working<E>. He leans forward slightly as he speaks, maintaining a serious expression behind his sunglasses.. <AUDCAP>Clear male voice speaking into a microphone, a low background hum.<ENDAUDCAP>",
 
398
  50,
399
+ "example_prompts/pngs/5.png",
400
  ],
401
 
402
  [
403
  "The video opens with a close-up on an older man with long, grey hair and a short, grey beard, wearing dark sunglasses. He is clad in a dark coat, possibly with fur trim, and black gloves. His face is angled slightly upwards and to the right, as he begins to speak, his mouth slightly open. In the immediate foreground, out of focus, is the dark-clad shoulder and the back of the head of another person. The man articulates, <S>labbra. Ti ci vorrebbe...<E> His expression remains contemplative, and he continues, seemingly completing his thought, <S>Un ego solare.<E> The background behind him is a textured, grey stone wall, suggesting an outdoor setting. The man's gaze remains fixed upwards, his expression thoughtful.. <AUDCAP>A clear, slightly low-pitched male voice speaking Italian. The overall soundscape is quiet, with no prominent background noises or music.<ENDAUDCAP>",
 
404
  50,
405
+ "example_prompts/pngs/7.png",
406
  ],
407
 
408
  [
409
  "The scene is set outdoors with a blurry, bright green background, suggesting grass and a sunny environment. On the left, a woman with long, dark hair, wearing a red top and a necklace with a white pendant, faces towards the right. Her expression is serious and slightly perturbed as she speaks, with her lips slightly pursed. She says, <S>UFO, UFC thing.<E> On the right, the back of a man's head and his right ear are visible, indicating he is facing away from the camera, listening to the woman. He has short, dark hair. The woman continues speaking, her expression remaining serious, <S>And if you're not watching that, it's one of those ancient movies from an era that's<E> as the frame holds steady on the two figures.. <AUDCAP>Clear female speech, distant low-frequency hum.<ENDAUDCAP>",
410
+ 50,
411
  "example_prompts/pngs/9.png",
412
+ ],
413
+
414
+ [
415
+ "The scene is set in a dimly lit, hazy room, creating a somber atmosphere. An woman with light, slightly disheveled hair is visible in the foreground, her face mostly obscured by deep shadows, but her mouth is visible as she speaks. She wears a work-style shirt, and her hands are clasped together. In the background, to the right and slightly out of focus, a man with a mustache and beard is seated, facing forward, also largely in shadow, appearing to listen intently. The woman looks directly forward as she slowly enunciates, <S>Only through death will the third door be<E>. The scene ends abruptly.. <AUDCAP>Clear, deliberate female voice speaking, low ambient hum and subtle atmospheric sounds creating a tense mood.<ENDAUDCAP>",
416
  50,
417
+ None,
418
  ],
419
  ],
420
+ inputs=[video_text_prompt, sample_steps, image],
421
  outputs=[output_path],
422
  fn=generate_video,
423
  cache_examples=True,
 
425
 
426
  run_btn.click(
427
  fn=generate_scene,
428
+ inputs=[video_text_prompt, sample_steps, image, session_state],
429
  outputs=[output_path],
430
  )
431
 
ovi/utils/fm_solvers_unipc.py CHANGED
@@ -16,6 +16,21 @@ from diffusers.utils import deprecate, is_scipy_available
16
  if is_scipy_available():
17
  import scipy.stats
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
21
  """
@@ -604,7 +619,9 @@ class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
604
  if order == 1:
605
  rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
606
  else:
607
- rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
 
 
608
 
609
  if self.predict_x0:
610
  x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
 
16
  if is_scipy_available():
17
  import scipy.stats
18
 
19
+ def _safe_solve(R, b, device, out_dtype):
20
+ """
21
+ Try GPU solve (MAGMA/cuSOLVER) in float32; on failure, fall back to CPU solve.
22
+ """
23
+ R32 = R.float().contiguous()
24
+ b32 = b.float().contiguous()
25
+ try:
26
+ return torch.linalg.solve(R32, b32).to(device=device, dtype=out_dtype)
27
+ except RuntimeError as e:
28
+ # cuSOLVER creation / internal errors are common here on certain GPUs
29
+ if "cusolver" in str(e).lower() or "CUSOLVER" in str(e):
30
+ R_cpu, b_cpu = R32.cpu(), b32.cpu()
31
+ sol = torch.linalg.solve(R_cpu, b_cpu)
32
+ return sol.to(device=device, dtype=out_dtype)
33
+ raise
34
 
35
  class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
36
  """
 
619
  if order == 1:
620
  rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
621
  else:
622
+ rhos_c = _safe_solve(R, b, device, x.dtype)
623
+
624
+
625
 
626
  if self.predict_x0:
627
  x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0