dagloop5 commited on
Commit
733dd76
·
verified ·
1 Parent(s): 683bc4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -7
app.py CHANGED
@@ -110,6 +110,7 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
110
  frame_rate: float,
111
  images: list[ImageConditioningInput],
112
  audio_path: str | None = None,
 
113
  tiling_config: TilingConfig | None = None,
114
  enhance_prompt: bool = False,
115
  ):
@@ -147,6 +148,19 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
147
  raise ValueError(f"Could not extract audio stream from {audio_path}")
148
 
149
  encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
151
  expected_frames = audio_shape.frames
152
  actual_frames = encoded_audio_latent.shape[2]
@@ -253,11 +267,18 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
253
  tiling_config,
254
  generator,
255
  )
256
- original_audio = Audio(
257
- waveform=decoded_audio.waveform.squeeze(0),
258
- sampling_rate=decoded_audio.sampling_rate,
259
- )
260
- return decoded_video, original_audio
 
 
 
 
 
 
 
261
 
262
 
263
  # Model repos
@@ -573,6 +594,7 @@ def get_gpu_duration(
573
  first_image,
574
  last_image,
575
  input_audio,
 
576
  prompt: str,
577
  duration: float,
578
  gpu_duration: float,
@@ -600,6 +622,7 @@ def generate_video(
600
  first_image,
601
  last_image,
602
  input_audio,
 
603
  prompt: str,
604
  duration: float,
605
  gpu_duration: float,
@@ -667,6 +690,7 @@ def generate_video(
667
  frame_rate=frame_rate,
668
  images=images,
669
  audio_path=input_audio,
 
670
  tiling_config=tiling_config,
671
  enhance_prompt=enhance_prompt,
672
  )
@@ -702,6 +726,14 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
702
  first_image = gr.Image(label="First Frame (Optional)", type="pil")
703
  last_image = gr.Image(label="Last Frame (Optional)", type="pil")
704
  input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
 
 
 
 
 
 
 
 
705
  prompt = gr.Textbox(
706
  label="Prompt",
707
  info="for best results - make it as elaborate as possible",
@@ -784,6 +816,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
784
  None,
785
  "pinkknit.jpg",
786
  None,
 
787
  "The camera falls downward through darkness as if dropped into a tunnel. "
788
  "As it slows, five friends wearing pink knitted hats and sunglasses lean "
789
  "over and look down toward the camera with curious expressions. The lens "
@@ -809,7 +842,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
809
  ],
810
  ],
811
  inputs=[
812
- first_image, last_image, input_audio, prompt, duration, gpu_duration,
813
  enhance_prompt, seed, randomize_seed, height, width,
814
  pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
815
  ],
@@ -842,7 +875,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
842
  generate_btn.click(
843
  fn=generate_video,
844
  inputs=[
845
- first_image, last_image, input_audio, prompt, duration, gpu_duration, enhance_prompt,
846
  seed, randomize_seed, height, width,
847
  pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
848
  ],
 
110
  frame_rate: float,
111
  images: list[ImageConditioningInput],
112
  audio_path: str | None = None,
113
+ audio_mix_ratio: float = 0.35,
114
  tiling_config: TilingConfig | None = None,
115
  enhance_prompt: bool = False,
116
  ):
 
148
  raise ValueError(f"Could not extract audio stream from {audio_path}")
149
 
150
  encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
151
+ # Keep the uploaded audio as a soft conditioning signal, not a hard copy.
152
+ audio_mix_ratio = float(max(0.0, min(1.0, audio_mix_ratio)))
153
+ if audio_mix_ratio < 1.0:
154
+ noise = torch.randn(
155
+ encoded_audio_latent.shape,
156
+ device=encoded_audio_latent.device,
157
+ dtype=encoded_audio_latent.dtype,
158
+ generator=generator,
159
+ )
160
+ encoded_audio_latent = (
161
+ audio_mix_ratio * encoded_audio_latent
162
+ + (1.0 - audio_mix_ratio) * noise
163
+ )
164
  audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
165
  expected_frames = audio_shape.frames
166
  actual_frames = encoded_audio_latent.shape[2]
 
267
  tiling_config,
268
  generator,
269
  )
270
+
271
+ generated_audio_latent = getattr(video_state, "audio_latent", None)
272
+ if generated_audio_latent is None:
273
+ raise RuntimeError(
274
+ "No generated audio latent was returned. "
275
+ "Patch denoise_video_only() to expose the audio latent, "
276
+ "or switch this block to the upstream stage API that returns "
277
+ "video_state, audio_state."
278
+ )
279
+
280
+ decoded_audio = self.model_ledger.audio_decoder()(generated_audio_latent)
281
+ return decoded_video, decoded_audio
282
 
283
 
284
  # Model repos
 
594
  first_image,
595
  last_image,
596
  input_audio,
597
+ audio_mix_ratio,
598
  prompt: str,
599
  duration: float,
600
  gpu_duration: float,
 
622
  first_image,
623
  last_image,
624
  input_audio,
625
+ audio_mix_ratio,
626
  prompt: str,
627
  duration: float,
628
  gpu_duration: float,
 
690
  frame_rate=frame_rate,
691
  images=images,
692
  audio_path=input_audio,
693
+ audio_mix_ratio=audio_mix_ratio,
694
  tiling_config=tiling_config,
695
  enhance_prompt=enhance_prompt,
696
  )
 
726
  first_image = gr.Image(label="First Frame (Optional)", type="pil")
727
  last_image = gr.Image(label="Last Frame (Optional)", type="pil")
728
  input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
729
+ audio_mix_ratio = gr.Slider(
730
+ label="Audio Conditioning Strength",
731
+ minimum=0.0,
732
+ maximum=1.0,
733
+ value=0.35,
734
+ step=0.01,
735
+ info="0 = mostly ignore input audio, 1 = strongly follow input audio",
736
+ )
737
  prompt = gr.Textbox(
738
  label="Prompt",
739
  info="for best results - make it as elaborate as possible",
 
816
  None,
817
  "pinkknit.jpg",
818
  None,
819
+ 0.0,
820
  "The camera falls downward through darkness as if dropped into a tunnel. "
821
  "As it slows, five friends wearing pink knitted hats and sunglasses lean "
822
  "over and look down toward the camera with curious expressions. The lens "
 
842
  ],
843
  ],
844
  inputs=[
845
+ first_image, last_image, input_audio, audio_mix_ratio, prompt, duration, gpu_duration,
846
  enhance_prompt, seed, randomize_seed, height, width,
847
  pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
848
  ],
 
875
  generate_btn.click(
876
  fn=generate_video,
877
  inputs=[
878
+ first_image, last_image, input_audio, audio_mix_ratio, prompt, duration, gpu_duration, enhance_prompt,
879
  seed, randomize_seed, height, width,
880
  pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
881
  ],