Update app.py
Browse files
app.py
CHANGED
|
@@ -110,6 +110,7 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
|
|
| 110 |
frame_rate: float,
|
| 111 |
images: list[ImageConditioningInput],
|
| 112 |
audio_path: str | None = None,
|
|
|
|
| 113 |
tiling_config: TilingConfig | None = None,
|
| 114 |
enhance_prompt: bool = False,
|
| 115 |
):
|
|
@@ -147,6 +148,19 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
|
|
| 147 |
raise ValueError(f"Could not extract audio stream from {audio_path}")
|
| 148 |
|
| 149 |
encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
|
| 151 |
expected_frames = audio_shape.frames
|
| 152 |
actual_frames = encoded_audio_latent.shape[2]
|
|
@@ -253,11 +267,18 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
|
|
| 253 |
tiling_config,
|
| 254 |
generator,
|
| 255 |
)
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
|
| 263 |
# Model repos
|
|
@@ -573,6 +594,7 @@ def get_gpu_duration(
|
|
| 573 |
first_image,
|
| 574 |
last_image,
|
| 575 |
input_audio,
|
|
|
|
| 576 |
prompt: str,
|
| 577 |
duration: float,
|
| 578 |
gpu_duration: float,
|
|
@@ -600,6 +622,7 @@ def generate_video(
|
|
| 600 |
first_image,
|
| 601 |
last_image,
|
| 602 |
input_audio,
|
|
|
|
| 603 |
prompt: str,
|
| 604 |
duration: float,
|
| 605 |
gpu_duration: float,
|
|
@@ -667,6 +690,7 @@ def generate_video(
|
|
| 667 |
frame_rate=frame_rate,
|
| 668 |
images=images,
|
| 669 |
audio_path=input_audio,
|
|
|
|
| 670 |
tiling_config=tiling_config,
|
| 671 |
enhance_prompt=enhance_prompt,
|
| 672 |
)
|
|
@@ -702,6 +726,14 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
|
|
| 702 |
first_image = gr.Image(label="First Frame (Optional)", type="pil")
|
| 703 |
last_image = gr.Image(label="Last Frame (Optional)", type="pil")
|
| 704 |
input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
prompt = gr.Textbox(
|
| 706 |
label="Prompt",
|
| 707 |
info="for best results - make it as elaborate as possible",
|
|
@@ -784,6 +816,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
|
|
| 784 |
None,
|
| 785 |
"pinkknit.jpg",
|
| 786 |
None,
|
|
|
|
| 787 |
"The camera falls downward through darkness as if dropped into a tunnel. "
|
| 788 |
"As it slows, five friends wearing pink knitted hats and sunglasses lean "
|
| 789 |
"over and look down toward the camera with curious expressions. The lens "
|
|
@@ -809,7 +842,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
|
|
| 809 |
],
|
| 810 |
],
|
| 811 |
inputs=[
|
| 812 |
-
first_image, last_image, input_audio, prompt, duration, gpu_duration,
|
| 813 |
enhance_prompt, seed, randomize_seed, height, width,
|
| 814 |
pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
|
| 815 |
],
|
|
@@ -842,7 +875,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
|
|
| 842 |
generate_btn.click(
|
| 843 |
fn=generate_video,
|
| 844 |
inputs=[
|
| 845 |
-
first_image, last_image, input_audio, prompt, duration, gpu_duration, enhance_prompt,
|
| 846 |
seed, randomize_seed, height, width,
|
| 847 |
pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
|
| 848 |
],
|
|
|
|
| 110 |
frame_rate: float,
|
| 111 |
images: list[ImageConditioningInput],
|
| 112 |
audio_path: str | None = None,
|
| 113 |
+
audio_mix_ratio: float = 0.35,
|
| 114 |
tiling_config: TilingConfig | None = None,
|
| 115 |
enhance_prompt: bool = False,
|
| 116 |
):
|
|
|
|
| 148 |
raise ValueError(f"Could not extract audio stream from {audio_path}")
|
| 149 |
|
| 150 |
encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
|
| 151 |
+
# Keep the uploaded audio as a soft conditioning signal, not a hard copy.
|
| 152 |
+
audio_mix_ratio = float(max(0.0, min(1.0, audio_mix_ratio)))
|
| 153 |
+
if audio_mix_ratio < 1.0:
|
| 154 |
+
noise = torch.randn(
|
| 155 |
+
encoded_audio_latent.shape,
|
| 156 |
+
device=encoded_audio_latent.device,
|
| 157 |
+
dtype=encoded_audio_latent.dtype,
|
| 158 |
+
generator=generator,
|
| 159 |
+
)
|
| 160 |
+
encoded_audio_latent = (
|
| 161 |
+
audio_mix_ratio * encoded_audio_latent
|
| 162 |
+
+ (1.0 - audio_mix_ratio) * noise
|
| 163 |
+
)
|
| 164 |
audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
|
| 165 |
expected_frames = audio_shape.frames
|
| 166 |
actual_frames = encoded_audio_latent.shape[2]
|
|
|
|
| 267 |
tiling_config,
|
| 268 |
generator,
|
| 269 |
)
|
| 270 |
+
|
| 271 |
+
generated_audio_latent = getattr(video_state, "audio_latent", None)
|
| 272 |
+
if generated_audio_latent is None:
|
| 273 |
+
raise RuntimeError(
|
| 274 |
+
"No generated audio latent was returned. "
|
| 275 |
+
"Patch denoise_video_only() to expose the audio latent, "
|
| 276 |
+
"or switch this block to the upstream stage API that returns "
|
| 277 |
+
"video_state, audio_state."
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
decoded_audio = self.model_ledger.audio_decoder()(generated_audio_latent)
|
| 281 |
+
return decoded_video, decoded_audio
|
| 282 |
|
| 283 |
|
| 284 |
# Model repos
|
|
|
|
| 594 |
first_image,
|
| 595 |
last_image,
|
| 596 |
input_audio,
|
| 597 |
+
audio_mix_ratio,
|
| 598 |
prompt: str,
|
| 599 |
duration: float,
|
| 600 |
gpu_duration: float,
|
|
|
|
| 622 |
first_image,
|
| 623 |
last_image,
|
| 624 |
input_audio,
|
| 625 |
+
audio_mix_ratio,
|
| 626 |
prompt: str,
|
| 627 |
duration: float,
|
| 628 |
gpu_duration: float,
|
|
|
|
| 690 |
frame_rate=frame_rate,
|
| 691 |
images=images,
|
| 692 |
audio_path=input_audio,
|
| 693 |
+
audio_mix_ratio=audio_mix_ratio,
|
| 694 |
tiling_config=tiling_config,
|
| 695 |
enhance_prompt=enhance_prompt,
|
| 696 |
)
|
|
|
|
| 726 |
first_image = gr.Image(label="First Frame (Optional)", type="pil")
|
| 727 |
last_image = gr.Image(label="Last Frame (Optional)", type="pil")
|
| 728 |
input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
|
| 729 |
+
audio_mix_ratio = gr.Slider(
|
| 730 |
+
label="Audio Conditioning Strength",
|
| 731 |
+
minimum=0.0,
|
| 732 |
+
maximum=1.0,
|
| 733 |
+
value=0.35,
|
| 734 |
+
step=0.01,
|
| 735 |
+
info="0 = mostly ignore input audio, 1 = strongly follow input audio",
|
| 736 |
+
)
|
| 737 |
prompt = gr.Textbox(
|
| 738 |
label="Prompt",
|
| 739 |
info="for best results - make it as elaborate as possible",
|
|
|
|
| 816 |
None,
|
| 817 |
"pinkknit.jpg",
|
| 818 |
None,
|
| 819 |
+
0.0,
|
| 820 |
"The camera falls downward through darkness as if dropped into a tunnel. "
|
| 821 |
"As it slows, five friends wearing pink knitted hats and sunglasses lean "
|
| 822 |
"over and look down toward the camera with curious expressions. The lens "
|
|
|
|
| 842 |
],
|
| 843 |
],
|
| 844 |
inputs=[
|
| 845 |
+
first_image, last_image, input_audio, audio_mix_ratio, prompt, duration, gpu_duration,
|
| 846 |
enhance_prompt, seed, randomize_seed, height, width,
|
| 847 |
pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
|
| 848 |
],
|
|
|
|
| 875 |
generate_btn.click(
|
| 876 |
fn=generate_video,
|
| 877 |
inputs=[
|
| 878 |
+
first_image, last_image, input_audio, audio_mix_ratio, prompt, duration, gpu_duration, enhance_prompt,
|
| 879 |
seed, randomize_seed, height, width,
|
| 880 |
pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
|
| 881 |
],
|