Testing3

Runtime error

App Files Files Community

dagloop5 commited on 26 days ago

Commit

733dd76

verified ·

1 Parent(s): 683bc4f

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -7

app.py CHANGED Viewed

@@ -110,6 +110,7 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         frame_rate: float,
         images: list[ImageConditioningInput],
         audio_path: str | None = None,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
@@ -147,6 +148,19 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             raise ValueError(f"Could not extract audio stream from {audio_path}")
         encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
         audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
         expected_frames = audio_shape.frames
         actual_frames = encoded_audio_latent.shape[2]
@@ -253,11 +267,18 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             tiling_config,
             generator,
         )
-        original_audio = Audio(
-            waveform=decoded_audio.waveform.squeeze(0),
-            sampling_rate=decoded_audio.sampling_rate,
-        )
-        return decoded_video, original_audio
 # Model repos
@@ -573,6 +594,7 @@ def get_gpu_duration(
     first_image,
     last_image,
     input_audio,
     prompt: str,
     duration: float,
     gpu_duration: float,
@@ -600,6 +622,7 @@ def generate_video(
     first_image,
     last_image,
     input_audio,
     prompt: str,
     duration: float,
     gpu_duration: float,
@@ -667,6 +690,7 @@ def generate_video(
             frame_rate=frame_rate,
             images=images,
             audio_path=input_audio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
@@ -702,6 +726,14 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
             input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
@@ -784,6 +816,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
                 None,
                 "pinkknit.jpg",
                 None,
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
@@ -809,7 +842,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             ],
         ],
         inputs=[
-            first_image, last_image, input_audio, prompt, duration, gpu_duration,
             enhance_prompt, seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
         ],
@@ -842,7 +875,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            first_image, last_image, input_audio, prompt, duration, gpu_duration, enhance_prompt,
             seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
         ],

         frame_rate: float,
         images: list[ImageConditioningInput],
         audio_path: str | None = None,
+        audio_mix_ratio: float = 0.35,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
             raise ValueError(f"Could not extract audio stream from {audio_path}")
         encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
+        # Keep the uploaded audio as a soft conditioning signal, not a hard copy.
+        audio_mix_ratio = float(max(0.0, min(1.0, audio_mix_ratio)))
+        if audio_mix_ratio < 1.0:
+            noise = torch.randn(
+                encoded_audio_latent.shape,
+                device=encoded_audio_latent.device,
+                dtype=encoded_audio_latent.dtype,
+                generator=generator,
+            )
+            encoded_audio_latent = (
+                audio_mix_ratio * encoded_audio_latent
+                + (1.0 - audio_mix_ratio) * noise
+            )
         audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
         expected_frames = audio_shape.frames
         actual_frames = encoded_audio_latent.shape[2]
             tiling_config,
             generator,
         )
+        generated_audio_latent = getattr(video_state, "audio_latent", None)
+        if generated_audio_latent is None:
+            raise RuntimeError(
+                "No generated audio latent was returned. "
+                "Patch denoise_video_only() to expose the audio latent, "
+                "or switch this block to the upstream stage API that returns "
+                "video_state, audio_state."
+            )
+        decoded_audio = self.model_ledger.audio_decoder()(generated_audio_latent)
+        return decoded_video, decoded_audio
 # Model repos
     first_image,
     last_image,
     input_audio,
+    audio_mix_ratio,
     prompt: str,
     duration: float,
     gpu_duration: float,
     first_image,
     last_image,
     input_audio,
+    audio_mix_ratio,
     prompt: str,
     duration: float,
     gpu_duration: float,
             frame_rate=frame_rate,
             images=images,
             audio_path=input_audio,
+            audio_mix_ratio=audio_mix_ratio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
             input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
+            audio_mix_ratio = gr.Slider(
+                label="Audio Conditioning Strength",
+                minimum=0.0,
+                maximum=1.0,
+                value=0.35,
+                step=0.01,
+                info="0 = mostly ignore input audio, 1 = strongly follow input audio",
+            )
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
                 None,
                 "pinkknit.jpg",
                 None,
+                0.0,
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
             ],
         ],
         inputs=[
+            first_image, last_image, input_audio, audio_mix_ratio, prompt, duration, gpu_duration,
             enhance_prompt, seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
         ],
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            first_image, last_image, input_audio, audio_mix_ratio, prompt, duration, gpu_duration, enhance_prompt,
             seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
         ],