Testing3

Runtime error

App Files Files Community

dagloop5 commited on 26 days ago

Commit

593d864

verified ·

1 Parent(s): 733dd76

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -60

app.py CHANGED Viewed

@@ -66,6 +66,8 @@ from ltx_pipelines.utils.helpers import (
     simple_denoising_func,
 )
 from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
@@ -131,7 +133,6 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
         (ctx_p,) = encode_prompts(
@@ -148,7 +149,8 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             raise ValueError(f"Could not extract audio stream from {audio_path}")
         encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
-        # Keep the uploaded audio as a soft conditioning signal, not a hard copy.
         audio_mix_ratio = float(max(0.0, min(1.0, audio_mix_ratio)))
         if audio_mix_ratio < 1.0:
             noise = torch.randn(
@@ -161,7 +163,13 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
                 audio_mix_ratio * encoded_audio_latent
                 + (1.0 - audio_mix_ratio) * noise
             )
-        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
         expected_frames = audio_shape.frames
         actual_frames = encoded_audio_latent.shape[2]
@@ -178,22 +186,8 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             )
             encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
-        video_encoder = self.model_ledger.video_encoder()
-        transformer = self.model_ledger.transformer()
         stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
-        def denoising_loop(sigmas, video_state, audio_state, stepper):
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=simple_denoising_func(
-                    video_context=video_context,
-                    audio_context=audio_context,
-                    transformer=transformer,
-                ),
-            )
         stage_1_output_shape = VideoPixelShape(
             batch=1,
@@ -206,21 +200,28 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             images=images,
             height=stage_1_output_shape.height,
             width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
             dtype=dtype,
             device=self.device,
         )
-        video_state = denoise_video_only(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
-            noiser=noiser,
             sigmas=stage_1_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            initial_audio_latent=encoded_audio_latent,
         )
         torch.cuda.synchronize()
@@ -228,56 +229,56 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         upscaled_video_latent = upsample_video(
             latent=video_state.latent[:1],
-            video_encoder=video_encoder,
             upsampler=self.model_ledger.spatial_upsampler(),
         )
-        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
         stage_2_conditionings = combined_image_conditionings(
             images=images,
             height=stage_2_output_shape.height,
             width=stage_2_output_shape.width,
-            video_encoder=video_encoder,
             dtype=dtype,
             device=self.device,
         )
-        video_state = denoise_video_only(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
-            noiser=noiser,
             sigmas=stage_2_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=stage_2_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=encoded_audio_latent,
         )
         torch.cuda.synchronize()
-        del transformer
-        del video_encoder
         cleanup_memory()
-        decoded_video = vae_decode_video(
             video_state.latent,
-            self.model_ledger.video_decoder(),
             tiling_config,
             generator,
         )
-        generated_audio_latent = getattr(video_state, "audio_latent", None)
-        if generated_audio_latent is None:
-            raise RuntimeError(
-                "No generated audio latent was returned. "
-                "Patch denoise_video_only() to expose the audio latent, "
-                "or switch this block to the upstream stage API that returns "
-                "video_state, audio_state."
-            )
-        decoded_audio = self.model_ledger.audio_decoder()(generated_audio_latent)
         return decoded_video, decoded_audio

     simple_denoising_func,
 )
 from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
+from ltx_pipelines.utils.denoisers import SimpleDenoiser
+from ltx_pipelines.utils.types import ModalitySpec
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         dtype = torch.bfloat16
         (ctx_p,) = encode_prompts(
             raise ValueError(f"Could not extract audio stream from {audio_path}")
         encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
+        # Keep the uploaded audio as a soft prior instead of a hard target.
         audio_mix_ratio = float(max(0.0, min(1.0, audio_mix_ratio)))
         if audio_mix_ratio < 1.0:
             noise = torch.randn(
                 audio_mix_ratio * encoded_audio_latent
                 + (1.0 - audio_mix_ratio) * noise
             )
+        audio_shape = AudioLatentShape.from_duration(
+            batch=1,
+            duration=video_duration,
+            channels=8,
+            mel_bins=16,
+        )
         expected_frames = audio_shape.frames
         actual_frames = encoded_audio_latent.shape[2]
             )
             encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
         stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
+        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
         stage_1_output_shape = VideoPixelShape(
             batch=1,
             images=images,
             height=stage_1_output_shape.height,
             width=stage_1_output_shape.width,
+            video_encoder=self.model_ledger.video_encoder(),
             dtype=dtype,
             device=self.device,
         )
+        video_state, audio_state = self.stage(
+            denoiser=SimpleDenoiser(video_context, audio_context),
             sigmas=stage_1_sigmas,
+            noiser=noiser,
+            width=stage_1_output_shape.width,
+            height=stage_1_output_shape.height,
+            frames=num_frames,
+            fps=frame_rate,
+            video=ModalitySpec(
+                context=video_context,
+                conditionings=stage_1_conditionings,
+            ),
+            audio=ModalitySpec(
+                context=audio_context,
+                noise_scale=stage_1_sigmas[0].item(),
+                initial_latent=encoded_audio_latent,
+            ),
         )
         torch.cuda.synchronize()
         upscaled_video_latent = upsample_video(
             latent=video_state.latent[:1],
+            video_encoder=self.model_ledger.video_encoder(),
             upsampler=self.model_ledger.spatial_upsampler(),
         )
+        stage_2_output_shape = VideoPixelShape(
+            batch=1,
+            frames=num_frames,
+            width=width,
+            height=height,
+            fps=frame_rate,
+        )
         stage_2_conditionings = combined_image_conditionings(
             images=images,
             height=stage_2_output_shape.height,
             width=stage_2_output_shape.width,
+            video_encoder=self.model_ledger.video_encoder(),
             dtype=dtype,
             device=self.device,
         )
+        video_state, audio_state = self.stage(
+            denoiser=SimpleDenoiser(video_context, audio_context),
             sigmas=stage_2_sigmas,
+            noiser=noiser,
+            width=stage_2_output_shape.width,
+            height=stage_2_output_shape.height,
+            frames=num_frames,
+            fps=frame_rate,
+            video=ModalitySpec(
+                context=video_context,
+                conditionings=stage_2_conditionings,
+                noise_scale=stage_2_sigmas[0].item(),
+                initial_latent=upscaled_video_latent,
+            ),
+            audio=ModalitySpec(
+                context=audio_context,
+                noise_scale=stage_2_sigmas[0].item(),
+                initial_latent=audio_state.latent,
+            ),
         )
         torch.cuda.synchronize()
         cleanup_memory()
+        decoded_video = self.model_ledger.video_decoder()(
             video_state.latent,
             tiling_config,
             generator,
         )
+        decoded_audio = self.model_ledger.audio_decoder()(audio_state.latent)
         return decoded_video, decoded_audio