Spaces:

dagloop5
/

Testing2

Sleeping

App Files Files Community

dagloop5 commited on 10 days ago

Commit

4e8337c

verified ·

1 Parent(s): 9a24168

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -82

app.py CHANGED Viewed

@@ -102,7 +102,7 @@ RESOLUTIONS = {
 class LTX23DistilledA2VPipeline(DistilledPipeline):
-    """DistilledPipeline with optional audio conditioning."""
     def __call__(
         self,
@@ -117,20 +117,7 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
-        # Standard path when no audio input is provided.
         print(prompt)
-        if audio_path is None:
-            return super().__call__(
-                prompt=prompt,
-                seed=seed,
-                height=height,
-                width=width,
-                num_frames=num_frames,
-                frame_rate=frame_rate,
-                images=images,
-                tiling_config=tiling_config,
-                enhance_prompt=enhance_prompt,
-            )
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
@@ -145,32 +132,41 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
-        video_duration = num_frames / frame_rate
-        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
-        if decoded_audio is None:
-            raise ValueError(f"Could not extract audio stream from {audio_path}")
-        encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
-        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
-        expected_frames = audio_shape.frames
-        actual_frames = encoded_audio_latent.shape[2]
-        if actual_frames > expected_frames:
-            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
-        elif actual_frames < expected_frames:
-            pad = torch.zeros(
-                encoded_audio_latent.shape[0],
-                encoded_audio_latent.shape[1],
-                expected_frames - actual_frames,
-                encoded_audio_latent.shape[3],
-                device=encoded_audio_latent.device,
-                dtype=encoded_audio_latent.dtype,
             )
-            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
-        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
         def denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
@@ -185,26 +181,26 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
                 ),
             )
-        stage_1_output_shape = VideoPixelShape(
             batch=1,
             frames=num_frames,
-            width=width // 2,
-            height=height // 2,
             fps=frame_rate,
         )
-        stage_1_conditionings = combined_image_conditionings(
             images=images,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
             video_encoder=video_encoder,
             dtype=dtype,
             device=self.device,
         )
         video_state = denoise_video_only(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
             noiser=noiser,
-            sigmas=stage_1_sigmas,
             stepper=stepper,
             denoising_loop_fn=denoising_loop,
             components=self.pipeline_components,
@@ -213,39 +209,6 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             initial_audio_latent=encoded_audio_latent,
         )
-        torch.cuda.synchronize()
-        cleanup_memory()
-        upscaled_video_latent = upsample_video(
-            latent=video_state.latent[:1],
-            video_encoder=video_encoder,
-            upsampler=self.model_ledger.spatial_upsampler(),
-        )
-        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_2_conditionings = combined_image_conditionings(
-            images=images,
-            height=stage_2_output_shape.height,
-            width=stage_2_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        video_state = denoise_video_only(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
-            noiser=noiser,
-            sigmas=stage_2_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=stage_2_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=encoded_audio_latent,
-        )
         torch.cuda.synchronize()
         del transformer
         del video_encoder
@@ -257,10 +220,7 @@ class LTX23DistilledA2VPipeline(DistilledPipeline):
             tiling_config,
             generator,
         )
-        original_audio = Audio(
-            waveform=decoded_audio.waveform.squeeze(0),
-            sampling_rate=decoded_audio.sampling_rate,
-        )
         return decoded_video, original_audio

 class LTX23DistilledA2VPipeline(DistilledPipeline):
+    """DistilledPipeline: single stage, full resolution, 8 steps, with optional audio."""
     def __call__(
         self,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
         print(prompt)
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        # Audio encoding — only runs if audio is provided
+        encoded_audio_latent = None
+        original_audio = None
+        if audio_path is not None:
+            video_duration = num_frames / frame_rate
+            decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+            if decoded_audio is None:
+                raise ValueError(f"Could not extract audio stream from {audio_path}")
+            encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
+            audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
+            expected_frames = audio_shape.frames
+            actual_frames = encoded_audio_latent.shape[2]
+            if actual_frames > expected_frames:
+                encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+            elif actual_frames < expected_frames:
+                pad = torch.zeros(
+                    encoded_audio_latent.shape[0],
+                    encoded_audio_latent.shape[1],
+                    expected_frames - actual_frames,
+                    encoded_audio_latent.shape[3],
+                    device=encoded_audio_latent.device,
+                    dtype=encoded_audio_latent.dtype,
+                )
+                encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
+            original_audio = Audio(
+                waveform=decoded_audio.waveform.squeeze(0),
+                sampling_rate=decoded_audio.sampling_rate,
             )
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
+        sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
         def denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
                 ),
             )
+        output_shape = VideoPixelShape(
             batch=1,
             frames=num_frames,
+            width=width,
+            height=height,
             fps=frame_rate,
         )
+        conditionings = combined_image_conditionings(
             images=images,
+            height=output_shape.height,
+            width=output_shape.width,
             video_encoder=video_encoder,
             dtype=dtype,
             device=self.device,
         )
         video_state = denoise_video_only(
+            output_shape=output_shape,
+            conditionings=conditionings,
             noiser=noiser,
+            sigmas=sigmas,
             stepper=stepper,
             denoising_loop_fn=denoising_loop,
             components=self.pipeline_components,
             initial_audio_latent=encoded_audio_latent,
         )
         torch.cuda.synchronize()
         del transformer
         del video_encoder
             tiling_config,
             generator,
         )
         return decoded_video, original_audio