TestingwithNeg

Running on Zero

App Files Files Community

dagloop5 commited on 4 days ago

Commit

ed1c038

verified ·

1 Parent(s): ec187f4

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -9

app.py CHANGED Viewed

@@ -51,9 +51,10 @@ from safetensors import safe_open
 import json
 import requests
-from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.components.guiders import MultiModalGuider, MultiModalGuiderParams
 from ltx_core.components.noisers import GaussianNoiser
 from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
 from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
 from ltx_core.model.upsampler import upsample_video
@@ -71,6 +72,7 @@ from ltx_pipelines.utils.helpers import (
     encode_prompts,
     simple_denoising_func,
     multi_modal_guider_denoising_func,
 )
 from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
@@ -152,6 +154,7 @@ class LTX23DistilledA2VPipeline:
         video_guider_params: MultiModalGuiderParams,
         audio_guider_params: MultiModalGuiderParams,
         images: list[ImageConditioningInput],
         audio_path: str | None = None,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
@@ -160,7 +163,7 @@ class LTX23DistilledA2VPipeline:
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
         ctx_p, ctx_n = encode_prompts(
@@ -201,10 +204,19 @@ class LTX23DistilledA2VPipeline:
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
-        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
-        def stage1_denoising_loop(sigmas, video_state, audio_state, stepper):
-            return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
                 audio_state=audio_state,
@@ -224,8 +236,8 @@ class LTX23DistilledA2VPipeline:
                 ),
             )
-        def stage2_denoising_loop(sigmas, video_state, audio_state, stepper):
-            return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
                 audio_state=audio_state,
@@ -674,9 +686,12 @@ def get_gpu_duration(
     voice_strength: float = 0.0,
     realism_strength: float = 0.0,
     transition_strength: float = 0.0,
     progress=None,
 ):
-    return int(gpu_duration)
 @spaces.GPU(duration=get_gpu_duration)
 @torch.inference_mode()
@@ -713,6 +728,7 @@ def generate_video(
     voice_strength: float = 0.0,
     realism_strength: float = 0.0,
     transition_strength: float = 0.0,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
@@ -783,6 +799,7 @@ def generate_video(
             video_guider_params=video_guider_params,
             audio_guider_params=audio_guider_params,
             images=images,
             audio_path=input_audio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
@@ -860,6 +877,13 @@ with gr.Blocks(title="LTX-2.3 Distilled with LoRAs, Negative Prompting, and Adva
             with gr.Row():
                 width = gr.Number(label="Width", value=1536, precision=0)
                 height = gr.Number(label="Height", value=1024, precision=0)
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
@@ -973,7 +997,7 @@ with gr.Blocks(title="LTX-2.3 Distilled with LoRAs, Negative Prompting, and Adva
             pose_strength, general_strength, motion_strength,
             dreamlay_strength, mself_strength, dramatic_strength, fluid_strength,
             liquid_strength, demopose_strength, voice_strength, realism_strength,
-            transition_strength,
         ],
         outputs=[output_video, seed],
     )

 import json
 import requests
+from ltx_core.components.diffusion_steps import Res2sDiffusionStep
 from ltx_core.components.guiders import MultiModalGuider, MultiModalGuiderParams
 from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.components.schedulers import LTX2Scheduler
 from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
 from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
 from ltx_core.model.upsampler import upsample_video
     encode_prompts,
     simple_denoising_func,
     multi_modal_guider_denoising_func,
+    res2s_audio_video_denoising_loop,
 )
 from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
         video_guider_params: MultiModalGuiderParams,
         audio_guider_params: MultiModalGuiderParams,
         images: list[ImageConditioningInput],
+        num_inference_steps: int = 8,
         audio_path: str | None = None,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
+        stepper = Res2sDiffusionStep()
         dtype = torch.bfloat16
         ctx_p, ctx_n = encode_prompts(
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
+        # Stage 1: Generate sigmas using LTX2Scheduler with user-specified steps
+        empty_latent = torch.empty(VideoLatentShape.from_pixel_shape(
+            VideoPixelShape(batch=1, frames=num_frames, width=width // 2, height=height // 2, fps=frame_rate)
+        ).to_torch_shape())
+        stage_1_sigmas = (
+            LTX2Scheduler()
+            .execute(latent=empty_latent, steps=num_inference_steps)
+            .to(dtype=torch.float32, device=self.device)
+        )
+        def stage1_denoising_loop(sigmas: torch.Tensor, video_state, audio_state, stepper: DiffusionStepProtocol):
+            return res2s_audio_video_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
                 audio_state=audio_state,
                 ),
             )
+        def stage2_denoising_loop(sigmas: torch.Tensor, video_state, audio_state, stepper: DiffusionStepProtocol):
+            return res2s_audio_video_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
                 audio_state=audio_state,
     voice_strength: float = 0.0,
     realism_strength: float = 0.0,
     transition_strength: float = 0.0,
+    num_inference_steps: int = 8,
     progress=None,
 ):
+    base_duration = int(gpu_duration)
+    step_ratio = num_inference_steps / 8  # Normalize to 8 steps as baseline
+    return int(base_duration * step_ratio)
 @spaces.GPU(duration=get_gpu_duration)
 @torch.inference_mode()
     voice_strength: float = 0.0,
     realism_strength: float = 0.0,
     transition_strength: float = 0.0,
+    num_inference_steps: int = 8,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
             video_guider_params=video_guider_params,
             audio_guider_params=audio_guider_params,
             images=images,
+            num_inference_steps=num_inference_steps,
             audio_path=input_audio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
             with gr.Row():
                 width = gr.Number(label="Width", value=1536, precision=0)
                 height = gr.Number(label="Height", value=1024, precision=0)
+            with gr.Row():
+                num_inference_steps = gr.Slider(
+                    label="Stage 1 Inference Steps",
+                    minimum=2, maximum=16, value=8, step=1,
+                     info="Higher = more quality but slower (Stage 2 uses fixed 3 steps)"
+                )
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
             pose_strength, general_strength, motion_strength,
             dreamlay_strength, mself_strength, dramatic_strength, fluid_strength,
             liquid_strength, demopose_strength, voice_strength, realism_strength,
+            transition_strength, num_inference_steps,
         ],
         outputs=[output_video, seed],
     )