ltx-video-distilled

Paused

App Files Files Community

ford442 commited on Sep 4

Commit

85f9298

verified ·

1 Parent(s): 50da42d

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -28

app.py CHANGED Viewed

@@ -14,10 +14,6 @@ os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
 import torch
 # --- NEW ---
 # Import the OpenCV library
-import cv2
-import gc
-from image_gen_aux import UpscaleWithModel
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
@@ -29,6 +25,11 @@ torch.backends.cuda.preferred_blas_library="cublas"
 torch.backends.cuda.preferred_linalg_library="cusolver"
 torch.set_float32_matmul_precision("highest")
 import gradio as gr
 import numpy as np
 import random
@@ -40,14 +41,30 @@ from PIL import Image
 from huggingface_hub import hf_hub_download
 import shutil
 MAX_SEED = np.iinfo(np.int32).max
 upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
-#import diffusers
-from diffusers import StableDiffusionXLImg2ImgPipeline, AutoencoderKL
 print("Loading SDXL Image-to-Image pipeline...")
 #vaeX = AutoencoderKL.from_pretrained('stabilityai/stable-diffusion-xl-refiner-1.0',subfolder='vae')
 enhancer_pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained(
     #"stabilityai/stable-diffusion-xl-base-1.0",
     "ford442/stable-diffusion-xl-refiner-1.0-bf16",
@@ -57,24 +74,13 @@ enhancer_pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained(
     requires_aesthetics_score=True,
     #vae=None
 )
 #enhancer_pipeline.vae=vaeX
 enhancer_pipeline.vae.set_default_attn_processor()
 enhancer_pipeline.to("cpu")
 print("SDXL Image-to-Image pipeline loaded successfully.")
-from inference import (
-    create_ltx_video_pipeline,
-    create_latent_upsampler,
-    load_image_to_tensor_with_resize_and_crop,
-    seed_everething,
-    get_device,
-    calculate_padding,
-    load_media_file
-)
-from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
-from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
 config_file_path = "configs/ltxv-13b-0.9.8-distilled.yaml"
 with open(config_file_path, "r") as file:
     PIPELINE_CONFIG_YAML = yaml.safe_load(file)
@@ -83,7 +89,6 @@ LTX_REPO = "Lightricks/LTX-Video"
 MAX_IMAGE_SIZE = PIPELINE_CONFIG_YAML.get("max_resolution", 1280)
 MAX_NUM_FRAMES = 900
-# ... (model loading code is unchanged) ...
 pipeline_instance = None
 latent_upsampler_instance = None
 models_dir = "downloaded_models_gradio_cpu_init"
@@ -120,21 +125,24 @@ def get_duration(*args, **kwargs):
     if duration_ui > 5.0: return 100
     return 90
 @spaces.GPU(duration=45)
 def enhance_frame(prompt, image_to_enhance: Image.Image):
     try:
         print("Moving enhancer pipeline to GPU...")
         seed = random.randint(0, MAX_SEED)
-        generator = torch.Generator(device='cpu').manual_seed(seed)
         enhancer_pipeline.to("cuda",torch.bfloat16)
         refine_prompt = prompt +" high detail, sharp focus, 8k, professional"
-        enhanced_image = enhancer_pipeline(prompt=refine_prompt, image=image_to_enhance, strength=0.1, generator=generator, num_inference_steps=220).images[0]
         print("Frame enhancement successful.")
-        print("Doing super-resolution.")
-        with torch.no_grad():
-            upscale = upscaler(enhanced_image, tiling=True, tile_width=1024, tile_height=1024)
-            enhanced_image = upscale.resize((upscale.width // 4, upscale.height // 4), Image.LANCZOS)
     except Exception as e:
         print(f"Error during frame enhancement: {e}")
         gr.Warning("Frame enhancement failed. Using original frame.")
@@ -147,7 +155,7 @@ def enhance_frame(prompt, image_to_enhance: Image.Image):
     return enhanced_image
-def use_last_frame_as_input(prompt, video_filepath, do_enhance):
     if not video_filepath or not os.path.exists(video_filepath):
         gr.Warning("No video clip available.")
         return None, gr.update()
@@ -162,12 +170,18 @@ def use_last_frame_as_input(prompt, video_filepath, do_enhance):
                 # 1. Immediately yield the original frame to the UI
         print("Displaying original last frame...")
         yield pil_image, gr.update()
         if do_enhance:
             enhanced_image = enhance_frame(prompt, pil_image)
             # 2. Yield the enhanced frame and switch the tab
             print("Displaying enhanced frame and switching tab...")
             yield enhanced_image, gr.update(selected="i2v_tab")
         else:
             # If not enhancing, just switch the tab
             yield pil_image, gr.update(selected="i2v_tab")
     except Exception as e:
@@ -210,6 +224,8 @@ def generate(prompt, negative_prompt, clips_list, input_image_filepath, input_vi
         raise gr.Error("input_video_filepath is required for video-to-video mode")
     if randomize_seed: seed_ui = random.randint(0, 2**32 - 1)
     seed_everething(int(seed_ui))
     actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(round((max(1, round(duration_ui * fps)) - 1.0) / 8.0) * 8 + 1)))
     actual_height, actual_width = int(height_ui), int(width_ui)
     height_padded, width_padded = ((actual_height - 1) // 32 + 1) * 32, ((actual_width - 1) // 32 + 1) * 32
@@ -279,6 +295,7 @@ with gr.Blocks(css=css) as demo:
             duration_input = gr.Slider(label="Clip Duration (seconds)", minimum=1.0, maximum=10.0, value=2.0, step=0.1)
             improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True)
             enhance_checkbox = gr.Checkbox(label="Improve Frame (SDXL Refiner)", value=True)
         with gr.Column():
             output_video = gr.Video(label="Last Generated Clip", interactive=False)
             use_last_frame_button = gr.Button("Use Last Frame as Input Image", visible=False)
@@ -320,7 +337,7 @@ with gr.Blocks(css=css) as demo:
     t2v_button.click(hide_btn, outputs=[use_last_frame_button], queue=False).then(fn=generate, inputs=t2v_inputs, outputs=gen_outputs, api_name="text_to_video")
     i2v_button.click(hide_btn, outputs=[use_last_frame_button], queue=False).then(fn=generate, inputs=i2v_inputs, outputs=gen_outputs, api_name="image_to_video")
     v2v_button.click(hide_btn, outputs=[use_last_frame_button], queue=False).then(fn=generate, inputs=v2v_inputs, outputs=gen_outputs, api_name="video_to_video")
-    use_last_frame_button.click(fn=use_last_frame_as_input, inputs=[i2v_prompt,output_video,enhance_checkbox], outputs=[image_i2v, tabs])
     stitch_button.click(fn=stitch_videos, inputs=[clips_state], outputs=[final_video_output])
     clear_button.click(fn=clear_clips, outputs=[clips_state, clip_counter_display, output_video, final_video_output])
 if __name__ == "__main__":

 import torch
 # --- NEW ---
 # Import the OpenCV library
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 torch.backends.cuda.preferred_linalg_library="cusolver"
 torch.set_float32_matmul_precision("highest")
+import cv2
+import gc
+from image_gen_aux import UpscaleWithModel
 import gradio as gr
 import numpy as np
 import random
 from huggingface_hub import hf_hub_download
 import shutil
+from diffusers import StableDiffusionXLImg2ImgPipeline, AutoencoderKL
+from inference import (
+    create_ltx_video_pipeline,
+    create_latent_upsampler,
+    load_image_to_tensor_with_resize_and_crop,
+    seed_everething,
+    get_device,
+    calculate_padding,
+    load_media_file
+)
+from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
+from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
 MAX_SEED = np.iinfo(np.int32).max
+generator = torch.Generator(device='cuda').manual_seed(seed)
 upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
 print("Loading SDXL Image-to-Image pipeline...")
 #vaeX = AutoencoderKL.from_pretrained('stabilityai/stable-diffusion-xl-refiner-1.0',subfolder='vae')
 enhancer_pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained(
     #"stabilityai/stable-diffusion-xl-base-1.0",
     "ford442/stable-diffusion-xl-refiner-1.0-bf16",
     requires_aesthetics_score=True,
     #vae=None
 )
 #enhancer_pipeline.vae=vaeX
 enhancer_pipeline.vae.set_default_attn_processor()
 enhancer_pipeline.to("cpu")
 print("SDXL Image-to-Image pipeline loaded successfully.")
 config_file_path = "configs/ltxv-13b-0.9.8-distilled.yaml"
 with open(config_file_path, "r") as file:
     PIPELINE_CONFIG_YAML = yaml.safe_load(file)
 MAX_IMAGE_SIZE = PIPELINE_CONFIG_YAML.get("max_resolution", 1280)
 MAX_NUM_FRAMES = 900
 pipeline_instance = None
 latent_upsampler_instance = None
 models_dir = "downloaded_models_gradio_cpu_init"
     if duration_ui > 5.0: return 100
     return 90
+@spaces.GPU(duration=30)
+def superres_image(image_to_enhance: Image.Image)
+    print("Doing super-resolution.")
+    with torch.no_grad():
+        upscale = upscaler(enhanced_image, tiling=True, tile_width=1024, tile_height=1024)
+        enhanced_image = upscale.resize((upscale.width // 2, upscale.height // 2), Image.LANCZOS)
+    return enhanced_image
 @spaces.GPU(duration=45)
 def enhance_frame(prompt, image_to_enhance: Image.Image):
     try:
         print("Moving enhancer pipeline to GPU...")
         seed = random.randint(0, MAX_SEED)
+        generator.manual_seed(seed)
         enhancer_pipeline.to("cuda",torch.bfloat16)
         refine_prompt = prompt +" high detail, sharp focus, 8k, professional"
+        enhanced_image = enhancer_pipeline(prompt=refine_prompt, image=image_to_enhance, strength=0.1, generator=generator, num_inference_steps=160).images[0]
         print("Frame enhancement successful.")
     except Exception as e:
         print(f"Error during frame enhancement: {e}")
         gr.Warning("Frame enhancement failed. Using original frame.")
     return enhanced_image
+def use_last_frame_as_input(prompt, video_filepath, do_enhance, do_superres):
     if not video_filepath or not os.path.exists(video_filepath):
         gr.Warning("No video clip available.")
         return None, gr.update()
                 # 1. Immediately yield the original frame to the UI
         print("Displaying original last frame...")
         yield pil_image, gr.update()
+        if do_superres:
+            pil_image = superres_image(pil_image)
         if do_enhance:
             enhanced_image = enhance_frame(prompt, pil_image)
+            if do_superres:
+                enhanced_image = superres_image(enhanced_image)
             # 2. Yield the enhanced frame and switch the tab
             print("Displaying enhanced frame and switching tab...")
             yield enhanced_image, gr.update(selected="i2v_tab")
         else:
+            if do_superres:
+                pil_image = superres_image(pil_image)
             # If not enhancing, just switch the tab
             yield pil_image, gr.update(selected="i2v_tab")
     except Exception as e:
         raise gr.Error("input_video_filepath is required for video-to-video mode")
     if randomize_seed: seed_ui = random.randint(0, 2**32 - 1)
     seed_everething(int(seed_ui))
+    seed = random.randint(0, MAX_SEED)
+    generator.manual_seed(seed)
     actual_num_frames = max(9, min(MAX_NUM_FRAMES, int(round((max(1, round(duration_ui * fps)) - 1.0) / 8.0) * 8 + 1)))
     actual_height, actual_width = int(height_ui), int(width_ui)
     height_padded, width_padded = ((actual_height - 1) // 32 + 1) * 32, ((actual_width - 1) // 32 + 1) * 32
             duration_input = gr.Slider(label="Clip Duration (seconds)", minimum=1.0, maximum=10.0, value=2.0, step=0.1)
             improve_texture = gr.Checkbox(label="Improve Texture (multi-scale)", value=True)
             enhance_checkbox = gr.Checkbox(label="Improve Frame (SDXL Refiner)", value=True)
+            superres_checkbox = gr.Checkbox(label="Upscale Frame (ClearRealityV1)", value=True)
         with gr.Column():
             output_video = gr.Video(label="Last Generated Clip", interactive=False)
             use_last_frame_button = gr.Button("Use Last Frame as Input Image", visible=False)
     t2v_button.click(hide_btn, outputs=[use_last_frame_button], queue=False).then(fn=generate, inputs=t2v_inputs, outputs=gen_outputs, api_name="text_to_video")
     i2v_button.click(hide_btn, outputs=[use_last_frame_button], queue=False).then(fn=generate, inputs=i2v_inputs, outputs=gen_outputs, api_name="image_to_video")
     v2v_button.click(hide_btn, outputs=[use_last_frame_button], queue=False).then(fn=generate, inputs=v2v_inputs, outputs=gen_outputs, api_name="video_to_video")
+    use_last_frame_button.click(fn=use_last_frame_as_input, inputs=[i2v_prompt,output_video,enhance_checkbox, superres_checkbox], outputs=[image_i2v, tabs])
     stitch_button.click(fn=stitch_videos, inputs=[clips_state], outputs=[final_video_output])
     clear_button.click(fn=clear_clips, outputs=[clips_state, clip_counter_display, output_video, final_video_output])
 if __name__ == "__main__":