File size: 8,361 Bytes
b9d776d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import torch
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video

pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()

prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
expected_height, expected_width = 704, 512
downscale_factor = 2 / 3
num_frames = 121

# Part 1. Generate video at smaller resolution
downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
latents = pipe(
    conditions=None,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=downscaled_width,
    height=downscaled_height,
    num_frames=num_frames,
    num_inference_steps=30,
    generator=torch.Generator().manual_seed(0),
    output_type="latent",
).frames

# Part 2. Upscale generated video using latent upsampler with fewer inference steps
# The available latent upsampler upscales the height/width by 2x
upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
upscaled_latents = pipe_upsample(
    latents=latents,
    output_type="latent"
).frames

# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
video = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=upscaled_width,
    height=upscaled_height,
    num_frames=num_frames,
    denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
    num_inference_steps=10,
    latents=upscaled_latents,
    decode_timestep=0.05,
    image_cond_noise_scale=0.025,
    generator=torch.Generator().manual_seed(0),
    output_type="pil",
).frames[0]

# Part 4. Downscale the video to the expected resolution
video = [frame.resize((expected_width, expected_height)) for frame in video]

export_to_video(video, "output.mp4", fps=24)
import torch
import gradio as gr
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video

def generate_video(
    prompt,
    negative_prompt,
    expected_height,
    expected_width,
    downscale_factor,
    num_frames,
    num_inference_steps,
    denoise_strength,
    seed,
    progress=gr.Progress()
):
    # Initialize pipelines (move this outside the function for production)
    progress(0.1, desc="Loading models...")
    pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
    pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
    pipe.to("cuda")
    pipe_upsample.to("cuda")
    pipe.vae.enable_tiling()
    
    # Part 1. Generate video at smaller resolution
    progress(0.2, desc="Generating initial video...")
    downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
    generator = torch.Generator().manual_seed(seed)
    
    latents = pipe(
        conditions=None,
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=downscaled_width,
        height=downscaled_height,
        num_frames=num_frames,
        num_inference_steps=num_inference_steps,
        generator=generator,
        output_type="latent",
    ).frames
    
    # Part 2. Upscale generated video
    progress(0.5, desc="Upscaling video...")
    upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
    upscaled_latents = pipe_upsample(
        latents=latents,
        output_type="latent"
    ).frames
    
    # Part 3. Denoise the upscaled video
    progress(0.7, desc="Refining video quality...")
    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=upscaled_width,
        height=upscaled_height,
        num_frames=num_frames,
        denoise_strength=denoise_strength,
        num_inference_steps=10,
        latents=upscaled_latents,
        decode_timestep=0.05,
        image_cond_noise_scale=0.025,
        generator=generator,
        output_type="pil",
    ).frames[0]
    
    # Part 4. Downscale the video to the expected resolution
    progress(0.9, desc="Finalizing video...")
    video = [frame.resize((expected_width, expected_height)) for frame in video]
    
    # Save and return video
    output_path = "output.mp4"
    export_to_video(video, output_path, fps=24)
    
    return output_path

# Create Gradio interface
with gr.Blocks(title="LTX Video Generator") as demo:
    gr.Markdown("# LTX Video Generator")
    gr.Markdown("Generate videos from text prompts using Lightricks' LTX model")
    
    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(
                label="Prompt",
                value="The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region.",
                lines=4
            )
            negative_prompt = gr.Textbox(
                label="Negative Prompt",
                value="worst quality, inconsistent motion, blurry, jittery, distorted",
                lines=2
            )
            
            with gr.Row():
                expected_height = gr.Slider(
                    label="Output Height",
                    minimum=256,
                    maximum=1024,
                    step=64,
                    value=704
                )
                expected_width = gr.Slider(
                    label="Output Width",
                    minimum=256,
                    maximum=1024,
                    step=64,
                    value=512
                )
            
            with gr.Row():
                downscale_factor = gr.Slider(
                    label="Initial Downscale Factor",
                    minimum=0.3,
                    maximum=0.9,
                    step=0.05,
                    value=2/3
                )
                num_frames = gr.Slider(
                    label="Number of Frames",
                    minimum=24,
                    maximum=240,
                    step=1,
                    value=121
                )
            
            with gr.Row():
                num_inference_steps = gr.Slider(
                    label="Inference Steps",
                    minimum=10,
                    maximum=50,
                    step=1,
                    value=30
                )
                denoise_strength = gr.Slider(
                    label="Denoise Strength",
                    minimum=0.1,
                    maximum=0.9,
                    step=0.05,
                    value=0.4
                )
                seed = gr.Number(
                    label="Seed",
                    value=0,
                    precision=0
                )
            
            submit_btn = gr.Button("Generate Video", variant="primary")
        
        with gr.Column():
            output_video = gr.Video(label="Generated Video")
    
    submit_btn.click(
        fn=generate_video,
        inputs=[
            prompt,
            negative_prompt,
            expected_height,
            expected_width,
            downscale_factor,
            num_frames,
            num_inference_steps,
            denoise_strength,
            seed
        ],
        outputs=output_video
    )

if __name__ == "__main__":
    demo.launch()