dagloop5 commited on
Commit
7b323d0
·
verified ·
1 Parent(s): 131276a

Delete app(old).py

Browse files
Files changed (1) hide show
  1. app(old).py +0 -858
app(old).py DELETED
@@ -1,858 +0,0 @@
1
- import os
2
- import subprocess
3
- import sys
4
-
5
- # Disable torch.compile / dynamo before any torch import
6
- os.environ["TORCH_COMPILE_DISABLE"] = "1"
7
- os.environ["TORCHDYNAMO_DISABLE"] = "1"
8
-
9
- # Install xformers for memory-efficient attention
10
- subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
11
-
12
- # Clone LTX-2 repo and install packages
13
- LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
14
- LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
15
-
16
- LTX_COMMIT = "ae855f8538843825f9015a419cf4ba5edaf5eec2" # known working commit with decode_video
17
-
18
- if not os.path.exists(LTX_REPO_DIR):
19
- print(f"Cloning {LTX_REPO_URL}...")
20
- subprocess.run(["git", "clone", LTX_REPO_URL, LTX_REPO_DIR], check=True)
21
- subprocess.run(["git", "checkout", LTX_COMMIT], cwd=LTX_REPO_DIR, check=True)
22
-
23
- print("Installing ltx-core and ltx-pipelines from cloned repo...")
24
- subprocess.run(
25
- [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "-e",
26
- os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
27
- "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines")],
28
- check=True,
29
- )
30
-
31
- sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
32
- sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
33
-
34
- import logging
35
- import random
36
- import tempfile
37
- from pathlib import Path
38
- import gc
39
- import hashlib
40
-
41
- import torch
42
- torch._dynamo.config.suppress_errors = True
43
- torch._dynamo.config.disable = True
44
-
45
- import spaces
46
- import gradio as gr
47
- import numpy as np
48
- from huggingface_hub import hf_hub_download, snapshot_download
49
-
50
- from ltx_core.components.diffusion_steps import EulerDiffusionStep
51
- from ltx_core.components.noisers import GaussianNoiser
52
- from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
53
- from ltx_core.model.upsampler import upsample_video
54
- from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number, decode_video as vae_decode_video
55
- from ltx_core.quantization import QuantizationPolicy
56
- from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
57
- from ltx_pipelines.distilled import DistilledPipeline
58
- from ltx_pipelines.utils import euler_denoising_loop
59
- from ltx_pipelines.utils.args import ImageConditioningInput
60
- from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
61
- from ltx_pipelines.utils.helpers import (
62
- cleanup_memory,
63
- combined_image_conditionings,
64
- denoise_video_only,
65
- encode_prompts,
66
- simple_denoising_func,
67
- )
68
- from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
69
- from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
70
- from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
71
-
72
- # Force-patch xformers attention into the LTX attention module.
73
- from ltx_core.model.transformer import attention as _attn_mod
74
- print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
75
- try:
76
- from xformers.ops import memory_efficient_attention as _mea
77
- _attn_mod.memory_efficient_attention = _mea
78
- print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
79
- except Exception as e:
80
- print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")
81
-
82
- logging.getLogger().setLevel(logging.INFO)
83
-
84
- MAX_SEED = np.iinfo(np.int32).max
85
- DEFAULT_PROMPT = (
86
- "An astronaut hatches from a fragile egg on the surface of the Moon, "
87
- "the shell cracking and peeling apart in gentle low-gravity motion. "
88
- "Fine lunar dust lifts and drifts outward with each movement, floating "
89
- "in slow arcs before settling back onto the ground."
90
- )
91
- DEFAULT_FRAME_RATE = 24.0
92
-
93
- # Resolution presets: (width, height)
94
- RESOLUTIONS = {
95
- "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024), "9:7": (1408, 1088), "7:9": (1088, 1408), "19:13": (1472, 1008), "13:19": (1008, 1472)},
96
- "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768), "9:7": (704, 544), "7:9": (544, 704), "19:13": (736, 504), "13:19": (504, 736)},
97
- }
98
-
99
-
100
- class LTX23DistilledA2VPipeline(DistilledPipeline):
101
- """DistilledPipeline with optional audio conditioning."""
102
-
103
- def __call__(
104
- self,
105
- prompt: str,
106
- seed: int,
107
- height: int,
108
- width: int,
109
- num_frames: int,
110
- frame_rate: float,
111
- images: list[ImageConditioningInput],
112
- audio_path: str | None = None,
113
- tiling_config: TilingConfig | None = None,
114
- enhance_prompt: bool = False,
115
- ):
116
- # Standard path when no audio input is provided.
117
- print(prompt)
118
- if audio_path is None:
119
- return super().__call__(
120
- prompt=prompt,
121
- seed=seed,
122
- height=height,
123
- width=width,
124
- num_frames=num_frames,
125
- frame_rate=frame_rate,
126
- images=images,
127
- tiling_config=tiling_config,
128
- enhance_prompt=enhance_prompt,
129
- )
130
-
131
- generator = torch.Generator(device=self.device).manual_seed(seed)
132
- noiser = GaussianNoiser(generator=generator)
133
- stepper = EulerDiffusionStep()
134
- dtype = torch.bfloat16
135
-
136
- (ctx_p,) = encode_prompts(
137
- [prompt],
138
- self.model_ledger,
139
- enhance_first_prompt=enhance_prompt,
140
- enhance_prompt_image=images[0].path if len(images) > 0 else None,
141
- )
142
- video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
143
-
144
- video_duration = num_frames / frame_rate
145
- decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
146
- if decoded_audio is None:
147
- raise ValueError(f"Could not extract audio stream from {audio_path}")
148
-
149
- encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
150
- audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
151
- expected_frames = audio_shape.frames
152
- actual_frames = encoded_audio_latent.shape[2]
153
-
154
- if actual_frames > expected_frames:
155
- encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
156
- elif actual_frames < expected_frames:
157
- pad = torch.zeros(
158
- encoded_audio_latent.shape[0],
159
- encoded_audio_latent.shape[1],
160
- expected_frames - actual_frames,
161
- encoded_audio_latent.shape[3],
162
- device=encoded_audio_latent.device,
163
- dtype=encoded_audio_latent.dtype,
164
- )
165
- encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
166
-
167
- video_encoder = self.model_ledger.video_encoder()
168
- transformer = self.model_ledger.transformer()
169
- stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
170
-
171
- def denoising_loop(sigmas, video_state, audio_state, stepper):
172
- return euler_denoising_loop(
173
- sigmas=sigmas,
174
- video_state=video_state,
175
- audio_state=audio_state,
176
- stepper=stepper,
177
- denoise_fn=simple_denoising_func(
178
- video_context=video_context,
179
- audio_context=audio_context,
180
- transformer=transformer,
181
- ),
182
- )
183
-
184
- stage_1_output_shape = VideoPixelShape(
185
- batch=1,
186
- frames=num_frames,
187
- width=width // 2,
188
- height=height // 2,
189
- fps=frame_rate,
190
- )
191
- stage_1_conditionings = combined_image_conditionings(
192
- images=images,
193
- height=stage_1_output_shape.height,
194
- width=stage_1_output_shape.width,
195
- video_encoder=video_encoder,
196
- dtype=dtype,
197
- device=self.device,
198
- )
199
- video_state = denoise_video_only(
200
- output_shape=stage_1_output_shape,
201
- conditionings=stage_1_conditionings,
202
- noiser=noiser,
203
- sigmas=stage_1_sigmas,
204
- stepper=stepper,
205
- denoising_loop_fn=denoising_loop,
206
- components=self.pipeline_components,
207
- dtype=dtype,
208
- device=self.device,
209
- initial_audio_latent=encoded_audio_latent,
210
- )
211
-
212
- torch.cuda.synchronize()
213
- cleanup_memory()
214
-
215
- upscaled_video_latent = upsample_video(
216
- latent=video_state.latent[:1],
217
- video_encoder=video_encoder,
218
- upsampler=self.model_ledger.spatial_upsampler(),
219
- )
220
- stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
221
- stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
222
- stage_2_conditionings = combined_image_conditionings(
223
- images=images,
224
- height=stage_2_output_shape.height,
225
- width=stage_2_output_shape.width,
226
- video_encoder=video_encoder,
227
- dtype=dtype,
228
- device=self.device,
229
- )
230
- video_state = denoise_video_only(
231
- output_shape=stage_2_output_shape,
232
- conditionings=stage_2_conditionings,
233
- noiser=noiser,
234
- sigmas=stage_2_sigmas,
235
- stepper=stepper,
236
- denoising_loop_fn=denoising_loop,
237
- components=self.pipeline_components,
238
- dtype=dtype,
239
- device=self.device,
240
- noise_scale=stage_2_sigmas[0],
241
- initial_video_latent=upscaled_video_latent,
242
- initial_audio_latent=encoded_audio_latent,
243
- )
244
-
245
- torch.cuda.synchronize()
246
- del transformer
247
- del video_encoder
248
- cleanup_memory()
249
-
250
- decoded_video = vae_decode_video(
251
- video_state.latent,
252
- self.model_ledger.video_decoder(),
253
- tiling_config,
254
- generator,
255
- )
256
- original_audio = Audio(
257
- waveform=decoded_audio.waveform.squeeze(0),
258
- sampling_rate=decoded_audio.sampling_rate,
259
- )
260
- return decoded_video, original_audio
261
-
262
-
263
- # Model repos
264
- LTX_MODEL_REPO = "Lightricks/LTX-2.3"
265
- GEMMA_REPO ="Lightricks/gemma-3-12b-it-qat-q4_0-unquantized"
266
-
267
-
268
- # Download model checkpoints
269
- print("=" * 80)
270
- print("Downloading LTX-2.3 distilled model + Gemma...")
271
- print("=" * 80)
272
-
273
- # LoRA cache directory and currently-applied key
274
- LORA_CACHE_DIR = Path("lora_cache")
275
- LORA_CACHE_DIR.mkdir(exist_ok=True)
276
- current_lora_key: str | None = None
277
-
278
- PENDING_LORA_KEY: str | None = None
279
- PENDING_LORA_STATE: dict[str, torch.Tensor] | None = None
280
- PENDING_LORA_STATUS: str = "No LoRA state prepared yet."
281
-
282
- weights_dir = Path("weights")
283
- weights_dir.mkdir(exist_ok=True)
284
- checkpoint_path = hf_hub_download(
285
- repo_id=LTX_MODEL_REPO,
286
- filename="ltx-2.3-22b-distilled.safetensors",
287
- local_dir=str(weights_dir),
288
- local_dir_use_symlinks=False,
289
- )
290
- spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
291
- gemma_root = snapshot_download(repo_id=GEMMA_REPO)
292
-
293
- # ---- Insert block (LoRA downloads) between lines 268 and 269 ----
294
- # LoRA repo + download the requested LoRA adapters
295
- LORA_REPO = "dagloop5/LoRA"
296
-
297
- print("=" * 80)
298
- print("Downloading LoRA adapters from dagloop5/LoRA...")
299
- print("=" * 80)
300
- pose_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2_3_NSFW_furry_concat_v2.safetensors")
301
- general_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2.3_Reasoning_V1.safetensors")
302
- motion_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="motion_helper.safetensors")
303
- dreamlay_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="DR34ML4Y_LTXXX_PREVIEW_RC1.safetensors") # m15510n4ry, bl0wj0b, d0ubl3_bj, d0gg1e, c0wg1rl
304
- mself_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="Furry Hyper Masturbation - LTX-2 I2V v1.safetensors") # Hyperfap
305
- dramatic_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX-2.3 - Orgasm.safetensors") # "[He | She] is having am orgasm." (am or an?)
306
- fluid_lora_path = hf_hub_download(repo_id="valiantcat/LTX-2.3-Transition-LORA", filename="ltx2.3-transition.safetensors") # cr3ampi3 animation., missionary animation, doggystyle bouncy animation, double penetration animation
307
- liquid_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="liquid_wet_dr1pp_ltx2_v1.0_scaled.safetensors") # wet dr1pp
308
- demopose_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="clapping-cheeks-audio-v001-alpha.safetensors") # skin slapping against skin
309
-
310
- print(f"Pose LoRA: {pose_lora_path}")
311
- print(f"General LoRA: {general_lora_path}")
312
- print(f"Motion LoRA: {motion_lora_path}")
313
- print(f"Dreamlay LoRA: {dreamlay_lora_path}")
314
- print(f"Mself LoRA: {mself_lora_path}")
315
- print(f"Dramatic LoRA: {dramatic_lora_path}")
316
- print(f"Fluid LoRA: {fluid_lora_path}")
317
- print(f"Liquid LoRA: {liquid_lora_path}")
318
- print(f"Demopose LoRA: {demopose_lora_path}")
319
- # ----------------------------------------------------------------
320
-
321
- print(f"Checkpoint: {checkpoint_path}")
322
- print(f"Spatial upsampler: {spatial_upsampler_path}")
323
- print(f"Gemma root: {gemma_root}")
324
-
325
- # Initialize pipeline WITH text encoder and optional audio support
326
- # ---- Replace block (pipeline init) lines 275-281 ----
327
- pipeline = LTX23DistilledA2VPipeline(
328
- distilled_checkpoint_path=checkpoint_path,
329
- spatial_upsampler_path=spatial_upsampler_path,
330
- gemma_root=gemma_root,
331
- loras=[],
332
- quantization=QuantizationPolicy.fp8_cast(), # keep FP8 quantization unchanged
333
- )
334
- # ----------------------------------------------------------------
335
-
336
- def _make_lora_key(pose_strength: float, general_strength: float, motion_strength: float, dreamlay_strength: float, mself_strength: float, dramatic_strength: float, fluid_strength: float, liquid_strength: float, demopose_strength: float) -> tuple[str, str]:
337
- rp = round(float(pose_strength), 2)
338
- rg = round(float(general_strength), 2)
339
- rm = round(float(motion_strength), 2)
340
- rd = round(float(dreamlay_strength), 2)
341
- rs = round(float(mself_strength), 2)
342
- rr = round(float(dramatic_strength), 2)
343
- rf = round(float(fluid_strength), 2)
344
- rl = round(float(liquid_strength), 2)
345
- ro = round(float(demopose_strength), 2)
346
- key_str = f"{pose_lora_path}:{rp}|{general_lora_path}:{rg}|{motion_lora_path}:{rm}|{dreamlay_lora_path}:{rd}|{mself_lora_path}:{rs}|{dramatic_lora_path}:{rr}|{fluid_lora_path}:{rf}|{liquid_lora_path}:{rl}|{demopose_lora_path}:{ro}"
347
- key = hashlib.sha256(key_str.encode("utf-8")).hexdigest()
348
- return key, key_str
349
-
350
-
351
- def prepare_lora_cache(
352
- pose_strength: float,
353
- general_strength: float,
354
- motion_strength: float,
355
- dreamlay_strength: float,
356
- mself_strength: float,
357
- dramatic_strength: float,
358
- fluid_strength: float,
359
- liquid_strength: float,
360
- demopose_strength: float,
361
- progress=gr.Progress(track_tqdm=True),
362
- ):
363
- """
364
- CPU-only step:
365
- - checks cache
366
- - loads cached fused transformer state_dict, or
367
- - builds fused transformer on CPU and saves it
368
- The resulting state_dict is stored in memory and can be applied later.
369
- """
370
- global PENDING_LORA_KEY, PENDING_LORA_STATE, PENDING_LORA_STATUS
371
-
372
- ledger = pipeline.model_ledger
373
- key, _ = _make_lora_key(pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength)
374
- cache_path = LORA_CACHE_DIR / f"{key}.pt"
375
-
376
- progress(0.05, desc="Preparing LoRA state")
377
- if cache_path.exists():
378
- try:
379
- progress(0.20, desc="Loading cached fused state")
380
- state = torch.load(cache_path, map_location="cpu")
381
- PENDING_LORA_KEY = key
382
- PENDING_LORA_STATE = state
383
- PENDING_LORA_STATUS = f"Loaded cached LoRA state: {cache_path.name}"
384
- return PENDING_LORA_STATUS
385
- except Exception as e:
386
- print(f"[LoRA] Cache load failed: {type(e).__name__}: {e}")
387
-
388
- entries = [
389
- (pose_lora_path, round(float(pose_strength), 2)),
390
- (general_lora_path, round(float(general_strength), 2)),
391
- (motion_lora_path, round(float(motion_strength), 2)),
392
- (dreamlay_lora_path, round(float(dreamlay_strength), 2)),
393
- (mself_lora_path, round(float(mself_strength), 2)),
394
- (dramatic_lora_path, round(float(dramatic_strength), 2)),
395
- (fluid_lora_path, round(float(fluid_strength), 2)),
396
- (liquid_lora_path, round(float(liquid_strength), 2)),
397
- (demopose_lora_path, round(float(demopose_strength), 2)),
398
- ]
399
- loras_for_builder = [
400
- LoraPathStrengthAndSDOps(path, strength, LTXV_LORA_COMFY_RENAMING_MAP)
401
- for path, strength in entries
402
- if path is not None and float(strength) != 0.0
403
- ]
404
-
405
- if not loras_for_builder:
406
- PENDING_LORA_KEY = None
407
- PENDING_LORA_STATE = None
408
- PENDING_LORA_STATUS = "No non-zero LoRA strengths selected; nothing to prepare."
409
- return PENDING_LORA_STATUS
410
-
411
- tmp_ledger = None
412
- new_transformer_cpu = None
413
- try:
414
- progress(0.35, desc="Building fused CPU transformer")
415
- tmp_ledger = pipeline.model_ledger.__class__(
416
- dtype=ledger.dtype,
417
- device=torch.device("cpu"),
418
- checkpoint_path=str(checkpoint_path),
419
- spatial_upsampler_path=str(spatial_upsampler_path),
420
- gemma_root_path=str(gemma_root),
421
- loras=tuple(loras_for_builder),
422
- quantization=getattr(ledger, "quantization", None),
423
- )
424
- new_transformer_cpu = tmp_ledger.transformer()
425
-
426
- progress(0.70, desc="Extracting fused state_dict")
427
- state = new_transformer_cpu.state_dict()
428
- torch.save(state, cache_path)
429
-
430
- PENDING_LORA_KEY = key
431
- PENDING_LORA_STATE = state
432
- PENDING_LORA_STATUS = f"Built and cached LoRA state: {cache_path.name}"
433
- return PENDING_LORA_STATUS
434
-
435
- except Exception as e:
436
- import traceback
437
- print(f"[LoRA] Prepare failed: {type(e).__name__}: {e}")
438
- print(traceback.format_exc())
439
- PENDING_LORA_KEY = None
440
- PENDING_LORA_STATE = None
441
- PENDING_LORA_STATUS = f"LoRA prepare failed: {type(e).__name__}: {e}"
442
- return PENDING_LORA_STATUS
443
-
444
- finally:
445
- try:
446
- del new_transformer_cpu
447
- except Exception:
448
- pass
449
- try:
450
- del tmp_ledger
451
- except Exception:
452
- pass
453
- gc.collect()
454
-
455
-
456
- def apply_prepared_lora_state_to_pipeline():
457
- """
458
- Fast step: copy the already prepared CPU state into the live transformer.
459
- This is the only part that should remain near generation time.
460
- """
461
- global current_lora_key, PENDING_LORA_KEY, PENDING_LORA_STATE
462
-
463
- if PENDING_LORA_STATE is None or PENDING_LORA_KEY is None:
464
- print("[LoRA] No prepared LoRA state available; skipping.")
465
- return False
466
-
467
- if current_lora_key == PENDING_LORA_KEY:
468
- print("[LoRA] Prepared LoRA state already active; skipping.")
469
- return True
470
-
471
- existing_transformer = _transformer
472
- existing_params = {name: param for name, param in existing_transformer.named_parameters()}
473
- existing_buffers = {name: buf for name, buf in existing_transformer.named_buffers()}
474
-
475
- with torch.no_grad():
476
- for k, v in PENDING_LORA_STATE.items():
477
- if k in existing_params:
478
- existing_params[k].data.copy_(v.to(existing_params[k].device))
479
- elif k in existing_buffers:
480
- existing_buffers[k].data.copy_(v.to(existing_buffers[k].device))
481
-
482
- current_lora_key = PENDING_LORA_KEY
483
- print("[LoRA] Prepared LoRA state applied to the pipeline.")
484
- return True
485
-
486
- # ---- REPLACE PRELOAD BLOCK START ----
487
- # Preload all models for ZeroGPU tensor packing.
488
- print("Preloading all models (including Gemma and audio components)...")
489
- ledger = pipeline.model_ledger
490
-
491
- # Save the original factory methods so we can rebuild individual components later.
492
- # These are bound callables on ledger that will call the builder when invoked.
493
- _orig_transformer_factory = ledger.transformer
494
- _orig_video_encoder_factory = ledger.video_encoder
495
- _orig_video_decoder_factory = ledger.video_decoder
496
- _orig_audio_encoder_factory = ledger.audio_encoder
497
- _orig_audio_decoder_factory = ledger.audio_decoder
498
- _orig_vocoder_factory = ledger.vocoder
499
- _orig_spatial_upsampler_factory = ledger.spatial_upsampler
500
- _orig_text_encoder_factory = ledger.text_encoder
501
- _orig_gemma_embeddings_factory = ledger.gemma_embeddings_processor
502
-
503
- # Call the original factories once to create the cached instances we will serve by default.
504
- _transformer = _orig_transformer_factory()
505
- _video_encoder = _orig_video_encoder_factory()
506
- _video_decoder = _orig_video_decoder_factory()
507
- _audio_encoder = _orig_audio_encoder_factory()
508
- _audio_decoder = _orig_audio_decoder_factory()
509
- _vocoder = _orig_vocoder_factory()
510
- _spatial_upsampler = _orig_spatial_upsampler_factory()
511
- _text_encoder = _orig_text_encoder_factory()
512
- _embeddings_processor = _orig_gemma_embeddings_factory()
513
-
514
- # Replace ledger methods with lightweight lambdas that return the cached instances.
515
- # We keep the original factories above so we can call them later to rebuild components.
516
- ledger.transformer = lambda: _transformer
517
- ledger.video_encoder = lambda: _video_encoder
518
- ledger.video_decoder = lambda: _video_decoder
519
- ledger.audio_encoder = lambda: _audio_encoder
520
- ledger.audio_decoder = lambda: _audio_decoder
521
- ledger.vocoder = lambda: _vocoder
522
- ledger.spatial_upsampler = lambda: _spatial_upsampler
523
- ledger.text_encoder = lambda: _text_encoder
524
- ledger.gemma_embeddings_processor = lambda: _embeddings_processor
525
-
526
- print("All models preloaded (including Gemma text encoder and audio encoder)!")
527
- # ---- REPLACE PRELOAD BLOCK END ----
528
-
529
- print("=" * 80)
530
- print("Pipeline ready!")
531
- print("=" * 80)
532
-
533
-
534
- def log_memory(tag: str):
535
- if torch.cuda.is_available():
536
- allocated = torch.cuda.memory_allocated() / 1024**3
537
- peak = torch.cuda.max_memory_allocated() / 1024**3
538
- free, total = torch.cuda.mem_get_info()
539
- print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
540
-
541
-
542
- def detect_aspect_ratio(image) -> str:
543
- if image is None:
544
- return "16:9"
545
- if hasattr(image, "size"):
546
- w, h = image.size
547
- elif hasattr(image, "shape"):
548
- h, w = image.shape[:2]
549
- else:
550
- return "16:9"
551
- ratio = w / h
552
- candidates = {"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0}
553
- return min(candidates, key=lambda k: abs(ratio - candidates[k]))
554
-
555
-
556
- def on_image_upload(first_image, last_image, high_res):
557
- ref_image = first_image if first_image is not None else last_image
558
- aspect = detect_aspect_ratio(ref_image)
559
- tier = "high" if high_res else "low"
560
- w, h = RESOLUTIONS[tier][aspect]
561
- return gr.update(value=w), gr.update(value=h)
562
-
563
-
564
- def on_highres_toggle(first_image, last_image, high_res):
565
- ref_image = first_image if first_image is not None else last_image
566
- aspect = detect_aspect_ratio(ref_image)
567
- tier = "high" if high_res else "low"
568
- w, h = RESOLUTIONS[tier][aspect]
569
- return gr.update(value=w), gr.update(value=h)
570
-
571
-
572
- def get_gpu_duration(
573
- first_image,
574
- last_image,
575
- input_audio,
576
- prompt: str,
577
- duration: float,
578
- gpu_duration: float,
579
- enhance_prompt: bool = True,
580
- seed: int = 42,
581
- randomize_seed: bool = True,
582
- height: int = 1024,
583
- width: int = 1536,
584
- pose_strength: float = 0.0,
585
- general_strength: float = 0.0,
586
- motion_strength: float = 0.0,
587
- dreamlay_strength: float = 0.0,
588
- mself_strength: float = 0.0,
589
- dramatic_strength: float = 0.0,
590
- fluid_strength: float = 0.0,
591
- liquid_strength: float = 0.0,
592
- demopose_strength: float = 0.0,
593
- progress=None,
594
- ):
595
- return int(gpu_duration)
596
-
597
- @spaces.GPU(duration=get_gpu_duration)
598
- @torch.inference_mode()
599
- def generate_video(
600
- first_image,
601
- last_image,
602
- input_audio,
603
- prompt: str,
604
- duration: float,
605
- gpu_duration: float,
606
- enhance_prompt: bool = True,
607
- seed: int = 42,
608
- randomize_seed: bool = True,
609
- height: int = 1024,
610
- width: int = 1536,
611
- pose_strength: float = 0.0,
612
- general_strength: float = 0.0,
613
- motion_strength: float = 0.0,
614
- dreamlay_strength: float = 0.0,
615
- mself_strength: float = 0.0,
616
- dramatic_strength: float = 0.0,
617
- fluid_strength: float = 0.0,
618
- liquid_strength: float = 0.0,
619
- demopose_strength: float = 0.0,
620
- progress=gr.Progress(track_tqdm=True),
621
- ):
622
- try:
623
- torch.cuda.reset_peak_memory_stats()
624
- log_memory("start")
625
-
626
- current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
627
-
628
- frame_rate = DEFAULT_FRAME_RATE
629
- num_frames = int(duration * frame_rate) + 1
630
- num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
631
-
632
- print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
633
-
634
- images = []
635
- output_dir = Path("outputs")
636
- output_dir.mkdir(exist_ok=True)
637
-
638
- if first_image is not None:
639
- temp_first_path = output_dir / f"temp_first_{current_seed}.jpg"
640
- if hasattr(first_image, "save"):
641
- first_image.save(temp_first_path)
642
- else:
643
- temp_first_path = Path(first_image)
644
- images.append(ImageConditioningInput(path=str(temp_first_path), frame_idx=0, strength=1.0))
645
-
646
- if last_image is not None:
647
- temp_last_path = output_dir / f"temp_last_{current_seed}.jpg"
648
- if hasattr(last_image, "save"):
649
- last_image.save(temp_last_path)
650
- else:
651
- temp_last_path = Path(last_image)
652
- images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
653
-
654
- tiling_config = TilingConfig.default()
655
- video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
656
-
657
- log_memory("before pipeline call")
658
-
659
- apply_prepared_lora_state_to_pipeline()
660
-
661
- video, audio = pipeline(
662
- prompt=prompt,
663
- seed=current_seed,
664
- height=int(height),
665
- width=int(width),
666
- num_frames=num_frames,
667
- frame_rate=frame_rate,
668
- images=images,
669
- audio_path=input_audio,
670
- tiling_config=tiling_config,
671
- enhance_prompt=enhance_prompt,
672
- )
673
-
674
- log_memory("after pipeline call")
675
-
676
- output_path = tempfile.mktemp(suffix=".mp4")
677
- encode_video(
678
- video=video,
679
- fps=frame_rate,
680
- audio=audio,
681
- output_path=output_path,
682
- video_chunks_number=video_chunks_number,
683
- )
684
-
685
- log_memory("after encode_video")
686
- return str(output_path), current_seed
687
-
688
- except Exception as e:
689
- import traceback
690
- log_memory("on error")
691
- print(f"Error: {str(e)}\n{traceback.format_exc()}")
692
- return None, current_seed
693
-
694
-
695
- with gr.Blocks(title="LTX-2.3 Distilled") as demo:
696
- gr.Markdown("# LTX-2.3 F2LF with Fast Audio-Video Generation with Frame Conditioning")
697
-
698
-
699
- with gr.Row():
700
- with gr.Column():
701
- with gr.Row():
702
- first_image = gr.Image(label="First Frame (Optional)", type="pil")
703
- last_image = gr.Image(label="Last Frame (Optional)", type="pil")
704
- input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
705
- prompt = gr.Textbox(
706
- label="Prompt",
707
- info="for best results - make it as elaborate as possible",
708
- value="Make this image come alive with cinematic motion, smooth animation",
709
- lines=3,
710
- placeholder="Describe the motion and animation you want...",
711
- )
712
- duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, value=10.0, step=0.1)
713
-
714
-
715
- generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
716
-
717
- with gr.Accordion("Advanced Settings", open=False):
718
- seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
719
- randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
720
- with gr.Row():
721
- width = gr.Number(label="Width", value=1536, precision=0)
722
- height = gr.Number(label="Height", value=1024, precision=0)
723
- with gr.Row():
724
- enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=False)
725
- high_res = gr.Checkbox(label="High Resolution", value=True)
726
- with gr.Column():
727
- gr.Markdown("### LoRA adapter strengths (set to 0 to disable)")
728
- pose_strength = gr.Slider(
729
- label="Anthro Enhancer strength",
730
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
731
- )
732
- general_strength = gr.Slider(
733
- label="Reasoning Enhancer strength",
734
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
735
- )
736
- motion_strength = gr.Slider(
737
- label="Anthro Posing Helper strength",
738
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
739
- )
740
- dreamlay_strength = gr.Slider(
741
- label="Dreamlay strength",
742
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
743
- )
744
- mself_strength = gr.Slider(
745
- label="Mself strength",
746
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
747
- )
748
- dramatic_strength = gr.Slider(
749
- label="Dramatic strength",
750
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
751
- )
752
- fluid_strength = gr.Slider(
753
- label="Fluid Helper strength",
754
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
755
- )
756
- liquid_strength = gr.Slider(
757
- label="Liquid Helper strength",
758
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
759
- )
760
- demopose_strength = gr.Slider(
761
- label="Demopose Helper strength",
762
- minimum=0.0, maximum=2.0, value=0.0, step=0.01
763
- )
764
- prepare_lora_btn = gr.Button("Prepare / Load LoRA Cache", variant="secondary")
765
- lora_status = gr.Textbox(
766
- label="LoRA Cache Status",
767
- value="No LoRA state prepared yet.",
768
- interactive=False,
769
- )
770
-
771
- with gr.Column():
772
- output_video = gr.Video(label="Generated Video", autoplay=False)
773
- gpu_duration = gr.Slider(
774
- label="ZeroGPU duration (seconds)",
775
- minimum=40.0,
776
- maximum=240.0,
777
- value=85.0,
778
- step=1.0,
779
- )
780
-
781
- gr.Examples(
782
- examples=[
783
- [
784
- None,
785
- "pinkknit.jpg",
786
- None,
787
- "The camera falls downward through darkness as if dropped into a tunnel. "
788
- "As it slows, five friends wearing pink knitted hats and sunglasses lean "
789
- "over and look down toward the camera with curious expressions. The lens "
790
- "has a strong fisheye effect, creating a circular frame around them. They "
791
- "crowd together closely, forming a symmetrical cluster while staring "
792
- "directly into the lens.",
793
- 3.0,
794
- 80.0,
795
- False,
796
- 42,
797
- True,
798
- 1024,
799
- 1024,
800
- 0.0, # pose_strength (example)
801
- 0.0, # general_strength (example)
802
- 0.0, # motion_strength (example)
803
- 0.0,
804
- 0.0,
805
- 0.0,
806
- 0.0,
807
- 0.0,
808
- 0.0,
809
- ],
810
- ],
811
- inputs=[
812
- first_image, last_image, input_audio, prompt, duration, gpu_duration,
813
- enhance_prompt, seed, randomize_seed, height, width,
814
- pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
815
- ],
816
- )
817
-
818
- first_image.change(
819
- fn=on_image_upload,
820
- inputs=[first_image, last_image, high_res],
821
- outputs=[width, height],
822
- )
823
-
824
- last_image.change(
825
- fn=on_image_upload,
826
- inputs=[first_image, last_image, high_res],
827
- outputs=[width, height],
828
- )
829
-
830
- high_res.change(
831
- fn=on_highres_toggle,
832
- inputs=[first_image, last_image, high_res],
833
- outputs=[width, height],
834
- )
835
-
836
- prepare_lora_btn.click(
837
- fn=prepare_lora_cache,
838
- inputs=[pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength],
839
- outputs=[lora_status],
840
- )
841
-
842
- generate_btn.click(
843
- fn=generate_video,
844
- inputs=[
845
- first_image, last_image, input_audio, prompt, duration, gpu_duration, enhance_prompt,
846
- seed, randomize_seed, height, width,
847
- pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength,
848
- ],
849
- outputs=[output_video, seed],
850
- )
851
-
852
-
853
- css = """
854
- .fillable{max-width: 1200px !important}
855
- """
856
-
857
- if __name__ == "__main__":
858
- demo.launch(theme=gr.themes.Citrus(), css=css)