jiuhai commited on 17 days ago

Commit

a3c20e1

verified ·

1 Parent(s): 4ab332d

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

packages/ltx-core/pyproject.toml +55 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/__init__.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/attention.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/feed_forward.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/gelu_approx.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/modality.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/model.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/model_configurator.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/rope.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/text_projection.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/timestep_embedding.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/transformer/__pycache__/transformer.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py +202 -0
packages/ltx-trainer/configs/accelerate/ddp.yaml +16 -0
packages/ltx-trainer/configs/accelerate/ddp_compile.yaml +21 -0
packages/ltx-trainer/configs/accelerate/fsdp.yaml +29 -0
packages/ltx-trainer/configs/accelerate/fsdp_compile.yaml +34 -0
packages/ltx-trainer/configs/ltx2_av_lora.yaml +313 -0
packages/ltx-trainer/configs/ltx2_av_lora_low_vram.yaml +325 -0
packages/ltx-trainer/configs/ltx2_v2v_ic_lora.yaml +329 -0
packages/ltx-trainer/docs/configuration-reference.md +372 -0
packages/ltx-trainer/docs/custom-training-strategies.md +510 -0
packages/ltx-trainer/docs/dataset-preparation.md +342 -0
packages/ltx-trainer/docs/quick-start.md +130 -0
packages/ltx-trainer/docs/training-guide.md +203 -0
packages/ltx-trainer/docs/training-modes.md +277 -0
packages/ltx-trainer/docs/troubleshooting.md +300 -0
packages/ltx-trainer/docs/utility-scripts.md +274 -0
packages/ltx-trainer/scripts/caption_videos.py +486 -0
packages/ltx-trainer/scripts/compute_reference.py +288 -0
packages/ltx-trainer/scripts/decode_latents.py +369 -0
packages/ltx-trainer/scripts/process_captions.py +435 -0
packages/ltx-trainer/scripts/process_dataset.py +317 -0
packages/ltx-trainer/scripts/process_videos.py +1039 -0
packages/ltx-trainer/scripts/split_scenes.py +417 -0
packages/ltx-trainer/scripts/train.py +64 -0
packages/ltx-trainer/src/ltx_trainer/__pycache__/__init__.cpython-312.pyc +0 -0
packages/ltx-trainer/src/ltx_trainer/__pycache__/model_loader.cpython-312.pyc +0 -0
packages/ltx-trainer/src/ltx_trainer/captioning.py +401 -0
packages/ltx-trainer/src/ltx_trainer/gemma_8bit.py +85 -0
packages/ltx-trainer/src/ltx_trainer/gpu_utils.py +90 -0
packages/ltx-trainer/src/ltx_trainer/progress.py +236 -0
packages/ltx-trainer/src/ltx_trainer/quantization.py +195 -0
packages/ltx-trainer/src/ltx_trainer/trainer.py +1000 -0
packages/ltx-trainer/src/ltx_trainer/training_strategies/__init__.py +58 -0
packages/ltx-trainer/src/ltx_trainer/training_strategies/base_strategy.py +262 -0
packages/ltx-trainer/src/ltx_trainer/training_strategies/text_to_video.py +291 -0
packages/ltx-trainer/src/ltx_trainer/training_strategies/video_to_video.py +303 -0
packages/ltx-trainer/src/ltx_trainer/utils.py +88 -0
packages/ltx-trainer/templates/model_card.md +59 -0

packages/ltx-core/pyproject.toml ADDED Viewed

	@@ -0,0 +1,55 @@

+[project]
+name = "ltx-core"
+version = "1.0.0"
+description = "Core implementation of Lightricks' LTX-2 model"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "torch~=2.7",
+    "torchaudio",
+    "einops",
+    "numpy",
+    "transformers>=4.52",
+    "safetensors",
+    "accelerate",
+    "scipy>=1.14",
+]
+[project.optional-dependencies]
+xformers = ["xformers"]
+fp8-trtllm = [
+    "tensorrt-llm==1.0.0",
+    "onnx>=1.16.0,<1.20.0",
+    "openmpi",
+]
+[tool.uv]
+conflicts = [
+  [
+    { extra = "xformers" },
+    { extra = "fp8-trtllm" },
+  ],
+]
+[tool.uv.sources]
+xformers = { index = "pytorch" }
+tensorrt-llm = { index = "nvidia" }
+[[tool.uv.index]]
+name = "pytorch"
+url = "https://download.pytorch.org/whl/cu129"
+explicit = true
+[[tool.uv.index]]
+name = "nvidia"
+url = "https://pypi.nvidia.com/"
+explicit = true
+[build-system]
+requires = ["uv_build>=0.9.8,<0.10.0"]
+build-backend = "uv_build"
+[dependency-groups]
+dev = [
+    "scikit-image>=0.25.2",
+]

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (624 Bytes). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/attention.cpython-312.pyc ADDED Viewed

Binary file (13.3 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/feed_forward.cpython-312.pyc ADDED Viewed

Binary file (1.52 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/gelu_approx.cpython-312.pyc ADDED Viewed

Binary file (1.28 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/modality.cpython-312.pyc ADDED Viewed

Binary file (2.29 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (20.6 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/model_configurator.cpython-312.pyc ADDED Viewed

Binary file (9.1 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/rope.cpython-312.pyc ADDED Viewed

Binary file (10.6 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/text_projection.cpython-312.pyc ADDED Viewed

Binary file (2.75 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/timestep_embedding.cpython-312.pyc ADDED Viewed

Binary file (7.2 kB). View file

packages/ltx-core/src/ltx_core/model/transformer/__pycache__/transformer.cpython-312.pyc ADDED Viewed

Binary file (18 kB). View file

packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import functools
+from pathlib import Path
+import torch
+from transformers import AutoImageProcessor, Gemma3ForConditionalGeneration, Gemma3Processor
+from ltx_core.loader.module_ops import ModuleOps
+from ltx_core.text_encoders.gemma.tokenizer import LTXVGemmaTokenizer
+from ltx_core.utils import find_matching_file
+class GemmaTextEncoder(torch.nn.Module):
+    """Pure Gemma text encoder — runs the LLM and returns raw hidden states.
+    Prompt enhancement (generate) is also supported since the full
+    Gemma3ForConditionalGeneration model (including lm_head) is loaded.
+    """
+    def __init__(
+        self,
+        model: Gemma3ForConditionalGeneration | None = None,
+        tokenizer: LTXVGemmaTokenizer | None = None,
+        processor: Gemma3Processor | None = None,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self._dtype = dtype
+    def encode(
+        self,
+        text: str,
+        padding_side: str = "left",  # noqa: ARG002
+    ) -> tuple[tuple[torch.Tensor, ...], torch.Tensor]:
+        """Run Gemma LLM and return raw hidden states + attention mask.
+        Calls the inner model (self.model.model) to skip lm_head logits computation (~500 MiB saving).
+        Returns:
+            (hidden_states, attention_mask) where hidden_states is a tuple of per-layer tensors.
+        """
+        token_pairs = self.tokenizer.tokenize_with_weights(text)["gemma"]
+        input_ids = torch.tensor([[t[0] for t in token_pairs]], device=self.model.device)
+        attention_mask = torch.tensor([[w[1] for w in token_pairs]], device=self.model.device)
+        outputs = self.model.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
+        hidden_states = outputs.hidden_states
+        del outputs
+        return hidden_states, attention_mask
+    # --- Prompt enhancement methods ---
+    def _enhance(
+        self,
+        messages: list[dict[str, str]],
+        image: torch.Tensor | None = None,
+        max_new_tokens: int = 512,
+        seed: int = 10,
+    ) -> str:
+        text = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        model_inputs = self.processor(
+            text=text,
+            images=image,
+            return_tensors="pt",
+        ).to(self.model.device)
+        pad_token_id = self.processor.tokenizer.pad_token_id if self.processor.tokenizer.pad_token_id is not None else 0
+        model_inputs = _pad_inputs_for_attention_alignment(model_inputs, pad_token_id=pad_token_id)
+        with torch.inference_mode(), torch.random.fork_rng(devices=[self.model.device]):
+            torch.manual_seed(seed)
+            outputs = self.model.generate(
+                **model_inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=True,
+                temperature=0.7,
+            )
+            generated_ids = outputs[0][len(model_inputs.input_ids[0]) :]
+            enhanced_prompt = self.processor.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        return enhanced_prompt
+    def enhance_t2v(
+        self,
+        prompt: str,
+        max_new_tokens: int = 512,
+        system_prompt: str | None = None,
+        seed: int = 10,
+    ) -> str:
+        """Enhance a text prompt for T2V generation."""
+        system_prompt = system_prompt or self.default_gemma_t2v_system_prompt
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": f"user prompt: {prompt}"},
+        ]
+        return self._enhance(messages, max_new_tokens=max_new_tokens, seed=seed)
+    def enhance_i2v(
+        self,
+        prompt: str,
+        image: torch.Tensor,
+        max_new_tokens: int = 512,
+        system_prompt: str | None = None,
+        seed: int = 10,
+    ) -> str:
+        """Enhance a text prompt for I2V generation using a reference image."""
+        system_prompt = system_prompt or self.default_gemma_i2v_system_prompt
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": f"User Raw Input Prompt: {prompt}."},
+                ],
+            },
+        ]
+        return self._enhance(messages, image=image, max_new_tokens=max_new_tokens, seed=seed)
+    @functools.cached_property
+    def default_gemma_i2v_system_prompt(self) -> str:
+        return _load_system_prompt("gemma_i2v_system_prompt.txt")
+    @functools.cached_property
+    def default_gemma_t2v_system_prompt(self) -> str:
+        return _load_system_prompt("gemma_t2v_system_prompt.txt")
+# --- Standalone utility functions ---
+@functools.lru_cache(maxsize=2)
+def _load_system_prompt(prompt_name: str) -> str:
+    with open(Path(__file__).parent / "prompts" / f"{prompt_name}", "r") as f:
+        return f.read()
+def _cat_with_padding(
+    tensor: torch.Tensor,
+    padding_length: int,
+    value: int | float,
+) -> torch.Tensor:
+    """Concatenate a tensor with a padding tensor of the given value."""
+    return torch.cat(
+        [
+            tensor,
+            torch.full(
+                (1, padding_length),
+                value,
+                dtype=tensor.dtype,
+                device=tensor.device,
+            ),
+        ],
+        dim=1,
+    )
+def _pad_inputs_for_attention_alignment(
+    model_inputs: dict[str, torch.Tensor],
+    pad_token_id: int = 0,
+    alignment: int = 8,
+) -> dict[str, torch.Tensor]:
+    """Pad sequence length to multiple of alignment for Flash Attention compatibility."""
+    seq_len = model_inputs.input_ids.shape[1]
+    padded_len = ((seq_len + alignment - 1) // alignment) * alignment
+    padding_length = padded_len - seq_len
+    if padding_length > 0:
+        model_inputs["input_ids"] = _cat_with_padding(model_inputs.input_ids, padding_length, pad_token_id)
+        model_inputs["attention_mask"] = _cat_with_padding(model_inputs.attention_mask, padding_length, 0)
+        if "token_type_ids" in model_inputs and model_inputs["token_type_ids"] is not None:
+            model_inputs["token_type_ids"] = _cat_with_padding(model_inputs["token_type_ids"], padding_length, 0)
+    return model_inputs
+def module_ops_from_gemma_root(gemma_root: str) -> tuple[ModuleOps, ...]:
+    tokenizer_root = str(find_matching_file(gemma_root, "tokenizer.model").parent)
+    processor_root = str(find_matching_file(gemma_root, "preprocessor_config.json").parent)
+    def load_tokenizer(module: GemmaTextEncoder) -> GemmaTextEncoder:
+        module.tokenizer = LTXVGemmaTokenizer(tokenizer_root, 1024)
+        return module
+    def load_processor(module: GemmaTextEncoder) -> GemmaTextEncoder:
+        image_processor = AutoImageProcessor.from_pretrained(processor_root, local_files_only=True)
+        if not module.tokenizer:
+            raise ValueError("Tokenizer model operation must be performed before processor model operation")
+        module.processor = Gemma3Processor(image_processor=image_processor, tokenizer=module.tokenizer.tokenizer)
+        return module
+    tokenizer_load_ops = ModuleOps(
+        "TokenizerLoad",
+        matcher=lambda module: isinstance(module, GemmaTextEncoder) and module.tokenizer is None,
+        mutator=load_tokenizer,
+    )
+    processor_load_ops = ModuleOps(
+        "ProcessorLoad",
+        matcher=lambda module: isinstance(module, GemmaTextEncoder) and module.processor is None,
+        mutator=load_processor,
+    )
+    return (tokenizer_load_ops, processor_load_ops)

packages/ltx-trainer/configs/accelerate/ddp.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

packages/ltx-trainer/configs/accelerate/ddp_compile.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+compute_environment: LOCAL_MACHINE
+dynamo_config:
+  dynamo_backend: INDUCTOR
+  dynamo_mode: default
+  dynamo_use_fullgraph: false
+  dynamo_use_dynamic: true
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: [ ]
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

packages/ltx-trainer/configs/accelerate/fsdp.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BasicAVTransformerBlock
+  fsdp_use_orig_params: true
+  fsdp_version: 1
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

packages/ltx-trainer/configs/accelerate/fsdp_compile.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+dynamo_config:
+  dynamo_backend: INDUCTOR
+  dynamo_mode: default
+  dynamo_use_fullgraph: false
+  dynamo_use_dynamic: true
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BasicAVTransformerBlock
+  fsdp_use_orig_params: true
+  fsdp_version: 1
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

packages/ltx-trainer/configs/ltx2_av_lora.yaml ADDED Viewed

	@@ -0,0 +1,313 @@

+# =============================================================================
+# LTX-2 Audio-Video LoRA Training Configuration
+# =============================================================================
+#
+# This configuration is for training LoRA adapters on the LTX-2 model for
+# text-to-video generation. It supports both video-only and joint audio-video
+# training modes.
+#
+# Use this configuration when you want to:
+#   - Fine-tune LTX-2 on your own video dataset
+#   - Train with or without audio generation
+#   - Create custom video generation styles or audiovisual concepts
+#
+# Dataset structure for text-to-video training:
+#   preprocessed_data_root/
+#   ├── latents/        # Video latents (VAE-encoded videos)
+#   ├── conditions/     # Text embeddings for each video
+#   └── audio_latents/  # Audio latents (only if with_audio: true)
+#
+# =============================================================================
+# -----------------------------------------------------------------------------
+# Model Configuration
+# -----------------------------------------------------------------------------
+# Specifies the base model to fine-tune and the training mode.
+model:
+  # Path to the LTX-2 model checkpoint (.safetensors file)
+  # This should be a local path to your downloaded model
+  model_path: "path/to/ltx-2-model.safetensors"
+  # Path to the text encoder model directory
+  # For LTX-2, this is typically the Gemma-based text encoder
+  text_encoder_path: "path/to/gemma-text-encoder"
+  # Training mode: "lora" for efficient adapter training, "full" for full fine-tuning
+  # LoRA is recommended for most use cases (faster, less memory, prevents overfitting)
+  training_mode: "lora"
+  # Optional: Path to resume training from a checkpoint
+  # Can be a checkpoint file (.safetensors) or directory (uses latest checkpoint)
+  load_checkpoint: null
+# -----------------------------------------------------------------------------
+# LoRA Configuration
+# -----------------------------------------------------------------------------
+# Controls the Low-Rank Adaptation parameters for efficient fine-tuning.
+lora:
+  # Rank of the LoRA matrices (higher = more capacity but more parameters)
+  # Typical values: 8, 16, 32, 64. Start with 32 for general fine-tuning.
+  rank: 32
+  # Alpha scaling factor (usually set equal to rank)
+  # The effective scaling is alpha/rank, so alpha=rank means scaling of 1.0
+  alpha: 32
+  # Dropout probability for LoRA layers (0.0 = no dropout)
+  # Can help with regularization if overfitting occurs
+  dropout: 0.0
+  # Which transformer modules to apply LoRA to
+  # The LTX-2 transformer has separate attention and FFN blocks for video and audio:
+  #
+  # VIDEO MODULES:
+  #   - attn1.to_k, attn1.to_q, attn1.to_v, attn1.to_out.0  (video self-attention)
+  #   - attn2.to_k, attn2.to_q, attn2.to_v, attn2.to_out.0  (video cross-attention to text)
+  #   - ff.net.0.proj, ff.net.2                             (video feed-forward)
+  #
+  # AUDIO MODULES:
+  #   - audio_attn1.to_k, audio_attn1.to_q, audio_attn1.to_v, audio_attn1.to_out.0  (audio self-attention)
+  #   - audio_attn2.to_k, audio_attn2.to_q, audio_attn2.to_v, audio_attn2.to_out.0  (audio cross-attention to text)
+  #   - audio_ff.net.0.proj, audio_ff.net.2                                         (audio feed-forward)
+  #
+  # AUDIO-VIDEO CROSS-ATTENTION MODULES (for cross-modal interaction):
+  #   - audio_to_video_attn.to_k, audio_to_video_attn.to_q, audio_to_video_attn.to_v, audio_to_video_attn.to_out.0
+  #       (Q from video, K/V from audio - allows video to attend to audio features)
+  #   - video_to_audio_attn.to_k, video_to_audio_attn.to_q, video_to_audio_attn.to_v, video_to_audio_attn.to_out.0
+  #       (Q from audio, K/V from video - allows audio to attend to video features)
+  #
+  # Using short patterns like "to_k" matches ALL attention modules (video, audio, and cross-modal).
+  # For audio-video training, this is the recommended approach.
+  target_modules:
+    # Attention layers (matches both video and audio branches)
+    - "to_k"
+    - "to_q"
+    - "to_v"
+    - "to_out.0"
+    # Uncomment below to also train feed-forward layers (can increase the LoRA's capacity):
+    # - "ff.net.0.proj"
+    # - "ff.net.2"
+    # - "audio_ff.net.0.proj"
+    # - "audio_ff.net.2"
+# -----------------------------------------------------------------------------
+# Training Strategy Configuration
+# -----------------------------------------------------------------------------
+# Defines the text-to-video training approach.
+training_strategy:
+  # Strategy name: "text_to_video" for standard text-to-video training
+  name: "text_to_video"
+  # Probability of conditioning on the first frame during training
+  # Higher values train the model to perform better in image-to-video (I2V) mode,
+  # where a clean first frame is provided and the model generates the rest of the video
+  # Increase this value to train the model to perform better in image-to-video (I2V) mode
+  first_frame_conditioning_p: 0.5
+  # Enable joint audio-video training
+  # Set to true if your dataset includes audio and you want to train the audio branch
+  with_audio: true
+  # Directory name (within preprocessed_data_root) containing audio latents
+  # Only used when with_audio is true
+  audio_latents_dir: "audio_latents"
+# -----------------------------------------------------------------------------
+# Optimization Configuration
+# -----------------------------------------------------------------------------
+# Controls the training optimization parameters.
+optimization:
+  # Learning rate for the optimizer
+  # Typical range for LoRA: 1e-5 to 1e-4
+  learning_rate: 1e-4
+  # Total number of training steps
+  steps: 2000
+  # Batch size per GPU
+  # Reduce if running out of memory
+  batch_size: 1
+  # Number of gradient accumulation steps
+  # Effective batch size = batch_size * gradient_accumulation_steps * num_gpus
+  gradient_accumulation_steps: 1
+  # Maximum gradient norm for clipping (helps training stability)
+  max_grad_norm: 1.0
+  # Optimizer type: "adamw" (standard) or "adamw8bit" (memory-efficient)
+  optimizer_type: "adamw"
+  # Learning rate scheduler type
+  # Options: "constant", "linear", "cosine", "cosine_with_restarts", "polynomial"
+  scheduler_type: "linear"
+  # Additional scheduler parameters (depends on scheduler_type)
+  scheduler_params: { }
+  # Enable gradient checkpointing to reduce memory usage
+  # Recommended for training with limited GPU memory
+  enable_gradient_checkpointing: true
+# -----------------------------------------------------------------------------
+# Acceleration Configuration
+# -----------------------------------------------------------------------------
+# Hardware acceleration and memory optimization settings.
+acceleration:
+  # Mixed precision training mode
+  # Options: "no" (fp32), "fp16" (half precision), "bf16" (bfloat16, recommended)
+  mixed_precision_mode: "bf16"
+  # Model quantization for reduced memory usage
+  # Options: null (none), "int8-quanto", "int4-quanto", "int2-quanto", "fp8-quanto", "fp8uz-quanto"
+  quantization: null
+  # Load text encoder in 8-bit precision to save memory
+  # Useful when GPU memory is limited
+  load_text_encoder_in_8bit: false
+# -----------------------------------------------------------------------------
+# Data Configuration
+# -----------------------------------------------------------------------------
+# Specifies the training data location and loading parameters.
+data:
+  # Root directory containing preprocessed training data
+  # Should contain: latents/, conditions/, and optionally audio_latents/
+  preprocessed_data_root: "/path/to/preprocessed/data"
+  # Number of worker processes for data loading
+  # Used for parallel data loading to speed up data loading
+  num_dataloader_workers: 2
+# -----------------------------------------------------------------------------
+# Validation Configuration
+# -----------------------------------------------------------------------------
+# Controls validation video generation during training.
+# NOTE: Validation sampling use simplified inference pipelines and prioritizes speed over
+# maximum quality. For production-quality inference, use `packages/ltx-pipelines`.
+validation:
+  # Text prompts for validation video generation
+  # Provide prompts representative of your training data
+  # LTX-2 prefers longer, detailed prompts that describe both visual content and audio
+  prompts:
+    - "A woman with long brown hair sits at a wooden desk in a cozy home office, typing on a laptop while occasionally glancing at notes beside her. Soft natural light streams through a large window, casting warm shadows across the room. She pauses to take a sip from a ceramic mug, then continues working with focused concentration. The audio captures the gentle clicking of keyboard keys, the soft rustle of papers, and ambient room tone with occasional distant bird chirps from outside."
+    - "A chef in a white uniform stands in a professional kitchen, carefully plating a gourmet dish with precise movements. Steam rises from freshly cooked vegetables as he arranges them with tweezers. The stainless steel surfaces gleam under bright overhead lights, and various pots simmer on the stove behind him. The audio features the sizzling of pans, the clinking of utensils against plates, and the ambient hum of kitchen ventilation."
+  # Negative prompt to avoid unwanted artifacts
+  negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted"
+  # Optional: First frame images for image-to-video validation
+  # If provided, must have one image per prompt
+  images: null
+  # Output video dimensions [width, height, frames]
+  # Width and height must be divisible by 32
+  # Frames must satisfy: frames % 8 == 1 (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, ...)
+  video_dims: [ 576, 576, 89 ]
+  # Frame rate for generated videos
+  frame_rate: 25.0
+  # Random seed for reproducible validation outputs
+  seed: 42
+  # Number of denoising steps for validation inference
+  # Higher values = better quality but slower generation
+  inference_steps: 30
+  # Generate validation videos every N training steps
+  # Set to null to disable validation during training
+  interval: 100
+  # Number of videos to generate per prompt
+  videos_per_prompt: 1
+  # Classifier-free guidance scale
+  # Higher values = stronger adherence to prompt but may introduce artifacts
+  guidance_scale: 4.0
+  # STG (Spatio-Temporal Guidance) parameters for improved video quality
+  # STG is combined with CFG for better temporal coherence
+  stg_scale: 1.0  # Recommended: 1.0 (0.0 disables STG)
+  stg_blocks: [29]  # Recommended: single block 29
+  stg_mode: "stg_av"  # "stg_av" perturbs both audio and video, "stg_v" video only
+  # Whether to generate audio in validation samples
+  # Independent of training_strategy.with_audio - you can generate audio
+  # in validation even when not training the audio branch
+  generate_audio: true
+  # Skip validation at the beginning of training (step 0)
+  skip_initial_validation: false
+# -----------------------------------------------------------------------------
+# Checkpoint Configuration
+# -----------------------------------------------------------------------------
+# Controls model checkpoint saving during training.
+checkpoints:
+  # Save a checkpoint every N steps
+  # Set to null to disable intermediate checkpoints
+  interval: 250
+  # Number of most recent checkpoints to keep
+  # Set to -1 to keep all checkpoints
+  keep_last_n: -1
+  # Precision to use when saving checkpoint weights
+  # Options: "bfloat16" (default, smaller files) or "float32" (full precision)
+  precision: "bfloat16"
+# -----------------------------------------------------------------------------
+# Flow Matching Configuration
+# -----------------------------------------------------------------------------
+# Parameters for the flow matching training objective.
+flow_matching:
+  # Timestep sampling mode
+  # "shifted_logit_normal" is recommended for LTX-2 models
+  timestep_sampling_mode: "shifted_logit_normal"
+  # Additional parameters for timestep sampling
+  timestep_sampling_params: { }
+# -----------------------------------------------------------------------------
+# Hugging Face Hub Configuration
+# -----------------------------------------------------------------------------
+# Settings for uploading trained models to the Hugging Face Hub.
+hub:
+  # Whether to push the trained model to the Hub
+  push_to_hub: false
+  # Repository ID on Hugging Face Hub (e.g., "username/my-lora-model")
+  # Required if push_to_hub is true
+  hub_model_id: null
+# -----------------------------------------------------------------------------
+# Weights & Biases Configuration
+# -----------------------------------------------------------------------------
+# Settings for experiment tracking with W&B.
+wandb:
+  # Enable W&B logging
+  enabled: false
+  # W&B project name
+  project: "ltx-2-trainer"
+  # W&B username or team (null uses default account)
+  entity: null
+  # Tags to help organize runs
+  tags: [ "ltx2", "lora" ]
+  # Log validation videos to W&B
+  log_validation_videos: true
+# -----------------------------------------------------------------------------
+# General Configuration
+# -----------------------------------------------------------------------------
+# Global settings for the training run.
+# Random seed for reproducibility
+seed: 42
+# Directory to save outputs (checkpoints, validation videos, logs)
+output_dir: "outputs/ltx2_av_lora"

packages/ltx-trainer/configs/ltx2_av_lora_low_vram.yaml ADDED Viewed

	@@ -0,0 +1,325 @@

+# =============================================================================
+# LTX-2 Audio-Video LoRA Training Configuration (Low VRAM)
+# =============================================================================
+#
+# This is a memory-optimized variant of the standard audio-video LoRA config.
+# It uses 8-bit optimizer, int8 quantization, and reduced LoRA rank to minimize
+# GPU memory usage while maintaining good training quality.
+#
+# Memory optimizations applied:
+#   - 8-bit AdamW optimizer (reduces optimizer state memory by ~75%)
+#   - INT8 model quantization (reduces model memory by ~50%)
+#   - Lower LoRA rank (16 vs 32, reduces trainable parameters)
+#   - Gradient checkpointing enabled
+#
+# Recommended for GPUs with 32GB VRAM (e.g., RTX 5090).
+#
+# Use this configuration when you want to:
+#   - Fine-tune LTX-2 on your own video dataset
+#   - Train with or without audio generation
+#   - Create custom video generation styles or audiovisual concepts
+#
+# Dataset structure for text-to-video training:
+#   preprocessed_data_root/
+#   ├── latents/        # Video latents (VAE-encoded videos)
+#   ├── conditions/     # Text embeddings for each video
+#   └── audio_latents/  # Audio latents (only if with_audio: true)
+#
+# =============================================================================
+# -----------------------------------------------------------------------------
+# Model Configuration
+# -----------------------------------------------------------------------------
+# Specifies the base model to fine-tune and the training mode.
+model:
+  # Path to the LTX-2 model checkpoint (.safetensors file)
+  # This should be a local path to your downloaded model
+  model_path: "path/to/ltx-2-model.safetensors"
+  # Path to the text encoder model directory
+  # For LTX-2, this is typically the Gemma-based text encoder
+  text_encoder_path: "path/to/gemma-text-encoder"
+  # Training mode: "lora" for efficient adapter training, "full" for full fine-tuning
+  # LoRA is recommended for most use cases (faster, less memory, prevents overfitting)
+  training_mode: "lora"
+  # Optional: Path to resume training from a checkpoint
+  # Can be a checkpoint file (.safetensors) or directory (uses latest checkpoint)
+  load_checkpoint: null
+# -----------------------------------------------------------------------------
+# LoRA Configuration
+# -----------------------------------------------------------------------------
+# Controls the Low-Rank Adaptation parameters for efficient fine-tuning.
+# Using a lower rank (16) to reduce trainable parameters and memory usage.
+# This still provides good capacity for many fine-tuning tasks.
+lora:
+  # Rank of the LoRA matrices (higher = more capacity but more parameters)
+  # Typical values: 8, 16, 32, 64. Using 16 for low VRAM configuration.
+  rank: 16
+  # Alpha scaling factor (usually set equal to rank)
+  # The effective scaling is alpha/rank, so alpha=rank means scaling of 1.0
+  alpha: 16
+  # Dropout probability for LoRA layers (0.0 = no dropout)
+  # Can help with regularization if overfitting occurs
+  dropout: 0.0
+  # Which transformer modules to apply LoRA to
+  # The LTX-2 transformer has separate attention and FFN blocks for video and audio:
+  #
+  # VIDEO MODULES:
+  #   - attn1.to_k, attn1.to_q, attn1.to_v, attn1.to_out.0  (video self-attention)
+  #   - attn2.to_k, attn2.to_q, attn2.to_v, attn2.to_out.0  (video cross-attention to text)
+  #   - ff.net.0.proj, ff.net.2                             (video feed-forward)
+  #
+  # AUDIO MODULES:
+  #   - audio_attn1.to_k, audio_attn1.to_q, audio_attn1.to_v, audio_attn1.to_out.0  (audio self-attention)
+  #   - audio_attn2.to_k, audio_attn2.to_q, audio_attn2.to_v, audio_attn2.to_out.0  (audio cross-attention to text)
+  #   - audio_ff.net.0.proj, audio_ff.net.2                                         (audio feed-forward)
+  #
+  # AUDIO-VIDEO CROSS-ATTENTION MODULES (for cross-modal interaction):
+  #   - audio_to_video_attn.to_k, audio_to_video_attn.to_q, audio_to_video_attn.to_v, audio_to_video_attn.to_out.0
+  #       (Q from video, K/V from audio - allows video to attend to audio features)
+  #   - video_to_audio_attn.to_k, video_to_audio_attn.to_q, video_to_audio_attn.to_v, video_to_audio_attn.to_out.0
+  #       (Q from audio, K/V from video - allows audio to attend to video features)
+  #
+  # Using short patterns like "to_k" matches ALL attention modules (video, audio, and cross-modal).
+  # For audio-video training, this is the recommended approach.
+  target_modules:
+    # Attention layers (matches both video and audio branches)
+    - "to_k"
+    - "to_q"
+    - "to_v"
+    - "to_out.0"
+    # Uncomment below to also train feed-forward layers (can increase the LoRA's capacity):
+    # - "ff.net.0.proj"
+    # - "ff.net.2"
+    # - "audio_ff.net.0.proj"
+    # - "audio_ff.net.2"
+# -----------------------------------------------------------------------------
+# Training Strategy Configuration
+# -----------------------------------------------------------------------------
+# Defines the text-to-video training approach.
+training_strategy:
+  # Strategy name: "text_to_video" for standard text-to-video training
+  name: "text_to_video"
+  # Probability of conditioning on the first frame during training
+  # Higher values train the model to perform better in image-to-video (I2V) mode,
+  # where a clean first frame is provided and the model generates the rest of the video
+  # Increase this value to train the model to perform better in image-to-video (I2V) mode
+  first_frame_conditioning_p: 0.5
+  # Enable joint audio-video training
+  # Set to true if your dataset includes audio and you want to train the audio branch
+  with_audio: true
+  # Directory name (within preprocessed_data_root) containing audio latents
+  # Only used when with_audio is true
+  audio_latents_dir: "audio_latents"
+# -----------------------------------------------------------------------------
+# Optimization Configuration
+# -----------------------------------------------------------------------------
+# Controls the training optimization parameters.
+optimization:
+  # Learning rate for the optimizer
+  # Typical range for LoRA: 1e-5 to 1e-4
+  learning_rate: 1e-4
+  # Total number of training steps
+  steps: 2000
+  # Batch size per GPU
+  # Reduce if running out of memory
+  batch_size: 1
+  # Number of gradient accumulation steps
+  # Effective batch size = batch_size * gradient_accumulation_steps * num_gpus
+  gradient_accumulation_steps: 1
+  # Maximum gradient norm for clipping (helps training stability)
+  max_grad_norm: 1.0
+  # Optimizer type: "adamw" (standard) or "adamw8bit" (memory-efficient)
+  # Using 8-bit AdamW to reduce optimizer state memory by ~75%
+  optimizer_type: "adamw8bit"
+  # Learning rate scheduler type
+  # Options: "constant", "linear", "cosine", "cosine_with_restarts", "polynomial"
+  scheduler_type: "linear"
+  # Additional scheduler parameters (depends on scheduler_type)
+  scheduler_params: { }
+  # Enable gradient checkpointing to reduce memory usage
+  # Recommended for training with limited GPU memory
+  enable_gradient_checkpointing: true
+# -----------------------------------------------------------------------------
+# Acceleration Configuration
+# -----------------------------------------------------------------------------
+# Hardware acceleration and memory optimization settings.
+acceleration:
+  # Mixed precision training mode
+  # Options: "no" (fp32), "fp16" (half precision), "bf16" (bfloat16, recommended)
+  mixed_precision_mode: "bf16"
+  # Model quantization for reduced memory usage
+  # Options: null (none), "int8-quanto", "int4-quanto", "int2-quanto", "fp8-quanto", "fp8uz-quanto"
+  # Using INT8 quantization to reduce base model memory consumption by ~50%
+  quantization: "int8-quanto"
+  # Load text encoder in 8-bit precision to save memory
+  # Useful when GPU memory is limited
+  load_text_encoder_in_8bit: true
+# -----------------------------------------------------------------------------
+# Data Configuration
+# -----------------------------------------------------------------------------
+# Specifies the training data location and loading parameters.
+data:
+  # Root directory containing preprocessed training data
+  # Should contain: latents/, conditions/, and optionally audio_latents/
+  preprocessed_data_root: "/path/to/preprocessed/data"
+  # Number of worker processes for data loading
+  # Used for parallel data loading to speed up data loading
+  num_dataloader_workers: 2
+# -----------------------------------------------------------------------------
+# Validation Configuration
+# -----------------------------------------------------------------------------
+# Controls validation video generation during training.
+# NOTE: Validation sampling use simplified inference pipelines and prioritizes speed over
+# maximum quality. For production-quality inference, use `packages/ltx-pipelines`.
+validation:
+  # Text prompts for validation video generation
+  # Provide prompts representative of your training data
+  # LTX-2 prefers longer, detailed prompts that describe both visual content and audio
+  prompts:
+    - "A woman with long brown hair sits at a wooden desk in a cozy home office, typing on a laptop while occasionally glancing at notes beside her. Soft natural light streams through a large window, casting warm shadows across the room. She pauses to take a sip from a ceramic mug, then continues working with focused concentration. The audio captures the gentle clicking of keyboard keys, the soft rustle of papers, and ambient room tone with occasional distant bird chirps from outside."
+    - "A chef in a white uniform stands in a professional kitchen, carefully plating a gourmet dish with precise movements. Steam rises from freshly cooked vegetables as he arranges them with tweezers. The stainless steel surfaces gleam under bright overhead lights, and various pots simmer on the stove behind him. The audio features the sizzling of pans, the clinking of utensils against plates, and the ambient hum of kitchen ventilation."
+  # Negative prompt to avoid unwanted artifacts
+  negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted"
+  # Optional: First frame images for image-to-video validation
+  # If provided, must have one image per prompt
+  images: null
+  # Output video dimensions [width, height, frames]
+  # Width and height must be divisible by 32
+  # Frames must satisfy: frames % 8 == 1 (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, ...)
+  video_dims: [ 576, 576, 49 ]
+  # Frame rate for generated videos
+  frame_rate: 25.0
+  # Random seed for reproducible validation outputs
+  seed: 42
+  # Number of denoising steps for validation inference
+  # Higher values = better quality but slower generation
+  inference_steps: 30
+  # Generate validation videos every N training steps
+  # Set to null to disable validation during training
+  interval: 100
+  # Number of videos to generate per prompt
+  videos_per_prompt: 1
+  # Classifier-free guidance scale
+  # Higher values = stronger adherence to prompt but may introduce artifacts
+  guidance_scale: 4.0
+  # STG (Spatio-Temporal Guidance) parameters for improved video quality
+  # STG is combined with CFG for better temporal coherence
+  stg_scale: 1.0  # Recommended: 1.0 (0.0 disables STG)
+  stg_blocks: [ 29 ]  # Recommended: single block 29
+  stg_mode: "stg_av"  # "stg_av" perturbs both audio and video, "stg_v" video only
+  # Whether to generate audio in validation samples
+  # Independent of training_strategy.with_audio - you can generate audio
+  # in validation even when not training the audio branch
+  generate_audio: true
+  # Skip validation at the beginning of training (step 0)
+  skip_initial_validation: false
+# -----------------------------------------------------------------------------
+# Checkpoint Configuration
+# -----------------------------------------------------------------------------
+# Controls model checkpoint saving during training.
+checkpoints:
+  # Save a checkpoint every N steps
+  # Set to null to disable intermediate checkpoints
+  interval: 250
+  # Number of most recent checkpoints to keep
+  # Set to -1 to keep all checkpoints
+  keep_last_n: -1
+  # Precision to use when saving checkpoint weights
+  # Options: "bfloat16" (default, smaller files) or "float32" (full precision)
+  precision: "bfloat16"
+# -----------------------------------------------------------------------------
+# Flow Matching Configuration
+# -----------------------------------------------------------------------------
+# Parameters for the flow matching training objective.
+flow_matching:
+  # Timestep sampling mode
+  # "shifted_logit_normal" is recommended for LTX-2 models
+  timestep_sampling_mode: "shifted_logit_normal"
+  # Additional parameters for timestep sampling
+  timestep_sampling_params: { }
+# -----------------------------------------------------------------------------
+# Hugging Face Hub Configuration
+# -----------------------------------------------------------------------------
+# Settings for uploading trained models to the Hugging Face Hub.
+hub:
+  # Whether to push the trained model to the Hub
+  push_to_hub: false
+  # Repository ID on Hugging Face Hub (e.g., "username/my-lora-model")
+  # Required if push_to_hub is true
+  hub_model_id: null
+# -----------------------------------------------------------------------------
+# Weights & Biases Configuration
+# -----------------------------------------------------------------------------
+# Settings for experiment tracking with W&B.
+wandb:
+  # Enable W&B logging
+  enabled: false
+  # W&B project name
+  project: "ltx-2-trainer"
+  # W&B username or team (null uses default account)
+  entity: null
+  # Tags to help organize runs
+  tags: [ "ltx2", "lora" ]
+  # Log validation videos to W&B
+  log_validation_videos: true
+# -----------------------------------------------------------------------------
+# General Configuration
+# -----------------------------------------------------------------------------
+# Global settings for the training run.
+# Random seed for reproducibility
+seed: 42
+# Directory to save outputs (checkpoints, validation videos, logs)
+output_dir: "outputs/ltx2_av_lora"

packages/ltx-trainer/configs/ltx2_v2v_ic_lora.yaml ADDED Viewed

	@@ -0,0 +1,329 @@

+# =============================================================================
+# LTX-2 Video-to-Video (IC-LoRA) Training Configuration
+# =============================================================================
+#
+# This configuration is for training In-Context LoRA (IC-LoRA) adapters that
+# enable video-to-video transformations. IC-LoRA learns to apply visual
+# transformations (e.g., depth-to-video, pose control, style transfer, etc.)
+# by conditioning on reference videos.
+#
+# Key differences from text-to-video LoRA:
+#   - Uses reference videos as conditioning input alongside text prompts
+#   - Requires preprocessed reference latents in addition to target latents
+#   - Validation requires reference videos to demonstrate the transformation
+#
+# Dataset structure for IC-LoRA training:
+#   preprocessed_data_root/
+#   ├── latents/           # Target video latents (what the model learns to generate)
+#   ├── conditions/        # Text embeddings for each video
+#   └── reference_latents/ # Reference video latents (conditioning input)
+#
+# =============================================================================
+# -----------------------------------------------------------------------------
+# Model Configuration
+# -----------------------------------------------------------------------------
+# Specifies the base model to fine-tune and the training mode.
+model:
+  # Path to the LTX-2 model checkpoint (.safetensors file)
+  # This should be a local path to your downloaded model
+  model_path: "path/to/ltx-2-model.safetensors"
+  # Path to the text encoder model directory
+  # For LTX-2, this is typically the Gemma-based text encoder
+  text_encoder_path: "path/to/gemma-text-encoder"
+  # Training mode: "lora" for efficient adapter training, "full" for full fine-tuning
+  # Note: video_to_video strategy requires "lora" mode
+  training_mode: "lora"
+  # Optional: Path to resume training from a checkpoint
+  # Can be a checkpoint file (.safetensors) or directory (uses latest checkpoint)
+  load_checkpoint: null
+# -----------------------------------------------------------------------------
+# LoRA Configuration
+# -----------------------------------------------------------------------------
+# Controls the Low-Rank Adaptation parameters for efficient fine-tuning.
+lora:
+  # Rank of the LoRA matrices (higher = more capacity but more parameters)
+  # Typical values: 8, 16, 32, 64. Start with 16-32 for IC-LoRA.
+  rank: 32
+  # Alpha scaling factor (usually set equal to rank)
+  # The effective scaling is alpha/rank, so alpha=rank means scaling of 1.0
+  alpha: 32
+  # Dropout probability for LoRA layers (0.0 = no dropout)
+  # Can help with regularization if overfitting occurs
+  dropout: 0.0
+  # Which transformer modules to apply LoRA to
+  # The LTX-2 transformer has separate attention and FFN blocks for video and audio:
+  #
+  # VIDEO MODULES:
+  #   - attn1.to_k, attn1.to_q, attn1.to_v, attn1.to_out.0  (video self-attention)
+  #   - attn2.to_k, attn2.to_q, attn2.to_v, attn2.to_out.0  (video cross-attention to text)
+  #   - ff.net.0.proj, ff.net.2                             (video feed-forward)
+  #
+  # AUDIO MODULES (not used for video-only IC-LoRA):
+  #   - audio_attn1.to_k, audio_attn1.to_q, audio_attn1.to_v, audio_attn1.to_out.0  (audio self-attention)
+  #   - audio_attn2.to_k, audio_attn2.to_q, audio_attn2.to_v, audio_attn2.to_out.0  (audio cross-attention to text)
+  #   - audio_ff.net.0.proj, audio_ff.net.2                                         (audio feed-forward)
+  #
+  # AUDIO-VIDEO CROSS-ATTENTION MODULES (for cross-modal interaction, not used for video-only IC-LoRA):
+  #   - audio_to_video_attn.to_k, audio_to_video_attn.to_q, audio_to_video_attn.to_v, audio_to_video_attn.to_out.0
+  #       (Q from video, K/V from audio - allows video to attend to audio features)
+  #   - video_to_audio_attn.to_k, video_to_audio_attn.to_q, video_to_audio_attn.to_v, video_to_audio_attn.to_out.0
+  #       (Q from audio, K/V from video - allows audio to attend to video features)
+  #
+  # For IC-LoRA (video-only), we explicitly target video modules.
+  # Including FFN layers often improves transformation quality.
+  target_modules:
+    # Video self-attention
+    - "attn1.to_k"
+    - "attn1.to_q"
+    - "attn1.to_v"
+    - "attn1.to_out.0"
+    # Video cross-attention
+    - "attn2.to_k"
+    - "attn2.to_q"
+    - "attn2.to_v"
+    - "attn2.to_out.0"
+    # Video feed-forward (often improves transformation quality)
+    - "ff.net.0.proj"
+    - "ff.net.2"
+# -----------------------------------------------------------------------------
+# Training Strategy Configuration
+# -----------------------------------------------------------------------------
+# Defines the video-to-video (IC-LoRA) training approach.
+training_strategy:
+  # Strategy name: "video_to_video" for IC-LoRA training
+  name: "video_to_video"
+  # Probability of conditioning on the first frame during training
+  # Higher values train the model to perform better in image-to-video (I2V) mode,
+  # where a clean first frame is provided and the model generates the rest of the video
+  # Increase this value to train the model to perform better in image-to-video (I2V) mode
+  first_frame_conditioning_p: 0.2
+  # Directory name (within preprocessed_data_root) containing reference video latents
+  # These are the conditioning inputs that guide the transformation
+  reference_latents_dir: "reference_latents"
+# -----------------------------------------------------------------------------
+# Optimization Configuration
+# -----------------------------------------------------------------------------
+# Controls the training optimization parameters.
+optimization:
+  # Learning rate for the optimizer
+  # Typical range for LoRA: 1e-5 to 1e-4
+  learning_rate: 2e-4
+  # Total number of training steps
+  steps: 3000
+  # Batch size per GPU
+  # Reduce if running out of memory
+  batch_size: 1
+  # Number of gradient accumulation steps
+  # Effective batch size = batch_size * gradient_accumulation_steps * num_gpus
+  gradient_accumulation_steps: 1
+  # Maximum gradient norm for clipping (helps training stability)
+  max_grad_norm: 1.0
+  # Optimizer type: "adamw" (standard) or "adamw8bit" (memory-efficient)
+  optimizer_type: "adamw"
+  # Learning rate scheduler type
+  # Options: "constant", "linear", "cosine", "cosine_with_restarts", "polynomial"
+  scheduler_type: "linear"
+  # Additional scheduler parameters (depends on scheduler_type)
+  scheduler_params: { }
+  # Enable gradient checkpointing to reduce memory usage
+  # Recommended for training with limited GPU memory
+  enable_gradient_checkpointing: true
+# -----------------------------------------------------------------------------
+# Acceleration Configuration
+# -----------------------------------------------------------------------------
+# Hardware acceleration and memory optimization settings.
+acceleration:
+  # Mixed precision training mode
+  # Options: "no" (fp32), "fp16" (half precision), "bf16" (bfloat16, recommended)
+  mixed_precision_mode: "bf16"
+  # Model quantization for reduced memory usage
+  # Options: null (none), "int8-quanto", "int4-quanto", "int2-quanto", "fp8-quanto", "fp8uz-quanto"
+  quantization: null
+  # Load text encoder in 8-bit precision to save memory
+  # Useful when GPU memory is limited
+  load_text_encoder_in_8bit: false
+# -----------------------------------------------------------------------------
+# Data Configuration
+# -----------------------------------------------------------------------------
+# Specifies the training data location and loading parameters.
+data:
+  # Root directory containing preprocessed training data
+  # Should contain: latents/, conditions/, and reference_latents/ subdirectories
+  preprocessed_data_root: "/path/to/preprocessed/data"
+  # Number of worker processes for data loading
+  # Used for parallel data loading to speed up data loading
+  num_dataloader_workers: 2
+# -----------------------------------------------------------------------------
+# Validation Configuration
+# -----------------------------------------------------------------------------
+# Controls validation video generation during training.
+# NOTE: Validation sampling use simplified inference pipelines and prioritizes speed over
+# maximum quality. For production-quality inference, use `packages/ltx-pipelines`.
+validation:
+  # Text prompts for validation video generation
+  # Provide prompts representative of your training data
+  # LTX-2 prefers longer, detailed prompts that describe both visual content and audio
+  prompts:
+    - "A man in a casual blue jacket walks along a winding path through a lush green park on a bright sunny afternoon. Tall oak trees line the pathway, their leaves rustling gently in the breeze. Dappled sunlight creates shifting patterns on the ground as he strolls at a relaxed pace, occasionally looking up at the scenery around him. The audio captures footsteps on gravel, birds singing in the trees, distant children playing, and the soft whisper of wind through the foliage."
+    - "A fluffy orange tabby cat sits perfectly still on a wooden windowsill, its green eyes intently tracking small birds hopping on a branch just outside the glass. The cat's ears twitch and rotate, following every movement. Warm afternoon light illuminates its fur, creating a soft golden glow. Behind the cat, a cozy living room with a bookshelf and houseplants is visible. The audio features gentle purring, occasional soft meows, muffled bird chirps through the window, and quiet ambient room sounds."
+  # Reference videos for validation (REQUIRED for video_to_video strategy)
+  # Must provide one reference video per prompt
+  # These are the conditioning inputs for generating validation outputs
+  reference_videos:
+    - "/path/to/reference_video_1.mp4"
+    - "/path/to/reference_video_2.mp4"
+  # Downscale factor for reference videos (for efficient IC-LoRA training)
+  # When > 1, reference videos are processed at 1/n resolution
+  # Must match the --reference-downscale-factor used during dataset preprocessing
+  # Examples: 1 = same resolution, 2 = half resolution (384x384 ref for 768x768 target)
+  reference_downscale_factor: 1
+  # Negative prompt to avoid unwanted artifacts
+  negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted"
+  # Optional: First frame images for additional conditioning
+  # If provided, must have one image per prompt
+  images: null
+  # Output video dimensions [width, height, frames]
+  # Width and height must be divisible by 32
+  # Frames must satisfy: frames % 8 == 1 (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, ...)
+  video_dims: [ 512, 512, 81 ]
+  # Frame rate for generated videos
+  frame_rate: 25.0
+  # Random seed for reproducible validation outputs
+  seed: 42
+  # Number of denoising steps for validation inference
+  # Higher values = better quality but slower generation
+  inference_steps: 30
+  # Generate validation videos every N training steps
+  # Set to null to disable validation during training
+  interval: 100
+  # Number of videos to generate per prompt
+  videos_per_prompt: 1
+  # Classifier-free guidance scale
+  # Higher values = stronger adherence to prompt but may introduce artifacts
+  guidance_scale: 4.0
+  # STG (Spatio-Temporal Guidance) parameters for improved video quality
+  # STG is combined with CFG for better temporal coherence
+  stg_scale: 1.0  # Recommended: 1.0 (0.0 disables STG)
+  stg_blocks: [29]  # Recommended: single block 29
+  stg_mode: "stg_v"  # "stg_v" for video-only (no audio training)
+  # Whether to generate audio in validation samples
+  # Can be enabled even when not training the audio branch
+  generate_audio: false
+  # Skip validation at the beginning of training (step 0)
+  skip_initial_validation: false
+  # Concatenate reference video side-by-side with generated output
+  # Useful for visually comparing the transformation quality
+  include_reference_in_output: true
+# -----------------------------------------------------------------------------
+# Checkpoint Configuration
+# -----------------------------------------------------------------------------
+# Controls model checkpoint saving during training.
+checkpoints:
+  # Save a checkpoint every N steps
+  # Set to null to disable intermediate checkpoints
+  interval: 250
+  # Number of most recent checkpoints to keep
+  # Set to -1 to keep all checkpoints
+  keep_last_n: 3
+  # Precision to use when saving checkpoint weights
+  # Options: "bfloat16" (default, smaller files) or "float32" (full precision)
+  precision: "bfloat16"
+# -----------------------------------------------------------------------------
+# Flow Matching Configuration
+# -----------------------------------------------------------------------------
+# Parameters for the flow matching training objective.
+flow_matching:
+  # Timestep sampling mode
+  # "shifted_logit_normal" is recommended for LTX-2 models
+  timestep_sampling_mode: "shifted_logit_normal"
+  # Additional parameters for timestep sampling
+  timestep_sampling_params: { }
+# -----------------------------------------------------------------------------
+# Hugging Face Hub Configuration
+# -----------------------------------------------------------------------------
+# Settings for uploading trained models to the Hugging Face Hub.
+hub:
+  # Whether to push the trained model to the Hub
+  push_to_hub: false
+  # Repository ID on Hugging Face Hub (e.g., "username/my-ic-lora-model")
+  # Required if push_to_hub is true
+  hub_model_id: null
+# -----------------------------------------------------------------------------
+# Weights & Biases Configuration
+# -----------------------------------------------------------------------------
+# Settings for experiment tracking with W&B.
+wandb:
+  # Enable W&B logging
+  enabled: false
+  # W&B project name
+  project: "ltx-2-trainer"
+  # W&B username or team (null uses default account)
+  entity: null
+  # Tags to help organize runs
+  tags: [ "ltx2", "ic-lora", "video-to-video" ]
+  # Log validation videos to W&B
+  log_validation_videos: true
+# -----------------------------------------------------------------------------
+# General Configuration
+# -----------------------------------------------------------------------------
+# Global settings for the training run.
+# Random seed for reproducibility
+seed: 42
+# Directory to save outputs (checkpoints, validation videos, logs)
+output_dir: "outputs/ltx2_v2v_ic_lora"

packages/ltx-trainer/docs/configuration-reference.md ADDED Viewed

	@@ -0,0 +1,372 @@

+# Configuration Reference
+The trainer uses structured Pydantic models for configuration, making it easy to customize training parameters.
+This guide covers all available configuration options and their usage.
+## 📋 Overview
+The main configuration class is [`LtxTrainerConfig`](../src/ltx_trainer/config.py), which includes the following
+sub-configurations:
+- **ModelConfig**: Base model and training mode settings
+- **LoraConfig**: LoRA training parameters
+- **TrainingStrategyConfig**: Training strategy settings (text-to-video or video-to-video)
+- **OptimizationConfig**: Learning rate, batch sizes, and scheduler settings
+- **AccelerationConfig**: Mixed precision and quantization settings
+- **DataConfig**: Data loading parameters
+- **ValidationConfig**: Validation and inference settings
+- **CheckpointsConfig**: Checkpoint saving frequency and retention settings
+- **HubConfig**: Hugging Face Hub integration settings
+- **WandbConfig**: Weights & Biases logging settings
+- **FlowMatchingConfig**: Timestep sampling parameters
+## 📄 Example Configuration Files
+Check out our example configurations in the `configs` directory:
+- 📄 [Audio-Video LoRA Training](../configs/ltx2_av_lora.yaml) - Joint audio-video generation training
+- 📄 [Audio-Video LoRA Training (Low VRAM)](../configs/ltx2_av_lora_low_vram.yaml) - Memory-optimized config for 32GB
+  GPUs (uses 8-bit optimizer, INT8 quantization, and reduced LoRA rank)
+- 📄 [IC-LoRA Training](../configs/ltx2_v2v_ic_lora.yaml) - Video-to-video transformation training
+## ⚙️ Configuration Sections
+### ModelConfig
+Controls the base model and training mode settings.
+```yaml
+model:
+  model_path: "/path/to/ltx-2-model.safetensors"  # Local path to model checkpoint
+  text_encoder_path: "/path/to/gemma-model"       # Path to Gemma text encoder directory
+  training_mode: "lora"                           # "lora" or "full"
+  load_checkpoint: null                           # Path to checkpoint to resume from
+```
+**Key parameters:**
+| Parameter           | Description                                                                                                                                                    |
+|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `model_path`        | **Required.** Local path to the LTX-2 model checkpoint (`.safetensors` file). URLs are not supported.                                                          |
+| `text_encoder_path` | **Required.** Path to the Gemma text encoder model directory. Download from [HuggingFace](https://huggingface.co/google/gemma-3-12b-it-qat-q4_0-unquantized/). |
+| `training_mode`     | Training approach - `"lora"` for LoRA training or `"full"` for full-rank fine-tuning.                                                                          |
+| `load_checkpoint`   | Optional path to resume training from a checkpoint file or directory.                                                                                          |
+> [!NOTE]
+> LTX-2 requires both a model checkpoint and a Gemma text encoder. Both must be local paths.
+### LoraConfig
+LoRA-specific fine-tuning parameters (only used when `training_mode: "lora"`).
+```yaml
+lora:
+  rank: 32         # LoRA rank (higher = more parameters)
+  alpha: 32        # LoRA alpha scaling factor
+  dropout: 0.0     # Dropout probability (0.0-1.0)
+  target_modules: # Modules to apply LoRA to
+    - "to_k"
+    - "to_q"
+    - "to_v"
+    - "to_out.0"
+```
+**Key parameters:**
+| Parameter        | Description                                                                     |
+|------------------|---------------------------------------------------------------------------------|
+| `rank`           | LoRA rank - higher values mean more trainable parameters (typical range: 8-128) |
+| `alpha`          | Alpha scaling factor - typically set equal to rank                              |
+| `dropout`        | Dropout probability for regularization                                          |
+| `target_modules` | List of transformer modules to apply LoRA adapters to (see below)               |
+#### Understanding Target Modules
+The LTX-2 transformer has separate attention and feed-forward blocks for video and audio, as well as cross-attention
+modules that enable the two modalities to exchange information. Choosing the right `target_modules` is critical for
+achieving good results, especially when training with audio.
+**Video-only modules:**
+| Module Pattern                                             | Description                     |
+|------------------------------------------------------------|---------------------------------|
+| `attn1.to_k`, `attn1.to_q`, `attn1.to_v`, `attn1.to_out.0` | Video self-attention            |
+| `attn2.to_k`, `attn2.to_q`, `attn2.to_v`, `attn2.to_out.0` | Video cross-attention (to text) |
+| `ff.net.0.proj`, `ff.net.2`                                | Video feed-forward network      |
+**Audio-only modules:**
+| Module Pattern                                                                     | Description                     |
+|------------------------------------------------------------------------------------|---------------------------------|
+| `audio_attn1.to_k`, `audio_attn1.to_q`, `audio_attn1.to_v`, `audio_attn1.to_out.0` | Audio self-attention            |
+| `audio_attn2.to_k`, `audio_attn2.to_q`, `audio_attn2.to_v`, `audio_attn2.to_out.0` | Audio cross-attention (to text) |
+| `audio_ff.net.0.proj`, `audio_ff.net.2`                                            | Audio feed-forward network      |
+**Audio-video cross-attention modules:**
+These modules enable bidirectional information flow between the audio and video modalities:
+| Module Pattern                                                                                                     | Description                                           |
+|--------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------|
+| `audio_to_video_attn.to_k`, `audio_to_video_attn.to_q`, `audio_to_video_attn.to_v`, `audio_to_video_attn.to_out.0` | Video attends to audio (Q from video, K/V from audio) |
+| `video_to_audio_attn.to_k`, `video_to_audio_attn.to_q`, `video_to_audio_attn.to_v`, `video_to_audio_attn.to_out.0` | Audio attends to video (Q from audio, K/V from video) |
+**Recommended configurations:**
+For **video-only training**, target the video attention layers:
+```yaml
+target_modules:
+  - "attn1.to_k"
+  - "attn1.to_q"
+  - "attn1.to_v"
+  - "attn1.to_out.0"
+  - "attn2.to_k"
+  - "attn2.to_q"
+  - "attn2.to_v"
+  - "attn2.to_out.0"
+```
+For **audio-video training**, use patterns that match both branches:
+```yaml
+target_modules:
+  - "to_k"
+  - "to_q"
+  - "to_v"
+  - "to_out.0"
+```
+> [!NOTE]
+> Using shorter patterns like `"to_k"` will match all attention modules including `attn1.to_k`, `audio_attn1.to_k`,
+> `audio_to_video_attn.to_k`, and `video_to_audio_attn.to_k`, effectively training video, audio, and cross-modal
+> attention branches together.
+> [!TIP]
+> You can also target the feed-forward (FFN) modules (`ff.net.0.proj`, `ff.net.2` for video,
+> `audio_ff.net.0.proj`, `audio_ff.net.2` for audio) to increase the LoRA's capacity and potentially
+> help it capture the target distribution better.
+### TrainingStrategyConfig
+Configures the training strategy. The trainer includes two built-in strategies described below.
+For custom use cases, see [Implementing Custom Training Strategies](custom-training-strategies.md).
+#### Text-to-Video Strategy
+```yaml
+training_strategy:
+  name: "text_to_video"
+  first_frame_conditioning_p: 0.1     # Probability of first-frame conditioning
+  with_audio: false                   # Enable joint audio-video training
+  audio_latents_dir: "audio_latents"  # Directory for audio latents (when with_audio: true)
+```
+#### Video-to-Video Strategy (IC-LoRA)
+```yaml
+training_strategy:
+  name: "video_to_video"
+  first_frame_conditioning_p: 0.1
+  reference_latents_dir: "reference_latents"  # Directory for reference video latents
+```
+**Key parameters:**
+| Parameter                    | Description                                                      |
+|------------------------------|------------------------------------------------------------------|
+| `name`                       | Strategy type: `"text_to_video"` or `"video_to_video"`           |
+| `first_frame_conditioning_p` | Probability of using first frame as conditioning (0.0-1.0)       |
+| `with_audio`                 | (text_to_video only) Enable joint audio-video training           |
+| `audio_latents_dir`          | (text_to_video only) Directory name for audio latents            |
+| `reference_latents_dir`      | (video_to_video only) Directory name for reference video latents |
+### OptimizationConfig
+Training optimization parameters including learning rates, batch sizes, and schedulers.
+```yaml
+optimization:
+  learning_rate: 1e-4                  # Learning rate
+  steps: 2000                          # Total training steps
+  batch_size: 1                        # Batch size per GPU
+  gradient_accumulation_steps: 1       # Steps to accumulate gradients
+  max_grad_norm: 1.0                   # Gradient clipping threshold
+  optimizer_type: "adamw"              # "adamw" or "adamw8bit"
+  scheduler_type: "linear"             # Scheduler type
+  scheduler_params: { }                # Additional scheduler parameters
+  enable_gradient_checkpointing: true  # Memory optimization
+```
+**Key parameters:**
+| Parameter                       | Description                                                                                  |
+|---------------------------------|----------------------------------------------------------------------------------------------|
+| `learning_rate`                 | Learning rate for optimization (typical range: 1e-5 to 1e-3)                                 |
+| `steps`                         | Total number of training steps                                                               |
+| `batch_size`                    | Batch size per GPU (reduce if running out of memory)                                         |
+| `gradient_accumulation_steps`   | Accumulate gradients over multiple steps                                                     |
+| `scheduler_type`                | LR scheduler: `"constant"`, `"linear"`, `"cosine"`, `"cosine_with_restarts"`, `"polynomial"` |
+| `enable_gradient_checkpointing` | Trade training speed for GPU memory savings (recommended for large models)                   |
+### AccelerationConfig
+Hardware acceleration and compute optimization settings.
+```yaml
+acceleration:
+  mixed_precision_mode: "bf16"      # "no", "fp16", or "bf16"
+  quantization: null                # Quantization options
+  load_text_encoder_in_8bit: false  # Load text encoder in 8-bit
+```
+**Key parameters:**
+| Parameter                   | Description                                                                        |
+|-----------------------------|------------------------------------------------------------------------------------|
+| `mixed_precision_mode`      | Precision mode - `"bf16"` recommended for modern GPUs                              |
+| `quantization`              | Model quantization: `null`, `"int8-quanto"`, `"int4-quanto"`, `"fp8-quanto"`, etc. |
+| `load_text_encoder_in_8bit` | Load the Gemma text encoder in 8-bit to save GPU memory                            |
+### DataConfig
+Data loading and processing configuration.
+```yaml
+data:
+  preprocessed_data_root: "/path/to/preprocessed/data"  # Path to precomputed dataset
+  num_dataloader_workers: 2                             # Background data loading workers
+```
+**Key parameters:**
+| Parameter                | Description                                                                                |
+|--------------------------|--------------------------------------------------------------------------------------------|
+| `preprocessed_data_root` | Path to your preprocessed dataset (contains `latents/`, `conditions/`, etc.)               |
+| `num_dataloader_workers` | Number of parallel data loading processes (0 = synchronous loading, useful when debugging) |
+### ValidationConfig
+Validation and inference settings for monitoring training progress.
+```yaml
+validation:
+  prompts: # Validation prompts
+    - "A cat playing with a ball"
+    - "A dog running in a field"
+  negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted"
+  images: null                        # Optional image paths for image-to-video
+  reference_videos: null              # Reference video paths (IC-LoRA only)
+  video_dims: [ 576, 576, 89 ]        # Video dimensions [width, height, frames]
+  frame_rate: 25.0                    # Frame rate for generated videos
+  seed: 42                            # Random seed for reproducibility
+  inference_steps: 30                 # Number of inference steps
+  interval: 100                       # Steps between validation runs
+  videos_per_prompt: 1                # Videos generated per prompt
+  guidance_scale: 4.0                 # CFG guidance strength
+  stg_scale: 1.0                      # STG guidance strength (0.0 to disable)
+  stg_blocks: [ 29 ]                  # Transformer blocks to perturb for STG
+  stg_mode: "stg_av"                  # "stg_av" or "stg_v" (video only)
+  generate_audio: true                # Whether to generate audio
+  skip_initial_validation: false      # Skip validation at step 0
+  include_reference_in_output: false  # Include reference video side-by-side (IC-LoRA)
+```
+**Key parameters:**
+| Parameter                     | Description                                                                                                              |
+|-------------------------------|--------------------------------------------------------------------------------------------------------------------------|
+| `prompts`                     | List of text prompts for validation video generation                                                                     |
+| `images`                      | List of image paths for image-to-video validation (must match number of prompts)                                         |
+| `reference_videos`            | List of reference video paths for IC-LoRA validation (must match number of prompts)                                      |
+| `video_dims`                  | Output dimensions `[width, height, frames]`. Width/height must be divisible by 32, frames must satisfy `frames % 8 == 1` |
+| `interval`                    | Steps between validation runs (set to `null` to disable)                                                                 |
+| `guidance_scale`              | CFG (Classifier-Free Guidance) scale. Recommended: 4.0                                                                   |
+| `stg_scale`                   | STG (Spatio-Temporal Guidance) scale. 0.0 disables STG. Recommended: 1.0                                                 |
+| `stg_blocks`                  | Transformer blocks to perturb for STG. Recommended: `[29]` (single block)                                                |
+| `stg_mode`                    | STG mode: `"stg_av"` perturbs both audio and video, `"stg_v"` perturbs video only                                        |
+| `generate_audio`              | Whether to generate audio in validation samples                                                                          |
+| `include_reference_in_output` | For IC-LoRA: concatenate reference video side-by-side with output                                                        |
+### CheckpointsConfig
+Model checkpointing configuration.
+```yaml
+checkpoints:
+  interval: 250       # Steps between checkpoint saves (null = disabled)
+  keep_last_n: 3      # Number of recent checkpoints to retain
+  precision: bfloat16 # Precision for saved weights (bfloat16 or float32)
+```
+**Key parameters:**
+| Parameter     | Description                                                                   |
+|---------------|-------------------------------------------------------------------------------|
+| `interval`    | Steps between intermediate checkpoint saves (set to `null` to disable)        |
+| `keep_last_n` | Number of most recent checkpoints to keep (-1 = keep all)                     |
+| `precision`   | Precision for saved checkpoint weights: `"bfloat16"` (default) or `"float32"` |
+### HubConfig
+Hugging Face Hub integration for automatic model uploads.
+```yaml
+hub:
+  push_to_hub: false                   # Enable Hub uploading
+  hub_model_id: "username/model-name"  # Hub repository ID
+```
+**Key parameters:**
+| Parameter      | Description                                                      |
+|----------------|------------------------------------------------------------------|
+| `push_to_hub`  | Whether to automatically push trained models to Hugging Face Hub |
+| `hub_model_id` | Repository ID in format `"username/repository-name"`             |
+### WandbConfig
+Weights & Biases logging configuration.
+```yaml
+wandb:
+  enabled: false               # Enable W&B logging
+  project: "ltx-2-trainer"     # W&B project name
+  entity: null                 # W&B username or team
+  tags: [ ]                    # Tags for the run
+  log_validation_videos: true  # Log validation videos to W&B
+```
+**Key parameters:**
+| Parameter               | Description                                      |
+|-------------------------|--------------------------------------------------|
+| `enabled`               | Whether to enable W&B logging                    |
+| `project`               | W&B project name                                 |
+| `entity`                | W&B username or team (null uses default account) |
+| `log_validation_videos` | Whether to log validation videos to W&B          |
+### FlowMatchingConfig
+Flow matching training configuration for timestep sampling.
+```yaml
+flow_matching:
+  timestep_sampling_mode: "shifted_logit_normal"  # Timestep sampling strategy
+  timestep_sampling_params: { }                   # Additional sampling parameters
+```
+**Key parameters:**
+| Parameter                  | Description                                                |
+|----------------------------|------------------------------------------------------------|
+| `timestep_sampling_mode`   | Sampling strategy: `"uniform"` or `"shifted_logit_normal"` |
+| `timestep_sampling_params` | Additional parameters for the sampling strategy            |
+## 🚀 Next Steps
+Once you've configured your training parameters:
+- Set up your dataset using [Dataset Preparation](dataset-preparation.md)
+- Choose your training approach in [Training Modes](training-modes.md)
+- Start training with the [Training Guide](training-guide.md)

packages/ltx-trainer/docs/custom-training-strategies.md ADDED Viewed

	@@ -0,0 +1,510 @@

+# Implementing Custom Training Strategies
+This guide explains how to implement your own training strategy for specialized use cases like audio-only training,
+video inpainting, or other custom training recipes.
+## 📋 Overview
+The trainer uses the **Strategy Pattern** to separate training logic from the core training loop. Each strategy defines:
+1. **What data is needed** - Which preprocessed data directories to load
+2. **How to prepare inputs** - Transform batch data into model inputs
+3. **How to compute loss** - Calculate the training objective
+This architecture lets you implement new training modes without modifying the core trainer code.
+### When You Need a Custom Strategy
+Consider implementing a custom strategy when you need:
+- **Different input modalities** (e.g., audio-only, audio-to-video conditioning)
+- **Additional conditioning signals** (e.g., masks for inpainting, depth maps)
+- **Custom loss computation** (e.g., weighted losses, auxiliary losses)
+- **Different noise application patterns** (e.g., partial masking)
+## 🏗️ Architecture Overview
+### How Strategies Fit Into the Trainer
+The trainer delegates all training-mode-specific logic to the strategy:
+1. **Initialization** — The trainer calls `get_data_sources()` to determine which preprocessed data directories to load
+2. **Each training step:**
+    - Calls `prepare_training_inputs()` to transform the raw batch into model-ready inputs
+    - Runs the transformer forward pass
+    - Calls `compute_loss()` to compute the training objective
+The trainer handles everything else: optimization, checkpointing, validation, and distributed training.
+### Key Components
+| Component                                                                               | Purpose                                                      |
+|-----------------------------------------------------------------------------------------|--------------------------------------------------------------|
+| [`TrainingStrategyConfigBase`](../src/ltx_trainer/training_strategies/base_strategy.py) | Base class for strategy configuration (Pydantic model)       |
+| [`TrainingStrategy`](../src/ltx_trainer/training_strategies/base_strategy.py)           | Abstract base class defining the strategy interface          |
+| [`ModelInputs`](../src/ltx_trainer/training_strategies/base_strategy.py)                | Dataclass containing prepared inputs for the transformer     |
+| [`Modality`](../../ltx-core/src/ltx_core/model/transformer/modality.py)                 | ltx-core dataclass representing video or audio modality data |
+## 📝 Step-by-Step Implementation
+### Step 1: Plan Your Strategy
+Before writing code, answer these questions:
+1. **What additional data does your strategy need?**
+    - Example: Inpainting needs mask latents alongside video latents
+    - Example: Audio-to-video needs reference audio embeddings
+2. **What does conditioning look like?**
+    - Which tokens should be noised vs. kept clean?
+    - How should conditioning tokens be structured (e.g., first frame, reference video, mask)?
+3. **How should loss be computed?**
+    - Which tokens contribute to the loss?
+    - Are there multiple loss terms to combine?
+### Step 2: Extend Data Preprocessing (If Needed)
+If your strategy requires additional preprocessed data beyond video latents, audio latents, and text embeddings, you'll
+need to extend the preprocessing pipeline.
+#### Option A: Modify `process_dataset.py`
+For integrated preprocessing, add new arguments and processing steps to the main script. For example, to add mask
+preprocessing:
+```python
+# In process_dataset.py, add a new argument
+@app.command()
+def main(
+        # ... existing arguments ...
+        mask_column: str | None = typer.Option(
+            default=None,
+            help="Column name containing mask video paths (for inpainting)",
+        ),
+) -> None:
+    # ... existing processing ...
+    # Process masks if provided
+    if mask_column:
+        logger.info("Processing mask videos for inpainting training...")
+        mask_latents_dir = output_base / "mask_latents"
+        compute_latents(
+            dataset_file=dataset_path,
+            video_column=mask_column,
+            resolution_buckets=parsed_resolution_buckets,
+            output_dir=str(mask_latents_dir),
+            model_path=model_path,
+            # ... other args ...
+        )
+```
+#### Option B: Create a Standalone Script
+For complex preprocessing that doesn't fit naturally into the existing pipeline, create a dedicated script
+(e.g., `scripts/process_masks.py`). Use [`scripts/compute_reference.py`](../scripts/compute_reference.py) as a
+template - it shows how to process paired data and update the dataset JSON.
+#### Expected Output Structure
+Your preprocessing should create a directory structure that the strategy can reference:
+```
+preprocessed_data_root/
+├── latents/           # Video latents (standard)
+├── conditions/        # Text embeddings (standard)
+├── audio_latents/     # Audio latents (if with_audio)
+├── mask_latents/      # Your custom data directory
+└── reference_latents/ # Reference videos (for IC-LoRA)
+```
+### Step 3: Create the Strategy Configuration
+Create a new file for your strategy (e.g., `src/ltx_trainer/training_strategies/inpainting.py`):
+```python
+"""Inpainting training strategy.
+This strategy implements video inpainting training where:
+- Mask latents indicate which regions to inpaint
+- Loss is computed only on masked (inpainted) regions
+"""
+from typing import Any, Literal
+import torch
+from pydantic import Field
+from torch import Tensor
+from ltx_core.model.transformer.modality import Modality
+from ltx_trainer.timestep_samplers import TimestepSampler
+from ltx_trainer.training_strategies.base_strategy import (
+    ModelInputs,
+    TrainingStrategy,
+    TrainingStrategyConfigBase,
+)
+class InpaintingConfig(TrainingStrategyConfigBase):
+    """Configuration for inpainting training strategy."""
+    # The 'name' field acts as a discriminator for the config union
+    name: Literal["inpainting"] = "inpainting"
+    mask_latents_dir: str = Field(
+        default="mask_latents",
+        description="Directory name for mask latents",
+    )
+    # Add any strategy-specific parameters
+    mask_threshold: float = Field(
+        default=0.5,
+        description="Threshold for binary mask conversion",
+        ge=0.0,
+        le=1.0,
+    )
+```
+**Key points:**
+- Inherit from `TrainingStrategyConfigBase`
+- Use `Literal["your_strategy_name"]` for the `name` field - this enables automatic strategy selection
+- Use Pydantic `Field` for validation and documentation
+### Step 4: Implement the Strategy Class
+```python
+class InpaintingStrategy(TrainingStrategy):
+    """Inpainting training strategy.
+    Trains the model to fill in masked regions of videos while
+    keeping unmasked regions as conditioning.
+    """
+    config: InpaintingConfig
+    def __init__(self, config: InpaintingConfig):
+        super().__init__(config)
+    @property
+    def requires_audio(self) -> bool:
+        """Whether this strategy requires audio components."""
+        return False  # Set to True if your strategy needs audio
+    def get_data_sources(self) -> dict[str, str]:
+        """Define which data directories to load.
+        Returns a mapping of directory names to batch keys.
+        The trainer will load .pt files from each directory and
+        make them available in the batch under the specified key.
+        """
+        return {
+            "latents": "latents",  # -> batch["latents"]
+            "conditions": "conditions",  # -> batch["conditions"]
+            self.config.mask_latents_dir: "masks",  # -> batch["masks"]
+        }
+    def prepare_training_inputs(
+            self,
+            batch: dict[str, Any],
+            timestep_sampler: TimestepSampler,
+    ) -> ModelInputs:
+        """Transform batch data into model inputs.
+        This is where the core training logic lives:
+        1. Extract and patchify latents
+        2. Sample noise and apply it appropriately
+        3. Create conditioning masks
+        4. Build Modality objects for the transformer
+        """
+        # Get video latents [B, C, F, H, W]
+        latents_data = batch["latents"]
+        video_latents = latents_data["latents"]
+        # Get dimensions
+        num_frames = latents_data["num_frames"][0].item()
+        height = latents_data["height"][0].item()
+        width = latents_data["width"][0].item()
+        # Patchify: [B, C, F, H, W] -> [B, seq_len, C]
+        video_latents = self._video_patchifier.patchify(video_latents)
+        batch_size, seq_len, _ = video_latents.shape
+        device = video_latents.device
+        dtype = video_latents.dtype
+        # Get mask latents and process them
+        mask_data = batch["masks"]
+        mask_latents = mask_data["latents"]
+        mask_latents = self._video_patchifier.patchify(mask_latents)
+        # Create binary mask: True = inpaint this region, False = keep original
+        inpaint_mask = mask_latents.mean(dim=-1) > self.config.mask_threshold
+        # Sample noise and sigmas
+        sigmas = timestep_sampler.sample_for(video_latents)
+        noise = torch.randn_like(video_latents)
+        # Apply noise only to inpaint regions
+        sigmas_expanded = sigmas.view(-1, 1, 1)
+        noisy_latents = (1 - sigmas_expanded) * video_latents + sigmas_expanded * noise
+        # Keep original latents for non-inpaint regions (conditioning)
+        inpaint_mask_expanded = inpaint_mask.unsqueeze(-1)
+        noisy_latents = torch.where(inpaint_mask_expanded, noisy_latents, video_latents)
+        # Create per-token timesteps
+        # Conditioning tokens (non-inpaint) get timestep=0
+        # Inpaint tokens get the sampled sigma
+        timesteps = self._create_per_token_timesteps(~inpaint_mask, sigmas.squeeze())
+        # Compute targets (velocity prediction: noise - clean)
+        targets = noise - video_latents
+        # Get text embeddings
+        conditions = batch["conditions"]
+        video_prompt_embeds = conditions["video_prompt_embeds"]
+        prompt_attention_mask = conditions["prompt_attention_mask"]
+        # Generate position embeddings
+        positions = self._get_video_positions(
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            batch_size=batch_size,
+            fps=24.0,  # Or get from latents_data
+            device=device,
+            dtype=dtype,
+        )
+        # Create video Modality
+        video_modality = Modality(
+            enabled=True,
+            latent=noisy_latents,
+            sigma=sigmas,
+            timesteps=timesteps,
+            positions=positions,
+            context=video_prompt_embeds,
+            context_mask=prompt_attention_mask,
+        )
+        # Loss mask: only compute loss on inpaint regions
+        loss_mask = inpaint_mask
+        return ModelInputs(
+            video=video_modality,
+            audio=None,
+            video_targets=targets,
+            audio_targets=None,
+            video_loss_mask=loss_mask,
+            audio_loss_mask=None,
+        )
+    def compute_loss(
+            self,
+            video_pred: Tensor,
+            audio_pred: Tensor | None,
+            inputs: ModelInputs,
+    ) -> Tensor:
+        """Compute training loss on inpaint regions only."""
+        # MSE loss
+        loss = (video_pred - inputs.video_targets).pow(2)
+        # Apply loss mask
+        loss_mask = inputs.video_loss_mask.unsqueeze(-1).float()
+        loss = loss.mul(loss_mask).div(loss_mask.mean() + 1e-8)
+        return loss.mean()
+```
+### Step 5: Register the Strategy
+You need to register your strategy in two places:
+**1. Update [`src/ltx_trainer/training_strategies/__init__.py`](../src/ltx_trainer/training_strategies/__init__.py):**
+```python
+# Add import for your strategy
+from ltx_trainer.training_strategies.inpainting import InpaintingConfig, InpaintingStrategy
+# Add to the TrainingStrategyConfig type alias
+TrainingStrategyConfig = TextToVideoConfig | VideoToVideoConfig | InpaintingConfig
+# Add to __all__
+__all__ = [
+    # ... existing exports ...
+    "InpaintingConfig",
+    "InpaintingStrategy",
+]
+# Add case in get_training_strategy()
+def get_training_strategy(config: TrainingStrategyConfig) -> TrainingStrategy:
+    match config:
+        # ... existing cases ...
+        case InpaintingConfig():
+            strategy = InpaintingStrategy(config)
+```
+**2. Update [`src/ltx_trainer/config.py`](../src/ltx_trainer/config.py):**
+```python
+# Add import
+from ltx_trainer.training_strategies.inpainting import InpaintingConfig
+# Add to the TrainingStrategyConfig union with a Tag matching your strategy name
+TrainingStrategyConfig = Annotated[
+    Annotated[TextToVideoConfig, Tag("text_to_video")]
+    | Annotated[VideoToVideoConfig, Tag("video_to_video")]
+    | Annotated[InpaintingConfig, Tag("inpainting")],  # Add your config
+    Discriminator(_get_strategy_discriminator),
+]
+```
+### Step 6: Create a Configuration File
+Create an example config in `configs/`:
+```yaml
+# configs/ltx2_inpainting_lora.yaml
+model:
+  model_path: "/path/to/ltx2.safetensors"
+  text_encoder_path: "/path/to/gemma"
+  training_mode: "lora"
+training_strategy:
+  name: "inpainting"  # Must match your Literal type
+  mask_latents_dir: "mask_latents"
+  mask_threshold: 0.5
+lora:
+  rank: 32
+  alpha: 32
+  target_modules:
+    - "to_k"
+    - "to_q"
+    - "to_v"
+    - "to_out.0"
+data:
+  preprocessed_data_root: "/path/to/preprocessed/dataset"
+optimization:
+  learning_rate: 1e-4
+  steps: 2000
+  batch_size: 1
+# ... other config sections ...
+```
+## 🔧 Helper Methods Reference
+The base `TrainingStrategy` class provides these helper methods:
+| Method                                       | Purpose                                         |
+|----------------------------------------------|-------------------------------------------------|
+| `_video_patchifier.patchify(latents)`        | Convert `[B, C, F, H, W]` → `[B, seq_len, C]`   |
+| `_audio_patchifier.patchify(latents)`        | Convert `[B, C, T, F]` → `[B, T, C*F]`          |
+| `_get_video_positions(...)`                  | Generate position embeddings for video          |
+| `_get_audio_positions(...)`                  | Generate position embeddings for audio          |
+| `_create_per_token_timesteps(mask, sigma)`   | Create timesteps with 0 for conditioning tokens |
+| `_create_first_frame_conditioning_mask(...)` | Create mask for first-frame conditioning        |
+## 📊 Understanding ModelInputs
+The `ModelInputs` dataclass contains everything needed for the forward pass and loss computation:
+```python
+@dataclass
+class ModelInputs:
+    video: Modality  # Video modality data
+    audio: Modality | None  # Audio modality (None if video-only)
+    video_targets: Tensor  # Target values for loss (velocity)
+    audio_targets: Tensor | None
+    video_loss_mask: Tensor  # Boolean: True = compute loss for this token
+    audio_loss_mask: Tensor | None
+    ref_seq_len: int | None = None  # For IC-LoRA: reference sequence length
+```
+## 📊 Understanding Modality
+The `Modality` dataclass (from ltx-core) represents a single modality's data:
+```python
+@dataclass(frozen=True)
+class Modality:
+    enabled: bool  # Whether this modality is active
+    latent: Tensor  # [B, seq_len, C] - the latent tokens
+    timesteps: Tensor  # [B, seq_len] - per-token timesteps (sigmas)
+    positions: Tensor  # [B, dims, seq_len, 2] - position bounds
+    context: Tensor  # [B, ctx_len, C] - text embeddings
+    context_mask: Tensor  # [B, ctx_len] - attention mask for context
+```
+> [!NOTE]
+> **Per-token timesteps:** Each token in the sequence has its own timestep. Conditioning tokens—those that should remain
+> un-noised—must have `timestep=0`. This is how the model distinguishes clean reference tokens from tokens to denoise. Use
+`_create_per_token_timesteps(conditioning_mask, sigma)` to set this up correctly.
+> [!NOTE]
+> `Modality` is immutable (frozen dataclass). Use `dataclasses.replace()` to create modified copies.
+## ✅ Testing Your Strategy
+1. **Verify your training configuration is valid:**
+   ```bash
+   uv run python -c "
+   from ltx_trainer.config import LtxTrainerConfig
+   import yaml
+   with open('configs/ltx2_inpainting_lora.yaml') as f:
+       config = LtxTrainerConfig(**yaml.safe_load(f))
+   print(f'Strategy: {config.training_strategy.name}')
+   "
+   ```
+2. **Test strategy instantiation:**
+   ```bash
+   uv run python -c "
+   from ltx_trainer.training_strategies import get_training_strategy
+   from ltx_trainer.training_strategies.inpainting import InpaintingConfig
+   config = InpaintingConfig()
+   strategy = get_training_strategy(config)
+   print(f'Data sources: {strategy.get_data_sources()}')
+   "
+   ```
+3. **Run a short training test:**
+   ```bash
+   uv run python scripts/train.py configs/ltx2_inpainting_lora.yaml
+   ```
+## 💡 Tips and Best Practices
+### Debugging
+- Set `data.num_dataloader_workers: 0` to get clearer error messages
+- Use a small dataset and few steps for initial testing
+- Check tensor shapes at each step with print statements
+## 🔗 Related Documentation
+- [Training Modes](training-modes.md) - Overview of built-in training modes
+- [Configuration Reference](configuration-reference.md) - All configuration options
+- [Dataset Preparation](dataset-preparation.md) - Preprocessing workflow
+- [ltx-core Documentation](../../ltx-core/README.md) - Core model components
+## 📚 Reference: Existing Strategies
+Study these implementations for guidance:
+| Strategy                                                                           | Complexity | Key Features                                   |
+|------------------------------------------------------------------------------------|------------|------------------------------------------------|
+| [`TextToVideoStrategy`](../src/ltx_trainer/training_strategies/text_to_video.py)   | Simple     | First-frame conditioning, optional audio       |
+| [`VideoToVideoStrategy`](../src/ltx_trainer/training_strategies/video_to_video.py) | Medium     | Reference video concatenation, split loss mask |

packages/ltx-trainer/docs/dataset-preparation.md ADDED Viewed

	@@ -0,0 +1,342 @@

+# Dataset Preparation Guide
+This guide covers the complete workflow for preparing and preprocessing your dataset for training.
+## 📋 Overview
+The general dataset preparation workflow is:
+1. **(Optional)** Split long videos into scenes using `split_scenes.py`
+2. **(Optional)** Generate captions for your videos using `caption_videos.py`
+3. **Preprocess your dataset** using `process_dataset.py` to compute and cache video/audio latents and text embeddings
+4. **Run the trainer** with your preprocessed dataset
+## 🎬 Step 1: Split Scenes
+If you're starting with raw, long-form videos (e.g., downloaded from YouTube), you should first split them into shorter, coherent scenes.
+```bash
+uv run python scripts/split_scenes.py input.mp4 scenes_output_dir/ \
+    --filter-shorter-than 5s
+```
+This will create multiple video clips in `scenes_output_dir`.
+These clips will be the input for the captioning step, if you choose to use it.
+The script supports many configuration options for scene detection (detector algorithms, thresholds, minimum scene lengths, etc.):
+```bash
+uv run python scripts/split_scenes.py --help
+```
+## 📝 Step 2: Caption Videos
+If your dataset doesn't include captions, you can automatically generate them using multimodal models that understand both video and audio.
+```bash
+uv run python scripts/caption_videos.py scenes_output_dir/ \
+    --output scenes_output_dir/dataset.json
+```
+If you're running into VRAM issues, try enabling 8-bit quantization to reduce memory usage:
+```bash
+uv run python scripts/caption_videos.py scenes_output_dir/ \
+    --output scenes_output_dir/dataset.json \
+    --use-8bit
+```
+This will create a `dataset.json` file containing video paths and their captions.
+**Captioning options:**
+| Option | Description |
+|--------|-------------|
+| `--captioner-type` | `qwen_omni` (default, local) or `gemini_flash` (API) |
+| `--use-8bit` | Enable 8-bit quantization for lower VRAM usage |
+| `--no-audio` | Disable audio processing (video-only captions) |
+| `--override` | Re-caption files that already have captions |
+| `--api-key` | API key for Gemini Flash (or set `GOOGLE_API_KEY` env var) |
+**Caption format:**
+The captioner produces structured captions with sections for:
+- **Visual content**: People, objects, actions, settings, colors, movements
+- **Speech transcription**: Word-for-word transcription of spoken content
+- **Sounds**: Music, ambient sounds, sound effects
+- **On-screen text**: Any visible text overlays
+> [!NOTE]
+> The automatically generated captions may contain inaccuracies or hallucinated content.
+> We recommend reviewing and correcting the generated captions in your `dataset.json` file before proceeding to preprocessing.
+## ⚡ Step 3: Dataset Preprocessing
+This step preprocesses your video dataset by:
+1. Resizing and cropping videos to fit specified resolution buckets
+2. Computing and caching video latent representations
+3. Computing and caching text embeddings for captions
+4. (Optional) Computing and caching audio latents
+> [!WARNING]
+> Very large videos (especially high spatial resolution and/or many frames) can cause GPU out-of-memory (OOM)
+> during preprocessing/encoding.
+> The simplest fix is to reduce the target resolution (spatially: width/height) and/or the number of frames
+> (temporally) by using `--resolution-buckets` with smaller dimensions (lower width/height and/or fewer frames).
+### Basic Usage
+```bash
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model
+```
+### With Audio Processing
+For audio-video training, add the `--with-audio` flag:
+```bash
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model \
+    --with-audio
+```
+### 📊 Dataset Format
+The trainer supports either videos or single images.
+Note that your dataset must be homogeneous - either all videos or all images, mixing is not supported.
+> [!TIP]
+> **Image Datasets:** When using images, follow the same preprocessing steps and format requirements as with videos,
+> but use `1` for the frame count in the resolution bucket (e.g., `960x544x1`).
+The dataset must be a CSV, JSON, or JSONL metadata file with columns for captions and video paths:
+**JSON format example:**
+```json
+[
+  {
+    "caption": "A cat playing with a ball of yarn",
+    "media_path": "videos/cat_playing.mp4"
+  },
+  {
+    "caption": "A dog running in the park",
+    "media_path": "videos/dog_running.mp4"
+  }
+]
+```
+**JSONL format example:**
+```jsonl
+{"caption": "A cat playing with a ball of yarn", "media_path": "videos/cat_playing.mp4"}
+{"caption": "A dog running in the park", "media_path": "videos/dog_running.mp4"}
+```
+**CSV format example:**
+```csv
+caption,media_path
+"A cat playing with a ball of yarn","videos/cat_playing.mp4"
+"A dog running in the park","videos/dog_running.mp4"
+```
+### 📐 Resolution Buckets
+Videos are organized into "buckets" of specific dimensions (width × height × frames).
+Each video is assigned to the nearest matching bucket.
+You can preprocess with one or multiple resolution buckets.
+When training with multiple resolution buckets, you must use a batch size of 1.
+The dimensions of each bucket must follow these constraints due to LTX-2's VAE architecture:
+- **Spatial dimensions** (width and height) must be multiples of 32
+- **Number of frames** must satisfy `frames % 8 == 1` (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 121, etc.)
+**Guidelines for choosing training resolution:**
+- For high-quality, detailed videos: use larger spatial dimensions (e.g. 768x448) with fewer frames (e.g. 89)
+- For longer, motion-focused videos: use smaller spatial dimensions (512×512) with more frames (121)
+- Memory usage increases with both spatial and temporal dimensions
+**Example usage:**
+```bash
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model
+```
+Multiple buckets are supported by separating entries with `;`:
+```bash
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49;512x512x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model
+```
+**Video processing workflow:**
+1. Videos are **resized** maintaining aspect ratio until either width or height matches the target
+2. The larger dimension is **center cropped** to match the bucket's dimensions
+3. Only the **first X frames are taken** to match the bucket's frame count, remaining frames are ignored
+> [!NOTE]
+> The sequence length processed by the transformer model can be calculated as:
+>
+> ```
+> sequence_length = (H/32) * (W/32) * ((F-1)/8 + 1)
+> ```
+>
+> Where:
+> - H = Height of video
+> - W = Width of video
+> - F = Number of frames
+> - 32 = VAE's spatial downsampling factor
+> - 8 = VAE's temporal downsampling factor
+>
+> For example, a 768×448×89 video would have sequence length:
+> ```
+> (768/32) * (448/32) * ((89-1)/8 + 1) = 24 * 14 * 12 = 4,032
+> ```
+>
+> Keep this in mind when choosing video dimensions, as longer sequences require more GPU memory.
+> [!WARNING]
+> When training with multiple resolution buckets, you must use a batch size of 1
+> (i.e., set `optimization.batch_size: 1` in your training config).
+### 📁 Output Structure
+The preprocessed data is saved in a `.precomputed` directory:
+```
+dataset/
+└── .precomputed/
+    ├── latents/            # Cached video latents
+    ├── conditions/         # Cached text embeddings
+    ├── audio_latents/      # (only if --with-audio) Cached audio latents
+    └── reference_latents/  # (only for IC-LoRA) Cached reference video latents
+```
+## 🪄 IC-LoRA Reference Video Preprocessing
+For IC-LoRA training, you need to preprocess datasets that include reference videos.
+Reference videos provide the conditioning input while target videos represent the desired transformed output.
+### Dataset Format with Reference Videos
+**JSON format:**
+```json
+[
+  {
+    "caption": "A cat playing with a ball of yarn",
+    "media_path": "videos/cat_playing.mp4",
+    "reference_path": "references/cat_playing_depth.mp4"
+  }
+]
+```
+**JSONL format:**
+```jsonl
+{"caption": "A cat playing with a ball of yarn", "media_path": "videos/cat_playing.mp4", "reference_path": "references/cat_playing_depth.mp4"}
+{"caption": "A dog running in the park", "media_path": "videos/dog_running.mp4", "reference_path": "references/dog_running_depth.mp4"}
+```
+### Preprocessing with Reference Videos
+To preprocess a dataset with reference videos, add the `--reference-column` argument specifying the name of the field
+in your dataset JSON/JSONL/CSV that contains the reference video paths:
+```bash
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model \
+    --reference-column "reference_path"
+```
+This will create an additional `reference_latents/` directory containing the preprocessed reference video latents.
+### Generating Reference Videos
+**Dataset Requirements for IC-LoRA:**
+- Your dataset must contain paired videos where each target video has a corresponding reference video
+- Reference and target videos must have *identical* resolution and length
+- Both reference and target videos should be preprocessed together using the same resolution buckets
+We provide an example script, [`scripts/compute_reference.py`](../scripts/compute_reference.py), to generate reference
+videos for a given dataset. The default implementation generates Canny edge reference videos.
+```bash
+uv run python scripts/compute_reference.py scenes_output_dir/ \
+    --output scenes_output_dir/dataset.json
+```
+The script accepts a JSON file as the dataset configuration and updates it in-place by adding the filenames of the generated reference videos.
+If you want to generate a different type of condition (depth maps, pose skeletons, etc.), modify or replace the `compute_reference()` function within this script.
+### Example Dataset
+For reference, see our **[Canny Control Dataset](https://huggingface.co/datasets/Lightricks/Canny-Control-Dataset)** which demonstrates proper IC-LoRA dataset structure with paired videos and Canny edge maps.
+## 🎯 LoRA Trigger Words
+When training a LoRA, you can specify a trigger token that will be prepended to all captions:
+```bash
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model \
+    --lora-trigger "MYTRIGGER"
+```
+This acts as a trigger word that activates the LoRA during inference when you include the same token in your prompts.
+> [!NOTE]
+> There is no need to manually insert the trigger word into your dataset JSON/JSONL/CSV file.
+> The trigger word specified with `--lora-trigger` is automatically prepended to each caption during preprocessing.
+## 🔍 Decoding Videos for Verification
+If you add the `--decode` flag, the script will VAE-decode the precomputed latents and save the resulting videos
+in `.precomputed/decoded_videos`. When audio preprocessing is enabled (`--with-audio`), audio latents will also be
+decoded and saved to `.precomputed/decoded_audio`. This allows you to visually and audibly inspect the processed data.
+```bash
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model \
+    --decode
+```
+For single-frame images, the decoded latents will be saved as PNG files rather than MP4 videos.
+## 🚀 Next Steps
+Once your dataset is preprocessed, you can proceed to:
+- Configure your training parameters in [Configuration Reference](configuration-reference.md)
+- Choose your training approach in [Training Modes](training-modes.md)
+- Start training with the [Training Guide](training-guide.md)
+> [!TIP]
+> If your training recipe requires additional preprocessed data (e.g., masks, conditioning signals), see
+> [Implementing Custom Training Strategies](custom-training-strategies.md) for guidance on extending the
+> preprocessing pipeline.

packages/ltx-trainer/docs/quick-start.md ADDED Viewed

	@@ -0,0 +1,130 @@

+# Quick Start Guide
+Get up and running with LTX-2 training in just a few steps!
+## 📋 Prerequisites
+Before you begin, ensure you have:
+1. **LTX-2 Model Checkpoint** - A local `.safetensors` file containing the LTX-2 model weights.
+   Download `ltx-2-19b-dev.safetensors` from: [HuggingFace Hub](https://huggingface.co/Lightricks/LTX-2)
+2. **Gemma Text Encoder** - A local directory containing the Gemma model (required for LTX-2).
+   Download from: [HuggingFace Hub](https://huggingface.co/google/gemma-3-12b-it-qat-q4_0-unquantized/)
+3. **Linux with CUDA** - The trainer requires `triton` which is Linux-only
+4. **GPU with sufficient VRAM** - 80GB recommended for the standard config. For GPUs with 32GB VRAM (e.g., RTX 5090),
+   use the [low VRAM config](../configs/ltx2_av_lora_low_vram.yaml) which enables INT8 quantization and other
+   memory optimizations
+## ⚡ Installation
+First, install [uv](https://docs.astral.sh/uv/getting-started/installation/) if you haven't already.
+Then clone the repository and install the dependencies:
+```bash
+git clone https://github.com/Lightricks/LTX-2
+```
+The `ltx-trainer` package is part of the `LTX-2` monorepo. Install the dependencies from the repository root,
+then navigate to the trainer package:
+```bash
+# From the repository root
+uv sync
+cd packages/ltx-trainer
+```
+> [!NOTE]
+> The trainer depends on [`ltx-core`](../../ltx-core/) and [`ltx-pipelines`](../../ltx-pipelines/)
+> packages which are automatically installed from the monorepo.
+## 🏋 Training Workflow
+### 1. Prepare Your Dataset
+Organize your videos and captions, then preprocess them:
+```bash
+# Split long videos into scenes (optional)
+uv run python scripts/split_scenes.py input.mp4 scenes_output_dir/ --filter-shorter-than 5s
+# Generate captions for videos (optional)
+uv run python scripts/caption_videos.py scenes_output_dir/ --output dataset.json
+# Preprocess the dataset (compute latents and embeddings)
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model
+```
+See [Dataset Preparation](dataset-preparation.md) for detailed instructions.
+### 2. Configure Training
+Create or modify a configuration YAML file. Start with one of the example configs:
+- [`configs/ltx2_av_lora.yaml`](../configs/ltx2_av_lora.yaml) - Audio-video LoRA training
+- [`configs/ltx2_av_lora_low_vram.yaml`](../configs/ltx2_av_lora_low_vram.yaml) - Audio-video LoRA training (optimized for 32GB VRAM)
+- [`configs/ltx2_v2v_ic_lora.yaml`](../configs/ltx2_v2v_ic_lora.yaml) - IC-LoRA video-to-video
+Key settings to update:
+```yaml
+model:
+  model_path: "/path/to/ltx-2-model.safetensors"
+  text_encoder_path: "/path/to/gemma-model"
+data:
+  preprocessed_data_root: "/path/to/preprocessed/data"
+output_dir: "outputs/my_training_run"
+```
+See [Configuration Reference](configuration-reference.md) for all available options.
+### 3. Start Training
+```bash
+uv run python scripts/train.py configs/ltx2_av_lora.yaml
+```
+For multi-GPU training:
+```bash
+uv run accelerate launch scripts/train.py configs/ltx2_av_lora.yaml
+```
+See [Training Guide](training-guide.md) for distributed training and advanced options.
+## 🎯 Training Modes
+The trainer supports several training modes:
+| Mode                 | Description                    | Config Example                             |
+|----------------------|--------------------------------|--------------------------------------------|
+| **LoRA**             | Efficient adapter training     | `training_strategy.name: "text_to_video"`  |
+| **Audio-Video LoRA** | Joint audio-video training     | `training_strategy.with_audio: true`       |
+| **IC-LoRA**          | Video-to-video transformations | `training_strategy.name: "video_to_video"` |
+| **Full Fine-tuning** | Full model training            | `model.training_mode: "full"`              |
+See [Training Modes](training-modes.md) for detailed explanations,
+or [Custom Training Strategies](custom-training-strategies.md) if you need to implement your own training recipe.
+## Next Steps
+Once you've completed your first training run, you can:
+- **Use your trained LoRA for inference** - The [`ltx-pipelines`](../../ltx-pipelines/) package provides
+  production-ready inference
+  pipelines for various use cases (T2V, I2V, IC-LoRA, etc.). See the package documentation for details.
+- Learn more about [Dataset Preparation](dataset-preparation.md) for advanced preprocessing
+- Explore different [Training Modes](training-modes.md) (LoRA, Audio-Video, IC-LoRA)
+- Dive deeper into [Training Configuration](configuration-reference.md)
+- Understand the model architecture in [LTX-Core Documentation](../../ltx-core/README.md)
+## Need Help?
+If you run into issues at any step, see the [Troubleshooting Guide](troubleshooting.md) for solutions to common
+problems.
+Join our [Discord community](https://discord.gg/ltxplatform) for real-time help and discussion!

packages/ltx-trainer/docs/training-guide.md ADDED Viewed

	@@ -0,0 +1,203 @@

+# Training Guide
+This guide covers how to run training jobs, from basic single-GPU training to advanced distributed setups and automatic
+model uploads.
+## ⚡ Basic Training (Single GPU)
+After preprocessing your dataset and preparing a configuration file, you can start training using the trainer script:
+```bash
+uv run python scripts/train.py configs/ltx2_av_lora.yaml
+```
+The trainer will:
+1. **Load your configuration** and validate all parameters
+2. **Initialize models** and apply optimizations
+3. **Run the training loop** with progress tracking
+4. **Generate validation videos** (if configured)
+5. **Save the trained weights** in your output directory
+### Output Files
+**For LoRA training:**
+- `lora_weights.safetensors` - Main LoRA weights file
+- `training_config.yaml` - Copy of training configuration
+- `validation_samples/` - Generated validation videos (if enabled)
+**For full model fine-tuning:**
+- `model_weights.safetensors` - Full model weights
+- `training_config.yaml` - Copy of training configuration
+- `validation_samples/` - Generated validation videos (if enabled)
+## 🖥️ Distributed / Multi-GPU Training
+We use Hugging Face 🤗 [Accelerate](https://huggingface.co/docs/accelerate/index) for multi-GPU DDP and FSDP.
+### Configure Accelerate
+Run the interactive wizard once to set up your environment (DDP / FSDP, GPU count, etc.):
+```bash
+uv run accelerate config
+```
+This stores your preferences in `~/.cache/huggingface/accelerate/default_config.yaml`.
+### Use the Provided Accelerate Configs (Recommended)
+We include ready-to-use Accelerate config files in `configs/accelerate/`:
+- [ddp.yaml](../configs/accelerate/ddp.yaml) — Standard DDP
+- [ddp_compile.yaml](../configs/accelerate/ddp_compile.yaml) — DDP with `torch.compile` (Inductor)
+- [fsdp.yaml](../configs/accelerate/fsdp.yaml) — Standard FSDP (auto-wraps `BasicAVTransformerBlock`)
+- [fsdp_compile.yaml](../configs/accelerate/fsdp_compile.yaml) — FSDP with `torch.compile` (Inductor)
+Launch with a specific config using `--config_file`:
+```bash
+# DDP (2 GPUs shown as example)
+CUDA_VISIBLE_DEVICES=0,1 \
+uv run accelerate launch --config_file configs/accelerate/ddp.yaml \
+  scripts/train.py configs/ltx2_av_lora.yaml
+# DDP + torch.compile
+CUDA_VISIBLE_DEVICES=0,1 \
+uv run accelerate launch --config_file configs/accelerate/ddp_compile.yaml \
+  scripts/train.py configs/ltx2_av_lora.yaml
+# FSDP (4 GPUs shown as example)
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+uv run accelerate launch --config_file configs/accelerate/fsdp.yaml \
+  scripts/train.py configs/ltx2_av_lora.yaml
+# FSDP + torch.compile
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+uv run accelerate launch --config_file configs/accelerate/fsdp_compile.yaml \
+  scripts/train.py configs/ltx2_av_lora.yaml
+```
+**Notes:**
+- The number of processes is taken from the Accelerate config (`num_processes`). Override with `--num_processes X` or
+  restrict GPUs with `CUDA_VISIBLE_DEVICES`.
+- The compile variants enable `torch.compile` with the Inductor backend via Accelerate's `dynamo_config`.
+- FSDP configs auto-wrap the transformer blocks (`fsdp_transformer_layer_cls_to_wrap: BasicAVTransformerBlock`).
+### Launch with Your Default Accelerate Config
+If you prefer to use your default Accelerate profile:
+```bash
+# Use settings from your default accelerate config
+uv run accelerate launch scripts/train.py configs/ltx2_av_lora.yaml
+# Override number of processes on the fly (e.g., 2 GPUs)
+uv run accelerate launch --num_processes 2 scripts/train.py configs/ltx2_av_lora.yaml
+# Select specific GPUs
+CUDA_VISIBLE_DEVICES=0,1 uv run accelerate launch scripts/train.py configs/ltx2_av_lora.yaml
+```
+> [!TIP]
+> You can disable the in-terminal progress bars with `--disable-progress-bars` flag in the trainer CLI if desired.
+### Benefits of Distributed Training
+- **Faster training**: Distribute workload across multiple GPUs
+- **Larger effective batch sizes**: Combine gradients from multiple GPUs
+- **Memory efficiency**: Each GPU handles a portion of the batch
+> [!NOTE]
+> Distributed training requires that all GPUs have sufficient memory for the model and batch size. The effective batch
+> size becomes `batch_size × num_processes`.
+## 🤗 Pushing Models to Hugging Face Hub
+You can automatically push your trained models to the Hugging Face Hub by adding the following to your configuration:
+```yaml
+hub:
+  push_to_hub: true
+  hub_model_id: "your-username/your-model-name"
+```
+### Prerequisites
+Before pushing, make sure you:
+1. **Have a Hugging Face account** - Sign up at [huggingface.co](https://huggingface.co)
+2. **Are logged in** via `huggingface-cli login` or have set the `HUGGING_FACE_HUB_TOKEN` environment variable
+3. **Have write access** to the specified repository (it will be created if it doesn't exist)
+### Login Options
+**Option 1: Interactive login**
+```bash
+uv run huggingface-cli login
+```
+**Option 2: Environment variable**
+```bash
+export HUGGING_FACE_HUB_TOKEN="your_token_here"
+```
+### What Gets Uploaded
+The trainer will automatically:
+- **Create a model card** with training details and sample outputs
+- **Upload model weights**
+- **Push sample videos as GIFs** in the model card
+- **Include training configuration and prompts**
+## 📊 Weights & Biases Logging
+Enable experiment tracking with W&B by adding to your configuration:
+```yaml
+wandb:
+  enabled: true
+  project: "ltx-2-trainer"
+  entity: null  # Your W&B username or team
+  tags: [ "ltx2", "lora" ]
+  log_validation_videos: true
+```
+This will log:
+- Training loss and learning rate
+- Validation videos
+- Model configuration
+- Training progress
+## 🚀 Next Steps
+After training completes:
+- **Run inference with your trained LoRA** - The [`ltx-pipelines`](../../ltx-pipelines/) package provides
+  production-ready inference
+  pipelines that support loading custom LoRAs. Available pipelines include text-to-video, image-to-video,
+  IC-LoRA video-to-video, and more. See the [`ltx-pipelines`](../../ltx-pipelines/) package for usage details.
+- **Test your model** with validation prompts
+- **Iterate and improve** based on validation results
+- **Share your results** by pushing to Hugging Face Hub
+## 💡 Tips for Successful Training
+- **Start small**: Begin with a small dataset and a few hundred steps to verify everything works
+- **Monitor validation**: Keep an eye on validation samples to catch overfitting
+- **Adjust learning rate**: Lower learning rates often produce better results
+- **Use gradient checkpointing**: Essential for training with limited GPU memory
+- **Save checkpoints**: Regular checkpoints help recover from interruptions
+## Need Help?
+If you encounter issues during training, see the [Troubleshooting Guide](troubleshooting.md).
+Join our [Discord community](https://discord.gg/ltxplatform) for real-time help!

packages/ltx-trainer/docs/training-modes.md ADDED Viewed

	@@ -0,0 +1,277 @@

+# Training Modes Guide
+The trainer supports several training modes, each suited for different use cases and requirements.
+## 🎯 Standard LoRA Training (Video-Only)
+Standard LoRA (Low-Rank Adaptation) training fine-tunes the model by adding small, trainable adapter layers while
+keeping the base model frozen. This approach:
+- **Requires significantly less memory and compute** than full fine-tuning
+- **Produces small, portable weight files** (typically a few hundred MB)
+- **Is ideal for learning specific styles, effects, or concepts**
+- **Can be easily combined with other LoRAs** during inference
+Configure standard LoRA training with:
+```yaml
+model:
+  training_mode: "lora"
+training_strategy:
+  name: "text_to_video"
+  first_frame_conditioning_p: 0.1
+  with_audio: false  # Video-only training
+```
+## 🔊 Audio-Video LoRA Training
+LTX-2 supports joint audio-video generation. You can train LoRA adapters that affect both video and audio output:
+- **Synchronized audio-video generation** - Audio matches the visual content
+- **Same efficient LoRA approach** - Just enable audio training
+- **Requires audio latents** - Dataset must include preprocessed audio
+Configure audio-video training with:
+```yaml
+model:
+  training_mode: "lora"
+training_strategy:
+  name: "text_to_video"
+  first_frame_conditioning_p: 0.1
+  with_audio: true  # Enable audio training
+  audio_latents_dir: "audio_latents"  # Directory containing audio latents
+```
+**Example configuration file:**
+- 📄 [Audio-Video LoRA Training](../configs/ltx2_av_lora.yaml)
+**Dataset structure for audio-video training:**
+```
+preprocessed_data_root/
+├── latents/           # Video latents
+├── conditions/        # Text embeddings
+└── audio_latents/     # Audio latents (required when with_audio: true)
+```
+> [!IMPORTANT]
+> When training audio-video LoRAs, ensure your `target_modules` configuration captures video, audio, and
+> cross-modal attention branches. Use patterns like `"to_k"` instead of `"attn1.to_k"` to match:
+> - Video modules: `attn1.to_k`, `attn2.to_k`
+> - Audio modules: `audio_attn1.to_k`, `audio_attn2.to_k`
+> - Cross-modal modules: `audio_to_video_attn.to_k`, `video_to_audio_attn.to_k`
+>
+> The cross-modal attention modules (`audio_to_video_attn` and `video_to_audio_attn`) enable bidirectional
+> information flow between audio and video, which is critical for synchronized audiovisual generation.
+> See [Understanding Target Modules](configuration-reference.md#understanding-target-modules) for detailed guidance.
+> [!NOTE]
+> You can generate audio during validation even if you're not training the audio branch.
+> Set `validation.generate_audio: true` independently of `training_strategy.with_audio`.
+## 🔥 Full Model Fine-tuning
+Full model fine-tuning updates all parameters of the base model, providing maximum flexibility but
+requiring substantial computational resources and larger training datasets:
+- **Offers the highest potential quality and capability improvements**
+- **Requires multiple GPUs** and distributed training techniques (e.g., FSDP)
+- **Produces large checkpoint files** (several GB)
+- **Best for major model adaptations** or when LoRA limitations are reached
+Configure full fine-tuning with:
+```yaml
+model:
+  training_mode: "full"
+training_strategy:
+  name: "text_to_video"
+  first_frame_conditioning_p: 0.1
+```
+> [!IMPORTANT]
+> Full fine-tuning of LTX-2 requires multiple high-end GPUs (e.g., 4-8× H100 80GB) and distributed
+> training with FSDP. See [Training Guide](training-guide.md) for multi-GPU setup instructions.
+## 🔄 In-Context LoRA (IC-LoRA) Training
+IC-LoRA is a specialized training mode for video-to-video transformations.
+Unlike standard training modes that learn from individual videos, IC-LoRA learns transformations from pairs of videos.
+IC-LoRA enables a wide range of advanced video-to-video applications, such as:
+- **Control adapters** (e.g., Depth, Pose): Learn to map from a control signal (like a depth map or pose skeleton) to a
+  target video
+- **Video deblurring**: Transform blurry input videos into sharp, high-quality outputs
+- **Style transfer**: Apply the style of a reference video to a target video sequence
+- **Colorization**: Convert grayscale reference videos into colorized outputs
+- **Restoration and enhancement**: Denoise, upscale, or restore old or degraded videos
+By providing paired reference and target videos, IC-LoRA can learn complex transformations that go beyond caption-based
+conditioning.
+IC-LoRA training fundamentally differs from standard LoRA and full fine-tuning:
+- **Reference videos** provide clean, unnoised conditioning input showing the "before" state
+- **Target videos** are noised during training and represent the desired "after" state
+- **The model learns transformations** from reference videos to target videos
+- **Loss is applied only to the target portion**, not the reference
+- **Training and inference time increase significantly** due to the doubled sequence length
+To enable IC-LoRA training, configure your YAML file with:
+```yaml
+model:
+  training_mode: "lora"  # Required: IC-LoRA uses LoRA mode
+training_strategy:
+  name: "video_to_video"
+  first_frame_conditioning_p: 0.1
+  reference_latents_dir: "reference_latents"  # Directory for reference video latents
+```
+**Example configuration file:**
+- 📄 [IC-LoRA Training](../configs/ltx2_v2v_ic_lora.yaml) - Video-to-video transformation training
+### Dataset Requirements for IC-LoRA
+- Your dataset must contain **paired videos** where each target video has a corresponding reference video
+- Reference and target videos must have the **same frame count** (length)
+- Reference videos can optionally be at **lower spatial resolution** than target videos (
+  see [Scaled Reference Conditioning](#scaled-reference-conditioning) below)
+- Both reference and target videos should be **preprocessed** before training
+**Dataset structure for IC-LoRA training:**
+```
+preprocessed_data_root/
+├── latents/            # Target video latents (what the model learns to generate)
+├── conditions/         # Text embeddings for each video
+└── reference_latents/  # Reference video latents (conditioning input)
+```
+### Generating Reference Videos
+We provide an example script to generate reference videos (e.g., Canny edge maps) for a given dataset.
+The script takes a JSON file as input (e.g., output of `caption_videos.py`) and updates it with the generated reference
+video paths.
+```bash
+uv run python scripts/compute_reference.py scenes_output_dir/ \
+    --output scenes_output_dir/dataset.json
+```
+To compute a different condition (depth maps, pose skeletons, etc.), modify the `compute_reference()` function in the
+script.
+### Configuration Requirements for IC-LoRA
+- You **must** provide `reference_videos` in your validation configuration when using IC-LoRA training
+- The number of reference videos must match the number of validation prompts
+Example validation configuration for IC-LoRA:
+```yaml
+validation:
+  prompts:
+    - "First prompt describing the desired output"
+    - "Second prompt describing the desired output"
+  reference_videos:
+    - "/path/to/reference1.mp4"
+    - "/path/to/reference2.mp4"
+  reference_downscale_factor: 1  # Set to match preprocessing (e.g., 2 for half resolution)
+  include_reference_in_output: true  # Show reference side-by-side with output
+```
+### Scaled Reference Conditioning
+For more efficient training and inference, you can use **downscaled reference videos** while keeping target videos at
+full resolution. This reduces the number of conditioning tokens, leading to:
+- **Faster training** due to shorter sequence lengths
+- **Faster inference** with reduced memory usage
+- **Same aspect ratio** maintained between reference and target
+#### How It Works
+When the reference video has resolution `H/n × W/n` and the target video has resolution `H × W`, the trainer
+automatically detects this scale factor `n` and adjusts the positional encodings so that the reference positions
+map to the correct locations in the target coordinate space.
+#### Preprocessing Datasets with Scaled References
+Use the `--reference-downscale-factor` option when running `process_dataset.py`:
+```bash
+# Process dataset with scaled reference videos (half resolution)
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets 768x768x25 \
+    --model-path /path/to/ltx2.safetensors \
+    --text-encoder-path /path/to/gemma \
+    --reference-column "reference_path" \
+    --reference-downscale-factor 2
+```
+This will:
+- Process target videos at 768×768 resolution
+- Process reference videos at 384×384 resolution (768 / 2)
+- The trainer will automatically infer the scale factor from the dimension ratio
+**Important**: Set `reference_downscale_factor: 2` in your validation configuration to match the preprocessing:
+```yaml
+validation:
+  reference_downscale_factor: 2  # Must match the preprocessing factor
+  reference_videos:
+    - "/path/to/reference1.mp4"
+    - "/path/to/reference2.mp4"
+```
+> [!NOTE]
+> The scale factor must be a positive integer, and all dimensions must be divisible by 32.
+> Common scale factors are 1 (no scaling), 2 (half resolution), or 4 (quarter resolution).
+## 📊 Training Mode Comparison
+| Aspect               | LoRA                           | Audio-Video LoRA               | Full Fine-tuning | IC-LoRA                        |
+|----------------------|--------------------------------|--------------------------------|------------------|--------------------------------|
+| **Memory Usage**     | Low                            | Low-Medium                     | High             | Medium                         |
+| **Training Speed**   | Fast                           | Fast                           | Slow             | Medium                         |
+| **Output Size**      | 100MB-few GB (depends on rank) | 100MB-few GB (depends on rank) | Tens of GB       | 100MB-few GB (depends on rank) |
+| **Flexibility**      | Medium                         | Medium                         | High             | Specialized                    |
+| **Audio Support**    | Optional                       | Yes                            | Optional         | No                             |
+| **Reference Videos** | No                             | No                             | No               | Yes (required)                 |
+## 🎬 Using Trained Models for Inference
+After training, use the [`ltx-pipelines`](../../ltx-pipelines/) package for production inference with your trained
+LoRAs:
+| Training Mode           | Recommended Pipeline                                  |
+|-------------------------|-------------------------------------------------------|
+| LoRA / Audio-Video LoRA | `TI2VidOneStagePipeline` or `TI2VidTwoStagesPipeline` |
+| IC-LoRA                 | `ICLoraPipeline`                                      |
+All pipelines support loading custom LoRAs via the `loras` parameter. See the [`ltx-pipelines`](../../ltx-pipelines/)
+package
+documentation for detailed usage instructions.
+## 🚀 Next Steps
+Once you've chosen your training mode:
+- Set up your dataset using [Dataset Preparation](dataset-preparation.md)
+- Configure your training parameters in [Configuration Reference](configuration-reference.md)
+- Start training with the [Training Guide](training-guide.md)
+> [!TIP]
+> Need a training mode that's not covered here?
+> See [Implementing Custom Training Strategies](custom-training-strategies.md)
+> to learn how to create your own strategy for specialized use cases like video inpainting, audio-only training, or
+> custom conditioning.

packages/ltx-trainer/docs/troubleshooting.md ADDED Viewed

	@@ -0,0 +1,300 @@

+# Troubleshooting Guide
+This guide covers common issues and solutions when training with the LTX-2 trainer.
+## 🔧 VRAM and Memory Issues
+Memory management is crucial for successful training with LTX-2.
+> [!TIP]
+> For GPUs with 32GB VRAM, use the pre-configured low VRAM config:
+> [`configs/ltx2_av_lora_low_vram.yaml`](../configs/ltx2_av_lora_low_vram.yaml)
+> which combines 8-bit optimizer, INT8 quantization, and reduced LoRA rank.
+### Memory Optimization Techniques
+#### 1. Enable Gradient Checkpointing
+Gradient checkpointing trades training speed for memory savings. **Highly recommended** for most training runs:
+```yaml
+optimization:
+  enable_gradient_checkpointing: true
+```
+#### 2. Enable 8-bit Text Encoder
+Load the Gemma text encoder in 8-bit precision to save GPU memory:
+```yaml
+acceleration:
+  load_text_encoder_in_8bit: true
+```
+#### 3. Reduce Batch Size
+Lower the batch size if you encounter out-of-memory errors:
+```yaml
+optimization:
+  batch_size: 1  # Start with 1 and increase gradually
+```
+Use gradient accumulation to maintain a larger effective batch size:
+```yaml
+optimization:
+  batch_size: 1
+  gradient_accumulation_steps: 4  # Effective batch size = 4
+```
+#### 4. Use Lower Resolution
+Reduce spatial or temporal dimensions to save memory:
+```bash
+# Smaller spatial resolution
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "512x512x49" \
+    --model-path /path/to/model.safetensors \
+    --text-encoder-path /path/to/gemma
+# Fewer frames
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x25" \
+    --model-path /path/to/model.safetensors \
+    --text-encoder-path /path/to/gemma
+```
+#### 5. Enable Model Quantization
+Use quantization to reduce memory usage:
+```yaml
+acceleration:
+  quantization: "int8-quanto"  # Options: int8-quanto, int4-quanto, fp8-quanto
+```
+#### 6. Use 8-bit Optimizer
+The 8-bit AdamW optimizer uses less memory:
+```yaml
+optimization:
+  optimizer_type: "adamw8bit"
+```
+---
+## ⚠️ Common Usage Issues
+### Issue: "No module named 'ltx_trainer'" Error
+**Solution:**
+Ensure you've installed the dependencies and are using `uv run` to execute scripts:
+```bash
+# From the repository root
+uv sync
+cd packages/ltx-trainer
+uv run python scripts/train.py configs/ltx2_av_lora.yaml
+```
+> [!TIP]
+> Always use `uv run` to execute Python scripts. This automatically uses the correct virtual environment
+> without requiring manual activation.
+### Issue: "Gemma model path is not a directory" Error
+**Solution:**
+The `text_encoder_path` must point to a directory containing the Gemma model, not a file:
+```yaml
+model:
+  model_path: "/path/to/ltx-2-model.safetensors"  # File path
+  text_encoder_path: "/path/to/gemma-model/"      # Directory path
+```
+### Issue: "Model path does not exist" Error
+**Solution:**
+LTX-2 requires local model paths. URLs are not supported:
+```yaml
+# ✅ Correct - local path
+model:
+  model_path: "/path/to/ltx-2-model.safetensors"
+# ❌ Wrong - URL not supported
+model:
+  model_path: "https://huggingface.co/..."
+```
+### Issue: "Frames must satisfy frames % 8 == 1" Error
+**Solution:**
+LTX-2 requires the number of frames to satisfy `frames % 8 == 1`:
+- ✅ Valid: 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 121
+- ❌ Invalid: 24, 32, 48, 64, 100
+### Issue: Slow Training Speed
+**Optimizations:**
+1. **Disable gradient checkpointing** (if you have enough VRAM):
+   ```yaml
+   optimization:
+     enable_gradient_checkpointing: false
+   ```
+2. **Use torch.compile** via Accelerate:
+   ```bash
+   uv run accelerate launch --config_file configs/accelerate/ddp_compile.yaml \
+     scripts/train.py configs/ltx2_av_lora.yaml
+   ```
+### Issue: Poor Quality Validation Outputs
+**Solutions:**
+1. **Use Image-to-Video Validation:**
+   For more reliable validation, use image-to-video (first-frame conditioning) rather than pure text-to-video:
+   ```yaml
+   validation:
+     prompts:
+       - "a professional portrait video of a person"
+     images:
+       - "/path/to/first_frame.png"  # One image per prompt
+   ```
+2. **Increase inference steps:**
+   ```yaml
+   validation:
+     inference_steps: 50  # Default is 30
+   ```
+3. **Adjust guidance settings:**
+   ```yaml
+   validation:
+     guidance_scale: 4.0  # CFG scale (recommended: 4.0)
+     stg_scale: 1.0       # STG scale for temporal coherence (recommended: 1.0)
+     stg_blocks: [29]     # Transformer block to perturb
+   ```
+4. **Check caption quality:**
+   Review and manually edit captions for accuracy if using auto-generated captions.
+   LTX-2 prefers long, detailed captions that describe both visual content and audio (e.g., ambient sounds, speech,
+   music).
+5. **Check target modules:**
+   Ensure your `target_modules` configuration matches your training goals. For audio-video training,
+   use patterns that match both branches (e.g., `"to_k"` instead of `"attn1.to_k"`).
+   See [Understanding Target Modules](configuration-reference.md#understanding-target-modules) for details.
+6. **Adjust LoRA rank:**
+   Try higher values for more capacity:
+   ```yaml
+   lora:
+     rank: 64  # Or 128 for more capacity
+   ```
+7. **Increase training steps:**
+   ```yaml
+   optimization:
+     steps: 3000
+   ```
+---
+## 🔍 Debugging Tools
+### Monitor GPU Memory Usage
+Track memory usage during training:
+```bash
+# Watch GPU memory in real-time
+watch -n 1 nvidia-smi
+# Log memory usage to file
+nvidia-smi --query-gpu=memory.used,memory.total --format=csv --loop=5 > memory_log.csv
+```
+### Verify Preprocessed Data
+Decode latents to visualize the preprocessed videos:
+```bash
+uv run python scripts/decode_latents.py dataset/.precomputed/latents debug_output \
+    --model-path /path/to/model.safetensors
+```
+To also decode audio latents, add the `--with-audio` flag:
+```bash
+uv run python scripts/decode_latents.py dataset/.precomputed/latents debug_output \
+    --model-path /path/to/model.safetensors \
+    --with-audio
+```
+Compare decoded videos and audio with originals to ensure quality.
+---
+## 💡 Best Practices
+### Before Training
+- [ ] Test preprocessing with a small subset first
+- [ ] Verify all video files are accessible
+- [ ] Check available GPU memory
+- [ ] Review configuration against hardware capabilities
+- [ ] Ensure model and text encoder paths are correct
+### During Training
+- [ ] Monitor GPU memory usage
+- [ ] Check loss convergence regularly
+- [ ] Review validation samples periodically
+- [ ] Save checkpoints frequently
+### After Training
+- [ ] Test trained model with diverse prompts
+- [ ] Document training parameters and results
+- [ ] Archive training data and configs
+## 🆘 Getting Help
+If you're still experiencing issues:
+1. **Check logs:** Review console output for error details
+2. **Search issues:** Look through GitHub issues for similar problems
+3. **Provide details:** When reporting issues, include:
+    - Hardware specifications (GPU model, VRAM)
+    - Configuration file used
+    - Complete error message
+    - Steps to reproduce the issue
+---
+## 🤝 Join the Community
+Have questions, want to share your results, or need real-time help?
+Join our [community Discord server](https://discord.gg/ltxplatform)
+to connect with other users and the development team!
+- Get troubleshooting help
+- Share your training results and workflows
+- Stay up to date with announcements and updates
+We look forward to seeing you there!

packages/ltx-trainer/docs/utility-scripts.md ADDED Viewed

	@@ -0,0 +1,274 @@

+# Utility Scripts Reference
+This guide covers the various utility scripts available for preprocessing, conversion, and debugging tasks.
+## 🎬 Dataset Processing Scripts
+### Video Scene Splitting
+The `scripts/split_scenes.py` script automatically splits long videos into shorter, coherent scenes.
+```bash
+# Basic scene splitting
+uv run python scripts/split_scenes.py input.mp4 output_dir/ --filter-shorter-than 5s
+```
+**Key features:**
+- **Automatic scene detection**: Uses PySceneDetect for intelligent splitting
+- **Multiple algorithms**: Content-based, adaptive, threshold, and histogram detection
+- **Filtering options**: Remove scenes shorter than specified duration
+- **Customizable parameters**: Thresholds, window sizes, and detection modes
+**Common options:**
+```bash
+# See all available options
+uv run python scripts/split_scenes.py --help
+# Use adaptive detection with custom threshold
+uv run python scripts/split_scenes.py video.mp4 scenes/ --detector adaptive --threshold 30.0
+# Limit to maximum number of scenes
+uv run python scripts/split_scenes.py video.mp4 scenes/ --max-scenes 50
+```
+### Automatic Video Captioning
+The `scripts/caption_videos.py` script generates captions for videos (with audio) using multimodal models.
+```bash
+# Generate captions for all videos in a directory (uses Qwen2.5-Omni by default)
+uv run python scripts/caption_videos.py videos_dir/ --output dataset.json
+# Use 8-bit quantization to reduce VRAM usage
+uv run python scripts/caption_videos.py videos_dir/ --output dataset.json --use-8bit
+# Use Gemini Flash API instead (requires API key)
+uv run python scripts/caption_videos.py videos_dir/ --output dataset.json \
+    --captioner-type gemini_flash --api-key YOUR_API_KEY
+# Caption without audio processing (video-only)
+uv run python scripts/caption_videos.py videos_dir/ --output dataset.json --no-audio
+# Force re-caption all files
+uv run python scripts/caption_videos.py videos_dir/ --output dataset.json --override
+```
+**Key features:**
+- **Audio-visual captioning**: Processes both video and audio content, including speech transcription
+- **Multiple backends**:
+  - `qwen_omni` (default): Local Qwen2.5-Omni model - processes video + audio locally
+  - `gemini_flash`: Google Gemini Flash API - cloud-based, requires API key
+- **Structured output**: Captions include visual description, speech transcription, sounds, and on-screen text
+- **Memory optimization**: 8-bit quantization option for limited VRAM
+- **Incremental processing**: Skips already-captioned files by default
+- **Multiple output formats**: JSON, JSONL, CSV, or TXT
+**Caption format:**
+The captioner produces structured captions with four sections:
+- `[VISUAL]`: Detailed description of visual content
+- `[SPEECH]`: Word-for-word transcription of spoken content
+- `[SOUNDS]`: Description of music, ambient sounds, sound effects
+- `[TEXT]`: Any on-screen text visible in the video
+**Environment variables (for Gemini Flash):**
+Set one of these to use Gemini Flash without passing `--api-key`:
+- `GOOGLE_API_KEY`
+- `GEMINI_API_KEY`
+### Dataset Preprocessing
+The `scripts/process_dataset.py` script processes videos and caches latents for training.
+```bash
+# Basic preprocessing
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model
+# With audio processing
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model \
+    --with-audio
+# With video decoding for verification
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model \
+    --decode
+```
+Multiple resolution buckets can be specified, separated by `;`:
+```bash
+uv run python scripts/process_dataset.py dataset.json \
+    --resolution-buckets "960x544x49;512x512x81" \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --text-encoder-path /path/to/gemma-model
+```
+> [!NOTE]
+> When training with multiple resolution buckets, set `optimization.batch_size: 1`.
+For detailed usage, see the [Dataset Preparation Guide](dataset-preparation.md).
+### Reference Video Generation
+The `scripts/compute_reference.py` script provides a template for creating reference videos needed for IC-LoRA training.
+The default implementation generates Canny edge reference videos.
+```bash
+# Generate Canny edge reference videos
+uv run python scripts/compute_reference.py videos_dir/ --output dataset.json
+```
+**Key features:**
+- **Canny edge detection**: Creates edge-based reference videos
+- **In-place editing**: Updates existing dataset JSON files
+- **Customizable**: Modify the `compute_reference()` function for different conditions (depth, pose, etc.)
+> [!TIP]
+> You can edit this script to generate other types of reference videos for IC-LoRA training,
+> such as depth maps, segmentation masks, or any custom video transformation.
+## 🔍 Debugging and Verification Scripts
+### Latents Decoding
+The `scripts/decode_latents.py` script decodes precomputed video latents back into video files for visual inspection.
+```bash
+# Basic usage
+uv run python scripts/decode_latents.py /path/to/latents/dir \
+    --output-dir /path/to/output \
+    --model-path /path/to/ltx-2-model.safetensors
+# With VAE tiling for large videos
+uv run python scripts/decode_latents.py /path/to/latents/dir \
+    --output-dir /path/to/output \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --vae-tiling
+# Decode both video and audio latents
+uv run python scripts/decode_latents.py /path/to/latents/dir \
+    --output-dir /path/to/output \
+    --model-path /path/to/ltx-2-model.safetensors \
+    --with-audio
+```
+**The script will:**
+1. **Load the VAE model** from the specified path
+2. **Process all `.pt` latent files** in the input directory
+3. **Decode each latent** back into a video using the VAE
+4. **Save resulting videos** as MP4 files in the output directory
+**When to use:**
+- **Verify preprocessing quality**: Check that your videos were encoded correctly
+- **Debug training data**: Visualize what the model actually sees during training
+- **Quality assessment**: Ensure latent encoding preserves important visual details
+### Inference Script
+The `scripts/inference.py` script runs inference with a trained model.
+> [!TIP]
+> For production inference, consider using the [`ltx-pipelines`](../../ltx-pipelines/) package which provides optimized,
+> feature-rich pipelines for various use cases:
+> - **Text/Image-to-Video**: `TI2VidOneStagePipeline`, `TI2VidTwoStagesPipeline`
+> - **Distilled (fast) inference**: `DistilledPipeline`
+> - **IC-LoRA video-to-video**: `ICLoraPipeline`
+> - **Keyframe interpolation**: `KeyframeInterpolationPipeline`
+>
+> All pipelines support loading custom LoRAs trained with this trainer.
+```bash
+# Text-to-video inference (with audio by default)
+# By default, uses CFG scale 4.0 and STG scale 1.0 with block 29
+uv run python scripts/inference.py \
+    --checkpoint /path/to/model.safetensors \
+    --text-encoder-path /path/to/gemma \
+    --prompt "A cat playing with a ball" \
+    --output output.mp4
+# Video-only (skip audio generation)
+uv run python scripts/inference.py \
+    --checkpoint /path/to/model.safetensors \
+    --text-encoder-path /path/to/gemma \
+    --prompt "A cat playing with a ball" \
+    --skip-audio \
+    --output output.mp4
+# Image-to-video with conditioning image
+uv run python scripts/inference.py \
+    --checkpoint /path/to/model.safetensors \
+    --text-encoder-path /path/to/gemma \
+    --prompt "A cat walking" \
+    --condition-image first_frame.png \
+    --output output.mp4
+# Custom guidance settings
+uv run python scripts/inference.py \
+    --checkpoint /path/to/model.safetensors \
+    --text-encoder-path /path/to/gemma \
+    --prompt "A cat playing with a ball" \
+    --guidance-scale 4.0 \
+    --stg-scale 1.0 \
+    --stg-blocks 29 \
+    --output output.mp4
+# Disable STG (CFG only)
+uv run python scripts/inference.py \
+    --checkpoint /path/to/model.safetensors \
+    --text-encoder-path /path/to/gemma \
+    --prompt "A cat playing with a ball" \
+    --stg-scale 0.0 \
+    --output output.mp4
+```
+**Guidance parameters:**
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `--guidance-scale` | 4.0 | CFG (Classifier-Free Guidance) scale |
+| `--stg-scale` | 1.0 | STG (Spatio-Temporal Guidance) scale. 0.0 disables STG |
+| `--stg-blocks` | 29 | Transformer block(s) to perturb for STG |
+| `--stg-mode` | stg_av | `stg_av` perturbs both audio and video, `stg_v` video only |
+## 🚀 Training Scripts
+### Basic and Distributed Training
+Use `scripts/train.py` for both single GPU and multi-GPU runs:
+```bash
+# Single-GPU training
+uv run python scripts/train.py configs/ltx2_av_lora.yaml
+# Multi-GPU (uses your accelerate config)
+uv run accelerate launch scripts/train.py configs/ltx2_av_lora.yaml
+# Override number of processes
+uv run accelerate launch --num_processes 4 scripts/train.py configs/ltx2_av_lora.yaml
+```
+For detailed usage, see the [Training Guide](training-guide.md).
+## 💡 Tips for Using Utility Scripts
+- **Start with `--help`**: Always check available options for each script
+- **Test on small datasets**: Verify workflows with a few files before processing large datasets
+- **Use decode verification**: Always decode a few samples to verify preprocessing quality
+- **Monitor VRAM usage**: Use `--use-8bit` or quantization flags when running into memory issues
+- **Keep backups**: Make copies of important dataset files before running conversion scripts

packages/ltx-trainer/scripts/caption_videos.py ADDED Viewed

	@@ -0,0 +1,486 @@

+#!/usr/bin/env python3
+"""
+Auto-caption videos with audio using multimodal models.
+This script provides a command-line interface for generating captions for videos
+(including audio) using multimodal models. It supports:
+- Qwen2.5-Omni: Local model for audio-visual captioning (default)
+- Gemini Flash: Cloud-based API for audio-visual captioning
+The paths to videos in the generated dataset/captions file will be RELATIVE to the
+directory where the output file is stored. This makes the dataset more portable and
+easier to use in different environments.
+Basic usage:
+    # Caption a single video (includes audio by default)
+    caption_videos.py video.mp4 --output captions.json
+    # Caption all videos in a directory
+    caption_videos.py videos_dir/ --output captions.csv
+    # Caption with custom instruction
+    caption_videos.py video.mp4 --instruction "Describe what happens in this video in detail."
+Advanced usage:
+    # Use Gemini Flash API (requires GEMINI_API_KEY or GOOGLE_API_KEY env var)
+    caption_videos.py videos_dir/ --captioner-type gemini_flash
+    # Disable audio processing (video-only captions)
+    caption_videos.py videos_dir/ --no-audio
+    # Process videos with specific extensions and save as JSON
+    caption_videos.py videos_dir/ --extensions mp4,mov,avi --output captions.json
+"""
+import csv
+import json
+from enum import Enum
+from pathlib import Path
+import torch
+import typer
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from transformers.utils.logging import disable_progress_bar
+from ltx_trainer.captioning import CaptionerType, MediaCaptioningModel, create_captioner
+VIDEO_EXTENSIONS = ["mp4", "avi", "mov", "mkv", "webm"]
+IMAGE_EXTENSIONS = ["jpg", "jpeg", "png"]
+MEDIA_EXTENSIONS = VIDEO_EXTENSIONS + IMAGE_EXTENSIONS
+SAVE_INTERVAL = 5
+console = Console()
+app = typer.Typer(
+    pretty_exceptions_enable=False,
+    no_args_is_help=True,
+    help="Auto-caption videos with audio using multimodal models.",
+)
+disable_progress_bar()
+class OutputFormat(str, Enum):
+    """Available output formats for captions."""
+    TXT = "txt"  # Separate files for captions and video paths, one caption / video path per line
+    CSV = "csv"  # CSV file with video path and caption columns
+    JSON = "json"  # JSON file with video paths as keys and captions as values
+    JSONL = "jsonl"  # JSON Lines file with one JSON object per line
+def caption_media(
+    input_path: Path,
+    output_path: Path,
+    captioner: MediaCaptioningModel,
+    extensions: list[str],
+    recursive: bool,
+    fps: int,
+    include_audio: bool,
+    clean_caption: bool,
+    output_format: OutputFormat,
+    override: bool,
+) -> None:
+    """Caption videos and images using the provided captioning model.
+    Args:
+        input_path: Path to input video file or directory
+        output_path: Path to output caption file
+        captioner: Media captioning model
+        extensions: List of media file extensions to include
+        recursive: Whether to search subdirectories recursively
+        fps: Frames per second to sample from videos (ignored for images)
+        include_audio: Whether to include audio in captioning
+        clean_caption: Whether to clean up captions
+        output_format: Format to save the captions in
+        override: Whether to override existing captions
+    """
+    # Get list of media files to process
+    media_files = _get_media_files(input_path, extensions, recursive)
+    if not media_files:
+        console.print("[bold yellow]No media files found to process.[/]")
+        return
+    console.print(f"Found [bold]{len(media_files)}[/] media files to process.")
+    # Load existing captions and determine which files need processing
+    base_dir = output_path.parent.resolve()
+    existing_captions = _load_existing_captions(output_path, output_format)
+    existing_abs_paths = {str((base_dir / p).resolve()) for p in existing_captions}
+    if override:
+        media_to_process = media_files
+    else:
+        media_to_process = [f for f in media_files if str(f.resolve()) not in existing_abs_paths]
+        if skipped := len(media_files) - len(media_to_process):
+            console.print(f"[bold yellow]Skipping {skipped} media that already have captions.[/]")
+    if not media_to_process:
+        console.print("[bold yellow]All media already have captions. Use --override to recaption.[/]")
+        return
+    # Process media files
+    captions = existing_captions.copy()
+    successfully_captioned = 0
+    progress = Progress(
+        SpinnerColumn(),
+        TextColumn("{task.description}"),
+        BarColumn(bar_width=40),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        TextColumn("•"),
+        TimeRemainingColumn(),
+        console=console,
+    )
+    with progress:
+        task = progress.add_task("Captioning", total=len(media_to_process))
+        for i, media_file in enumerate(media_to_process):
+            progress.update(task, description=f"Captioning [bold blue]{media_file.name}[/]")
+            try:
+                # Generate caption for the media
+                caption = captioner.caption(
+                    path=media_file,
+                    fps=fps,
+                    include_audio=include_audio,
+                    clean_caption=clean_caption,
+                )
+                # Convert absolute path to relative path (relative to the output file's directory)
+                rel_path = str(media_file.resolve().relative_to(base_dir))
+                # Store the caption with the relative path as key
+                captions[rel_path] = caption
+                successfully_captioned += 1
+            except Exception as e:
+                console.print(f"[bold red]Error captioning {media_file}: {e}[/]")
+            if i % SAVE_INTERVAL == 0:
+                _save_captions(captions, output_path, output_format)
+            # Advance progress bar
+            progress.advance(task)
+    # Save captions to file
+    _save_captions(captions, output_path, output_format)
+    # Print summary
+    console.print(
+        f"[bold green]✓[/] Captioned [bold]{successfully_captioned}/{len(media_to_process)}[/] media successfully.",
+    )
+def _get_media_files(
+    input_path: Path,
+    extensions: list[str] = MEDIA_EXTENSIONS,
+    recursive: bool = False,
+) -> list[Path]:
+    """Get all media files from the input path."""
+    input_path = Path(input_path)
+    # Normalize extensions to lowercase without dots
+    extensions_set = {ext.lower().lstrip(".") for ext in extensions}
+    if input_path.is_file():
+        # If input is a file, check if it has a valid extension
+        if input_path.suffix.lstrip(".").lower() in extensions_set:
+            return [input_path]
+        else:
+            typer.echo(f"Warning: {input_path} is not a recognized media file. Skipping.")
+            return []
+    elif input_path.is_dir():
+        # Find all files and filter by extension case-insensitively
+        glob_pattern = "**/*" if recursive else "*"
+        media_files = [
+            f for f in input_path.glob(glob_pattern) if f.is_file() and f.suffix.lstrip(".").lower() in extensions_set
+        ]
+        return sorted(media_files)
+    else:
+        typer.echo(f"Error: {input_path} does not exist.")
+        raise typer.Exit(code=1)
+def _save_captions(
+    captions: dict[str, str],
+    output_path: Path,
+    format_type: OutputFormat,
+) -> None:
+    """Save captions to a file in the specified format.
+    Args:
+        captions: Dictionary mapping media paths to captions
+        output_path: Path to save the output file
+        format_type: Format to save the captions in
+    """
+    # Create parent directories if they don't exist
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    console.print("[bold blue]Saving captions...[/]")
+    match format_type:
+        case OutputFormat.TXT:
+            # Create two separate files for captions and media paths
+            captions_file = output_path.with_stem(f"{output_path.stem}_captions")
+            paths_file = output_path.with_stem(f"{output_path.stem}_paths")
+            with captions_file.open("w", encoding="utf-8") as f:
+                for caption in captions.values():
+                    f.write(f"{caption}\n")
+            with paths_file.open("w", encoding="utf-8") as f:
+                for media_path in captions:
+                    f.write(f"{media_path}\n")
+            console.print(f"[bold green]✓[/] Captions saved to [cyan]{captions_file}[/]")
+            console.print(f"[bold green]✓[/] Media paths saved to [cyan]{paths_file}[/]")
+        case OutputFormat.CSV:
+            with output_path.open("w", encoding="utf-8", newline="") as f:
+                writer = csv.writer(f)
+                writer.writerow(["caption", "media_path"])
+                for media_path, caption in captions.items():
+                    writer.writerow([caption, media_path])
+            console.print(f"[bold green]✓[/] Captions saved to [cyan]{output_path}[/]")
+        case OutputFormat.JSON:
+            # Format as list of dictionaries with caption and media_path keys
+            json_data = [{"caption": caption, "media_path": media_path} for media_path, caption in captions.items()]
+            with output_path.open("w", encoding="utf-8") as f:
+                json.dump(json_data, f, indent=2, ensure_ascii=False)
+            console.print(f"[bold green]✓[/] Captions saved to [cyan]{output_path}[/]")
+        case OutputFormat.JSONL:
+            with output_path.open("w", encoding="utf-8") as f:
+                for media_path, caption in captions.items():
+                    f.write(json.dumps({"caption": caption, "media_path": media_path}, ensure_ascii=False) + "\n")
+            console.print(f"[bold green]✓[/] Captions saved to [cyan]{output_path}[/]")
+        case _:
+            raise ValueError(f"Unsupported output format: {format_type}")
+def _load_existing_captions(  # noqa: PLR0912
+    output_path: Path,
+    format_type: OutputFormat,
+) -> dict[str, str]:
+    """Load existing captions from a file.
+    Args:
+        output_path: Path to the captions file
+        format_type: Format of the captions file
+    Returns:
+        Dictionary mapping media paths to captions, or empty dict if file doesn't exist
+    """
+    if not output_path.exists():
+        return {}
+    console.print(f"[bold blue]Loading existing captions from [cyan]{output_path}[/]...[/]")
+    existing_captions = {}
+    try:
+        match format_type:
+            case OutputFormat.TXT:
+                # For TXT format, we have two separate files
+                captions_file = output_path.with_stem(f"{output_path.stem}_captions")
+                paths_file = output_path.with_stem(f"{output_path.stem}_paths")
+                if captions_file.exists() and paths_file.exists():
+                    captions = captions_file.read_text(encoding="utf-8").splitlines()
+                    paths = paths_file.read_text(encoding="utf-8").splitlines()
+                    if len(captions) == len(paths):
+                        existing_captions = dict(zip(paths, captions, strict=False))
+            case OutputFormat.CSV:
+                with output_path.open("r", encoding="utf-8", newline="") as f:
+                    reader = csv.reader(f)
+                    # Skip header
+                    next(reader, None)
+                    for row in reader:
+                        if len(row) >= 2:
+                            caption, media_path = row[0], row[1]
+                            existing_captions[media_path] = caption
+            case OutputFormat.JSON:
+                with output_path.open("r", encoding="utf-8") as f:
+                    json_data = json.load(f)
+                    for item in json_data:
+                        if "caption" in item and "media_path" in item:
+                            existing_captions[item["media_path"]] = item["caption"]
+            case OutputFormat.JSONL:
+                with output_path.open("r", encoding="utf-8") as f:
+                    for line in f:
+                        item = json.loads(line)
+                        if "caption" in item and "media_path" in item:
+                            existing_captions[item["media_path"]] = item["caption"]
+            case _:
+                raise ValueError(f"Unsupported output format: {format_type}")
+        console.print(f"[bold green]✓[/] Loaded [bold]{len(existing_captions)}[/] existing captions")
+        return existing_captions
+    except Exception as e:
+        console.print(f"[bold yellow]Warning: Could not load existing captions: {e}[/]")
+        return {}
+@app.command()
+def main(  # noqa: PLR0913
+    input_path: Path = typer.Argument(  # noqa: B008
+        ...,
+        help="Path to input video/image file or directory containing media files",
+        exists=True,
+    ),
+    output: Path | None = typer.Option(  # noqa: B008
+        None,
+        "--output",
+        "-o",
+        help="Path to output file for captions. Format determined by file extension.",
+    ),
+    captioner_type: CaptionerType = typer.Option(  # noqa: B008
+        CaptionerType.QWEN_OMNI,
+        "--captioner-type",
+        "-c",
+        help="Type of captioner to use. Valid values: 'qwen_omni' (local), 'gemini_flash' (API)",
+        case_sensitive=False,
+    ),
+    device: str | None = typer.Option(
+        None,
+        "--device",
+        "-d",
+        help="Device to use for inference (e.g., 'cuda', 'cuda:0', 'cpu'). Only for local models.",
+    ),
+    use_8bit: bool = typer.Option(
+        False,
+        "--use-8bit",
+        help="Whether to use 8-bit precision for the captioning model (reduces memory usage)",
+    ),
+    instruction: str | None = typer.Option(
+        None,
+        "--instruction",
+        "-i",
+        help="Custom instruction for the captioning model. If not provided, uses an appropriate default.",
+    ),
+    extensions: str = typer.Option(
+        ",".join(MEDIA_EXTENSIONS),
+        "--extensions",
+        "-e",
+        help="Comma-separated list of media file extensions to process",
+    ),
+    recursive: bool = typer.Option(
+        False,
+        "--recursive",
+        "-r",
+        help="Search for media files in subdirectories recursively",
+    ),
+    fps: int = typer.Option(
+        3,
+        "--fps",
+        "-f",
+        help="Frames per second to sample from videos (ignored for images)",
+    ),
+    include_audio: bool = typer.Option(
+        True,
+        "--audio/--no-audio",
+        help="Whether to include audio in captioning (for videos with audio tracks)",
+    ),
+    clean_caption: bool = typer.Option(
+        True,
+        "--clean-caption/--raw-caption",
+        help="Whether to clean up captions by removing common VLM patterns",
+    ),
+    override: bool = typer.Option(
+        False,
+        "--override",
+        help="Whether to override existing captions for media",
+    ),
+    api_key: str | None = typer.Option(
+        None,
+        "--api-key",
+        envvar=["GOOGLE_API_KEY", "GEMINI_API_KEY"],
+        help="API key for Gemini Flash (can also use GOOGLE_API_KEY or GEMINI_API_KEY env var)",
+    ),
+) -> None:
+    """Auto-caption videos with audio using multimodal models.
+    This script supports audio-visual captioning using:
+    - Qwen2.5-Omni: Local model (default) - processes both video and audio
+    - Gemini Flash: Cloud API - requires GOOGLE_API_KEY environment variable
+    The paths in the output file will be relative to the output file's directory.
+    Examples:
+        # Caption videos with audio using Qwen2.5-Omni (default)
+        caption_videos.py videos_dir/ -o captions.json
+        # Caption using Gemini Flash API
+        caption_videos.py videos_dir/ -o captions.json -c gemini_flash
+        # Caption without audio (video-only)
+        caption_videos.py videos_dir/ -o captions.json --no-audio
+        # Caption with custom instruction
+        caption_videos.py video.mp4 -o captions.json -i "Describe this video in detail"
+    """
+    # Determine device for local models
+    device_str = device or ("cuda" if torch.cuda.is_available() else "cpu")
+    # Parse extensions
+    ext_list = [ext.strip() for ext in extensions.split(",")]
+    # Determine output path and format
+    if output is None:
+        output_format = OutputFormat.JSON
+        if input_path.is_file():  # noqa: SIM108
+            # Default to a JSON file with the same name as the input media
+            output = input_path.with_suffix(".dataset.json")
+        else:
+            # Default to a JSON file in the input directory
+            output = input_path / "dataset.json"
+    else:
+        # Determine format from file extension
+        output_format = OutputFormat(Path(output).suffix.lstrip(".").lower())
+    # Ensure output path is absolute
+    output = Path(output).resolve()
+    console.print(f"Output will be saved to [bold blue]{output}[/]")
+    # Initialize captioning model
+    with console.status("Loading captioning model...", spinner="dots"):
+        if captioner_type == CaptionerType.QWEN_OMNI:
+            captioner = create_captioner(
+                captioner_type=captioner_type,
+                device=device_str,
+                use_8bit=use_8bit,
+                instruction=instruction,
+            )
+        elif captioner_type == CaptionerType.GEMINI_FLASH:
+            captioner = create_captioner(
+                captioner_type=captioner_type,
+                api_key=api_key,
+                instruction=instruction,
+            )
+        else:
+            raise ValueError(f"Unsupported captioner type: {captioner_type}")
+        console.print(f"[bold green]✓[/] {captioner_type.value} captioning model loaded successfully")
+    # Caption media files
+    caption_media(
+        input_path=input_path,
+        output_path=output,
+        captioner=captioner,
+        extensions=ext_list,
+        recursive=recursive,
+        fps=fps,
+        include_audio=include_audio,
+        clean_caption=clean_caption,
+        output_format=output_format,
+        override=override,
+    )
+if __name__ == "__main__":
+    app()

packages/ltx-trainer/scripts/compute_reference.py ADDED Viewed

	@@ -0,0 +1,288 @@

+"""
+Compute reference videos for IC-LoRA training.
+This script provides a command-line interface for generating reference videos to be used for IC-LoRA training.
+Note that it reads and writes to the same file (the output of caption_videos.py),
+where it adds the "reference_path" field to the JSON.
+Basic usage:
+    # Compute reference videos for all videos in a directory
+    compute_reference.py videos_dir/ --output videos_dir/captions.json
+"""
+# Standard library imports
+import json
+from pathlib import Path
+from typing import Dict
+# Third-party imports
+import cv2
+import torch
+import torchvision.transforms.functional as TF  # noqa: N812
+import typer
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from transformers.utils.logging import disable_progress_bar
+# Local imports
+from ltx_trainer.video_utils import read_video, save_video
+# Initialize console and disable progress bars
+console = Console()
+disable_progress_bar()
+def compute_reference(
+    images: torch.Tensor,
+) -> torch.Tensor:
+    """Compute Canny edge detection on a batch of images.
+    Args:
+        images: Batch of images tensor of shape [B, C, H, W]
+    Returns:
+        Binary edge masks tensor of shape [B, H, W]
+    """
+    # Convert to grayscale if needed
+    if images.shape[1] == 3:
+        images = TF.rgb_to_grayscale(images)
+    # Ensure images are in [0, 1] range
+    if images.max() > 1.0:
+        images = images / 255.0
+    # Compute Canny edges
+    edge_masks = []
+    for image in images:
+        # Convert to numpy for OpenCV
+        image_np = (image.squeeze().cpu().numpy() * 255).astype("uint8")
+        # Apply Canny edge detection
+        edges = cv2.Canny(
+            image_np,
+            threshold1=100,
+            threshold2=200,
+        )
+        # Convert back to tensor
+        edge_mask = torch.from_numpy(edges).float()
+        edge_masks.append(edge_mask)
+    edges = torch.stack(edge_masks)
+    edges = torch.stack([edges] * 3, dim=1)  # Convert to 3-channel
+    return edges
+def _get_meta_data(
+    output_path: Path,
+) -> Dict[str, str]:
+    """Get set of existing reference video paths without loading the actual files.
+    Args:
+        output_path: Path to the reference video paths file
+    Returns:
+        Dictionary mapping media paths to reference video paths
+    """
+    if not output_path.exists():
+        return {}
+    console.print(f"[bold blue]Reading meta data from [cyan]{output_path}[/]...[/]")
+    try:
+        with output_path.open("r", encoding="utf-8") as f:
+            json_data = json.load(f)
+        return json_data
+    except Exception as e:
+        console.print(f"[bold yellow]Warning: Could not check meta data: {e}[/]")
+        return {}
+def _save_dataset_json(
+    reference_paths: Dict[str, str],
+    output_path: Path,
+) -> None:
+    """Save dataset json with reference video paths.
+    Args:
+        reference_paths: Dictionary mapping media paths to reference video paths
+        output_path: Path to save the output file
+    """
+    with output_path.open("r", encoding="utf-8") as f:
+        json_data = json.load(f)
+        new_json_data = json_data.copy()
+        for i, item in enumerate(json_data):
+            media_path = item["media_path"]
+            reference_path = reference_paths[media_path]
+            new_json_data[i]["reference_path"] = reference_path
+    with output_path.open("w", encoding="utf-8") as f:
+        json.dump(new_json_data, f, indent=2, ensure_ascii=False)
+    console.print(f"[bold green]✓[/] Reference video paths saved to [cyan]{output_path}[/]")
+    console.print("[bold yellow]Note:[/] Use these files with ImageOrVideoDataset by setting:")
+    console.print("  reference_column='[cyan]reference_path[/]'")
+    console.print("  video_column='[cyan]media_path[/]'")
+def process_media(
+    input_path: Path,
+    output_path: Path,
+    override: bool,
+    batch_size: int = 100,
+) -> None:
+    """Process videos and images to compute condition on videos.
+    Args:
+        input_path: Path to input video/image file or directory
+        output_path: Path to output reference video file
+        override: Whether to override existing reference video files
+    """
+    if not output_path.exists():
+        raise FileNotFoundError(
+            f"Output file does not exist: {output_path}. This is also the input file for the dataset."
+        )
+    # Check for existing reference video files
+    meta_data = _get_meta_data(output_path)
+    base_dir = input_path.resolve()
+    console.print(f"Using [bold blue]{base_dir}[/] as base directory for relative paths")
+    # Filter media files
+    media_to_process = []
+    skipped_media = []
+    def media_path_to_reference_path(media_file: Path) -> Path:
+        return media_file.parent / (media_file.stem + "_reference" + media_file.suffix)
+    media_files = [base_dir / Path(sample["media_path"]) for sample in meta_data]
+    for media_file in media_files:
+        reference_path = media_path_to_reference_path(media_file)
+        media_to_process.append(media_file)
+    console.print(f"Processing [bold]{len(media_to_process)}[/] media.")
+    # Initialize progress tracking
+    progress = Progress(
+        SpinnerColumn(),
+        TextColumn("{task.description}"),
+        BarColumn(bar_width=40),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        TextColumn("•"),
+        TimeRemainingColumn(),
+        console=console,
+    )
+    # Process media files
+    media_paths = [item["media_path"] for item in meta_data]
+    reference_paths = {rel_path: str(media_path_to_reference_path(Path(rel_path))) for rel_path in media_paths}
+    with progress:
+        task = progress.add_task("Computing condition on videos", total=len(media_to_process))
+        for media_file in media_to_process:
+            progress.update(task, description=f"Processing [bold blue]{media_file.name}[/]")
+            rel_path = str(media_file.resolve().relative_to(base_dir))
+            reference_path = media_path_to_reference_path(media_file)
+            reference_paths[rel_path] = str(reference_path.relative_to(base_dir))
+            if not reference_path.resolve().exists() or override:
+                try:
+                    video, fps = read_video(media_file)
+                    # Process frames in batches
+                    condition_frames = []
+                    for i in range(0, len(video), batch_size):
+                        batch = video[i : i + batch_size]
+                        condition_batch = compute_reference(batch)
+                        condition_frames.append(condition_batch)
+                    # Concatenate all edge frames
+                    all_condition = torch.cat(condition_frames, dim=0)
+                    # Save the edge video
+                    save_video(all_condition, reference_path.resolve(), fps=fps)
+                except Exception as e:
+                    console.print(f"[bold red]Error processing [bold blue]{media_file}[/]: {e}[/]")
+                    reference_paths.pop(rel_path)
+            else:
+                skipped_media.append(media_file)
+            progress.advance(task)
+    # Save results
+    _save_dataset_json(reference_paths, output_path)
+    # Print summary
+    total_to_process = len(media_files) - len(skipped_media)
+    console.print(
+        f"[bold green]✓[/] Processed [bold]{total_to_process}/{len(media_files)}[/] media successfully.",
+    )
+app = typer.Typer(
+    pretty_exceptions_enable=False,
+    no_args_is_help=True,
+    help="Compute reference videos for IC-LoRA training.",
+)
+@app.command()
+def main(
+    input_path: Path = typer.Argument(  # noqa: B008
+        ...,
+        help="Path to input video/image file or directory containing media files",
+        exists=True,
+    ),
+    output: Path | None = typer.Option(  # noqa: B008
+        None,
+        "--output",
+        "-o",
+        help="Path to json output file for reference video paths. "
+        "This is also the input file for the dataset, the output of compute_captions.py.",
+    ),
+    override: bool = typer.Option(
+        False,
+        "--override",
+        help="Whether to override existing reference video files",
+    ),
+    batch_size: int = typer.Option(
+        100,
+        "--batch-size",
+        help="Batch size for processing videos",
+    ),
+) -> None:
+    """Compute reference videos for IC-LoRA training.
+    This script generates reference videos (e.g., Canny edge maps) for given videos.
+    The paths in the output file will be relative to the output file's directory.
+    Examples:
+        # Process all videos in a directory
+        compute_reference.py videos_dir/ -o videos_dir/captions.json
+    """
+    # Ensure output path is absolute
+    output = Path(output).resolve()
+    console.print(f"Output will be saved to [bold blue]{output}[/]")
+    # Verify output path exists
+    if not output.exists():
+        raise FileNotFoundError(f"Output file does not exist: {output}. This is also the input file for the dataset.")
+    # Process media files
+    process_media(
+        input_path=input_path,
+        output_path=output,
+        override=override,
+        batch_size=batch_size,
+    )
+if __name__ == "__main__":
+    app()

packages/ltx-trainer/scripts/decode_latents.py ADDED Viewed

	@@ -0,0 +1,369 @@

+#!/usr/bin/env python3
+"""
+Decode precomputed video latents back into videos using the VAE.
+This script loads latent files saved during preprocessing and decodes them
+back into video clips using the same VAE model.
+Basic usage:
+    python scripts/decode_latents.py /path/to/latents/dir /path/to/output \
+        --model-source /path/to/ltx2.safetensors
+"""
+from pathlib import Path
+import torch
+import torchaudio
+import torchvision.utils
+import typer
+from einops import rearrange
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from transformers.utils.logging import disable_progress_bar
+from ltx_core.model.video_vae import SpatialTilingConfig, TemporalTilingConfig, TilingConfig
+from ltx_trainer import logger
+from ltx_trainer.model_loader import load_audio_vae_decoder, load_video_vae_decoder, load_vocoder
+from ltx_trainer.video_utils import save_video
+DEFAULT_TILE_SIZE_PIXELS = 512  # Spatial tile size in pixels (must be ≥64 and divisible by 32)
+DEFAULT_TILE_OVERLAP_PIXELS = 128  # Spatial tile overlap in pixels (must be divisible by 32)
+DEFAULT_TILE_SIZE_FRAMES = 128  # Temporal tile size in frames (must be ≥16 and divisible by 8)
+DEFAULT_TILE_OVERLAP_FRAMES = 24  # Temporal tile overlap in frames (must be divisible by 8)
+disable_progress_bar()
+console = Console()
+app = typer.Typer(
+    pretty_exceptions_enable=False,
+    no_args_is_help=True,
+    help="Decode precomputed video latents back into videos using the VAE.",
+)
+class LatentsDecoder:
+    def __init__(
+        self,
+        model_path: str,
+        device: str = "cuda",
+        vae_tiling: bool = False,
+        with_audio: bool = False,
+    ):
+        """Initialize the decoder with model configuration.
+        Args:
+            model_path: Path to LTX-2 checkpoint (.safetensors)
+            device: Device to use for computation
+            vae_tiling: Whether to enable VAE tiling for larger video resolutions
+            with_audio: Whether to load audio VAE for audio decoding
+        """
+        self.device = torch.device(device)
+        self.model_path = model_path
+        self.vae = None
+        self.audio_vae = None
+        self.vocoder = None
+        self.vae_tiling = vae_tiling
+        self._load_model(model_path, with_audio)
+    def _load_model(self, model_path: str, with_audio: bool = False) -> None:
+        """Initialize and load the VAE model(s)."""
+        with console.status(f"[bold]Loading video VAE decoder from {model_path}...", spinner="dots"):
+            self.vae = load_video_vae_decoder(model_path, device=self.device, dtype=torch.bfloat16)
+        if with_audio:
+            with console.status(f"[bold]Loading audio VAE decoder from {model_path}...", spinner="dots"):
+                self.audio_vae = load_audio_vae_decoder(model_path, device=self.device, dtype=torch.bfloat16)
+            with console.status(f"[bold]Loading vocoder from {model_path}...", spinner="dots"):
+                self.vocoder = load_vocoder(model_path, device=self.device)
+    @torch.inference_mode()
+    def decode(self, latents_dir: Path, output_dir: Path, seed: int | None = None) -> None:
+        """Decode all latent files in the directory recursively.
+        Args:
+            latents_dir: Directory containing latent files (.pt)
+            output_dir: Directory to save decoded videos
+            seed: Optional random seed for noise generation
+        """
+        # Find all .pt files recursively
+        latent_files = list(latents_dir.rglob("*.pt"))
+        if not latent_files:
+            logger.warning(f"No .pt files found in {latents_dir}")
+            return
+        logger.info(f"Found {len(latent_files):,} latent files to decode")
+        # Process files with progress bar
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+            console=console,
+        ) as progress:
+            task = progress.add_task("Decoding latents", total=len(latent_files))
+            for latent_file in latent_files:
+                # Calculate relative path to maintain directory structure
+                rel_path = latent_file.relative_to(latents_dir)
+                output_subdir = output_dir / rel_path.parent
+                output_subdir.mkdir(parents=True, exist_ok=True)
+                try:
+                    self._process_file(latent_file, output_subdir, seed)
+                except Exception as e:
+                    logger.error(f"Error processing {latent_file}: {e}")
+                    continue
+                progress.advance(task)
+        logger.info(f"Decoding complete! Videos saved to {output_dir}")
+    @torch.inference_mode()
+    def decode_audio(self, latents_dir: Path, output_dir: Path) -> None:
+        """Decode all audio latent files in the directory recursively.
+        Args:
+            latents_dir: Directory containing audio latent files (.pt)
+            output_dir: Directory to save decoded audio files
+        """
+        # Check if audio VAE is loaded
+        if self.audio_vae is None or self.vocoder is None:
+            logger.warning("Audio VAE or vocoder not loaded. Skipping audio decoding.")
+            return
+        # Find all .pt files recursively
+        latent_files = list(latents_dir.rglob("*.pt"))
+        if not latent_files:
+            logger.warning(f"No .pt files found in {latents_dir}")
+            return
+        logger.info(f"Found {len(latent_files):,} audio latent files to decode")
+        # Process files with progress bar
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+            console=console,
+        ) as progress:
+            task = progress.add_task("Decoding audio latents", total=len(latent_files))
+            for latent_file in latent_files:
+                # Calculate relative path to maintain directory structure
+                rel_path = latent_file.relative_to(latents_dir)
+                output_subdir = output_dir / rel_path.parent
+                output_subdir.mkdir(parents=True, exist_ok=True)
+                try:
+                    self._process_audio_file(latent_file, output_subdir)
+                except Exception as e:
+                    logger.error(f"Error processing audio {latent_file}: {e}")
+                    continue
+                progress.advance(task)
+        logger.info(f"Audio decoding complete! Audio files saved to {output_dir}")
+    def _process_file(self, latent_file: Path, output_dir: Path, seed: int | None) -> None:
+        """Process a single latent file."""
+        # Load the latent data
+        data = torch.load(latent_file, map_location=self.device, weights_only=False)
+        # Get latents - handle both old patchified [seq_len, C] and new [C, F, H, W] formats
+        latents = data["latents"]
+        num_frames = data["num_frames"]
+        height = data["height"]
+        width = data["width"]
+        # Check if latents need reshaping (old patchified format)
+        if latents.dim() == 2:
+            # Old format: [seq_len, C] -> reshape to [C, F, H, W]
+            latents = rearrange(latents, "(f h w) c -> c f h w", f=num_frames, h=height, w=width)
+        # Add batch dimension: [C, F, H, W] -> [1, C, F, H, W]
+        latents = latents.unsqueeze(0).to(device=self.device, dtype=torch.bfloat16)
+        # Create generator only if seed is provided
+        generator = None
+        if seed is not None:
+            generator = torch.Generator(device=self.device)
+            generator.manual_seed(seed)
+        # Decode the video
+        video = self._decode_video(latents, generator)
+        # Determine output format and save
+        is_image = video.shape[0] == 1
+        if is_image:
+            # Save as PNG for single frame
+            output_path = output_dir / f"{latent_file.stem}.png"
+            torchvision.utils.save_image(
+                video[0],  # [C, H, W] in [0, 1]
+                str(output_path),
+            )
+        else:
+            # Save as MP4 for video using PyAV-based save_video
+            output_path = output_dir / f"{latent_file.stem}.mp4"
+            fps = data.get("fps", 24)  # Use stored FPS or default to 24
+            save_video(
+                video_tensor=video,  # [F, C, H, W] in [0, 1]
+                output_path=output_path,
+                fps=fps,
+            )
+    def _decode_video(self, latents: torch.Tensor, generator: torch.Generator | None = None) -> torch.Tensor:
+        """Decode latents to video frames."""
+        if self.vae_tiling:
+            # Use tiled decoding for reduced VRAM
+            tiling_config = TilingConfig(
+                spatial_config=SpatialTilingConfig(
+                    tile_size_in_pixels=DEFAULT_TILE_SIZE_PIXELS,
+                    tile_overlap_in_pixels=DEFAULT_TILE_OVERLAP_PIXELS,
+                ),
+                temporal_config=TemporalTilingConfig(
+                    tile_size_in_frames=DEFAULT_TILE_SIZE_FRAMES,
+                    tile_overlap_in_frames=DEFAULT_TILE_OVERLAP_FRAMES,
+                ),
+            )
+            chunks = list(
+                self.vae.tiled_decode(
+                    latents,
+                    tiling_config=tiling_config,
+                    generator=generator,
+                )
+            )
+            # Concatenate along temporal dimension
+            video = torch.cat(chunks, dim=2)  # [B, C, F, H, W]
+        else:
+            # Standard full decoding
+            video = self.vae(latents, generator=generator)  # [B, C, F, H, W]
+        # Convert to [F, C, H, W] format and normalize to [0, 1]
+        video = rearrange(video, "1 c f h w -> f c h w")
+        video = (video + 1) / 2  # Denormalize from [-1, 1] to [0, 1]
+        video = video.clamp(0, 1)
+        return video
+    def _process_audio_file(self, latent_file: Path, output_dir: Path) -> None:
+        """Process a single audio latent file."""
+        # Load the latent data
+        data = torch.load(latent_file, map_location=self.device, weights_only=False)
+        latents = data["latents"].to(device=self.device, dtype=torch.float32)
+        num_time_steps = data["num_time_steps"]
+        freq_bins = data["frequency_bins"]
+        # Handle both old patchified [seq_len, C] and new [C, T, F] formats
+        if latents.dim() == 2:
+            # Old format: [seq_len, channels] where seq_len = time * freq
+            # Reshape to [C, T, F]
+            latents = rearrange(latents, "(t f) c -> c t f", t=num_time_steps, f=freq_bins)
+        # Add batch dimension: [C, T, F] -> [1, C, T, F]
+        latents = latents.unsqueeze(0)
+        # Set correct dtype for audio VAE
+        latents = latents.to(dtype=torch.bfloat16)
+        # Decode audio using audio VAE decoder (produces mel spectrogram)
+        mel_spectrogram = self.audio_vae(latents)
+        # Convert mel spectrogram to waveform using vocoder
+        waveform = self.vocoder(mel_spectrogram)
+        # Save as WAV
+        output_path = output_dir / f"{latent_file.stem}.wav"
+        sample_rate = self.vocoder.output_sampling_rate
+        torchaudio.save(str(output_path), waveform[0].cpu(), sample_rate)
+@app.command()
+def main(
+    latents_dir: str = typer.Argument(
+        ...,
+        help="Directory containing the precomputed latent files (searched recursively)",
+    ),
+    output_dir: str = typer.Argument(
+        ...,
+        help="Directory to save the decoded videos (maintains same folder hierarchy as input)",
+    ),
+    model_path: str = typer.Option(
+        ...,
+        help="Path to LTX-2 checkpoint (.safetensors file)",
+    ),
+    device: str = typer.Option(
+        default="cuda",
+        help="Device to use for computation",
+    ),
+    vae_tiling: bool = typer.Option(
+        default=False,
+        help="Enable VAE tiling for larger video resolutions",
+    ),
+    seed: int | None = typer.Option(
+        default=None,
+        help="Random seed for noise generation during decoding",
+    ),
+    with_audio: bool = typer.Option(
+        default=False,
+        help="Also decode audio latents (requires audio_latents directory)",
+    ),
+    audio_latents_dir: str | None = typer.Option(
+        default=None,
+        help="Directory containing audio latent files (defaults to 'audio_latents' sibling of latents_dir)",
+    ),
+) -> None:
+    """Decode precomputed video latents back into videos using the VAE.
+    This script recursively searches for .pt latent files in the input directory
+    and decodes them to videos, maintaining the same folder hierarchy in the output.
+    Examples:
+        # Basic usage
+        python scripts/decode_latents.py /path/to/latents /path/to/videos \\
+            --model-path /path/to/ltx2.safetensors
+        # With VAE tiling for large videos
+        python scripts/decode_latents.py /path/to/latents /path/to/videos \\
+            --model-path /path/to/ltx2.safetensors --vae-tiling
+        # With audio decoding
+        python scripts/decode_latents.py /path/to/latents /path/to/videos \\
+            --model-path /path/to/ltx2.safetensors --with-audio
+    """
+    latents_path = Path(latents_dir)
+    output_path = Path(output_dir)
+    if not latents_path.exists() or not latents_path.is_dir():
+        raise typer.BadParameter(f"Latents directory does not exist: {latents_path}")
+    decoder = LatentsDecoder(
+        model_path=model_path,
+        device=device,
+        vae_tiling=vae_tiling,
+        with_audio=with_audio,
+    )
+    decoder.decode(latents_path, output_path, seed=seed)
+    # Decode audio if requested
+    if with_audio:
+        audio_path = Path(audio_latents_dir) if audio_latents_dir else latents_path.parent / "audio_latents"
+        if audio_path.exists():
+            audio_output_path = output_path.parent / "decoded_audio"
+            decoder.decode_audio(audio_path, audio_output_path)
+        else:
+            logger.warning(f"Audio latents directory not found: {audio_path}")
+if __name__ == "__main__":
+    app()

packages/ltx-trainer/scripts/process_captions.py ADDED Viewed

	@@ -0,0 +1,435 @@

+#!/usr/bin/env python
+"""
+Compute text embeddings for video generation training.
+This module provides functionality for processing text captions, including:
+- Loading captions from various file formats (CSV, JSON, JSONL)
+- Cleaning and preprocessing text (removing LLM prefixes, adding ID tokens)
+- CaptionsDataset for caption-only preprocessing workflows
+Can be used as a standalone script:
+    python scripts/process_captions.py dataset.json --output-dir /path/to/output \
+        --model-source /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma
+"""
+import json
+import os
+from pathlib import Path
+from typing import Any
+import pandas as pd
+import torch
+import typer
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TaskProgressColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from torch.utils.data import DataLoader, Dataset
+from transformers.utils.logging import disable_progress_bar
+from ltx_trainer import logger
+from ltx_trainer.model_loader import load_embeddings_processor, load_text_encoder
+# Disable tokenizers parallelism to avoid warnings
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+disable_progress_bar()
+# Common phrases that LLMs often add to captions that we might want to remove
+COMMON_BEGINNING_PHRASES: tuple[str, ...] = (
+    "This video",
+    "The video",
+    "This clip",
+    "The clip",
+    "The animation",
+    "This image",
+    "The image",
+    "This picture",
+    "The picture",
+)
+COMMON_CONTINUATION_WORDS: tuple[str, ...] = (
+    "shows",
+    "depicts",
+    "features",
+    "captures",
+    "highlights",
+    "introduces",
+    "presents",
+)
+COMMON_LLM_START_PHRASES: tuple[str, ...] = (
+    "In the video,",
+    "In this video,",
+    "In this video clip,",
+    "In the clip,",
+    "Caption:",
+    *(
+        f"{beginning} {continuation}"
+        for beginning in COMMON_BEGINNING_PHRASES
+        for continuation in COMMON_CONTINUATION_WORDS
+    ),
+)
+app = typer.Typer(
+    pretty_exceptions_enable=False,
+    no_args_is_help=True,
+    help="Process text captions and save embeddings for video generation training.",
+)
+class CaptionsDataset(Dataset):
+    """
+    Dataset for processing text captions only.
+    This dataset is designed for caption preprocessing workflows where you only need
+    to process text without loading videos. Useful for:
+    - Precomputing text embeddings
+    - Caption cleaning and preprocessing
+    - Text-only preprocessing pipelines
+    """
+    def __init__(
+        self,
+        dataset_file: str | Path,
+        caption_column: str,
+        media_column: str = "media_path",
+        lora_trigger: str | None = None,
+        remove_llm_prefixes: bool = False,
+    ) -> None:
+        """
+        Initialize the captions dataset.
+        Args:
+            dataset_file: Path to CSV/JSON/JSONL metadata file
+            caption_column: Column name for captions in the metadata file
+            media_column: Column name for media paths (used for output naming)
+            lora_trigger: Optional trigger word to prepend to each caption
+            remove_llm_prefixes: Whether to remove common LLM-generated prefixes
+        """
+        super().__init__()
+        self.dataset_file = Path(dataset_file)
+        self.caption_column = caption_column
+        self.media_column = media_column
+        self.lora_trigger = f"{lora_trigger.strip()} " if lora_trigger else ""
+        # Load captions with their corresponding output embedding paths
+        self.caption_data = self._load_caption_data()
+        # Convert to lists for indexing
+        self.output_paths = list(self.caption_data.keys())
+        self.prompts = list(self.caption_data.values())
+        # Clean LLM start phrases if requested
+        if remove_llm_prefixes:
+            self._clean_llm_prefixes()
+    def __len__(self) -> int:
+        return len(self.prompts)
+    def __getitem__(self, index: int) -> dict[str, Any]:
+        """Get a single caption with optional trigger word prepended and output path."""
+        prompt = self.lora_trigger + self.prompts[index]
+        return {
+            "prompt": prompt,
+            "output_path": self.output_paths[index],
+            "index": index,
+        }
+    def _load_caption_data(self) -> dict[str, str]:
+        """Load captions and compute their output embedding paths."""
+        if self.dataset_file.suffix == ".csv":
+            return self._load_caption_data_from_csv()
+        elif self.dataset_file.suffix == ".json":
+            return self._load_caption_data_from_json()
+        elif self.dataset_file.suffix == ".jsonl":
+            return self._load_caption_data_from_jsonl()
+        else:
+            raise ValueError("Expected `dataset_file` to be a path to a CSV, JSON, or JSONL file.")
+    def _load_caption_data_from_csv(self) -> dict[str, str]:
+        """Load captions from a CSV file and compute output embedding paths."""
+        df = pd.read_csv(self.dataset_file)
+        if self.caption_column not in df.columns:
+            raise ValueError(f"Column '{self.caption_column}' not found in CSV file")
+        if self.media_column not in df.columns:
+            raise ValueError(f"Column '{self.media_column}' not found in CSV file")
+        caption_data = {}
+        for _, row in df.iterrows():
+            media_path = Path(row[self.media_column].strip())
+            # Convert media path to embedding output path (same structure, .pt extension)
+            output_path = str(media_path.with_suffix(".pt"))
+            caption_data[output_path] = row[self.caption_column]
+        return caption_data
+    def _load_caption_data_from_json(self) -> dict[str, str]:
+        """Load captions from a JSON file and compute output embedding paths."""
+        with open(self.dataset_file, "r", encoding="utf-8") as file:
+            data = json.load(file)
+        if not isinstance(data, list):
+            raise ValueError("JSON file must contain a list of objects")
+        caption_data = {}
+        for entry in data:
+            if self.caption_column not in entry:
+                raise ValueError(f"Key '{self.caption_column}' not found in JSON entry: {entry}")
+            if self.media_column not in entry:
+                raise ValueError(f"Key '{self.media_column}' not found in JSON entry: {entry}")
+            media_path = Path(entry[self.media_column].strip())
+            # Convert media path to embedding output path (same structure, .pt extension)
+            output_path = str(media_path.with_suffix(".pt"))
+            caption_data[output_path] = entry[self.caption_column]
+        return caption_data
+    def _load_caption_data_from_jsonl(self) -> dict[str, str]:
+        """Load captions from a JSONL file and compute output embedding paths."""
+        caption_data = {}
+        with open(self.dataset_file, "r", encoding="utf-8") as file:
+            for line in file:
+                entry = json.loads(line)
+                if self.caption_column not in entry:
+                    raise ValueError(f"Key '{self.caption_column}' not found in JSONL entry: {entry}")
+                if self.media_column not in entry:
+                    raise ValueError(f"Key '{self.media_column}' not found in JSONL entry: {entry}")
+                media_path = Path(entry[self.media_column].strip())
+                # Convert media path to embedding output path (same structure, .pt extension)
+                output_path = str(media_path.with_suffix(".pt"))
+                caption_data[output_path] = entry[self.caption_column]
+        return caption_data
+    def _clean_llm_prefixes(self) -> None:
+        """Remove common LLM-generated prefixes from captions."""
+        for i in range(len(self.prompts)):
+            self.prompts[i] = self.prompts[i].strip()
+            for phrase in COMMON_LLM_START_PHRASES:
+                if self.prompts[i].startswith(phrase):
+                    self.prompts[i] = self.prompts[i].removeprefix(phrase).strip()
+                    break
+def compute_captions_embeddings(  # noqa: PLR0913
+    dataset_file: str | Path,
+    output_dir: str,
+    model_path: str,
+    text_encoder_path: str,
+    caption_column: str = "caption",
+    media_column: str = "media_path",
+    lora_trigger: str | None = None,
+    remove_llm_prefixes: bool = False,
+    batch_size: int = 8,
+    device: str = "cuda",
+    load_in_8bit: bool = False,
+) -> None:
+    """
+    Process captions and save text embeddings.
+    Args:
+        dataset_file: Path to metadata file (CSV/JSON/JSONL) containing captions and media paths
+        output_dir: Directory to save embeddings
+        model_path: Path to LTX-2 checkpoint (.safetensors)
+        text_encoder_path: Path to Gemma text encoder directory
+        caption_column: Column name containing captions in the metadata file
+        media_column: Column name containing media paths (used for output naming)
+        lora_trigger: Optional trigger word to prepend to each caption
+        remove_llm_prefixes: Whether to remove common LLM-generated prefixes
+        batch_size: Batch size for processing
+        device: Device to use for computation
+        load_in_8bit: Whether to load the Gemma text encoder in 8-bit precision
+    """
+    console = Console()
+    # Create dataset
+    dataset = CaptionsDataset(
+        dataset_file=dataset_file,
+        caption_column=caption_column,
+        media_column=media_column,
+        lora_trigger=lora_trigger,
+        remove_llm_prefixes=remove_llm_prefixes,
+    )
+    logger.info(f"Loaded {len(dataset):,} captions")
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Load text encoder and embeddings processor
+    with console.status("[bold]Loading Gemma text encoder...", spinner="dots"):
+        text_encoder = load_text_encoder(
+            text_encoder_path,
+            device=device,
+            dtype=torch.bfloat16,
+            load_in_8bit=load_in_8bit,
+        )
+        embeddings_processor = load_embeddings_processor(
+            model_path,
+            device=device,
+            dtype=torch.bfloat16,
+        )
+    logger.info("Text encoder and embeddings processor loaded successfully")
+    # TODO(batch-tokenization): The current Gemma tokenizer doesn't support batched tokenization.
+    if batch_size > 1:
+        logger.warning(
+            "Batch size greater than 1 is not currently supported with the Gemma tokenizer. "
+            "Overriding batch_size to 1. This will be fixed in a future update."
+        )
+        batch_size = 1
+    # Create dataloader
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
+    # Process batches
+    total_batches = len(dataloader)
+    logger.info(f"Processing captions in {total_batches:,} batches...")
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TaskProgressColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        TimeRemainingColumn(),
+        console=console,
+    ) as progress:
+        task = progress.add_task("Processing captions", total=len(dataloader))
+        for batch in dataloader:
+            # Encode prompts using text_encoder.encode() + feature_extractor
+            # (returns video/audio features before connector).
+            # The connector is applied during training via embeddings_processor
+            with torch.inference_mode():
+                # TODO(batch-tokenization): When tokenizer supports batching, encode all prompts at once.
+                # For now, process one at a time:
+                for i in range(len(batch["prompt"])):
+                    hidden_states, prompt_attention_mask = text_encoder.encode(batch["prompt"][i], padding_side="left")
+                    video_prompt_embeds, audio_prompt_embeds = embeddings_processor.feature_extractor(
+                        hidden_states, prompt_attention_mask, "left"
+                    )
+                    output_rel_path = Path(batch["output_path"][i])
+                    # Create output directory maintaining structure
+                    output_dir_path = output_path / output_rel_path.parent
+                    output_dir_path.mkdir(parents=True, exist_ok=True)
+                    embedding_data = {
+                        "video_prompt_embeds": video_prompt_embeds[0].cpu().contiguous(),
+                        "prompt_attention_mask": prompt_attention_mask[0].cpu().contiguous(),
+                    }
+                    if audio_prompt_embeds is not None:
+                        embedding_data["audio_prompt_embeds"] = audio_prompt_embeds[0].cpu().contiguous()
+                    output_file = output_path / output_rel_path
+                    torch.save(embedding_data, output_file)
+            progress.advance(task)
+    logger.info(f"Processed {len(dataset):,} captions. Embeddings saved to {output_path}")
+@app.command()
+def main(  # noqa: PLR0913
+    dataset_file: str = typer.Argument(
+        ...,
+        help="Path to metadata file (CSV/JSON/JSONL) containing captions and media paths",
+    ),
+    output_dir: str = typer.Option(
+        ...,
+        help="Output directory to save text embeddings",
+    ),
+    model_path: str = typer.Option(
+        ...,
+        help="Path to LTX-2 checkpoint (.safetensors file)",
+    ),
+    text_encoder_path: str = typer.Option(
+        ...,
+        help="Path to Gemma text encoder directory",
+    ),
+    caption_column: str = typer.Option(
+        default="caption",
+        help="Column name containing captions in the dataset JSON/JSONL/CSV file",
+    ),
+    media_column: str = typer.Option(
+        default="media_path",
+        help="Column name in the dataset JSON/JSONL/CSV file containing media paths "
+        "(used for output file naming and folder structure)",
+    ),
+    batch_size: int = typer.Option(
+        default=8,
+        help="Batch size for processing",
+    ),
+    device: str = typer.Option(
+        default="cuda",
+        help="Device to use for computation",
+    ),
+    lora_trigger: str | None = typer.Option(
+        default=None,
+        help="Optional trigger word to prepend to each caption (activates the LoRA during inference)",
+    ),
+    remove_llm_prefixes: bool = typer.Option(
+        default=False,
+        help="Remove common LLM-generated prefixes from captions",
+    ),
+    load_text_encoder_in_8bit: bool = typer.Option(
+        default=False,
+        help="Load the Gemma text encoder in 8-bit precision to save GPU memory (requires bitsandbytes)",
+    ),
+) -> None:
+    """Process text captions and save embeddings for video generation training.
+    This script processes captions from metadata files and saves text embeddings
+    that can be used for training video generation models. The output embeddings
+    will maintain the same folder structure and naming as the corresponding media files.
+    Note: This script is designed for LTX-2 models which use the Gemma text encoder.
+    Examples:
+        # Process captions with LTX-2 model
+        python scripts/process_captions.py dataset.json --output-dir ./embeddings \\
+            --model-path /path/to/ltx2_checkpoint.safetensors \\
+            --text-encoder-path /path/to/gemma
+        # Add a trigger word for LoRA training
+        python scripts/process_captions.py dataset.json --output-dir ./embeddings \\
+            --model-path /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma \\
+            --lora-trigger "mytoken"
+        # Remove LLM-generated prefixes from captions
+        python scripts/process_captions.py dataset.json --output-dir ./embeddings \\
+            --model-path /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma \\
+            --remove-llm-prefixes
+    """
+    # Validate dataset file
+    if not Path(dataset_file).is_file():
+        raise typer.BadParameter(f"Dataset file not found: {dataset_file}")
+    if lora_trigger:
+        logger.info(f'LoRA trigger word "{lora_trigger}" will be prepended to all captions')
+    # Process embeddings
+    compute_captions_embeddings(
+        dataset_file=dataset_file,
+        output_dir=output_dir,
+        model_path=model_path,
+        text_encoder_path=text_encoder_path,
+        caption_column=caption_column,
+        media_column=media_column,
+        lora_trigger=lora_trigger,
+        remove_llm_prefixes=remove_llm_prefixes,
+        batch_size=batch_size,
+        device=device,
+        load_in_8bit=load_text_encoder_in_8bit,
+    )
+if __name__ == "__main__":
+    app()

packages/ltx-trainer/scripts/process_dataset.py ADDED Viewed

	@@ -0,0 +1,317 @@

+#!/usr/bin/env python3
+"""
+Preprocess a video dataset by computing video clips latents and text captions embeddings.
+This script provides a command-line interface for preprocessing video datasets by computing
+latent representations of video clips and text embeddings of their captions. The preprocessed
+data can be used to accelerate training of video generation models and to save GPU memory.
+Basic usage:
+    python scripts/process_dataset.py /path/to/dataset.json --resolution-buckets 768x768x49 \
+        --model-path /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma
+The dataset must be a CSV, JSON, or JSONL file with columns for captions and video paths.
+"""
+from pathlib import Path
+import typer
+from decode_latents import LatentsDecoder
+from process_captions import compute_captions_embeddings
+from process_videos import compute_latents, compute_scaled_resolution_buckets, parse_resolution_buckets
+from rich.console import Console
+from ltx_trainer import logger
+from ltx_trainer.gpu_utils import free_gpu_memory_context
+console = Console()
+app = typer.Typer(
+    pretty_exceptions_enable=False,
+    no_args_is_help=True,
+    help="Preprocess a video dataset by computing video clips latents and text captions embeddings. "
+    "The dataset must be a CSV, JSON, or JSONL file with columns for captions and video paths.",
+)
+def preprocess_dataset(  # noqa: PLR0913
+    dataset_file: str,
+    caption_column: str,
+    video_column: str,
+    resolution_buckets: list[tuple[int, int, int]],
+    batch_size: int,
+    output_dir: str | None,
+    lora_trigger: str | None,
+    vae_tiling: bool,
+    decode: bool,
+    model_path: str,
+    text_encoder_path: str,
+    device: str,
+    remove_llm_prefixes: bool = False,
+    reference_column: str | None = None,
+    reference_downscale_factor: int = 1,
+    with_audio: bool = False,
+    load_text_encoder_in_8bit: bool = False,
+) -> None:
+    """Run the preprocessing pipeline with the given arguments."""
+    # Validate dataset file
+    _validate_dataset_file(dataset_file)
+    # Set up output directories
+    output_base = Path(output_dir) if output_dir else Path(dataset_file).parent / ".precomputed"
+    conditions_dir = output_base / "conditions"
+    latents_dir = output_base / "latents"
+    if lora_trigger:
+        logger.info(f'LoRA trigger word "{lora_trigger}" will be prepended to all captions')
+    with free_gpu_memory_context():
+        # Process captions using the dedicated function
+        compute_captions_embeddings(
+            dataset_file=dataset_file,
+            output_dir=str(conditions_dir),
+            model_path=model_path,
+            text_encoder_path=text_encoder_path,
+            caption_column=caption_column,
+            media_column=video_column,
+            lora_trigger=lora_trigger,
+            remove_llm_prefixes=remove_llm_prefixes,
+            batch_size=batch_size,
+            device=device,
+            load_in_8bit=load_text_encoder_in_8bit,
+        )
+    # Process videos using the dedicated function
+    audio_latents_dir = None
+    if with_audio:
+        logger.info("Audio preprocessing enabled - will extract and encode audio from videos")
+        audio_latents_dir = output_base / "audio_latents"
+    with free_gpu_memory_context():
+        compute_latents(
+            dataset_file=dataset_file,
+            video_column=video_column,
+            resolution_buckets=resolution_buckets,
+            output_dir=str(latents_dir),
+            model_path=model_path,
+            batch_size=batch_size,
+            device=device,
+            vae_tiling=vae_tiling,
+            with_audio=with_audio,
+            audio_output_dir=str(audio_latents_dir) if audio_latents_dir else None,
+        )
+        # Process reference videos if reference_column is provided
+        if reference_column:
+            # Validate: scaled references with multiple buckets can cause ambiguous bucket matching
+            if reference_downscale_factor > 1 and len(resolution_buckets) > 1:
+                raise ValueError(
+                    "When using --reference-downscale-factor > 1, only a single resolution bucket is supported. "
+                    "Using multiple buckets with scaled references can cause ambiguous bucket matching "
+                    "(e.g., a 512x256 reference could match either the scaled-down 1024x512 bucket or the 512x256 "
+                    "bucket). Please use a single resolution bucket or set --reference-downscale-factor to 1."
+                )
+            # Calculate and validate scaled resolution buckets for reference videos
+            reference_buckets = compute_scaled_resolution_buckets(resolution_buckets, reference_downscale_factor)
+            if reference_downscale_factor > 1:
+                logger.info(
+                    f"Processing reference videos for IC-LoRA training at 1/{reference_downscale_factor} resolution..."
+                )
+                logger.info(f"Reference resolution buckets: {reference_buckets}")
+            else:
+                logger.info("Processing reference videos for IC-LoRA training...")
+            reference_latents_dir = output_base / "reference_latents"
+            compute_latents(
+                dataset_file=dataset_file,
+                main_media_column=video_column,
+                video_column=reference_column,
+                resolution_buckets=reference_buckets,
+                output_dir=str(reference_latents_dir),
+                model_path=model_path,
+                batch_size=batch_size,
+                device=device,
+                vae_tiling=vae_tiling,
+            )
+    # Handle decoding if requested (for verification)
+    if decode:
+        logger.info("Decoding latents for verification...")
+        decoder = LatentsDecoder(
+            model_path=model_path,
+            device=device,
+            vae_tiling=vae_tiling,
+            with_audio=with_audio,
+        )
+        decoder.decode(latents_dir, output_base / "decoded_videos")
+        # Also decode reference videos if they exist
+        if reference_column:
+            reference_latents_dir = output_base / "reference_latents"
+            if reference_latents_dir.exists():
+                logger.info("Decoding reference videos...")
+                decoder.decode(reference_latents_dir, output_base / "decoded_reference_videos")
+        # Decode audio latents if they exist
+        if with_audio and audio_latents_dir and audio_latents_dir.exists():
+            logger.info("Decoding audio latents...")
+            decoder.decode_audio(audio_latents_dir, output_base / "decoded_audio")
+    # Print summary
+    logger.info(f"Dataset preprocessing complete! Results saved to {output_base}")
+    if reference_column:
+        logger.info("Reference videos processed and saved to reference_latents/ directory for IC-LoRA training")
+    if with_audio:
+        logger.info("Audio latents saved to audio_latents/ directory for audio-video training")
+def _validate_dataset_file(dataset_path: str) -> None:
+    """Validate that the dataset file exists and has the correct format."""
+    dataset_file = Path(dataset_path)
+    if not dataset_file.exists():
+        raise FileNotFoundError(f"Dataset file does not exist: {dataset_file}")
+    if not dataset_file.is_file():
+        raise ValueError(f"Dataset path must be a file, not a directory: {dataset_file}")
+    if dataset_file.suffix.lower() not in [".csv", ".json", ".jsonl"]:
+        raise ValueError(f"Dataset file must be CSV, JSON, or JSONL format: {dataset_file}")
+@app.command()
+def main(  # noqa: PLR0913
+    dataset_path: str = typer.Argument(
+        ...,
+        help="Path to metadata file (CSV/JSON/JSONL) containing captions and video paths",
+    ),
+    resolution_buckets: str = typer.Option(
+        ...,
+        help='Resolution buckets in format "WxHxF;WxHxF;..." (e.g. "768x768x25;512x512x49")',
+    ),
+    model_path: str = typer.Option(
+        ...,
+        help="Path to LTX-2 checkpoint (.safetensors file)",
+    ),
+    text_encoder_path: str = typer.Option(
+        ...,
+        help="Path to Gemma text encoder directory",
+    ),
+    caption_column: str = typer.Option(
+        default="caption",
+        help="Column name containing captions in the dataset JSON/JSONL/CSV file",
+    ),
+    video_column: str = typer.Option(
+        default="media_path",
+        help="Column name containing video paths in the dataset JSON/JSONL/CSV file",
+    ),
+    batch_size: int = typer.Option(
+        default=1,
+        help="Batch size for preprocessing",
+    ),
+    device: str = typer.Option(
+        default="cuda",
+        help="Device to use for computation",
+    ),
+    vae_tiling: bool = typer.Option(
+        default=False,
+        help="Enable VAE tiling for larger video resolutions",
+    ),
+    output_dir: str | None = typer.Option(
+        default=None,
+        help="Output directory (defaults to .precomputed in dataset directory)",
+    ),
+    lora_trigger: str | None = typer.Option(
+        default=None,
+        help="Optional trigger word to prepend to each caption (activates the LoRA during inference)",
+    ),
+    decode: bool = typer.Option(
+        default=False,
+        help="Decode and save latents after encoding (videos and audio) for verification",
+    ),
+    remove_llm_prefixes: bool = typer.Option(
+        default=False,
+        help="Remove LLM prefixes from captions",
+    ),
+    reference_column: str | None = typer.Option(
+        default=None,
+        help="Column name containing reference video paths (for video-to-video training)",
+    ),
+    with_audio: bool = typer.Option(
+        default=False,
+        help="Extract and encode audio from video files",
+    ),
+    load_text_encoder_in_8bit: bool = typer.Option(
+        default=False,
+        help="Load the Gemma text encoder in 8-bit precision to save GPU memory (requires bitsandbytes)",
+    ),
+    reference_downscale_factor: int = typer.Option(
+        default=1,
+        help="Downscale factor for reference video resolution. When > 1, reference videos are processed at "
+        "1/n resolution (e.g., 2 means half resolution). Used for efficient IC-LoRA training.",
+    ),
+) -> None:
+    """Preprocess a video dataset by computing and saving latents and text embeddings.
+    The dataset must be a CSV, JSON, or JSONL file with columns for captions and video paths.
+    This script is designed for LTX-2 models which use the Gemma text encoder.
+    Examples:
+        # Process a dataset with LTX-2 model
+        python scripts/process_dataset.py dataset.json --resolution-buckets 768x768x25 \\
+            --model-path /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma
+        # Process dataset with custom column names
+        python scripts/process_dataset.py dataset.json --resolution-buckets 768x768x25 \\
+            --model-path /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma \\
+            --caption-column "text" --video-column "video_path"
+        # Process dataset with reference videos for IC-LoRA training
+        python scripts/process_dataset.py dataset.json --resolution-buckets 768x768x25 \\
+            --model-path /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma \\
+            --reference-column "reference_path"
+        # Process dataset with scaled reference videos (half resolution) for efficient IC-LoRA
+        python scripts/process_dataset.py dataset.json --resolution-buckets 768x768x25 \\
+            --model-path /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma \\
+            --reference-column "reference_path" --reference-downscale-factor 2
+        # Process dataset with audio for audio-video training
+        python scripts/process_dataset.py dataset.json --resolution-buckets 768x512x97 \\
+            --model-path /path/to/ltx2.safetensors --text-encoder-path /path/to/gemma \\
+            --with-audio
+    """
+    parsed_resolution_buckets = parse_resolution_buckets(resolution_buckets)
+    if len(parsed_resolution_buckets) > 1:
+        logger.warning(
+            "Using multiple resolution buckets. "
+            "When training with multiple resolution buckets, you must use a batch size of 1."
+        )
+    # Validate reference_downscale_factor
+    if reference_downscale_factor < 1:
+        raise typer.BadParameter("--reference-downscale-factor must be >= 1")
+    if reference_downscale_factor > 1 and not reference_column:
+        logger.warning("--reference-downscale-factor specified but no --reference-column provided. Ignoring.")
+    preprocess_dataset(
+        dataset_file=dataset_path,
+        caption_column=caption_column,
+        video_column=video_column,
+        resolution_buckets=parsed_resolution_buckets,
+        batch_size=batch_size,
+        output_dir=output_dir,
+        lora_trigger=lora_trigger,
+        vae_tiling=vae_tiling,
+        decode=decode,
+        model_path=model_path,
+        text_encoder_path=text_encoder_path,
+        device=device,
+        remove_llm_prefixes=remove_llm_prefixes,
+        reference_column=reference_column,
+        reference_downscale_factor=reference_downscale_factor,
+        with_audio=with_audio,
+        load_text_encoder_in_8bit=load_text_encoder_in_8bit,
+    )
+if __name__ == "__main__":
+    app()

packages/ltx-trainer/scripts/process_videos.py ADDED Viewed

	@@ -0,0 +1,1039 @@

+#!/usr/bin/env python3
+"""
+Compute latent representations for video generation training.
+This module provides functionality for processing video and image files, including:
+- Loading videos/images from various file formats (CSV, JSON, JSONL)
+- Resizing, cropping, and transforming media
+- MediaDataset for video-only preprocessing workflows
+- BucketSampler for grouping videos by resolution
+Can be used as a standalone script:
+    python scripts/process_videos.py dataset.csv --resolution-buckets 768x768x25 \
+        --output-dir /path/to/output --model-source /path/to/ltx2.safetensors
+"""
+import json
+import math
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+import torch
+import torchaudio
+import typer
+from pillow_heif import register_heif_opener
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TaskProgressColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import crop, resize, to_tensor
+from transformers.utils.logging import disable_progress_bar
+from ltx_core.model.audio_vae import AudioProcessor
+from ltx_core.types import Audio
+from ltx_trainer import logger
+from ltx_trainer.model_loader import load_audio_vae_encoder, load_video_vae_encoder
+from ltx_trainer.utils import open_image_as_srgb
+from ltx_trainer.video_utils import get_video_frame_count, read_video
+disable_progress_bar()
+# Register HEIF/HEIC support
+register_heif_opener()
+# Constants for validation
+VAE_SPATIAL_FACTOR = 32
+VAE_TEMPORAL_FACTOR = 8
+# Audio constants
+AUDIO_LATENT_CHANNELS = 8
+AUDIO_FREQUENCY_BINS = 16
+DEFAULT_TILE_SIZE = 512  # Spatial tile size in pixels (must be ≥64 and divisible by 32)
+DEFAULT_TILE_OVERLAP = 128  # Spatial tile overlap in pixels (must be divisible by 32)
+app = typer.Typer(
+    pretty_exceptions_enable=False,
+    no_args_is_help=True,
+    help="Process videos/images and save latent representations for video generation training.",
+)
+class MediaDataset(Dataset):
+    """
+    Dataset for processing video and image files.
+    This dataset is designed for media preprocessing workflows where you need to:
+    - Load and preprocess videos/images
+    - Apply resizing and cropping transformations
+    - Handle different resolution buckets
+    - Filter out invalid media files
+    - Optionally extract audio from video files
+    """
+    def __init__(
+        self,
+        dataset_file: str | Path,
+        main_media_column: str,
+        video_column: str,
+        resolution_buckets: list[tuple[int, int, int]],
+        reshape_mode: str = "center",
+        with_audio: bool = False,
+    ) -> None:
+        """
+        Initialize the media dataset.
+        Args:
+            dataset_file: Path to CSV/JSON/JSONL metadata file
+            video_column: Column name for video paths in the metadata file
+            resolution_buckets: List of (frames, height, width) tuples
+            reshape_mode: How to crop videos ("center", "random")
+            with_audio: Whether to extract audio from video files
+        """
+        super().__init__()
+        self.dataset_file = Path(dataset_file)
+        self.main_media_column = main_media_column
+        self.resolution_buckets = resolution_buckets
+        self.reshape_mode = reshape_mode
+        self.with_audio = with_audio
+        # First load main media paths
+        self.main_media_paths = self._load_video_paths(main_media_column)
+        # Then load reference video paths
+        self.video_paths = self._load_video_paths(video_column)
+        # Filter out videos with insufficient frames
+        self._filter_valid_videos()
+        self.max_target_frames = max(self.resolution_buckets, key=lambda x: x[0])[0]
+        # Set up video transforms
+        self.transforms = transforms.Compose(
+            [
+                transforms.Lambda(lambda x: x.clamp_(0, 1)),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+    def __len__(self) -> int:
+        return len(self.video_paths)
+    def __getitem__(self, index: int) -> dict[str, Any]:
+        """Get a single video/image with metadata, and optionally audio."""
+        if isinstance(index, list):
+            # Special case for BucketSampler - return cached data
+            return index
+        video_path: Path = self.video_paths[index]
+        # Compute relative path of the video
+        data_root = self.dataset_file.parent
+        relative_path = str(video_path.relative_to(data_root))
+        media_relative_path = str(self.main_media_paths[index].relative_to(data_root))
+        if video_path.suffix.lower() in [".png", ".jpg", ".jpeg"]:
+            media_tensor = self._preprocess_image(video_path)
+            fps = 1.0
+            audio_data = None  # Images don't have audio
+        else:
+            media_tensor, fps = self._preprocess_video(video_path)
+            # Extract audio if enabled
+            if self.with_audio:
+                # Calculate target duration from the processed video frames
+                # This ensures audio is trimmed to match the exact video duration
+                # media_tensor is [C, F, H, W] so shape[1] is num_frames
+                target_duration = media_tensor.shape[1] / fps
+                audio_data = self._extract_audio(video_path, target_duration)
+            else:
+                audio_data = None
+        # media_tensor is [C, F, H, W] format for VAE compatibility
+        _, num_frames, height, width = media_tensor.shape
+        result = {
+            "video": media_tensor,
+            "relative_path": relative_path,
+            "main_media_relative_path": media_relative_path,
+            "video_metadata": {
+                "num_frames": num_frames,
+                "height": height,
+                "width": width,
+                "fps": fps,
+            },
+        }
+        # Add audio data if available
+        if audio_data is not None:
+            result["audio"] = audio_data
+        return result
+    @staticmethod
+    def _extract_audio(video_path: Path, target_duration: float) -> dict[str, torch.Tensor | int] | None:
+        """Extract audio track from a video file, trimmed to match video duration."""
+        try:
+            # torchaudio can extract audio from video files directly
+            # waveform shape: [channels, samples]
+            waveform, sample_rate = torchaudio.load(str(video_path))
+            # Trim or pad to target duration
+            target_samples = int(target_duration * sample_rate)
+            current_samples = waveform.shape[-1]
+            if current_samples > target_samples:
+                # Trim to target duration
+                waveform = waveform[..., :target_samples]
+            elif current_samples < target_samples:
+                # Pad with zeros to target duration
+                padding = target_samples - current_samples
+                waveform = torch.nn.functional.pad(waveform, (0, padding))
+                logger.warning(f"Padded audio to {target_duration:.2f} seconds for {video_path}")
+            return {"waveform": waveform, "sample_rate": sample_rate}
+        except Exception as e:
+            logger.debug(f"Could not extract audio from {video_path}: {e}")
+            return None
+    def _load_video_paths(self, column: str) -> list[Path]:
+        """Load video paths from the specified data source."""
+        if self.dataset_file.suffix == ".csv":
+            return self._load_video_paths_from_csv(column)
+        elif self.dataset_file.suffix == ".json":
+            return self._load_video_paths_from_json(column)
+        elif self.dataset_file.suffix == ".jsonl":
+            return self._load_video_paths_from_jsonl(column)
+        else:
+            raise ValueError("Expected `dataset_file` to be a path to a CSV, JSON, or JSONL file.")
+    def _load_video_paths_from_csv(self, column: str) -> list[Path]:
+        """Load video paths from a CSV file."""
+        df = pd.read_csv(self.dataset_file)
+        if column not in df.columns:
+            raise ValueError(f"Column '{column}' not found in CSV file")
+        data_root = self.dataset_file.parent
+        video_paths = [data_root / Path(line.strip()) for line in df[column].tolist()]
+        # Validate that all paths exist
+        invalid_paths = [path for path in video_paths if not path.is_file()]
+        if invalid_paths:
+            raise ValueError(f"Found {len(invalid_paths)} invalid video paths. First few: {invalid_paths[:5]}")
+        return video_paths
+    def _load_video_paths_from_json(self, column: str) -> list[Path]:
+        """Load video paths from a JSON file."""
+        with open(self.dataset_file, "r", encoding="utf-8") as file:
+            data = json.load(file)
+        if not isinstance(data, list):
+            raise ValueError("JSON file must contain a list of objects")
+        data_root = self.dataset_file.parent
+        video_paths = []
+        for entry in data:
+            if column not in entry:
+                raise ValueError(f"Key '{column}' not found in JSON entry")
+            video_paths.append(data_root / Path(entry[column].strip()))
+        # Validate that all paths exist
+        invalid_paths = [path for path in video_paths if not path.is_file()]
+        if invalid_paths:
+            raise ValueError(f"Found {len(invalid_paths)} invalid video paths. First few: {invalid_paths[:5]}")
+        return video_paths
+    def _load_video_paths_from_jsonl(self, column: str) -> list[Path]:
+        """Load video paths from a JSONL file."""
+        data_root = self.dataset_file.parent
+        video_paths = []
+        with open(self.dataset_file, "r", encoding="utf-8") as file:
+            for line in file:
+                entry = json.loads(line)
+                if column not in entry:
+                    raise ValueError(f"Key '{column}' not found in JSONL entry")
+                video_paths.append(data_root / Path(entry[column].strip()))
+        # Validate that all paths exist
+        invalid_paths = [path for path in video_paths if not path.is_file()]
+        if invalid_paths:
+            raise ValueError(f"Found {len(invalid_paths)} invalid video paths. First few: {invalid_paths[:5]}")
+        return video_paths
+    def _filter_valid_videos(self) -> None:
+        """Filter out videos with insufficient frames."""
+        original_length = len(self.video_paths)
+        valid_video_paths = []
+        valid_main_media_paths = []
+        min_frames_required = min(self.resolution_buckets, key=lambda x: x[0])[0]
+        for i, video_path in enumerate(self.video_paths):
+            if video_path.suffix.lower() in [".png", ".jpg", ".jpeg"]:
+                valid_video_paths.append(video_path)
+                valid_main_media_paths.append(self.main_media_paths[i])
+                continue
+            try:
+                frame_count = get_video_frame_count(video_path)
+                if frame_count >= min_frames_required:
+                    valid_video_paths.append(video_path)
+                    valid_main_media_paths.append(self.main_media_paths[i])
+                else:
+                    logger.warning(
+                        f"Skipping video at {video_path} - has {frame_count} frames, "
+                        f"which is less than the minimum required frames ({min_frames_required})"
+                    )
+            except Exception as e:
+                logger.warning(f"Failed to read video at {video_path}: {e!s}")
+        # Update both path lists to maintain synchronization
+        self.video_paths = valid_video_paths
+        self.main_media_paths = valid_main_media_paths
+        if len(self.video_paths) < original_length:
+            logger.warning(
+                f"Filtered out {original_length - len(self.video_paths)} videos with insufficient frames. "
+                f"Proceeding with {len(self.video_paths)} valid videos."
+            )
+    def _preprocess_image(self, path: Path) -> torch.Tensor:
+        """Preprocess a single image by resizing and applying transforms."""
+        image = open_image_as_srgb(path)
+        image = to_tensor(image)
+        image = image.unsqueeze(0)  # Add frame dimension [1, C, H, W] for bucket selection
+        # Find nearest resolution bucket and resize
+        nearest_bucket = self._get_resolution_bucket_for_item(image)
+        _, target_height, target_width = nearest_bucket
+        image_resized = self._resize_and_crop(image, target_height, target_width)
+        # _resize_and_crop returns [C, H, W] for single-frame input (squeeze removes dim 0)
+        # Apply transforms
+        image = self.transforms(image_resized)  # [C, H, W] -> [C, H, W]
+        # Add frame dimension in VAE format: [C, H, W] -> [C, 1, H, W]
+        image = image.unsqueeze(1)
+        return image
+    def _preprocess_video(self, path: Path) -> tuple[torch.Tensor, float]:
+        """Preprocess a video by loading, resizing, and applying transforms.
+        Returns:
+            Tuple of (video tensor in [C, F, H, W] format, fps)
+        """
+        # Load video frames up to max_target_frames
+        video, fps = read_video(path, max_frames=self.max_target_frames)
+        nearest_bucket = self._get_resolution_bucket_for_item(video)
+        target_num_frames, target_height, target_width = nearest_bucket
+        frames_resized = self._resize_and_crop(video, target_height, target_width)
+        # Trim video to target number of frames
+        frames_resized = frames_resized[:target_num_frames]
+        # Apply transforms to each frame and stack
+        video = torch.stack([self.transforms(frame) for frame in frames_resized], dim=0)
+        # Permute [F,C,H,W] -> [C,F,H,W] for VAE compatibility
+        # After DataLoader batching, this becomes [B,C,F,H,W] which VAE expects
+        video = video.permute(1, 0, 2, 3).contiguous()
+        return video, fps
+    def _get_resolution_bucket_for_item(self, media_tensor: torch.Tensor) -> tuple[int, int, int]:
+        """Get the nearest resolution bucket for the given media tensor."""
+        num_frames, _, height, width = media_tensor.shape
+        def distance(bucket: tuple[int, int, int]) -> tuple:
+            bucket_num_frames, bucket_height, bucket_width = bucket
+            # Lexicographic key:
+            # 1) minimize aspect-ratio diff (in log-scale, for invariance to shorter/longer ARs)
+            # 2) prefer buckets with more frames (by using negative)
+            # 3) prefer buckets with larger spatial area (by using negative)
+            return (
+                abs(math.log(width / height) - math.log(bucket_width / bucket_height)),
+                -bucket_num_frames,
+                -(bucket_height * bucket_width),
+            )
+        # Keep only buckets with <= available frames
+        relevant_buckets = [b for b in self.resolution_buckets if b[0] <= num_frames]
+        if not relevant_buckets:
+            raise ValueError(f"No resolution buckets have <= {num_frames} frames. Available: {self.resolution_buckets}")
+        # Find the bucket with the minimal distance (according to the function above) to the media item's shape.
+        nearest_bucket = min(relevant_buckets, key=distance)
+        return nearest_bucket
+    def _resize_and_crop(self, media_tensor: torch.Tensor, target_height: int, target_width: int) -> torch.Tensor:
+        """Resize and crop tensor to target size."""
+        # Get current dimensions
+        current_height, current_width = media_tensor.shape[2], media_tensor.shape[3]
+        # Calculate aspect ratios to determine which dimension to resize first
+        current_aspect = current_width / current_height
+        target_aspect = target_width / target_height
+        # Resize while maintaining aspect ratio - scale to make the smaller dimension fit
+        if current_aspect > target_aspect:
+            # Current is wider than target, so scale by height
+            new_width = int(current_width * target_height / current_height)
+            media_tensor = resize(
+                media_tensor,
+                size=[target_height, new_width],  # type: ignore
+                interpolation=InterpolationMode.BICUBIC,
+            )
+        else:
+            # Current is taller than target, so scale by width
+            new_height = int(current_height * target_width / current_width)
+            media_tensor = resize(
+                media_tensor,
+                size=[new_height, target_width],
+                interpolation=InterpolationMode.BICUBIC,
+            )
+        # Update dimensions after resize
+        current_height, current_width = media_tensor.shape[2], media_tensor.shape[3]
+        media_tensor = media_tensor.squeeze(0)
+        # Calculate how much we need to crop from each dimension
+        delta_h = current_height - target_height
+        delta_w = current_width - target_width
+        # Determine crop position based on reshape mode
+        if self.reshape_mode == "random":
+            # Random crop position
+            top = np.random.randint(0, delta_h + 1)
+            left = np.random.randint(0, delta_w + 1)
+        elif self.reshape_mode == "center":
+            # Center crop
+            top, left = delta_h // 2, delta_w // 2
+        else:
+            raise ValueError(f"Unsupported reshape mode: {self.reshape_mode}")
+        # Perform the final crop to exact target dimensions
+        media_tensor = crop(media_tensor, top=top, left=left, height=target_height, width=target_width)
+        return media_tensor
+def compute_latents(  # noqa: PLR0913, PLR0915
+    dataset_file: str | Path,
+    video_column: str,
+    resolution_buckets: list[tuple[int, int, int]],
+    output_dir: str,
+    model_path: str,
+    main_media_column: str | None = None,
+    reshape_mode: str = "center",
+    batch_size: int = 1,
+    device: str = "cuda",
+    vae_tiling: bool = False,
+    with_audio: bool = False,
+    audio_output_dir: str | None = None,
+) -> None:
+    """
+    Process videos and save latent representations.
+    Args:
+        dataset_file: Path to metadata file (CSV/JSON/JSONL) containing video paths
+        video_column: Column name for video paths in the metadata file
+        resolution_buckets: List of (frames, height, width) tuples
+        output_dir: Directory to save video latents
+        model_path: Path to LTX-2 checkpoint (.safetensors)
+        reshape_mode: How to crop videos ("center", "random")
+        main_media_column: Column name for main media paths (if different from video_column)
+        batch_size: Batch size for processing
+        device: Device to use for computation
+        vae_tiling: Whether to enable VAE tiling
+        with_audio: Whether to extract and encode audio from videos
+        audio_output_dir: Directory to save audio latents (required if with_audio=True)
+    """
+    # Validate audio parameters
+    if with_audio and audio_output_dir is None:
+        raise ValueError("audio_output_dir must be provided when with_audio=True")
+    console = Console()
+    torch_device = torch.device(device)
+    # Create dataset
+    dataset = MediaDataset(
+        dataset_file=dataset_file,
+        main_media_column=main_media_column or video_column,
+        video_column=video_column,
+        resolution_buckets=resolution_buckets,
+        reshape_mode=reshape_mode,
+        with_audio=with_audio,
+    )
+    logger.info(f"Loaded {len(dataset)} valid media files")
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Set up audio output directory if needed
+    audio_output_path = None
+    if with_audio:
+        audio_output_path = Path(audio_output_dir)
+        audio_output_path.mkdir(parents=True, exist_ok=True)
+    # Load video VAE encoder
+    with console.status(f"[bold]Loading video VAE encoder from [cyan]{model_path}[/]...", spinner="dots"):
+        vae = load_video_vae_encoder(model_path, device=torch_device, dtype=torch.bfloat16)
+    # Load audio VAE encoder and audio processor if needed
+    audio_vae_encoder = None
+    audio_processor = None
+    if with_audio:
+        with console.status(f"[bold]Loading audio VAE encoder from [cyan]{model_path}[/]...", spinner="dots"):
+            audio_vae_encoder = load_audio_vae_encoder(
+                checkpoint_path=model_path,
+                device=torch_device,
+                dtype=torch.float32,  # Audio VAE needs float32 for quality. TODO: re-test with bfloat16.
+            )
+            # Create audio processor for waveform-to-spectrogram conversion
+            audio_processor = AudioProcessor(
+                target_sample_rate=audio_vae_encoder.sample_rate,
+                mel_bins=audio_vae_encoder.mel_bins,
+                mel_hop_length=audio_vae_encoder.mel_hop_length,
+                n_fft=audio_vae_encoder.n_fft,
+            ).to(torch_device)
+    # Create dataloader
+    # Note: batch_size=1 required when with_audio because audio extraction can fail for some videos,
+    # and the default collate function can't handle mixed None/dict values across a batch.
+    if with_audio and batch_size > 1:
+        logger.warning("Audio processing requires batch_size=1. Overriding batch_size to 1.")
+        batch_size = 1
+    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)
+    # Track audio statistics
+    audio_success_count = 0
+    audio_skip_count = 0
+    # Process batches
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TaskProgressColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        TimeRemainingColumn(),
+        console=console,
+    ) as progress:
+        task = progress.add_task("Processing videos", total=len(dataloader))
+        for batch in dataloader:
+            # Get video tensor - shape is [B, F, C, H, W] from DataLoader
+            video = batch["video"]
+            # Encode video
+            with torch.inference_mode():
+                video_latent_data = encode_video(vae=vae, video=video, use_tiling=vae_tiling)
+            # Save latents for each item in batch
+            for i in range(len(batch["relative_path"])):
+                output_rel_path = Path(batch["main_media_relative_path"][i]).with_suffix(".pt")
+                output_file = output_path / output_rel_path
+                # Create output directory maintaining structure
+                output_file.parent.mkdir(parents=True, exist_ok=True)
+                # Index into batch to get this item's latents
+                latent_data = {
+                    "latents": video_latent_data["latents"][i].cpu().contiguous(),  # [C, F', H', W']
+                    "num_frames": video_latent_data["num_frames"],
+                    "height": video_latent_data["height"],
+                    "width": video_latent_data["width"],
+                    "fps": batch["video_metadata"]["fps"][i].item(),
+                }
+                torch.save(latent_data, output_file)
+                # Process audio if enabled (audio is already extracted by the dataset)
+                if with_audio:
+                    audio_batch = batch.get("audio")
+                    if audio_batch is not None:
+                        # Extract the i-th item from batched audio data
+                        # DataLoader collates [channels, samples] -> [batch, channels, samples]
+                        audio_data = Audio(
+                            waveform=audio_batch["waveform"][i],
+                            sampling_rate=audio_batch["sample_rate"][i].item(),
+                        )
+                        # Encode audio
+                        with torch.inference_mode():
+                            audio_latents = encode_audio(audio_vae_encoder, audio_processor, audio_data)
+                        # Save audio latents
+                        audio_output_file = audio_output_path / output_rel_path
+                        audio_output_file.parent.mkdir(parents=True, exist_ok=True)
+                        audio_save_data = {
+                            "latents": audio_latents["latents"].cpu().contiguous(),
+                            "num_time_steps": audio_latents["num_time_steps"],
+                            "frequency_bins": audio_latents["frequency_bins"],
+                            "duration": audio_latents["duration"],
+                        }
+                        torch.save(audio_save_data, audio_output_file)
+                        audio_success_count += 1
+                    else:
+                        # Video has no audio track
+                        audio_skip_count += 1
+            progress.advance(task)
+    # Log summary
+    logger.info(f"Processed {len(dataset)} videos. Latents saved to {output_path}")
+    if with_audio:
+        logger.info(
+            f"Audio processing: {audio_success_count} videos with audio, "
+            f"{audio_skip_count} videos without audio (skipped)"
+        )
+def encode_video(
+    vae: torch.nn.Module,
+    video: torch.Tensor,
+    dtype: torch.dtype | None = None,
+    use_tiling: bool = False,
+    tile_size: int = DEFAULT_TILE_SIZE,
+    tile_overlap: int = DEFAULT_TILE_OVERLAP,
+) -> dict[str, torch.Tensor | int]:
+    """Encode video into non-patchified latent representation.
+    Args:
+        vae: Video VAE encoder model
+        video: Input tensor of shape [B, C, F, H, W] (batch, channels, frames, height, width)
+               This is the format expected by the VAE encoder.
+        dtype: Target dtype for output latents
+        use_tiling: Whether to use spatial tiling for memory efficiency
+        tile_size: Tile size in pixels (must be divisible by 32)
+        tile_overlap: Overlap between tiles in pixels (must be divisible by 32)
+    Returns:
+        Dict containing non-patchified latents and shape information:
+        {
+            "latents": Tensor[B, C, F', H', W'],  # Non-patchified format with batch dim
+            "num_frames": int,  # Latent frame count
+            "height": int,  # Latent height
+            "width": int,  # Latent width
+        }
+    """
+    device = next(vae.parameters()).device
+    vae_dtype = next(vae.parameters()).dtype
+    # Add batch dimension if needed
+    if video.ndim == 4:
+        video = video.unsqueeze(0)  # [C, F, H, W] -> [B, C, F, H, W]
+    video = video.to(device=device, dtype=vae_dtype)
+    # Choose encoding method based on tiling flag
+    if use_tiling:
+        latents = tiled_encode_video(
+            vae=vae,
+            video=video,
+            tile_size=tile_size,
+            tile_overlap=tile_overlap,
+        )
+    else:
+        # Encode video - VAE expects [B, C, F, H, W], returns [B, C, F', H', W']
+        latents = vae(video)
+    if dtype is not None:
+        latents = latents.to(dtype=dtype)
+    _, _, num_frames, height, width = latents.shape
+    return {
+        "latents": latents,  # [B, C, F', H', W']
+        "num_frames": num_frames,
+        "height": height,
+        "width": width,
+    }
+def tiled_encode_video(  # noqa: PLR0912, PLR0915
+    vae: torch.nn.Module,
+    video: torch.Tensor,
+    tile_size: int = DEFAULT_TILE_SIZE,
+    tile_overlap: int = DEFAULT_TILE_OVERLAP,
+) -> torch.Tensor:
+    """Encode video using spatial tiling for memory efficiency.
+    Splits the video into overlapping spatial tiles, encodes each tile separately,
+    and blends the results using linear feathering in the overlap regions.
+    Args:
+        vae: Video VAE encoder model
+        video: Input tensor of shape [B, C, F, H, W]
+        tile_size: Tile size in pixels (must be divisible by 32)
+        tile_overlap: Overlap between tiles in pixels (must be divisible by 32)
+    Returns:
+        Encoded latent tensor [B, C_latent, F_latent, H_latent, W_latent]
+    """
+    batch, _channels, frames, height, width = video.shape
+    device = video.device
+    dtype = video.dtype
+    # Validate tile parameters
+    if tile_size % VAE_SPATIAL_FACTOR != 0:
+        raise ValueError(f"tile_size must be divisible by {VAE_SPATIAL_FACTOR}, got {tile_size}")
+    if tile_overlap % VAE_SPATIAL_FACTOR != 0:
+        raise ValueError(f"tile_overlap must be divisible by {VAE_SPATIAL_FACTOR}, got {tile_overlap}")
+    if tile_overlap >= tile_size:
+        raise ValueError(f"tile_overlap ({tile_overlap}) must be less than tile_size ({tile_size})")
+    # If video fits in a single tile, use regular encoding
+    if height <= tile_size and width <= tile_size:
+        return vae(video)
+    # Calculate output dimensions
+    # VAE compresses: H -> H/32, W -> W/32, F -> 1 + (F-1)/8
+    output_height = height // VAE_SPATIAL_FACTOR
+    output_width = width // VAE_SPATIAL_FACTOR
+    output_frames = 1 + (frames - 1) // VAE_TEMPORAL_FACTOR
+    # Latent channels (128 for LTX-2)
+    # Get from a small test encode or assume 128
+    latent_channels = 128
+    # Initialize output and weight tensors
+    output = torch.zeros(
+        (batch, latent_channels, output_frames, output_height, output_width),
+        device=device,
+        dtype=dtype,
+    )
+    weights = torch.zeros(
+        (batch, 1, output_frames, output_height, output_width),
+        device=device,
+        dtype=dtype,
+    )
+    # Calculate tile positions with overlap
+    # Step size is tile_size - tile_overlap
+    step_h = tile_size - tile_overlap
+    step_w = tile_size - tile_overlap
+    h_positions = list(range(0, max(1, height - tile_overlap), step_h))
+    w_positions = list(range(0, max(1, width - tile_overlap), step_w))
+    # Ensure last tile covers the edge
+    if h_positions[-1] + tile_size < height:
+        h_positions.append(height - tile_size)
+    if w_positions[-1] + tile_size < width:
+        w_positions.append(width - tile_size)
+    # Remove duplicates and sort
+    h_positions = sorted(set(h_positions))
+    w_positions = sorted(set(w_positions))
+    # Overlap in latent space
+    overlap_out_h = tile_overlap // VAE_SPATIAL_FACTOR
+    overlap_out_w = tile_overlap // VAE_SPATIAL_FACTOR
+    # Process each tile
+    for h_pos in h_positions:
+        for w_pos in w_positions:
+            # Calculate tile boundaries in input space
+            h_start = max(0, h_pos)
+            w_start = max(0, w_pos)
+            h_end = min(h_start + tile_size, height)
+            w_end = min(w_start + tile_size, width)
+            # Ensure tile dimensions are divisible by VAE_SPATIAL_FACTOR
+            tile_h = ((h_end - h_start) // VAE_SPATIAL_FACTOR) * VAE_SPATIAL_FACTOR
+            tile_w = ((w_end - w_start) // VAE_SPATIAL_FACTOR) * VAE_SPATIAL_FACTOR
+            if tile_h < VAE_SPATIAL_FACTOR or tile_w < VAE_SPATIAL_FACTOR:
+                continue
+            # Adjust end positions
+            h_end = h_start + tile_h
+            w_end = w_start + tile_w
+            # Extract tile
+            tile = video[:, :, :, h_start:h_end, w_start:w_end]
+            # Encode tile
+            encoded_tile = vae(tile)
+            # Get actual encoded dimensions
+            _, _, tile_out_frames, tile_out_height, tile_out_width = encoded_tile.shape
+            # Calculate output positions
+            out_h_start = h_start // VAE_SPATIAL_FACTOR
+            out_w_start = w_start // VAE_SPATIAL_FACTOR
+            out_h_end = min(out_h_start + tile_out_height, output_height)
+            out_w_end = min(out_w_start + tile_out_width, output_width)
+            # Trim encoded tile if necessary
+            actual_tile_h = out_h_end - out_h_start
+            actual_tile_w = out_w_end - out_w_start
+            encoded_tile = encoded_tile[:, :, :, :actual_tile_h, :actual_tile_w]
+            # Create blending mask with linear feathering at edges
+            mask = torch.ones(
+                (1, 1, tile_out_frames, actual_tile_h, actual_tile_w),
+                device=device,
+                dtype=dtype,
+            )
+            # Apply feathering at edges (linear blend in overlap regions)
+            # Left edge
+            if h_pos > 0 and overlap_out_h > 0 and overlap_out_h < actual_tile_h:
+                fade_in = torch.linspace(0.0, 1.0, overlap_out_h + 2, device=device, dtype=dtype)[1:-1]
+                mask[:, :, :, :overlap_out_h, :] *= fade_in.view(1, 1, 1, -1, 1)
+            # Right edge (bottom in height dimension)
+            if h_end < height and overlap_out_h > 0 and overlap_out_h < actual_tile_h:
+                fade_out = torch.linspace(1.0, 0.0, overlap_out_h + 2, device=device, dtype=dtype)[1:-1]
+                mask[:, :, :, -overlap_out_h:, :] *= fade_out.view(1, 1, 1, -1, 1)
+            # Top edge (left in width dimension)
+            if w_pos > 0 and overlap_out_w > 0 and overlap_out_w < actual_tile_w:
+                fade_in = torch.linspace(0.0, 1.0, overlap_out_w + 2, device=device, dtype=dtype)[1:-1]
+                mask[:, :, :, :, :overlap_out_w] *= fade_in.view(1, 1, 1, 1, -1)
+            # Bottom edge (right in width dimension)
+            if w_end < width and overlap_out_w > 0 and overlap_out_w < actual_tile_w:
+                fade_out = torch.linspace(1.0, 0.0, overlap_out_w + 2, device=device, dtype=dtype)[1:-1]
+                mask[:, :, :, :, -overlap_out_w:] *= fade_out.view(1, 1, 1, 1, -1)
+            # Accumulate weighted results
+            output[:, :, :, out_h_start:out_h_end, out_w_start:out_w_end] += encoded_tile * mask
+            weights[:, :, :, out_h_start:out_h_end, out_w_start:out_w_end] += mask
+    # Normalize by weights (avoid division by zero)
+    output = output / (weights + 1e-8)
+    return output
+def encode_audio(
+    audio_vae_encoder: torch.nn.Module,
+    audio_processor: torch.nn.Module,
+    audio: Audio,
+) -> dict[str, torch.Tensor | int | float]:
+    """Encode audio waveform into latent representation.
+    Args:
+        audio_vae_encoder: Audio VAE encoder model from ltx-core
+        audio_processor: AudioProcessor for waveform-to-spectrogram conversion
+        audio: Audio container with waveform tensor and sampling rate.
+    Returns:
+        Dict containing audio latents and shape information:
+        {
+            "latents": Tensor[C, T, F],  # Non-patchified format
+            "num_time_steps": int,
+            "frequency_bins": int,
+            "duration": float,
+        }
+    """
+    device = next(audio_vae_encoder.parameters()).device
+    dtype = next(audio_vae_encoder.parameters()).dtype
+    waveform = audio.waveform.to(device=device, dtype=dtype)
+    # Add batch dimension if needed: [channels, samples] -> [batch, channels, samples]
+    if waveform.dim() == 2:
+        waveform = waveform.unsqueeze(0)
+    # Calculate duration
+    duration = waveform.shape[-1] / audio.sampling_rate
+    # Convert waveform to mel spectrogram using AudioProcessor
+    mel_spectrogram = audio_processor.waveform_to_mel(Audio(waveform=waveform, sampling_rate=audio.sampling_rate))
+    mel_spectrogram = mel_spectrogram.to(dtype=dtype)
+    # Encode mel spectrogram to latents
+    latents = audio_vae_encoder(mel_spectrogram)
+    # latents shape: [batch, channels, time, freq] = [1, 8, T, 16]
+    _, _channels, time_steps, freq_bins = latents.shape
+    return {
+        "latents": latents.squeeze(0),  # [C, T, F] - remove batch dim
+        "num_time_steps": time_steps,
+        "frequency_bins": freq_bins,
+        "duration": duration,
+    }
+def parse_resolution_buckets(resolution_buckets_str: str) -> list[tuple[int, int, int]]:
+    """Parse resolution buckets from string format to list of tuples (frames, height, width)"""
+    resolution_buckets = []
+    for bucket_str in resolution_buckets_str.split(";"):
+        w, h, f = map(int, bucket_str.split("x"))
+        if w % VAE_SPATIAL_FACTOR != 0 or h % VAE_SPATIAL_FACTOR != 0:
+            raise typer.BadParameter(
+                f"Width and height must be multiples of {VAE_SPATIAL_FACTOR}, got {w}x{h}",
+                param_hint="resolution-buckets",
+            )
+        if f % VAE_TEMPORAL_FACTOR != 1:
+            raise typer.BadParameter(
+                f"Number of frames must be a multiple of {VAE_TEMPORAL_FACTOR} plus 1, got {f}",
+                param_hint="resolution-buckets",
+            )
+        resolution_buckets.append((f, h, w))
+    return resolution_buckets
+def compute_scaled_resolution_buckets(
+    resolution_buckets: list[tuple[int, int, int]],
+    scale_factor: int,
+) -> list[tuple[int, int, int]]:
+    """Compute scaled resolution buckets and validate the results."""
+    if scale_factor == 1:
+        return resolution_buckets
+    scaled_buckets = []
+    for frames, height, width in resolution_buckets:
+        # Validate that scale factor evenly divides the dimensions
+        if height % scale_factor != 0:
+            raise ValueError(
+                f"Height {height} is not evenly divisible by scale factor {scale_factor}. "
+                f"Choose a scale factor that divides {height} evenly."
+            )
+        if width % scale_factor != 0:
+            raise ValueError(
+                f"Width {width} is not evenly divisible by scale factor {scale_factor}. "
+                f"Choose a scale factor that divides {width} evenly."
+            )
+        scaled_height = height // scale_factor
+        scaled_width = width // scale_factor
+        # Validate scaled dimensions are divisible by VAE spatial factor
+        if scaled_height % VAE_SPATIAL_FACTOR != 0:
+            raise ValueError(
+                f"Scaled height {scaled_height} (from {height} / {scale_factor}) "
+                f"is not divisible by {VAE_SPATIAL_FACTOR}. "
+                f"Choose a different scale factor or adjust your resolution buckets."
+            )
+        if scaled_width % VAE_SPATIAL_FACTOR != 0:
+            raise ValueError(
+                f"Scaled width {scaled_width} (from {width} / {scale_factor}) "
+                f"is not divisible by {VAE_SPATIAL_FACTOR}. "
+                f"Choose a different scale factor or adjust your resolution buckets."
+            )
+        scaled_buckets.append((frames, scaled_height, scaled_width))
+    return scaled_buckets
+@app.command()
+def main(  # noqa: PLR0913
+    dataset_file: str = typer.Argument(
+        ...,
+        help="Path to metadata file (CSV/JSON/JSONL) containing video paths",
+    ),
+    resolution_buckets: str = typer.Option(
+        ...,
+        help='Resolution buckets in format "WxHxF;WxHxF;..." (e.g. "768x768x25;512x512x49")',
+    ),
+    output_dir: str = typer.Option(
+        ...,
+        help="Output directory to save video latents",
+    ),
+    model_path: str = typer.Option(
+        ...,
+        help="Path to LTX-2 checkpoint (.safetensors file)",
+    ),
+    video_column: str = typer.Option(
+        default="media_path",
+        help="Column name in the dataset JSON/JSONL/CSV file containing video paths",
+    ),
+    batch_size: int = typer.Option(
+        default=1,
+        help="Batch size for processing",
+    ),
+    device: str = typer.Option(
+        default="cuda",
+        help="Device to use for computation",
+    ),
+    vae_tiling: bool = typer.Option(
+        default=False,
+        help="Enable VAE tiling for larger video resolutions",
+    ),
+    reshape_mode: str = typer.Option(
+        default="center",
+        help="How to crop videos: 'center' or 'random'",
+    ),
+    with_audio: bool = typer.Option(
+        default=False,
+        help="Extract and encode audio from video files",
+    ),
+    audio_output_dir: str | None = typer.Option(
+        default=None,
+        help="Output directory for audio latents (required if --with-audio is set)",
+    ),
+) -> None:
+    """Process videos/images and save latent representations for video generation training.
+    This script processes videos and images from metadata files and saves latent representations
+    that can be used for training video generation models. The output latents will maintain
+    the same folder structure and naming as the corresponding media files.
+    Examples:
+        # Process videos from a CSV file
+        python scripts/process_videos.py dataset.csv --resolution-buckets 768x768x25 \\
+            --output-dir ./latents --model-path /path/to/ltx2.safetensors
+        # Process videos from a JSON file with custom video column
+        python scripts/process_videos.py dataset.json --resolution-buckets 768x768x25 \\
+            --output-dir ./latents --model-path /path/to/ltx2.safetensors --video-column "video_path"
+        # Enable VAE tiling to save GPU VRAM
+        python scripts/process_videos.py dataset.csv --resolution-buckets 1024x1024x25 \\
+            --output-dir ./latents --model-path /path/to/ltx2.safetensors --vae-tiling
+        # Process videos with audio
+        python scripts/process_videos.py dataset.csv --resolution-buckets 768x768x25 \\
+            --output-dir ./latents --model-path /path/to/ltx2.safetensors \\
+            --with-audio --audio-output-dir ./audio_latents
+    """
+    # Validate dataset file exists
+    if not Path(dataset_file).is_file():
+        raise typer.BadParameter(f"Dataset file not found: {dataset_file}")
+    # Validate audio parameters
+    if with_audio and audio_output_dir is None:
+        raise typer.BadParameter("--audio-output-dir is required when --with-audio is set")
+    # Parse resolution buckets
+    parsed_resolution_buckets = parse_resolution_buckets(resolution_buckets)
+    if len(parsed_resolution_buckets) > 1:
+        logger.warning(
+            "Using multiple resolution buckets. "
+            "When training with multiple resolution buckets, you must use a batch size of 1."
+        )
+    # Process latents
+    compute_latents(
+        dataset_file=dataset_file,
+        video_column=video_column,
+        resolution_buckets=parsed_resolution_buckets,
+        output_dir=output_dir,
+        model_path=model_path,
+        reshape_mode=reshape_mode,
+        batch_size=batch_size,
+        device=device,
+        vae_tiling=vae_tiling,
+        with_audio=with_audio,
+        audio_output_dir=audio_output_dir,
+    )
+if __name__ == "__main__":
+    app()

packages/ltx-trainer/scripts/split_scenes.py ADDED Viewed

	@@ -0,0 +1,417 @@

+#!/usr/bin/env python3
+"""
+Split video into scenes using PySceneDetect.
+This script provides a command-line interface for splitting videos into scenes using various detection algorithms.
+It supports multiple detection methods, preview image generation, and customizable parameters for fine-tuning
+the scene detection process.
+Basic usage:
+    # Split video using default content-based detection
+    scenes_split.py input.mp4 output_dir/
+    # Save 3 preview images per scene
+    scenes_split.py input.mp4 output_dir/ --save-images 3
+    # Process specific duration and filter short scenes
+    scenes_split.py input.mp4 output_dir/ --duration 60s --filter-shorter-than 2s
+Advanced usage:
+    # Content detection with minimum scene length and frame skip
+    scenes_split.py input.mp4 output_dir/ --detector content --min-scene-length 30 --frame-skip 2
+    # Use adaptive detection with custom detector and detector parameters
+    scenes_split.py input.mp4 output_dir/ --detector adaptive --threshold 3.0 --adaptive-window 10
+"""
+from enum import Enum
+from pathlib import Path
+from typing import List, Optional, Tuple
+import typer
+from scenedetect import (
+    AdaptiveDetector,
+    ContentDetector,
+    HistogramDetector,
+    SceneManager,
+    ThresholdDetector,
+    open_video,
+)
+from scenedetect.frame_timecode import FrameTimecode
+from scenedetect.scene_manager import SceneDetector, write_scene_list_html
+from scenedetect.scene_manager import save_images as save_scene_images
+from scenedetect.stats_manager import StatsManager
+from scenedetect.video_splitter import split_video_ffmpeg
+app = typer.Typer(no_args_is_help=True, help="Split video into scenes using PySceneDetect.")
+class DetectorType(str, Enum):
+    """Available scene detection algorithms."""
+    CONTENT = "content"  # Detects fast cuts using HSV color space
+    ADAPTIVE = "adaptive"  # Detects fast two-phase cuts
+    THRESHOLD = "threshold"  # Detects fast cuts/slow fades in from and out to a given threshold level
+    HISTOGRAM = "histogram"  # Detects based on YUV histogram differences in adjacent frames
+def create_detector(
+    detector_type: DetectorType,
+    threshold: Optional[float] = None,
+    min_scene_len: Optional[int] = None,
+    luma_only: Optional[bool] = None,
+    adaptive_window: Optional[int] = None,
+    fade_bias: Optional[float] = None,
+) -> SceneDetector:
+    """Create a scene detector based on the specified type and parameters.
+    Args:
+        detector_type: Type of detector to create
+        threshold: Detection threshold (meaning varies by detector)
+        min_scene_len: Minimum scene length in frames
+        luma_only: If True, only use brightness for content detection
+        adaptive_window: Window size for adaptive detection
+        fade_bias: Bias for fade in/out detection (-1.0 to 1.0)
+    Note: Parameters set to None will use the detector's built-in default values.
+    Returns:
+        Configured scene detector instance
+    """
+    # Set common arguments
+    kwargs = {}
+    if threshold is not None:
+        kwargs["threshold"] = threshold
+    if min_scene_len is not None:
+        kwargs["min_scene_len"] = min_scene_len
+    match detector_type:
+        case DetectorType.CONTENT:
+            if luma_only is not None:
+                kwargs["luma_only"] = luma_only
+            return ContentDetector(**kwargs)
+        case DetectorType.ADAPTIVE:
+            if adaptive_window is not None:
+                kwargs["window_width"] = adaptive_window
+            if luma_only is not None:
+                kwargs["luma_only"] = luma_only
+            if "threshold" in kwargs:
+                # Special case for adaptive detector which uses different param name
+                kwargs["adaptive_threshold"] = kwargs.pop("threshold")
+            return AdaptiveDetector(**kwargs)
+        case DetectorType.THRESHOLD:
+            if fade_bias is not None:
+                kwargs["fade_bias"] = fade_bias
+            return ThresholdDetector(**kwargs)
+        case DetectorType.HISTOGRAM:
+            return HistogramDetector(**kwargs)
+        case _:
+            raise ValueError(f"Unknown detector type: {detector_type}")
+def validate_output_dir(output_dir: str) -> Path:
+    """Validate and create output directory if it doesn't exist.
+    Args:
+        output_dir: Path to the output directory
+    Returns:
+        Path object of the validated output directory
+    """
+    path = Path(output_dir)
+    if path.exists() and not path.is_dir():
+        raise typer.BadParameter(f"{output_dir} exists but is not a directory")
+    return path
+def parse_timecode(video: any, time_str: Optional[str]) -> Optional[FrameTimecode]:
+    """Parse a timecode string into a FrameTimecode object.
+    Supports formats:
+    - Frames: '123'
+    - Seconds: '123s' or '123.45s'
+    - Timecode: '00:02:03' or '00:02:03.456'
+    Args:
+        video: Video object to get framerate from
+        time_str: String to parse, or None
+    Returns:
+        FrameTimecode object or None if input is None
+    """
+    if time_str is None:
+        return None
+    try:
+        if time_str.endswith("s"):
+            # Seconds format
+            seconds = float(time_str[:-1])
+            return FrameTimecode(timecode=seconds, fps=video.frame_rate)
+        elif ":" in time_str:
+            # Timecode format
+            return FrameTimecode(timecode=time_str, fps=video.frame_rate)
+        else:
+            # Frame number format
+            return FrameTimecode(timecode=int(time_str), fps=video.frame_rate)
+    except ValueError as e:
+        raise typer.BadParameter(
+            f"Invalid timecode format: {time_str}. Use frames (123), "
+            f"seconds (123s/123.45s), or timecode (HH:MM:SS[.nnn])",
+        ) from e
+def detect_and_split_scenes(  # noqa: PLR0913
+    video_path: str,
+    output_dir: Path,
+    detector_type: DetectorType,
+    threshold: Optional[float] = None,
+    min_scene_len: Optional[int] = None,
+    max_scenes: Optional[int] = None,
+    filter_shorter_than: Optional[str] = None,
+    skip_start: Optional[int] = None,  # noqa: ARG001
+    skip_end: Optional[int] = None,  # noqa: ARG001
+    save_images_per_scene: int = 0,
+    stats_file: Optional[str] = None,
+    luma_only: bool = False,
+    adaptive_window: Optional[int] = None,
+    fade_bias: Optional[float] = None,
+    downscale_factor: Optional[int] = None,
+    frame_skip: int = 0,
+    duration: Optional[str] = None,
+) -> List[Tuple[FrameTimecode, FrameTimecode]]:
+    """Detect and split scenes in a video using the specified parameters.
+    Args:
+        video_path: Path to input video.
+        output_dir: Directory to save output split scenes.
+        detector_type: Type of scene detector to use.
+        threshold: Detection threshold.
+        min_scene_len: Minimum scene length in frames.
+        max_scenes: Maximum number of scenes to detect.
+        filter_shorter_than: Filter out scenes shorter than this duration (frames/seconds/timecode)
+        skip_start: Number of frames to skip at start.
+        skip_end: Number of frames to skip at end.
+        save_images_per_scene: Number of images to save per scene (0 to disable).
+        stats_file: Path to save detection statistics (optional).
+        luma_only: Only use brightness for content detection.
+        adaptive_window: Window size for adaptive detection.
+        fade_bias: Bias for fade detection (-1.0 to 1.0).
+        downscale_factor: Factor to downscale frames by during detection.
+        frame_skip: Number of frames to skip (i.e. process every 1 in N+1 frames,
+            where N is frame_skip, processing only 1/N+1 percent of the video,
+            speeding up the detection time at the expense of accuracy).
+            frame_skip must be 0 (the default) when using a StatsManager.
+        duration: How much of the video to process from start position.
+            Can be specified as frames (123), seconds (123s/123.45s),
+            or timecode (HH:MM:SS[.nnn]).
+    Returns:
+        List of detected scenes as (start, end) FrameTimecode pairs.
+    """
+    # Create video stream
+    video = open_video(video_path, backend="opencv")
+    # Parse duration if specified
+    duration_tc = parse_timecode(video, duration)
+    # Parse filter_shorter_than if specified
+    filter_shorter_than_tc = parse_timecode(video, filter_shorter_than)
+    # Initialize scene manager with optional stats manager
+    stats_manager = StatsManager() if stats_file else None
+    scene_manager = SceneManager(stats_manager)
+    # Configure scene manager
+    if downscale_factor:
+        scene_manager.auto_downscale = False
+        scene_manager.downscale = downscale_factor
+    # Create and add detector
+    detector = create_detector(
+        detector_type=detector_type,
+        threshold=threshold,
+        min_scene_len=min_scene_len,
+        luma_only=luma_only,
+        adaptive_window=adaptive_window,
+        fade_bias=fade_bias,
+    )
+    scene_manager.add_detector(detector)
+    # Detect scenes
+    typer.echo("Detecting scenes...")
+    scene_manager.detect_scenes(
+        video=video,
+        show_progress=True,
+        frame_skip=frame_skip,
+        duration=duration_tc,
+    )
+    # Get scene list
+    scenes = scene_manager.get_scene_list()
+    # Filter out scenes that are too short if filter_shorter_than is specified
+    if filter_shorter_than_tc:
+        original_count = len(scenes)
+        scenes = [
+            (start, end)
+            for start, end in scenes
+            if (end.get_frames() - start.get_frames()) >= filter_shorter_than_tc.get_frames()
+        ]
+        if len(scenes) < original_count:
+            typer.echo(
+                f"Filtered out {original_count - len(scenes)} scenes shorter "
+                f"than {filter_shorter_than_tc.get_seconds():.1f} seconds "
+                f"({filter_shorter_than_tc.get_frames()} frames)",
+            )
+    # Apply max scenes limit if specified
+    if max_scenes and len(scenes) > max_scenes:
+        typer.echo(f"Dropping last {len(scenes) - max_scenes} scenes to meet max_scenes ({max_scenes}) limit")
+        scenes = scenes[:max_scenes]
+    # Print scene information
+    typer.echo(f"Found {len(scenes)} scenes:")
+    for i, (start, end) in enumerate(scenes, 1):
+        typer.echo(
+            f"Scene {i}: {start.get_timecode()} to {end.get_timecode()} "
+            f"({end.get_frames() - start.get_frames()} frames)",
+        )
+    # Save stats if requested
+    if stats_file:
+        typer.echo(f"Saving detection stats to {stats_file}")
+        stats_manager.save_to_csv(stats_file)
+    # Split video into scenes
+    typer.echo("Splitting video into scenes...")
+    try:
+        split_video_ffmpeg(
+            input_video_path=video_path,
+            scene_list=scenes,
+            output_dir=output_dir,
+            show_progress=True,
+        )
+        typer.echo(f"Scenes have been saved to: {output_dir}")
+    except Exception as e:
+        raise typer.BadParameter(f"Error splitting video: {e}") from e
+    # Save preview images if requested
+    if save_images_per_scene > 0:
+        typer.echo(f"Saving {save_images_per_scene} preview images per scene...")
+        image_filenames = save_scene_images(
+            scene_list=scenes,
+            video=video,
+            num_images=save_images_per_scene,
+            output_dir=str(output_dir),
+            show_progress=True,
+        )
+        # Generate HTML report with scene information and previews
+        html_path = output_dir / "scene_report.html"
+        write_scene_list_html(
+            output_html_filename=str(html_path),
+            scene_list=scenes,
+            image_filenames=image_filenames,
+        )
+        typer.echo(f"Scene report saved to: {html_path}")
+    return scenes
+@app.command()
+def main(  # noqa: PLR0913
+    video_path: Path = typer.Argument(  # noqa: B008
+        ...,
+        help="Path to the input video file",
+        exists=True,
+        dir_okay=False,
+    ),
+    output_dir: str = typer.Argument(
+        ...,
+        help="Directory where split scenes will be saved",
+    ),
+    detector: DetectorType = typer.Option(  # noqa: B008
+        DetectorType.CONTENT,
+        help="Scene detection algorithm to use",
+    ),
+    threshold: Optional[float] = typer.Option(
+        None,
+        help="Detection threshold (meaning varies by detector)",
+    ),
+    max_scenes: Optional[int] = typer.Option(
+        None,
+        help="Maximum number of scenes to produce",
+    ),
+    min_scene_length: Optional[int] = typer.Option(
+        None,
+        help="Minimum scene length during detection. Forces the detector to make scenes at least this many frames. "
+        "This affects scene detection behavior but does not filter out short scenes.",
+    ),
+    filter_shorter_than: Optional[str] = typer.Option(
+        None,
+        help="Filter out scenes shorter than this duration. Can be specified as frames (123), "
+        "seconds (123s/123.45s), or timecode (HH:MM:SS[.nnn]). These scenes will be detected but not saved.",
+    ),
+    skip_start: Optional[int] = typer.Option(
+        None,
+        help="Number of frames to skip at the start of the video",
+    ),
+    skip_end: Optional[int] = typer.Option(
+        None,
+        help="Number of frames to skip at the end of the video",
+    ),
+    duration: Optional[str] = typer.Option(
+        None,
+        "-d",
+        help="How much of the video to process. Can be specified as frames (123), "
+        "seconds (123s/123.45s), or timecode (HH:MM:SS[.nnn])",
+    ),
+    save_images: int = typer.Option(
+        0,
+        help="Number of preview images to save per scene (0 to disable)",
+    ),
+    stats_file: Optional[str] = typer.Option(
+        None,
+        help="Path to save detection statistics CSV",
+    ),
+    luma_only: bool = typer.Option(
+        False,
+        help="Only use brightness for content detection",
+    ),
+    adaptive_window: Optional[int] = typer.Option(
+        None,
+        help="Window size for adaptive detection",
+    ),
+    fade_bias: Optional[float] = typer.Option(
+        None,
+        help="Bias for fade detection (-1.0 to 1.0)",
+    ),
+    downscale: Optional[int] = typer.Option(
+        None,
+        help="Factor to downscale frames by during detection",
+    ),
+    frame_skip: int = typer.Option(
+        0,
+        help="Number of frames to skip during processing",
+    ),
+) -> None:
+    """Split video into scenes using PySceneDetect."""
+    if skip_start or skip_end:
+        typer.echo("Skipping start and end frames is not supported yet.")
+        return
+    # Validate output directory
+    output_path = validate_output_dir(output_dir)
+    # Detect and split scenes
+    detect_and_split_scenes(
+        video_path=str(video_path),
+        output_dir=output_path,
+        detector_type=detector,
+        threshold=threshold,
+        min_scene_len=min_scene_length,
+        max_scenes=max_scenes,
+        filter_shorter_than=filter_shorter_than,
+        skip_start=skip_start,
+        skip_end=skip_end,
+        duration=duration,
+        save_images_per_scene=save_images,
+        stats_file=stats_file,
+        luma_only=luma_only,
+        adaptive_window=adaptive_window,
+        fade_bias=fade_bias,
+        downscale_factor=downscale,
+        frame_skip=frame_skip,
+    )
+if __name__ == "__main__":
+    app()

packages/ltx-trainer/scripts/train.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python
+"""
+Train LTXV models using configuration from YAML files.
+This script provides a command-line interface for training LTXV models using
+either LoRA fine-tuning or full model fine-tuning. It loads configuration from
+a YAML file and passes it to the trainer.
+Basic usage:
+    python scripts/train.py CONFIG_PATH [--disable-progress-bars]
+For multi-GPU/FSDP training, configure and launch via Accelerate:
+    accelerate config
+    accelerate launch scripts/train.py CONFIG_PATH
+"""
+from pathlib import Path
+import typer
+import yaml
+from rich.console import Console
+from ltx_trainer.config import LtxTrainerConfig
+from ltx_trainer.trainer import LtxvTrainer
+console = Console()
+app = typer.Typer(
+    pretty_exceptions_enable=False,
+    no_args_is_help=True,
+    help="Train LTXV models using configuration from YAML files.",
+)
+@app.command()
+def main(
+    config_path: str = typer.Argument(..., help="Path to YAML configuration file"),
+    disable_progress_bars: bool = typer.Option(
+        False,
+        "--disable-progress-bars",
+        help="Disable progress bars (useful for multi-process runs)",
+    ),
+) -> None:
+    """Train the model using the provided configuration file."""
+    # Load the configuration from the YAML file
+    config_path = Path(config_path)
+    if not config_path.exists():
+        typer.echo(f"Error: Configuration file {config_path} does not exist.")
+        raise typer.Exit(code=1)
+    with open(config_path, "r") as file:
+        config_data = yaml.safe_load(file)
+    # Convert the loaded data to the LtxTrainerConfig object
+    try:
+        trainer_config = LtxTrainerConfig(**config_data)
+    except Exception as e:
+        typer.echo(f"Error: Invalid configuration data: {e}")
+        raise typer.Exit(code=1) from e
+    # Initialize the training process
+    trainer = LtxvTrainer(trainer_config)
+    trainer.train(disable_progress_bars=disable_progress_bars)
+if __name__ == "__main__":
+    app()

packages/ltx-trainer/src/ltx_trainer/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.54 kB). View file

packages/ltx-trainer/src/ltx_trainer/__pycache__/model_loader.cpython-312.pyc ADDED Viewed

Binary file (13.9 kB). View file

packages/ltx-trainer/src/ltx_trainer/captioning.py ADDED Viewed

	@@ -0,0 +1,401 @@

+"""
+Audio-visual media captioning using multimodal models.
+This module provides captioning capabilities for videos with audio using:
+- Qwen2.5-Omni: Local model supporting text, audio, image, and video inputs (default)
+- Gemini Flash: Cloud-based API for audio-visual captioning
+Requirements:
+- Qwen2.5-Omni: transformers>=4.50, torch
+- Gemini Flash: google-generativeai (uv pip install google-generativeai)
+  Set GEMINI_API_KEY or GOOGLE_API_KEY environment variable
+"""
+import itertools
+import re
+from abc import ABC, abstractmethod
+from enum import Enum
+from pathlib import Path
+import torch
+# Instruction for audio-visual captioning (default) - includes speech transcription and sounds
+DEFAULT_CAPTION_INSTRUCTION = """\
+Analyze this media and provide a detailed caption in the following EXACT format. Fill in ALL sections:
+[VISUAL]: <Detailed description of people, objects, actions, settings, colors, and movements>
+[SPEECH]: <Word-for-word transcription of everything spoken.
+           Listen carefully and transcribe the exact words. If no speech, write "None">
+[SOUNDS]: <Description of music, ambient sounds, sound effects. If none, write "None">
+[TEXT]: <Any on-screen text visible. If none, write "None">
+You MUST fill in all four sections. For [SPEECH], transcribe the actual words spoken, not a summary."""
+# Instruction for video-only captioning (no audio processing)
+VIDEO_ONLY_CAPTION_INSTRUCTION = """\
+Analyze this media and provide a detailed caption in the following EXACT format. Fill in ALL sections:
+[VISUAL]: <Detailed description of people, objects, actions, settings, colors, and movements>
+[TEXT]: <Any on-screen text visible. If none, write "None">
+You MUST fill in both sections."""
+class CaptionerType(str, Enum):
+    """Enum for different types of media captioners."""
+    QWEN_OMNI = "qwen_omni"  # Local Qwen2.5-Omni model (audio + video)
+    GEMINI_FLASH = "gemini_flash"  # Gemini Flash API (audio + video)
+def create_captioner(captioner_type: CaptionerType, **kwargs) -> "MediaCaptioningModel":
+    """Factory function to create a media captioner.
+    Args:
+        captioner_type: The type of captioner to create
+        **kwargs: Additional arguments to pass to the captioner constructor
+    Returns:
+        An instance of a MediaCaptioningModel
+    """
+    match captioner_type:
+        case CaptionerType.QWEN_OMNI:
+            return QwenOmniCaptioner(**kwargs)
+        case CaptionerType.GEMINI_FLASH:
+            return GeminiFlashCaptioner(**kwargs)
+        case _:
+            raise ValueError(f"Unsupported captioner type: {captioner_type}")
+class MediaCaptioningModel(ABC):
+    """Abstract base class for audio-visual media captioning models."""
+    @abstractmethod
+    def caption(self, path: str | Path, **kwargs) -> str:
+        """Generate a caption for the given video or image.
+        Args:
+            path: Path to the video/image file to caption
+        Returns:
+            A string containing the generated caption
+        """
+    @property
+    @abstractmethod
+    def supports_audio(self) -> bool:
+        """Whether this captioner supports audio input."""
+    @staticmethod
+    def _is_image_file(path: str | Path) -> bool:
+        """Check if the file is an image based on extension."""
+        return str(path).lower().endswith((".png", ".jpg", ".jpeg", ".heic", ".heif", ".webp"))
+    @staticmethod
+    def _is_video_file(path: str | Path) -> bool:
+        """Check if the file is a video based on extension."""
+        return str(path).lower().endswith((".mp4", ".avi", ".mov", ".mkv", ".webm"))
+    @staticmethod
+    def _clean_raw_caption(caption: str) -> str:
+        """Clean up the raw caption by removing common VLM patterns."""
+        start = ["The", "This"]
+        kind = ["video", "image", "scene", "animated sequence", "clip", "footage"]
+        act = ["displays", "shows", "features", "depicts", "presents", "showcases", "captures", "contains"]
+        for x, y, z in itertools.product(start, kind, act):
+            caption = caption.replace(f"{x} {y} {z} ", "", 1)
+        return caption
+class QwenOmniCaptioner(MediaCaptioningModel):
+    """Audio-visual captioning using Alibaba's Qwen2.5-Omni model.
+    Qwen2.5-Omni is an end-to-end multimodal model that can perceive text, images, audio, and video.
+    It uses a Thinker-Talker architecture where the Thinker generates text and the Talker can
+    generate speech. For captioning, we use only the Thinker component for text generation.
+    Key features:
+    - Block-wise processing for streaming multimodal inputs
+    - TMRoPE (Time-aligned Multimodal RoPE) for synchronizing video and audio timestamps
+    - Can extract and process audio directly from video files
+    See: https://huggingface.co/docs/transformers/en/model_doc/qwen2_5_omni
+    Model: Qwen/Qwen2.5-Omni-7B (7B parameters)
+    """
+    MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
+    # Default system prompt required by Qwen2.5-Omni for proper audio processing
+    DEFAULT_SYSTEM_PROMPT = (
+        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
+        "capable of perceiving auditory and visual inputs, as well as generating text and speech."
+    )
+    def __init__(
+        self,
+        device: str | torch.device | None = None,
+        use_8bit: bool = False,
+        instruction: str | None = None,
+    ):
+        """
+        Initialize the Qwen2.5-Omni captioner.
+        Args:
+            device: Device to use for inference (e.g., 'cuda', 'cuda:0', 'cpu')
+            use_8bit: Whether to use 8-bit quantization for reduced memory usage
+            instruction: Custom instruction prompt. If None, uses the default instruction
+        """
+        self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
+        self.instruction = instruction
+        self._load_model(use_8bit=use_8bit)
+    @property
+    def supports_audio(self) -> bool:
+        return True
+    def caption(
+        self,
+        path: str | Path,
+        fps: int = 1,
+        include_audio: bool = True,
+        clean_caption: bool = True,
+    ) -> str:
+        """Generate a caption for the given video or image.
+        Args:
+            path: Path to the video/image file to caption
+            fps: Frames per second to sample from videos
+            include_audio: Whether to include audio in the captioning (for videos)
+            clean_caption: Whether to clean up the raw caption by removing common VLM patterns
+        Returns:
+            A string containing the generated caption
+        """
+        path = Path(path)
+        is_image = self._is_image_file(path)
+        is_video = self._is_video_file(path)
+        # Determine if we should process audio
+        use_audio = include_audio and is_video
+        # Use custom instruction if provided, otherwise pick appropriate default
+        if self.instruction is not None:
+            instruction = self.instruction
+        else:
+            instruction = DEFAULT_CAPTION_INSTRUCTION if use_audio else VIDEO_ONLY_CAPTION_INSTRUCTION
+        # Build the user content based on media type
+        # Based on HuggingFace docs: https://huggingface.co/docs/transformers/en/model_doc/qwen2_5_omni
+        user_content = []
+        if is_image:
+            user_content.append({"type": "image", "image": str(path)})
+        elif is_video:
+            user_content.append({"type": "video", "video": str(path)})
+        # Add the instruction text
+        user_content.append({"type": "text", "text": instruction})
+        # Build conversation - use the default system prompt required by Qwen2.5-Omni
+        # Using a custom system prompt causes warnings and may affect audio processing
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": self.DEFAULT_SYSTEM_PROMPT}],
+            },
+            {"role": "user", "content": user_content},
+        ]
+        # Process inputs using the processor's apply_chat_template
+        # For videos with audio, use load_audio_from_video=True and use_audio_in_video=True
+        inputs = self.processor.apply_chat_template(
+            messages,
+            load_audio_from_video=use_audio,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            fps=fps,
+            padding=True,
+            use_audio_in_video=use_audio,
+        ).to(self.model.device)
+        # Generate caption (text only, using Thinker-only model)
+        # Note: For Qwen2_5OmniThinkerForConditionalGeneration, use standard generate params
+        # (not thinker_ prefixed ones, those are for the full Qwen2_5OmniForConditionalGeneration)
+        input_len = inputs["input_ids"].shape[1]
+        output_tokens = self.model.generate(
+            **inputs,
+            use_audio_in_video=use_audio,
+            do_sample=False,
+            max_new_tokens=1024,
+        )
+        # Extract only the generated tokens (exclude the input/prompt tokens)
+        generated_tokens = output_tokens[:, input_len:]
+        # Decode only the generated response
+        caption_raw = self.processor.batch_decode(
+            generated_tokens,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+        # Remove hallucinated conversation turns (e.g., "Human\nHuman\n..." or "Human: ...")
+        # This is a known issue with chat models continuing to generate fake turns
+        # We look for patterns that are clearly hallucinated chat turns, not legitimate uses of "human"
+        # Match "\nHuman" followed by ":", "\n", or end of string (chat turn patterns)
+        # This won't match "A human walks..." or "...the human body..."
+        caption_raw = re.split(r"\nHuman(?::|(?:\s*\n)|$)", caption_raw, maxsplit=1)[0]
+        caption_raw = caption_raw.strip()
+        # Clean up caption if requested
+        return self._clean_raw_caption(caption_raw) if clean_caption else caption_raw
+    def _load_model(self, use_8bit: bool) -> None:
+        """Load the Qwen2.5-Omni model and processor.
+        Uses the Thinker-only model (Qwen2_5OmniThinkerForConditionalGeneration) for text generation
+        to save compute by not loading the audio generation components.
+        """
+        from transformers import (  # noqa: PLC0415
+            BitsAndBytesConfig,
+            Qwen2_5OmniProcessor,
+            Qwen2_5OmniThinkerForConditionalGeneration,
+        )
+        quantization_config = BitsAndBytesConfig(load_in_8bit=True) if use_8bit else None
+        # Use Thinker-only model for text generation (saves memory by not loading Talker)
+        self.model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
+            self.MODEL_ID,
+            dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            quantization_config=quantization_config,
+            device_map="auto",
+        )
+        self.processor = Qwen2_5OmniProcessor.from_pretrained(self.MODEL_ID)
+class GeminiFlashCaptioner(MediaCaptioningModel):
+    """Audio-visual captioning using Google's Gemini Flash API.
+    Gemini Flash is a cloud-based multimodal model that natively supports
+    audio and video understanding. Requires a Google API key.
+    Note: This captioner requires the `google-generativeai` package and a valid API key.
+    Set the GEMINI_API_KEY or GOOGLE_API_KEY environment variable, or pass the key directly.
+    """
+    MODEL_ID = "gemini-flash-lite-latest"
+    def __init__(
+        self,
+        api_key: str | None = None,
+        instruction: str | None = None,
+    ):
+        """Initialize the Gemini Flash captioner.
+        Args:
+            api_key: Google API key. If not provided, will look for
+                     GEMINI_API_KEY or GOOGLE_API_KEY environment variable.
+            instruction: Custom instruction prompt. If None, uses the default instruction
+        """
+        self.instruction = instruction
+        self._init_client(api_key)
+    @property
+    def supports_audio(self) -> bool:
+        return True
+    def caption(
+        self,
+        path: str | Path,
+        fps: int = 3,  # noqa: ARG002 - kept for API compatibility
+        include_audio: bool = True,
+        clean_caption: bool = True,
+    ) -> str:
+        """Generate a caption for the given video or image.
+        Args:
+            path: Path to the video/image file to caption
+            fps: Frames per second (not used for Gemini, kept for API compatibility)
+            include_audio: Whether to include audio content in the caption
+            clean_caption: Whether to clean up the raw caption
+        Returns:
+            A string containing the generated caption
+        """
+        import time  # noqa: PLC0415
+        path = Path(path)
+        is_video = self._is_video_file(path)
+        use_audio = include_audio and is_video
+        # Use custom instruction if provided, otherwise pick appropriate default
+        if self.instruction is not None:
+            instruction = self.instruction
+        else:
+            instruction = DEFAULT_CAPTION_INSTRUCTION if use_audio else VIDEO_ONLY_CAPTION_INSTRUCTION
+        # Upload the file to Gemini
+        uploaded_file = self._genai.upload_file(path)
+        # Wait for processing to complete (videos need time to process)
+        while uploaded_file.state.name == "PROCESSING":
+            time.sleep(1)
+            uploaded_file = self._genai.get_file(uploaded_file.name)
+        if uploaded_file.state.name == "FAILED":
+            raise RuntimeError(f"File processing failed: {uploaded_file.state.name}")
+        # Generate caption
+        response = self._model.generate_content([uploaded_file, instruction])
+        caption_raw = response.text
+        # Clean up the uploaded file
+        self._genai.delete_file(uploaded_file.name)
+        # Clean up caption if requested
+        return self._clean_raw_caption(caption_raw) if clean_caption else caption_raw
+    def _init_client(self, api_key: str | None) -> None:
+        """Initialize the Gemini API client."""
+        import os  # noqa: PLC0415
+        try:
+            import google.generativeai as genai  # noqa: PLC0415
+        except ImportError as e:
+            raise ImportError(
+                "The `google-generativeai` package is required for Gemini Flash captioning. "
+                "Install it with: `uv pip install google-generativeai`"
+            ) from e
+        # Get API key from argument or environment
+        # GEMINI_API_KEY is the recommended variable, GOOGLE_API_KEY also works
+        resolved_api_key = api_key or os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
+        if not resolved_api_key:
+            raise ValueError(
+                "Gemini API key is required. Provide it via the `api_key` argument "
+                "or set the GEMINI_API_KEY or GOOGLE_API_KEY environment variable."
+            )
+        # Configure the genai library with the API key
+        genai.configure(api_key=resolved_api_key)
+        # Store reference to genai module for file operations
+        self._genai = genai
+        # Initialize the model
+        self._model = genai.GenerativeModel(self.MODEL_ID)
+def example() -> None:
+    """Example usage of the captioning module."""
+    import sys  # noqa: PLC0415
+    if len(sys.argv) < 2:
+        print(f"Usage: python {sys.argv[0]} <video_path> [captioner_type]")  # noqa: T201
+        print("  captioner_type: qwen_omni (default) or gemini_flash")  # noqa: T201
+        sys.exit(1)
+    video_path = sys.argv[1]
+    captioner_type = CaptionerType(sys.argv[2]) if len(sys.argv) > 2 else CaptionerType.QWEN_OMNI
+    print(f"Using {captioner_type.value} captioner:")  # noqa: T201
+    captioner = create_captioner(captioner_type)
+    caption = captioner.caption(video_path)
+    print(f"CAPTION: {caption}")  # noqa: T201
+if __name__ == "__main__":
+    example()

packages/ltx-trainer/src/ltx_trainer/gemma_8bit.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# ruff: noqa: PLC0415
+"""
+8-bit Gemma text encoder loading utilities.
+This module provides functionality for loading the Gemma text encoder in 8-bit precision
+using bitsandbytes, which significantly reduces GPU memory usage.
+Example usage:
+    from ltx_trainer.gemma_8bit import load_8bit_gemma
+    text_encoder = load_8bit_gemma(gemma_model_path="/path/to/gemma")
+"""
+from __future__ import annotations
+import logging
+from collections.abc import Generator
+from contextlib import contextmanager
+from pathlib import Path
+import torch
+from ltx_core.text_encoders.gemma.encoders.base_encoder import GemmaTextEncoder
+from ltx_core.text_encoders.gemma.tokenizer import LTXVGemmaTokenizer
+def load_8bit_gemma(gemma_model_path: str | Path, dtype: torch.dtype = torch.bfloat16) -> GemmaTextEncoder:
+    """Load the Gemma text encoder in 8-bit precision using bitsandbytes.
+    Only the Gemma LLM backbone is loaded here.  The embeddings processor
+    (feature extractor + connectors) should be loaded separately via
+    :func:`ltx_trainer.model_loader.load_embeddings_processor`.
+    Args:
+        gemma_model_path: Path to Gemma model directory
+        dtype: Data type for non-quantized model weights
+    Returns:
+        GemmaTextEncoder with 8-bit quantized Gemma backbone
+    Raises:
+        ImportError: If bitsandbytes is not installed
+        FileNotFoundError: If required model files are not found
+    """
+    try:
+        from transformers import BitsAndBytesConfig, Gemma3ForConditionalGeneration
+    except ImportError as e:
+        raise ImportError(
+            "8-bit text encoder loading requires bitsandbytes. Install it with: uv pip install bitsandbytes"
+        ) from e
+    gemma_path = _find_gemma_subpath(gemma_model_path, "model*.safetensors")
+    tokenizer_path = _find_gemma_subpath(gemma_model_path, "tokenizer.model")
+    quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+    with _suppress_accelerate_memory_warnings():
+        gemma_model = Gemma3ForConditionalGeneration.from_pretrained(
+            gemma_path,
+            quantization_config=quantization_config,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            local_files_only=True,
+        )
+    tokenizer = LTXVGemmaTokenizer(tokenizer_path, 1024)
+    return GemmaTextEncoder(
+        tokenizer=tokenizer,
+        model=gemma_model,
+        dtype=dtype,
+    )
+def _find_gemma_subpath(root_path: str | Path, pattern: str) -> str:
+    """Find a file matching a glob pattern and return its parent directory."""
+    matches = list(Path(root_path).rglob(pattern))
+    if not matches:
+        raise FileNotFoundError(f"No files matching pattern '{pattern}' found under {root_path}")
+    return str(matches[0].parent)
+@contextmanager
+def _suppress_accelerate_memory_warnings() -> Generator[None, None, None]:
+    """Temporarily suppress INFO warnings from accelerate about memory allocation."""
+    accelerate_logger = logging.getLogger("accelerate.utils.modeling")
+    old_level = accelerate_logger.level
+    accelerate_logger.setLevel(logging.WARNING)
+    try:
+        yield
+    finally:
+        accelerate_logger.setLevel(old_level)

packages/ltx-trainer/src/ltx_trainer/gpu_utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""GPU memory management utilities for training and inference."""
+import functools
+import gc
+import subprocess
+from typing import Callable, TypeVar
+import torch
+from ltx_trainer import logger
+F = TypeVar("F", bound=Callable)
+def free_gpu_memory(log: bool = False) -> None:
+    """Free GPU memory by running garbage collection and emptying CUDA cache.
+    Args:
+        log: If True, log memory stats after clearing
+    """
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        if log:
+            allocated = torch.cuda.memory_allocated() / 1024**3
+            reserved = torch.cuda.memory_reserved() / 1024**3
+            logger.debug(f"GPU memory freed. Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
+class free_gpu_memory_context:  # noqa: N801
+    """Context manager and decorator to free GPU memory before and/or after execution.
+    Can be used as a decorator:
+        @free_gpu_memory_context(after=True)
+        def my_function():
+            ...
+    Or as a context manager:
+        with free_gpu_memory_context():
+            heavy_operation()
+    Args:
+        before: Free memory before execution (default: False)
+        after: Free memory after execution (default: True)
+        log: Log memory stats when freeing (default: False)
+    """
+    def __init__(self, *, before: bool = False, after: bool = True, log: bool = False) -> None:
+        self.before = before
+        self.after = after
+        self.log = log
+    def __enter__(self) -> "free_gpu_memory_context":
+        if self.before:
+            free_gpu_memory(log=self.log)
+        return self
+    def __exit__(self, exc_type: type | None, exc_val: Exception | None, exc_tb: object) -> None:
+        if self.after:
+            free_gpu_memory(log=self.log)
+    def __call__(self, func: F) -> F:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs) -> object:
+            with self:
+                return func(*args, **kwargs)
+        return wrapper  # type: ignore
+def get_gpu_memory_gb(device: torch.device) -> float:
+    """Get current GPU memory usage in GB using nvidia-smi.
+    Args:
+        device: torch.device to get memory usage for
+    Returns:
+        Current GPU memory usage in GB
+    """
+    try:
+        device_id = device.index if device.index is not None else 0
+        result = subprocess.check_output(
+            [
+                "nvidia-smi",
+                "--query-gpu=memory.used",
+                "--format=csv,nounits,noheader",
+                "-i",
+                str(device_id),
+            ],
+            encoding="utf-8",
+        )
+        return float(result.strip()) / 1024  # Convert MB to GB
+    except (subprocess.CalledProcessError, FileNotFoundError, ValueError) as e:
+        logger.error(f"Failed to get GPU memory from nvidia-smi: {e}")
+        # Fallback to torch
+        return torch.cuda.memory_allocated(device) / 1024**3

packages/ltx-trainer/src/ltx_trainer/progress.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Progress tracking for LTX training.
+This module provides a unified progress display for training and validation sampling,
+encapsulating all Rich progress bar logic in one place.
+"""
+from rich.progress import (
+    BarColumn,
+    Progress,
+    TaskID,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+class SamplingContext:
+    """Context for validation sampling progress tracking.
+    Provides a unified progress display showing current video and denoising step.
+    Display format: "Sampling X/Y [████████████] step Z/W"
+    The progress bar shows the denoising progress for the current video.
+    """
+    def __init__(self, progress: Progress | None, task: TaskID | None, num_prompts: int, num_steps: int):
+        self._progress = progress
+        self._task = task
+        self._num_prompts = num_prompts
+        self._num_steps = num_steps
+    def start_video(self, video_idx: int) -> None:
+        """Start tracking a new video (resets step progress)."""
+        if self._progress is None or self._task is None:
+            return
+        # Reset task for new video: completed=0, total=num_steps
+        self._progress.reset(self._task, total=self._num_steps)
+        self._progress.update(
+            self._task,
+            completed=0,
+            video=f"{video_idx + 1}/{self._num_prompts}",
+            info=f"step 0/{self._num_steps}",
+        )
+    def advance_step(self) -> None:
+        """Advance the denoising step by one."""
+        if self._progress is None or self._task is None:
+            return
+        self._progress.advance(self._task)
+        completed = int(self._progress.tasks[self._task].completed)
+        self._progress.update(self._task, info=f"step {completed}/{self._num_steps}")
+    def cleanup(self) -> None:
+        """Hide sampling task when done."""
+        if self._progress is None or self._task is None:
+            return
+        self._progress.update(self._task, visible=False)
+class StandaloneSamplingProgress:
+    """Standalone progress display for inference scripts.
+    Unlike SamplingContext (which integrates with TrainingProgress), this class
+    manages its own Rich Progress instance for use in standalone inference scripts.
+    Usage:
+        with StandaloneSamplingProgress(num_steps=30) as ctx:
+            for step in range(30):
+                # ... denoising step ...
+                ctx.advance_step()
+    """
+    def __init__(self, num_steps: int, description: str = "Generating"):
+        """Initialize standalone sampling progress.
+        Args:
+            num_steps: Total number of denoising steps
+            description: Description to show in progress bar
+        """
+        self._num_steps = num_steps
+        self._description = description
+        self._progress: Progress | None = None
+        self._task: TaskID | None = None
+    def __enter__(self) -> "StandaloneSamplingProgress":
+        """Start the progress display."""
+        self._progress = Progress(
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(bar_width=40, style="blue"),
+            TextColumn("{task.fields[info]}", style="cyan"),
+            TimeElapsedColumn(),
+            TextColumn("ETA:"),
+            TimeRemainingColumn(compact=True),
+        )
+        self._progress.__enter__()
+        self._task = self._progress.add_task(
+            self._description,
+            total=self._num_steps,
+            info=f"step 0/{self._num_steps}",
+        )
+        return self
+    def __exit__(self, *args) -> None:
+        """Stop the progress display."""
+        if self._progress is not None:
+            self._progress.__exit__(*args)
+    def advance_step(self) -> None:
+        """Advance the denoising step by one."""
+        if self._progress is None or self._task is None:
+            return
+        self._progress.advance(self._task)
+        completed = int(self._progress.tasks[self._task].completed)
+        self._progress.update(self._task, info=f"step {completed}/{self._num_steps}")
+class TrainingProgress:
+    """Manages Rich progress display for training and validation.
+    This class encapsulates all progress bar logic, providing a clean interface
+    for the trainer to update progress without dealing with Rich internals.
+    Usage:
+        with TrainingProgress(enabled=True, total_steps=1000) as progress:
+            for step in range(1000):
+                # ... training step ...
+                progress.update_training(loss=0.1, lr=1e-4, step_time=0.5)
+                if should_validate:
+                    sampling_ctx = progress.start_sampling(num_prompts=3, num_steps=30)
+                    sampler = ValidationSampler(..., sampling_context=sampling_ctx)
+                    for prompt_idx, prompt in enumerate(prompts):
+                        sampling_ctx.start_video(prompt_idx)
+                        sampler.generate(...)
+                    sampling_ctx.cleanup()
+    """
+    def __init__(self, enabled: bool, total_steps: int):
+        """Initialize progress tracking.
+        Args:
+            enabled: Whether to display progress bars (False for non-main processes)
+            total_steps: Total number of training steps
+        """
+        self._enabled = enabled
+        self._total_steps = total_steps
+        self._train_task: TaskID | None = None
+        if not enabled:
+            self._progress = None
+            return
+        # Single Progress instance with flexible columns
+        self._progress = Progress(
+            TextColumn("[progress.description]{task.description}"),
+            TextColumn("{task.fields[video]}", style="magenta"),
+            BarColumn(bar_width=40, style="blue"),
+            TextColumn("{task.fields[info]}", style="cyan"),
+            TimeElapsedColumn(),
+            TextColumn("ETA:"),
+            TimeRemainingColumn(compact=True),
+        )
+    def __enter__(self) -> "TrainingProgress":
+        """Enter the progress context, starting the live display."""
+        if self._progress is not None:
+            self._progress.__enter__()
+            self._train_task = self._progress.add_task(
+                "Training",
+                total=self._total_steps,
+                video=f"0/{self._total_steps}",
+                info="Starting...",
+            )
+        return self
+    def __exit__(self, *args) -> None:
+        """Exit the progress context, stopping the live display."""
+        if self._progress is not None:
+            self._progress.__exit__(*args)
+    @property
+    def enabled(self) -> bool:
+        """Whether progress display is enabled."""
+        return self._enabled
+    def update_training(
+        self,
+        *,
+        loss: float,
+        lr: float,
+        step_time: float,
+        advance: bool = True,
+    ) -> None:
+        """Update the training progress display.
+        Args:
+            loss: Current training loss
+            lr: Current learning rate
+            step_time: Time taken for this step in seconds
+            advance: Whether to advance the progress by one step
+        """
+        if self._progress is None or self._train_task is None:
+            return
+        info = f"Loss: {loss:.4f} | LR: {lr:.2e} | {step_time:.2f}s/step"
+        self._progress.update(
+            self._train_task,
+            advance=1 if advance else 0,
+            info=info,
+        )
+        # Update step count in video column
+        completed = int(self._progress.tasks[self._train_task].completed)
+        self._progress.update(self._train_task, video=f"{completed}/{self._total_steps}")
+    def start_sampling(self, num_prompts: int, num_steps: int) -> SamplingContext:
+        """Start validation sampling progress tracking.
+        Creates a task that shows current video and denoising step progress.
+        Format: "Sampling X/Y [████████████] step Z/W"
+        Args:
+            num_prompts: Number of validation prompts to sample
+            num_steps: Number of denoising steps per sample
+        Returns:
+            SamplingContext for tracking progress (no-op if progress is disabled)
+        """
+        if self._progress is None:
+            # Return a no-op context when progress is disabled
+            return SamplingContext(
+                progress=None,
+                task=None,
+                num_prompts=num_prompts,
+                num_steps=num_steps,
+            )
+        task = self._progress.add_task(
+            "Sampling",
+            total=num_steps,
+            completed=0,
+            video=f"0/{num_prompts}",
+            info=f"step 0/{num_steps}",
+        )
+        return SamplingContext(
+            progress=self._progress,
+            task=task,
+            num_prompts=num_prompts,
+            num_steps=num_steps,
+        )

packages/ltx-trainer/src/ltx_trainer/quantization.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Adapted from: https://github.com/bghira/SimpleTuner
+# With improvements from: https://github.com/ostris/ai-toolkit
+from typing import Literal
+import torch
+from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
+from ltx_trainer import logger
+QuantizationOptions = Literal[
+    "int8-quanto",
+    "int4-quanto",
+    "int2-quanto",
+    "fp8-quanto",
+    "fp8uz-quanto",
+]
+# Modules to exclude from quantization.
+# These are glob patterns passed to quanto's `exclude` parameter.
+# When quantizing the full model at once, these patterns match against full module paths.
+# When quantizing block-by-block, we also use SKIP_ROOT_MODULES for top-level modules.
+EXCLUDE_PATTERNS = [
+    # Input/output projection layers
+    "patchify_proj",
+    "audio_patchify_proj",
+    "proj_out",
+    "audio_proj_out",
+    # Timestep embedding layers - int4 tinygemm requires strict bfloat16 input
+    # and these receive float32 sinusoidal embeddings that are cast to bfloat16
+    "*adaln*",
+    "time_proj",
+    "timestep_embedder*",
+    # Caption/text projection layers
+    "caption_projection*",
+    "audio_caption_projection*",
+    # Normalization layers (usually excluded from quantization)
+    "*norm*",
+]
+# Top-level modules to skip entirely during block-by-block quantization.
+# These are exact matches against model.named_children() names.
+# (Needed because quanto's exclude patterns don't work when calling quantize() directly on a module)
+SKIP_ROOT_MODULES = {
+    "patchify_proj",
+    "audio_patchify_proj",
+    "proj_out",
+    "audio_proj_out",
+    "audio_caption_projection",
+}
+def quantize_model(
+    model: torch.nn.Module,
+    precision: QuantizationOptions,
+    quantize_activations: bool = False,
+    device: torch.device | str | None = None,
+) -> torch.nn.Module:
+    """
+    Quantize a model using optimum-quanto.
+    For large models with transformer_blocks, this function quantizes block-by-block
+    on GPU then moves back to CPU, which is much faster than quantizing on CPU and
+    uses less peak VRAM than loading the entire model to GPU at once.
+    Args:
+        model: The model to quantize.
+        precision: The quantization precision (e.g. "int8-quanto", "fp8-quanto").
+        quantize_activations: Whether to quantize activations in addition to weights.
+        device: Device to use for quantization. If None, uses CUDA if available, else CPU.
+    Returns:
+        The quantized model.
+    """
+    from optimum.quanto import freeze, quantize  # noqa: PLC0415
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        device = torch.device(device)
+    weight_quant = _get_quanto_dtype(precision)
+    if quantize_activations:
+        logger.debug("Quantizing model weights and activations")
+        activations_quant = weight_quant
+    else:
+        activations_quant = None
+    # Remember original device to restore after quantization
+    original_device = next(model.parameters()).device
+    # Check if model has transformer_blocks for block-by-block quantization
+    if hasattr(model, "transformer_blocks"):
+        logger.debug("Quantizing model using block-by-block approach for memory efficiency")
+        _quantize_blockwise(
+            model,
+            weight_quant=weight_quant,
+            activations_quant=activations_quant,
+            device=device,
+        )
+    else:
+        # Fallback: quantize entire model at once
+        model.to(device)
+        quantize(model, weights=weight_quant, activations=activations_quant, exclude=EXCLUDE_PATTERNS)
+        freeze(model)
+    # Restore model to original device
+    model.to(original_device)
+    return model
+def _quantize_blockwise(
+    model: torch.nn.Module,
+    weight_quant: torch.dtype,
+    activations_quant: torch.dtype | None,
+    device: torch.device,
+) -> None:
+    """Quantize a model block-by-block using optimum-quanto.
+    This approach:
+    1. Moves each transformer block to GPU
+    2. Quantizes on GPU (fast!)
+    3. Freezes the quantized weights
+    4. Moves back to CPU
+    This is much faster than quantizing on CPU and uses less peak VRAM
+    than loading the entire model to GPU.
+    """
+    from optimum.quanto import freeze, quantize  # noqa: PLC0415
+    original_dtype = next(model.parameters()).dtype
+    transformer_blocks = list(model.transformer_blocks)
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TaskProgressColumn(),
+        transient=True,
+    ) as progress:
+        task = progress.add_task("Quantizing transformer blocks", total=len(transformer_blocks))
+        for block in transformer_blocks:
+            # Move block to GPU
+            block.to(device, dtype=original_dtype, non_blocking=True)
+            # Quantize on GPU
+            quantize(block, weights=weight_quant, activations=activations_quant, exclude=EXCLUDE_PATTERNS)
+            freeze(block)
+            # Move back to CPU to free up VRAM for next block
+            block.to("cpu", non_blocking=True)
+            progress.advance(task)
+    # Quantize remaining non-transformer-block modules (e.g., embeddings, timestep projections)
+    # Skip modules that should not be quantized (patchify_proj, proj_out, etc.)
+    logger.debug("Quantizing remaining model components")
+    for name, module in model.named_children():
+        if name == "transformer_blocks":
+            continue  # Already quantized
+        if name in SKIP_ROOT_MODULES:
+            logger.debug(f"Skipping quantization for module: {name}")
+            continue  # Don't quantize these modules
+        # Move to device, quantize, freeze, move back
+        module.to(device, dtype=original_dtype, non_blocking=True)
+        quantize(module, weights=weight_quant, activations=activations_quant, exclude=EXCLUDE_PATTERNS)
+        freeze(module)
+        module.to("cpu", non_blocking=True)
+def _get_quanto_dtype(precision: QuantizationOptions) -> torch.dtype:
+    """Map precision string to quanto dtype."""
+    from optimum.quanto import (  # noqa: PLC0415
+        qfloat8,
+        qfloat8_e4m3fnuz,
+        qint2,
+        qint4,
+        qint8,
+    )
+    if precision == "int2-quanto":
+        return qint2
+    elif precision == "int4-quanto":
+        return qint4
+    elif precision == "int8-quanto":
+        return qint8
+    elif precision in ("fp8-quanto", "fp8uz-quanto"):
+        if torch.backends.mps.is_available():
+            raise ValueError("FP8 quantization is not supported on MPS devices. Use int2, int4, or int8 instead.")
+        if precision == "fp8-quanto":
+            return qfloat8
+        elif precision == "fp8uz-quanto":
+            return qfloat8_e4m3fnuz
+    raise ValueError(f"Invalid quantization precision: {precision}")

packages/ltx-trainer/src/ltx_trainer/trainer.py ADDED Viewed

	@@ -0,0 +1,1000 @@

+import os
+import time
+import warnings
+from pathlib import Path
+from typing import Callable
+import torch
+import wandb
+import yaml
+from accelerate import Accelerator, DistributedType
+from accelerate.utils import set_seed
+from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict
+from peft.tuners.tuners_utils import BaseTunerLayer
+from peft.utils import ModulesToSaveWrapper
+from pydantic import BaseModel
+from safetensors.torch import load_file, save_file
+from torch import Tensor
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import (
+    CosineAnnealingLR,
+    CosineAnnealingWarmRestarts,
+    LinearLR,
+    LRScheduler,
+    PolynomialLR,
+    StepLR,
+)
+from torch.utils.data import DataLoader
+from torchvision.transforms import functional as F  # noqa: N812
+from ltx_core.text_encoders.gemma import convert_to_additive_mask
+from ltx_trainer import logger
+from ltx_trainer.config import LtxTrainerConfig
+from ltx_trainer.config_display import print_config
+from ltx_trainer.datasets import PrecomputedDataset
+from ltx_trainer.gpu_utils import free_gpu_memory, free_gpu_memory_context, get_gpu_memory_gb
+from ltx_trainer.hf_hub_utils import push_to_hub
+from ltx_trainer.model_loader import load_embeddings_processor, load_text_encoder
+from ltx_trainer.model_loader import load_model as load_ltx_model
+from ltx_trainer.progress import TrainingProgress
+from ltx_trainer.quantization import quantize_model
+from ltx_trainer.timestep_samplers import SAMPLERS
+from ltx_trainer.training_strategies import get_training_strategy
+from ltx_trainer.utils import open_image_as_srgb, save_image
+from ltx_trainer.validation_sampler import CachedPromptEmbeddings, GenerationConfig, ValidationSampler
+from ltx_trainer.video_utils import read_video, save_video
+# Disable irrelevant warnings from transformers
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+# Silence bitsandbytes warnings about casting
+warnings.filterwarnings(
+    "ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization"
+)
+# Disable progress bars if not main process
+IS_MAIN_PROCESS = os.environ.get("LOCAL_RANK", "0") == "0"
+if not IS_MAIN_PROCESS:
+    from transformers.utils.logging import disable_progress_bar
+    disable_progress_bar()
+StepCallback = Callable[[int, int, list[Path]], None]  # (step, total, list[sampled_video_path]) -> None
+MEMORY_CHECK_INTERVAL = 200
+class TrainingStats(BaseModel):
+    """Statistics collected during training"""
+    total_time_seconds: float
+    steps_per_second: float
+    samples_per_second: float
+    peak_gpu_memory_gb: float
+    global_batch_size: int
+    num_processes: int
+class LtxvTrainer:
+    def __init__(self, trainer_config: LtxTrainerConfig) -> None:
+        self._config = trainer_config
+        if IS_MAIN_PROCESS:
+            print_config(trainer_config)
+        self._training_strategy = get_training_strategy(self._config.training_strategy)
+        self._cached_validation_embeddings = self._load_text_encoder_and_cache_embeddings()
+        self._load_models()
+        self._setup_accelerator()
+        self._collect_trainable_params()
+        self._load_checkpoint()
+        self._prepare_models_for_training()
+        self._dataset = None
+        self._global_step = -1
+        self._checkpoint_paths = []
+        self._init_wandb()
+    def train(  # noqa: PLR0912, PLR0915
+        self,
+        disable_progress_bars: bool = False,
+        step_callback: StepCallback | None = None,
+    ) -> tuple[Path, TrainingStats]:
+        """
+        Start the training process.
+        Returns:
+            Tuple of (saved_model_path, training_stats)
+        """
+        device = self._accelerator.device
+        cfg = self._config
+        start_mem = get_gpu_memory_gb(device)
+        train_start_time = time.time()
+        # Use the same seed for all processes and ensure deterministic operations
+        set_seed(cfg.seed)
+        logger.debug(f"Process {self._accelerator.process_index} using seed: {cfg.seed}")
+        self._init_optimizer()
+        self._init_dataloader()
+        data_iter = iter(self._dataloader)
+        self._init_timestep_sampler()
+        # Synchronize all processes after initialization
+        self._accelerator.wait_for_everyone()
+        Path(cfg.output_dir).mkdir(parents=True, exist_ok=True)
+        # Save the training configuration as YAML
+        self._save_config()
+        logger.info("🚀 Starting training...")
+        # Create progress tracking (disabled for non-main processes or when explicitly disabled)
+        progress_enabled = IS_MAIN_PROCESS and not disable_progress_bars
+        progress = TrainingProgress(
+            enabled=progress_enabled,
+            total_steps=cfg.optimization.steps,
+        )
+        if IS_MAIN_PROCESS and disable_progress_bars:
+            logger.warning("Progress bars disabled. Intermediate status messages will be logged instead.")
+        self._transformer.train()
+        self._global_step = 0
+        peak_mem_during_training = start_mem
+        sampled_videos_paths = None
+        with progress:
+            # Initial validation before training starts
+            if cfg.validation.interval and not cfg.validation.skip_initial_validation:
+                sampled_videos_paths = self._sample_videos(progress)
+                if IS_MAIN_PROCESS and sampled_videos_paths and self._config.wandb.log_validation_videos:
+                    self._log_validation_samples(sampled_videos_paths, cfg.validation.prompts)
+            self._accelerator.wait_for_everyone()
+            for step in range(cfg.optimization.steps * cfg.optimization.gradient_accumulation_steps):
+                # Get next batch, reset the dataloader if needed
+                try:
+                    batch = next(data_iter)
+                except StopIteration:
+                    data_iter = iter(self._dataloader)
+                    batch = next(data_iter)
+                step_start_time = time.time()
+                with self._accelerator.accumulate(self._transformer):
+                    is_optimization_step = (step + 1) % cfg.optimization.gradient_accumulation_steps == 0
+                    if is_optimization_step:
+                        self._global_step += 1
+                    loss = self._training_step(batch)
+                    self._accelerator.backward(loss)
+                    if self._accelerator.sync_gradients and cfg.optimization.max_grad_norm > 0:
+                        self._accelerator.clip_grad_norm_(
+                            self._trainable_params,
+                            cfg.optimization.max_grad_norm,
+                        )
+                    self._optimizer.step()
+                    self._optimizer.zero_grad()
+                    if self._lr_scheduler is not None:
+                        self._lr_scheduler.step()
+                    # Run validation if needed
+                    if (
+                        cfg.validation.interval
+                        and self._global_step > 0
+                        and self._global_step % cfg.validation.interval == 0
+                        and is_optimization_step
+                    ):
+                        if self._accelerator.distributed_type == DistributedType.FSDP:
+                            # FSDP: All processes must participate in validation
+                            sampled_videos_paths = self._sample_videos(progress)
+                            if IS_MAIN_PROCESS and sampled_videos_paths and self._config.wandb.log_validation_videos:
+                                self._log_validation_samples(sampled_videos_paths, cfg.validation.prompts)
+                        # DDP: Only main process runs validation
+                        elif IS_MAIN_PROCESS:
+                            sampled_videos_paths = self._sample_videos(progress)
+                            if sampled_videos_paths and self._config.wandb.log_validation_videos:
+                                self._log_validation_samples(sampled_videos_paths, cfg.validation.prompts)
+                    # Save checkpoint if needed
+                    if (
+                        cfg.checkpoints.interval
+                        and self._global_step > 0
+                        and self._global_step % cfg.checkpoints.interval == 0
+                        and is_optimization_step
+                    ):
+                        self._save_checkpoint()
+                    self._accelerator.wait_for_everyone()
+                    # Call step callback if provided
+                    if step_callback and is_optimization_step:
+                        step_callback(self._global_step, cfg.optimization.steps, sampled_videos_paths)
+                    self._accelerator.wait_for_everyone()
+                    # Update progress and log metrics
+                    current_lr = self._optimizer.param_groups[0]["lr"]
+                    step_time = (time.time() - step_start_time) * cfg.optimization.gradient_accumulation_steps
+                    progress.update_training(
+                        loss=loss.item(),
+                        lr=current_lr,
+                        step_time=step_time,
+                        advance=is_optimization_step,
+                    )
+                    # Log metrics to W&B (only on main process and optimization steps)
+                    if IS_MAIN_PROCESS and is_optimization_step:
+                        self._log_metrics(
+                            {
+                                "train/loss": loss.item(),
+                                "train/learning_rate": current_lr,
+                                "train/step_time": step_time,
+                                "train/global_step": self._global_step,
+                            }
+                        )
+                    # Fallback logging when progress bars are disabled
+                    if disable_progress_bars and IS_MAIN_PROCESS and self._global_step % 20 == 0:
+                        elapsed = time.time() - train_start_time
+                        progress_percentage = self._global_step / cfg.optimization.steps
+                        if progress_percentage > 0:
+                            total_estimated = elapsed / progress_percentage
+                            total_time = f"{total_estimated // 3600:.0f}h {(total_estimated % 3600) // 60:.0f}m"
+                        else:
+                            total_time = "calculating..."
+                        logger.info(
+                            f"Step {self._global_step}/{cfg.optimization.steps} - "
+                            f"Loss: {loss.item():.4f}, LR: {current_lr:.2e}, "
+                            f"Time/Step: {step_time:.2f}s, Total Time: {total_time}",
+                        )
+                    # Sample GPU memory periodically
+                    if step % MEMORY_CHECK_INTERVAL == 0:
+                        current_mem = get_gpu_memory_gb(device)
+                        peak_mem_during_training = max(peak_mem_during_training, current_mem)
+        # Collect final stats
+        train_end_time = time.time()
+        end_mem = get_gpu_memory_gb(device)
+        peak_mem = max(start_mem, end_mem, peak_mem_during_training)
+        # Calculate steps/second over entire training
+        total_time_seconds = train_end_time - train_start_time
+        steps_per_second = cfg.optimization.steps / total_time_seconds
+        samples_per_second = steps_per_second * self._accelerator.num_processes * cfg.optimization.batch_size
+        stats = TrainingStats(
+            total_time_seconds=total_time_seconds,
+            steps_per_second=steps_per_second,
+            samples_per_second=samples_per_second,
+            peak_gpu_memory_gb=peak_mem,
+            num_processes=self._accelerator.num_processes,
+            global_batch_size=cfg.optimization.batch_size * self._accelerator.num_processes,
+        )
+        saved_path = self._save_checkpoint()
+        if IS_MAIN_PROCESS:
+            # Log the training statistics
+            self._log_training_stats(stats)
+            # Upload artifacts to hub if enabled
+            if cfg.hub.push_to_hub:
+                push_to_hub(saved_path, sampled_videos_paths, self._config)
+            # Log final stats to W&B
+            if self._wandb_run is not None:
+                self._log_metrics(
+                    {
+                        "stats/total_time_minutes": stats.total_time_seconds / 60,
+                        "stats/steps_per_second": stats.steps_per_second,
+                        "stats/samples_per_second": stats.samples_per_second,
+                        "stats/peak_gpu_memory_gb": stats.peak_gpu_memory_gb,
+                    }
+                )
+                self._wandb_run.finish()
+        self._accelerator.wait_for_everyone()
+        self._accelerator.end_training()
+        return saved_path, stats
+    def _training_step(self, batch: dict[str, dict[str, Tensor]]) -> Tensor:
+        """Perform a single training step using the configured strategy."""
+        # Apply embedding connectors to transform pre-computed text embeddings
+        conditions = batch["conditions"]
+        if "video_prompt_embeds" in conditions:
+            # New format: separate video/audio features from precompute()
+            video_features = conditions["video_prompt_embeds"]
+            audio_features = conditions.get("audio_prompt_embeds")
+        else:
+            # Legacy format: single prompt_embeds tensor — duplicate for both modalities
+            video_features = conditions["prompt_embeds"]
+            audio_features = conditions["prompt_embeds"]
+        mask = conditions["prompt_attention_mask"]
+        additive_mask = convert_to_additive_mask(mask, video_features.dtype)
+        video_embeds, audio_embeds, attention_mask = self._embeddings_processor.create_embeddings(
+            video_features, audio_features, additive_mask
+        )
+        conditions["video_prompt_embeds"] = video_embeds
+        conditions["audio_prompt_embeds"] = audio_embeds
+        conditions["prompt_attention_mask"] = attention_mask
+        # Use strategy to prepare training inputs (returns ModelInputs with Modality objects)
+        model_inputs = self._training_strategy.prepare_training_inputs(batch, self._timestep_sampler)
+        # Run transformer forward pass with Modality-based interface
+        video_pred, audio_pred = self._transformer(
+            video=model_inputs.video,
+            audio=model_inputs.audio,
+            perturbations=None,
+        )
+        # Use strategy to compute loss
+        loss = self._training_strategy.compute_loss(video_pred, audio_pred, model_inputs)
+        return loss
+    @free_gpu_memory_context(after=True)
+    def _load_text_encoder_and_cache_embeddings(self) -> list[CachedPromptEmbeddings] | None:
+        """Load text encoder + embeddings processor, compute and cache validation embeddings."""
+        # This method:
+        #   1. Loads the pure Gemma text encoder on GPU
+        #   2. Loads the embeddings processor (feature extractor + connectors)
+        #   3. If validation prompts are configured, computes and caches their embeddings
+        #   4. Unloads the Gemma model entirely, keeps the embeddings processor for training
+        # Load text encoder (pure Gemma LLM) on GPU
+        logger.debug("Loading text encoder...")
+        text_encoder = load_text_encoder(
+            gemma_model_path=self._config.model.text_encoder_path,
+            device="cuda",
+            dtype=torch.bfloat16,
+            load_in_8bit=self._config.acceleration.load_text_encoder_in_8bit,
+        )
+        # Load embeddings processor (feature extractor + connectors)
+        logger.debug("Loading embeddings processor...")
+        self._embeddings_processor = load_embeddings_processor(
+            checkpoint_path=self._config.model.model_path,
+            device="cuda",
+            dtype=torch.bfloat16,
+        )
+        # Cache validation embeddings if prompts are configured
+        cached_embeddings = None
+        if self._config.validation.prompts:
+            logger.info(f"Pre-computing embeddings for {len(self._config.validation.prompts)} validation prompts...")
+            cached_embeddings = []
+            with torch.inference_mode():
+                for prompt in self._config.validation.prompts:
+                    pos_hs, pos_mask = text_encoder.encode(prompt)
+                    pos_out = self._embeddings_processor.process_hidden_states(pos_hs, pos_mask)
+                    neg_hs, neg_mask = text_encoder.encode(self._config.validation.negative_prompt)
+                    neg_out = self._embeddings_processor.process_hidden_states(neg_hs, neg_mask)
+                    cached_embeddings.append(
+                        CachedPromptEmbeddings(
+                            video_context_positive=pos_out.video_encoding.cpu(),
+                            audio_context_positive=pos_out.audio_encoding.cpu(),
+                            video_context_negative=neg_out.video_encoding.cpu(),
+                            audio_context_negative=(
+                                neg_out.audio_encoding.cpu() if neg_out.audio_encoding is not None else None
+                            ),
+                        )
+                    )
+        # Unload Gemma model and feature extractor, keep only connectors for training
+        del text_encoder
+        self._embeddings_processor.feature_extractor = None
+        logger.debug("Validation prompt embeddings cached. Gemma model unloaded")
+        return cached_embeddings
+    def _load_models(self) -> None:
+        """Load the LTX-2 model components."""
+        # Load audio components if:
+        # 1. Training strategy requires audio (training the audio branch), OR
+        # 2. Validation is configured to generate audio (even if not training audio)
+        load_audio = self._training_strategy.requires_audio or self._config.validation.generate_audio
+        # Check if we need VAE encoder (for image or reference video conditioning)
+        need_vae_encoder = (
+            self._config.validation.images is not None or self._config.validation.reference_videos is not None
+        )
+        # Load all model components (except text encoder - already handled)
+        components = load_ltx_model(
+            checkpoint_path=self._config.model.model_path,
+            device="cpu",
+            dtype=torch.bfloat16,
+            with_video_vae_encoder=need_vae_encoder,  # Needed for image conditioning
+            with_video_vae_decoder=True,  # Needed for validation sampling
+            with_audio_vae_decoder=load_audio,
+            with_vocoder=load_audio,
+            with_text_encoder=False,  # Text encoder handled separately
+        )
+        # Extract components
+        self._transformer = components.transformer
+        self._vae_decoder = components.video_vae_decoder.to(dtype=torch.bfloat16)
+        self._vae_encoder = components.video_vae_encoder
+        if self._vae_encoder is not None:
+            self._vae_encoder = self._vae_encoder.to(dtype=torch.bfloat16)
+        self._scheduler = components.scheduler
+        self._audio_vae = components.audio_vae_decoder
+        self._vocoder = components.vocoder
+        # Note: self._embeddings_processor was set in _load_text_encoder_and_cache_embeddings
+        # Determine initial dtype based on training mode.
+        # Note: For FSDP + LoRA, we'll cast to FP32 later in _prepare_models_for_training()
+        # after the accelerator is set up, and we can detect FSDP.
+        transformer_dtype = torch.bfloat16 if self._config.model.training_mode == "lora" else torch.float32
+        self._transformer = self._transformer.to(dtype=transformer_dtype)
+        if self._config.acceleration.quantization is not None:
+            if self._config.model.training_mode == "full":
+                raise ValueError("Quantization is not supported in full training mode.")
+            logger.info(f'Quantizing model with "{self._config.acceleration.quantization}". This may take a while...')
+            self._transformer = quantize_model(
+                self._transformer,
+                precision=self._config.acceleration.quantization,
+            )
+        # Freeze all models. We later unfreeze the transformer based on training mode.
+        # Note: embedding_connectors are already frozen (they come from the frozen text encoder)
+        self._vae_decoder.requires_grad_(False)
+        if self._vae_encoder is not None:
+            self._vae_encoder.requires_grad_(False)
+        self._transformer.requires_grad_(False)
+        if self._audio_vae is not None:
+            self._audio_vae.requires_grad_(False)
+        if self._vocoder is not None:
+            self._vocoder.requires_grad_(False)
+    def _collect_trainable_params(self) -> None:
+        """Collect trainable parameters based on training mode."""
+        if self._config.model.training_mode == "lora":
+            # For LoRA training, first set up LoRA layers
+            self._setup_lora()
+        elif self._config.model.training_mode == "full":
+            # For full training, unfreeze all transformer parameters
+            self._transformer.requires_grad_(True)
+        else:
+            raise ValueError(f"Unknown training mode: {self._config.model.training_mode}")
+        self._trainable_params = [p for p in self._transformer.parameters() if p.requires_grad]
+        logger.debug(f"Trainable params count: {sum(p.numel() for p in self._trainable_params):,}")
+    def _init_timestep_sampler(self) -> None:
+        """Initialize the timestep sampler based on the config."""
+        sampler_cls = SAMPLERS[self._config.flow_matching.timestep_sampling_mode]
+        self._timestep_sampler = sampler_cls(**self._config.flow_matching.timestep_sampling_params)
+    def _setup_lora(self) -> None:
+        """Configure LoRA adapters for the transformer. Only called in LoRA training mode."""
+        logger.debug(f"Adding LoRA adapter with rank {self._config.lora.rank}")
+        lora_config = LoraConfig(
+            r=self._config.lora.rank,
+            lora_alpha=self._config.lora.alpha,
+            target_modules=self._config.lora.target_modules,
+            lora_dropout=self._config.lora.dropout,
+            init_lora_weights=True,
+        )
+        # Wrap the transformer with PEFT to add LoRA layers
+        # noinspection PyTypeChecker
+        self._transformer = get_peft_model(self._transformer, lora_config)
+    def _load_checkpoint(self) -> None:
+        """Load checkpoint if specified in config."""
+        if not self._config.model.load_checkpoint:
+            return
+        checkpoint_path = self._find_checkpoint(self._config.model.load_checkpoint)
+        if not checkpoint_path:
+            logger.warning(f"⚠️ Could not find checkpoint at {self._config.model.load_checkpoint}")
+            return
+        logger.info(f"📥 Loading checkpoint from {checkpoint_path}")
+        if self._config.model.training_mode == "full":
+            self._load_full_checkpoint(checkpoint_path)
+        else:  # LoRA mode
+            self._load_lora_checkpoint(checkpoint_path)
+    def _load_full_checkpoint(self, checkpoint_path: Path) -> None:
+        """Load full model checkpoint."""
+        state_dict = load_file(checkpoint_path)
+        self._transformer.load_state_dict(state_dict, strict=True)
+        logger.info("✅ Full model checkpoint loaded successfully")
+    def _load_lora_checkpoint(self, checkpoint_path: Path) -> None:
+        """Load LoRA checkpoint with DDP/FSDP compatibility."""
+        state_dict = load_file(checkpoint_path)
+        # Adjust layer names to match internal format.
+        # (Weights are saved in ComfyUI-compatible format, with "diffusion_model." prefix)
+        state_dict = {k.replace("diffusion_model.", "", 1): v for k, v in state_dict.items()}
+        # Load LoRA weights and verify all weights were loaded
+        base_model = self._transformer.get_base_model()
+        set_peft_model_state_dict(base_model, state_dict)
+        logger.info("✅ LoRA checkpoint loaded successfully")
+    def _prepare_models_for_training(self) -> None:
+        """Prepare models for training with Accelerate."""
+        # For FSDP + LoRA: Cast entire model to FP32.
+        # FSDP requires uniform dtype across all parameters in wrapped modules.
+        # In LoRA mode, PEFT creates LoRA params in FP32 while base model is BF16.
+        # We cast the base model to FP32 to match the LoRA params.
+        if self._accelerator.distributed_type == DistributedType.FSDP and self._config.model.training_mode == "lora":
+            logger.debug("FSDP: casting transformer to FP32 for uniform dtype")
+            self._transformer = self._transformer.to(dtype=torch.float32)
+        # Enable gradient checkpointing if requested
+        # For PeftModel, we need to access the underlying base model
+        transformer = (
+            self._transformer.get_base_model() if hasattr(self._transformer, "get_base_model") else self._transformer
+        )
+        transformer.set_gradient_checkpointing(self._config.optimization.enable_gradient_checkpointing)
+        # Keep frozen models on CPU for memory efficiency
+        self._vae_decoder = self._vae_decoder.to("cpu")
+        if self._vae_encoder is not None:
+            self._vae_encoder = self._vae_encoder.to("cpu")
+        # Embedding connectors are already on GPU from _load_text_encoder_and_cache_embeddings
+        # noinspection PyTypeChecker
+        self._transformer = self._accelerator.prepare(self._transformer)
+        # Log GPU memory usage after model preparation
+        vram_usage_gb = torch.cuda.memory_allocated() / 1024**3
+        logger.debug(f"GPU memory usage after models preparation: {vram_usage_gb:.2f} GB")
+    @staticmethod
+    def _find_checkpoint(checkpoint_path: str | Path) -> Path | None:
+        """Find the checkpoint file to load, handling both file and directory paths."""
+        checkpoint_path = Path(checkpoint_path)
+        if checkpoint_path.is_file():
+            if not checkpoint_path.suffix == ".safetensors":
+                raise ValueError(f"Checkpoint file must have a .safetensors extension: {checkpoint_path}")
+            return checkpoint_path
+        if checkpoint_path.is_dir():
+            # Look for checkpoint files in the directory
+            checkpoints = list(checkpoint_path.rglob("*step_*.safetensors"))
+            if not checkpoints:
+                return None
+            # Sort by step number and return the latest
+            def _get_step_num(p: Path) -> int:
+                try:
+                    return int(p.stem.split("step_")[1])
+                except (IndexError, ValueError):
+                    return -1
+            latest = max(checkpoints, key=_get_step_num)
+            return latest
+        else:
+            raise ValueError(f"Invalid checkpoint path: {checkpoint_path}. Must be a file or directory.")
+    def _init_dataloader(self) -> None:
+        """Initialize the training data loader using the strategy's data sources."""
+        if self._dataset is None:
+            # Get data sources from the training strategy
+            data_sources = self._training_strategy.get_data_sources()
+            self._dataset = PrecomputedDataset(self._config.data.preprocessed_data_root, data_sources=data_sources)
+            logger.debug(f"Loaded dataset with {len(self._dataset):,} samples from sources: {list(data_sources)}")
+        num_workers = self._config.data.num_dataloader_workers
+        dataloader = DataLoader(
+            self._dataset,
+            batch_size=self._config.optimization.batch_size,
+            shuffle=True,
+            drop_last=True,
+            num_workers=num_workers,
+            pin_memory=num_workers > 0,
+            persistent_workers=num_workers > 0,
+        )
+        self._dataloader = self._accelerator.prepare(dataloader)
+    def _init_lora_weights(self) -> None:
+        """Initialize LoRA weights for the transformer."""
+        logger.debug("Initializing LoRA weights...")
+        for _, module in self._transformer.named_modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.reset_lora_parameters(adapter_name="default", init_lora_weights=True)
+    def _init_optimizer(self) -> None:
+        """Initialize the optimizer and learning rate scheduler."""
+        opt_cfg = self._config.optimization
+        lr = opt_cfg.learning_rate
+        if opt_cfg.optimizer_type == "adamw":
+            optimizer = AdamW(self._trainable_params, lr=lr)
+        elif opt_cfg.optimizer_type == "adamw8bit":
+            # noinspection PyUnresolvedReferences
+            from bitsandbytes.optim import AdamW8bit  # noqa: PLC0415
+            optimizer = AdamW8bit(self._trainable_params, lr=lr)
+        else:
+            raise ValueError(f"Unknown optimizer type: {opt_cfg.optimizer_type}")
+        # Add scheduler initialization
+        lr_scheduler = self._create_scheduler(optimizer)
+        # noinspection PyTypeChecker
+        self._optimizer, self._lr_scheduler = self._accelerator.prepare(optimizer, lr_scheduler)
+    def _create_scheduler(self, optimizer: torch.optim.Optimizer) -> LRScheduler | None:
+        """Create learning rate scheduler based on config."""
+        scheduler_type = self._config.optimization.scheduler_type
+        steps = self._config.optimization.steps
+        params = self._config.optimization.scheduler_params or {}
+        if scheduler_type is None:
+            return None
+        if scheduler_type == "linear":
+            scheduler = LinearLR(
+                optimizer,
+                start_factor=params.pop("start_factor", 1.0),
+                end_factor=params.pop("end_factor", 0.1),
+                total_iters=steps,
+                **params,
+            )
+        elif scheduler_type == "cosine":
+            scheduler = CosineAnnealingLR(
+                optimizer,
+                T_max=steps,
+                eta_min=params.pop("eta_min", 0),
+                **params,
+            )
+        elif scheduler_type == "cosine_with_restarts":
+            scheduler = CosineAnnealingWarmRestarts(
+                optimizer,
+                T_0=params.pop("T_0", steps // 4),  # First restart cycle length
+                T_mult=params.pop("T_mult", 1),  # Multiplicative factor for cycle lengths
+                eta_min=params.pop("eta_min", 5e-5),
+                **params,
+            )
+        elif scheduler_type == "polynomial":
+            scheduler = PolynomialLR(
+                optimizer,
+                total_iters=steps,
+                power=params.pop("power", 1.0),
+                **params,
+            )
+        elif scheduler_type == "step":
+            scheduler = StepLR(
+                optimizer,
+                step_size=params.pop("step_size", steps // 2),
+                gamma=params.pop("gamma", 0.1),
+                **params,
+            )
+        elif scheduler_type == "constant":
+            scheduler = None
+        else:
+            raise ValueError(f"Unknown scheduler type: {scheduler_type}")
+        return scheduler
+    def _setup_accelerator(self) -> None:
+        """Initialize the Accelerator with the appropriate settings."""
+        # All distributed setup (DDP/FSDP, number of processes, etc.) is controlled by
+        # the user's Accelerate configuration (accelerate config / accelerate launch).
+        self._accelerator = Accelerator(
+            mixed_precision=self._config.acceleration.mixed_precision_mode,
+            gradient_accumulation_steps=self._config.optimization.gradient_accumulation_steps,
+        )
+        if self._accelerator.num_processes > 1:
+            logger.info(
+                f"{self._accelerator.distributed_type.value} distributed training enabled "
+                f"with {self._accelerator.num_processes} processes"
+            )
+            local_batch = self._config.optimization.batch_size
+            global_batch = self._config.optimization.batch_size * self._accelerator.num_processes
+            logger.info(f"Local batch size: {local_batch}, global batch size: {global_batch}")
+        # Log torch.compile status from Accelerate's dynamo plugin
+        is_compile_enabled = (
+            hasattr(self._accelerator.state, "dynamo_plugin") and self._accelerator.state.dynamo_plugin.backend != "NO"
+        )
+        if is_compile_enabled:
+            plugin = self._accelerator.state.dynamo_plugin
+            logger.info(f"🔥 torch.compile enabled via Accelerate: backend={plugin.backend}, mode={plugin.mode}")
+            if self._accelerator.distributed_type == DistributedType.FSDP:
+                logger.warning(
+                    "⚠️ FSDP + torch.compile is experimental and may hang on the first training iteration. "
+                    "If this occurs, disable torch.compile by removing dynamo_config from your Accelerate config."
+                )
+        if self._accelerator.distributed_type == DistributedType.FSDP and self._config.acceleration.quantization:
+            logger.warning(
+                f"FSDP with quantization ({self._config.acceleration.quantization}) may have compatibility issues."
+                "Monitor training stability and consider disabling quantization if issues arise."
+            )
+    # Note: Use @torch.no_grad() instead of @torch.inference_mode() to avoid FSDP inplace update errors after validation
+    @torch.no_grad()
+    @free_gpu_memory_context(after=True)
+    def _sample_videos(self, progress: TrainingProgress) -> list[Path] | None:
+        """Run validation by generating videos from validation prompts."""
+        use_images = self._config.validation.images is not None
+        use_reference_videos = self._config.validation.reference_videos is not None
+        generate_audio = self._config.validation.generate_audio
+        inference_steps = self._config.validation.inference_steps
+        # Zero gradients and free GPU memory to reclaim memory before validation sampling
+        self._optimizer.zero_grad(set_to_none=True)
+        free_gpu_memory()
+        # Start sampling progress tracking
+        sampling_ctx = progress.start_sampling(
+            num_prompts=len(self._config.validation.prompts),
+            num_steps=inference_steps,
+        )
+        # Create validation sampler with loaded models and progress tracking
+        sampler = ValidationSampler(
+            transformer=self._transformer,
+            vae_decoder=self._vae_decoder,
+            vae_encoder=self._vae_encoder,
+            text_encoder=None,
+            audio_decoder=self._audio_vae if generate_audio else None,
+            vocoder=self._vocoder if generate_audio else None,
+            sampling_context=sampling_ctx,
+        )
+        output_dir = Path(self._config.output_dir) / "samples"
+        output_dir.mkdir(exist_ok=True, parents=True)
+        video_paths = []
+        width, height, num_frames = self._config.validation.video_dims
+        for prompt_idx, prompt in enumerate(self._config.validation.prompts):
+            # Update progress to show current video
+            sampling_ctx.start_video(prompt_idx)
+            # Load conditioning image if provided
+            condition_image = None
+            if use_images:
+                image_path = self._config.validation.images[prompt_idx]
+                image = open_image_as_srgb(image_path)
+                # Convert PIL image to tensor [C, H, W] in [0, 1]
+                condition_image = F.to_tensor(image)
+            # Load reference video if provided (for IC-LoRA)
+            reference_video = None
+            if use_reference_videos:
+                ref_video_path = self._config.validation.reference_videos[prompt_idx]
+                # read_video returns [F, C, H, W] in [0, 1]
+                reference_video, _ = read_video(ref_video_path, max_frames=num_frames)
+            # Get cached embeddings for this prompt if available
+            cached_embeddings = (
+                self._cached_validation_embeddings[prompt_idx]
+                if self._cached_validation_embeddings is not None
+                else None
+            )
+            # Create generation config
+            gen_config = GenerationConfig(
+                prompt=prompt,
+                negative_prompt=self._config.validation.negative_prompt,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                frame_rate=self._config.validation.frame_rate,
+                num_inference_steps=inference_steps,
+                guidance_scale=self._config.validation.guidance_scale,
+                seed=self._config.validation.seed,
+                condition_image=condition_image,
+                reference_video=reference_video,
+                reference_downscale_factor=self._config.validation.reference_downscale_factor,
+                generate_audio=generate_audio,
+                include_reference_in_output=self._config.validation.include_reference_in_output,
+                cached_embeddings=cached_embeddings,
+                stg_scale=self._config.validation.stg_scale,
+                stg_blocks=self._config.validation.stg_blocks,
+                stg_mode=self._config.validation.stg_mode,
+            )
+            # Generate sample
+            video, audio = sampler.generate(
+                config=gen_config,
+                device=self._accelerator.device,
+            )
+            # Save output (image for single frame, video otherwise)
+            if IS_MAIN_PROCESS:
+                ext = "png" if num_frames == 1 else "mp4"
+                output_path = output_dir / f"step_{self._global_step:06d}_{prompt_idx + 1}.{ext}"
+                if num_frames == 1:
+                    save_image(video, output_path)
+                else:
+                    save_video(
+                        video_tensor=video,
+                        output_path=output_path,
+                        fps=self._config.validation.frame_rate,
+                        audio=audio,
+                        audio_sample_rate=self._vocoder.output_sampling_rate if audio is not None else None,
+                    )
+                video_paths.append(output_path)
+        # Clean up progress tasks
+        sampling_ctx.cleanup()
+        rel_outputs_path = output_dir.relative_to(self._config.output_dir)
+        logger.info(f"🎥 Validation samples for step {self._global_step} saved in {rel_outputs_path}")
+        return video_paths
+    @staticmethod
+    def _log_training_stats(stats: TrainingStats) -> None:
+        """Log training statistics."""
+        stats_str = (
+            "📊 Training Statistics:\n"
+            f" - Total time: {stats.total_time_seconds / 60:.1f} minutes\n"
+            f" - Training speed: {stats.steps_per_second:.2f} steps/second\n"
+            f" - Samples/second: {stats.samples_per_second:.2f}\n"
+            f" - Peak GPU memory: {stats.peak_gpu_memory_gb:.2f} GB"
+        )
+        if stats.num_processes > 1:
+            stats_str += f"\n - Number of processes: {stats.num_processes}\n"
+            stats_str += f" - Global batch size: {stats.global_batch_size}"
+        logger.info(stats_str)
+    def _save_checkpoint(self) -> Path | None:
+        """Save the model weights."""
+        is_lora = self._config.model.training_mode == "lora"
+        is_fsdp = self._accelerator.distributed_type == DistributedType.FSDP
+        # Prepare paths
+        save_dir = Path(self._config.output_dir) / "checkpoints"
+        prefix = "lora" if is_lora else "model"
+        filename = f"{prefix}_weights_step_{self._global_step:05d}.safetensors"
+        saved_weights_path = save_dir / filename
+        # Get state dict (collective operation - all processes must participate)
+        self._accelerator.wait_for_everyone()
+        full_state_dict = self._accelerator.get_state_dict(self._transformer)
+        if not IS_MAIN_PROCESS:
+            return None
+        save_dir.mkdir(exist_ok=True, parents=True)
+        # Determine save precision
+        save_dtype = torch.bfloat16 if self._config.checkpoints.precision == "bfloat16" else torch.float32
+        # For LoRA: extract only adapter weights; for full: use as-is
+        if is_lora:
+            unwrapped = self._accelerator.unwrap_model(self._transformer, keep_torch_compile=False)
+            # For FSDP, pass full_state_dict since model params aren't directly accessible
+            state_dict = get_peft_model_state_dict(unwrapped, state_dict=full_state_dict if is_fsdp else None)
+            # Remove "base_model.model." prefix added by PEFT
+            state_dict = {k.replace("base_model.model.", "", 1): v for k, v in state_dict.items()}
+            # Convert to ComfyUI-compatible format (add "diffusion_model." prefix)
+            state_dict = {f"diffusion_model.{k}": v for k, v in state_dict.items()}
+            # Cast to configured precision
+            state_dict = {k: v.to(save_dtype) if isinstance(v, Tensor) else v for k, v in state_dict.items()}
+            # Build metadata for safetensors file
+            metadata = self._build_checkpoint_metadata()
+            # Save to disk with metadata
+            save_file(state_dict, saved_weights_path, metadata=metadata)
+        else:
+            # Cast to configured precision
+            full_state_dict = {k: v.to(save_dtype) if isinstance(v, Tensor) else v for k, v in full_state_dict.items()}
+            # Save to disk
+            self._accelerator.save(full_state_dict, saved_weights_path)
+        rel_path = saved_weights_path.relative_to(self._config.output_dir)
+        logger.info(f"💾 {prefix.capitalize()} weights for step {self._global_step} saved in {rel_path}")
+        # Keep track of checkpoint paths, and cleanup old checkpoints if needed
+        self._checkpoint_paths.append(saved_weights_path)
+        self._cleanup_checkpoints()
+        return saved_weights_path
+    def _cleanup_checkpoints(self) -> None:
+        """Clean up old checkpoints."""
+        if 0 < self._config.checkpoints.keep_last_n < len(self._checkpoint_paths):
+            checkpoints_to_remove = self._checkpoint_paths[: -self._config.checkpoints.keep_last_n]
+            for old_checkpoint in checkpoints_to_remove:
+                if old_checkpoint.exists():
+                    old_checkpoint.unlink()
+                    logger.info(f"Removed old checkpoints: {old_checkpoint}")
+            # Update the list to only contain kept checkpoints
+            self._checkpoint_paths = self._checkpoint_paths[-self._config.checkpoints.keep_last_n :]
+    def _build_checkpoint_metadata(self) -> dict[str, str]:
+        """Build metadata dictionary for safetensors checkpoint.
+        Delegates to the training strategy to get strategy-specific metadata
+        that downstream inference pipelines may need.
+        Returns:
+            Dictionary of string key-value pairs for safetensors metadata.
+            Values are converted to strings for safetensors compatibility.
+        """
+        raw_metadata = self._training_strategy.get_checkpoint_metadata()
+        # Convert all values to strings for safetensors compatibility
+        metadata = {k: str(v) for k, v in raw_metadata.items()}
+        if metadata:
+            logger.info(f"Saving checkpoint metadata: {metadata}")
+        return metadata
+    def _save_config(self) -> None:
+        """Save the training configuration as a YAML file in the output directory."""
+        if not IS_MAIN_PROCESS:
+            return
+        config_path = Path(self._config.output_dir) / "training_config.yaml"
+        with open(config_path, "w") as f:
+            yaml.dump(self._config.model_dump(), f, default_flow_style=False, indent=2)
+        logger.info(f"💾 Training configuration saved to: {config_path.relative_to(self._config.output_dir)}")
+    def _init_wandb(self) -> None:
+        """Initialize Weights & Biases run."""
+        if not self._config.wandb.enabled or not IS_MAIN_PROCESS:
+            self._wandb_run = None
+            return
+        wandb_config = self._config.wandb
+        run = wandb.init(
+            project=wandb_config.project,
+            entity=wandb_config.entity,
+            name=Path(self._config.output_dir).name,
+            tags=wandb_config.tags,
+            config=self._config.model_dump(),
+        )
+        self._wandb_run = run
+    def _log_metrics(self, metrics: dict[str, float]) -> None:
+        """Log metrics to Weights & Biases."""
+        if self._wandb_run is not None:
+            self._wandb_run.log(metrics)
+    def _log_validation_samples(self, sample_paths: list[Path], prompts: list[str]) -> None:
+        """Log validation samples (videos or images) to Weights & Biases."""
+        if not self._config.wandb.log_validation_videos or self._wandb_run is None:
+            return
+        # Determine if outputs are images or videos based on file extension
+        is_image = sample_paths and sample_paths[0].suffix.lower() in (".png", ".jpg", ".jpeg", ".heic", ".webp")
+        media_cls = wandb.Image if is_image else wandb.Video
+        samples = [media_cls(str(path), caption=prompt) for path, prompt in zip(sample_paths, prompts, strict=True)]
+        self._wandb_run.log({"validation_samples": samples}, step=self._global_step)

packages/ltx-trainer/src/ltx_trainer/training_strategies/__init__.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""Training strategies for different conditioning modes.
+This package implements the Strategy Pattern to handle different training modes:
+- Text-to-video training (standard generation, optionally with audio)
+- Video-to-video training (IC-LoRA mode with reference videos)
+Each strategy encapsulates the specific logic for preparing model inputs and computing loss.
+"""
+from ltx_trainer import logger
+from ltx_trainer.training_strategies.base_strategy import (
+    DEFAULT_FPS,
+    VIDEO_SCALE_FACTORS,
+    ModelInputs,
+    TrainingStrategy,
+    TrainingStrategyConfigBase,
+)
+from ltx_trainer.training_strategies.text_to_video import TextToVideoConfig, TextToVideoStrategy
+from ltx_trainer.training_strategies.video_to_video import VideoToVideoConfig, VideoToVideoStrategy
+# Type alias for all strategy config types
+TrainingStrategyConfig = TextToVideoConfig | VideoToVideoConfig
+__all__ = [
+    "DEFAULT_FPS",
+    "VIDEO_SCALE_FACTORS",
+    "ModelInputs",
+    "TextToVideoConfig",
+    "TextToVideoStrategy",
+    "TrainingStrategy",
+    "TrainingStrategyConfig",
+    "TrainingStrategyConfigBase",
+    "VideoToVideoConfig",
+    "VideoToVideoStrategy",
+    "get_training_strategy",
+]
+def get_training_strategy(config: TrainingStrategyConfig) -> TrainingStrategy:
+    """Factory function to create the appropriate training strategy.
+    The strategy is determined by the `name` field in the configuration.
+    Args:
+        config: Strategy-specific configuration with a `name` field
+    Returns:
+        The appropriate training strategy instance
+    Raises:
+        ValueError: If strategy name is not supported
+    """
+    match config:
+        case TextToVideoConfig():
+            strategy = TextToVideoStrategy(config)
+        case VideoToVideoConfig():
+            strategy = VideoToVideoStrategy(config)
+        case _:
+            raise ValueError(f"Unknown training strategy config type: {type(config).__name__}")
+    audio_mode = "(audio enabled 🔈)" if getattr(config, "with_audio", False) else "(audio disabled 🔇)"
+    logger.debug(f"🎯 Using {strategy.__class__.__name__} training strategy {audio_mode}")
+    return strategy

packages/ltx-trainer/src/ltx_trainer/training_strategies/base_strategy.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""Base class for training strategies.
+This module defines the abstract base class that all training strategies must implement,
+along with the base configuration class.
+"""
+import random
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Literal
+import torch
+from pydantic import BaseModel, ConfigDict, Field
+from torch import Tensor
+from ltx_core.components.patchifiers import (
+    AudioPatchifier,
+    VideoLatentPatchifier,
+    get_pixel_coords,
+)
+from ltx_core.model.transformer.modality import Modality
+from ltx_core.types import AudioLatentShape, SpatioTemporalScaleFactors, VideoLatentShape
+from ltx_trainer.timestep_samplers import TimestepSampler
+# Default frames per second for video missing in the FPS metadata
+DEFAULT_FPS = 24
+# VAE scale factors for LTX-2
+VIDEO_SCALE_FACTORS = SpatioTemporalScaleFactors.default()
+class TrainingStrategyConfigBase(BaseModel):
+    """Base configuration class for training strategies.
+    All strategy-specific configuration classes should inherit from this.
+    """
+    model_config = ConfigDict(extra="forbid")
+    name: Literal["text_to_video", "video_to_video"] = Field(
+        description="Unique name identifying the training strategy type"
+    )
+@dataclass
+class ModelInputs:
+    """Container for model inputs using the Modality-based interface."""
+    video: Modality
+    audio: Modality | None
+    # Training targets (for loss computation)
+    video_targets: Tensor
+    audio_targets: Tensor | None
+    # Masks for loss computation
+    video_loss_mask: Tensor  # Boolean mask: True = compute loss for this token
+    audio_loss_mask: Tensor | None
+    # Metadata needed for loss computation in some strategies
+    ref_seq_len: int | None = None  # For IC-LoRA: length of reference sequence
+class TrainingStrategy(ABC):
+    """Abstract base class for training strategies.
+    Each strategy encapsulates the logic for a specific training mode,
+    handling input preparation and loss computation.
+    """
+    def __init__(self, config: TrainingStrategyConfigBase):
+        """Initialize strategy with configuration.
+        Args:
+            config: Strategy-specific configuration
+        """
+        self.config = config
+        self._video_patchifier = VideoLatentPatchifier(patch_size=1)
+        self._audio_patchifier = AudioPatchifier(patch_size=1)
+    @property
+    def requires_audio(self) -> bool:
+        """Whether this training strategy requires audio components.
+        Override this property in subclasses that support audio training.
+        The trainer uses this to determine whether to load audio VAE and vocoder.
+        Returns:
+            True if audio components should be loaded, False otherwise.
+        """
+        return False
+    @abstractmethod
+    def get_data_sources(self) -> list[str] | dict[str, str]:
+        """Get the required data sources for this training strategy.
+        Returns:
+            Either a list of data directory names (where output keys match directory names)
+            or a dictionary mapping data directory names to custom output keys for the dataset
+        """
+    @abstractmethod
+    def prepare_training_inputs(
+        self,
+        batch: dict[str, Any],
+        timestep_sampler: TimestepSampler,
+    ) -> ModelInputs:
+        """Prepare training inputs from a raw data batch.
+        Args:
+            batch: Raw batch data from the dataset. Contains:
+                - "latents": Video latent data
+                - "conditions": Text embeddings with keys:
+                    - "video_prompt_embeds": Already processed by embedding connectors
+                    - "audio_prompt_embeds": Already processed by embedding connectors
+                    - "prompt_attention_mask": Attention mask
+                - Additional keys depending on strategy (e.g., "ref_latents" for IC-LoRA)
+            timestep_sampler: Sampler for generating timesteps and noise
+        Returns:
+            ModelInputs containing Modality objects and training targets
+        """
+    @abstractmethod
+    def compute_loss(
+        self,
+        video_pred: Tensor,
+        audio_pred: Tensor | None,
+        inputs: ModelInputs,
+    ) -> Tensor:
+        """Compute the training loss.
+        Args:
+            video_pred: Video prediction from the transformer model
+            audio_pred: Audio prediction from the transformer model (None for video-only)
+            inputs: The prepared model inputs containing targets and masks
+        Returns:
+            Scalar loss tensor
+        """
+    def get_checkpoint_metadata(self) -> dict[str, Any]:
+        """Get strategy-specific metadata to include in checkpoint files.
+        Override this method in subclasses to add custom metadata,
+        e.g. any parameters that a downstream inference pipeline may need.
+        Returns:
+            Dictionary of metadata key-value pairs (values must be JSON-serializable)
+        """
+        return {}
+    def _get_video_positions(
+        self,
+        num_frames: int,
+        height: int,
+        width: int,
+        batch_size: int,
+        fps: float,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> Tensor:
+        """Generate video position embeddings using ltx_core's native implementation.
+        Args:
+            num_frames: Number of latent frames
+            height: Latent height
+            width: Latent width
+            batch_size: Batch size
+            fps: Frames per second
+            device: Target device
+            dtype: Target dtype
+        Returns:
+            Position tensor of shape [B, 3, seq_len, 2]
+        """
+        latent_coords = self._video_patchifier.get_patch_grid_bounds(
+            output_shape=VideoLatentShape(
+                frames=num_frames,
+                height=height,
+                width=width,
+                batch=batch_size,
+                channels=128,  # Video latent channels
+            ),
+            device=device,
+        )
+        # Convert latent coords to pixel coords with causal fix
+        pixel_coords = get_pixel_coords(
+            latent_coords=latent_coords,
+            scale_factors=VIDEO_SCALE_FACTORS,
+            causal_fix=True,
+        ).to(dtype)
+        # Scale temporal dimension by 1/fps to get time in seconds
+        pixel_coords[:, 0, ...] = pixel_coords[:, 0, ...] / fps
+        return pixel_coords
+    def _get_audio_positions(
+        self,
+        num_time_steps: int,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> Tensor:
+        """Generate audio position embeddings using ltx_core's native implementation.
+        Args:
+            num_time_steps: Number of audio time steps (T, not T*mel_bins)
+            batch_size: Batch size
+            device: Target device
+            dtype: Target dtype
+        Returns:
+            Position tensor of shape [B, 1, num_time_steps, 2]
+        Note:
+            Audio latents should be in patchified format [B, T, C*F] = [B, T, 128]
+            where T is the number of time steps, C=8 channels, F=16 mel bins.
+            This matches the format produced by AudioPatchifier.patchify().
+        """
+        mel_bins = 16
+        latent_coords = self._audio_patchifier.get_patch_grid_bounds(
+            output_shape=AudioLatentShape(
+                frames=num_time_steps,
+                mel_bins=mel_bins,
+                batch=batch_size,
+                channels=8,  # Audio latent channels
+            ),
+            device=device,
+        )
+        return latent_coords.to(dtype)
+    @staticmethod
+    def _create_per_token_timesteps(conditioning_mask: Tensor, sampled_sigma: Tensor) -> Tensor:
+        """Create per-token timesteps based on conditioning mask.
+        Args:
+            conditioning_mask: Boolean mask of shape (batch_size, sequence_length),
+                where True = conditioning token (timestep=0), False = target token (use sigma)
+            sampled_sigma: Sampled sigma values of shape (batch_size,) or (batch_size, 1, 1)
+        Returns:
+            Timesteps tensor of shape [batch_size, sequence_length]
+        """
+        # Expand to match conditioning mask shape [B, seq_len]
+        expanded_sigma = sampled_sigma.view(-1, 1).expand_as(conditioning_mask)
+        # Conditioning tokens get 0, target tokens get the sampled sigma
+        return torch.where(conditioning_mask, torch.zeros_like(expanded_sigma), expanded_sigma)
+    @staticmethod
+    def _create_first_frame_conditioning_mask(
+        batch_size: int,
+        sequence_length: int,
+        height: int,
+        width: int,
+        device: torch.device,
+        first_frame_conditioning_p: float = 0.0,
+    ) -> Tensor:
+        """Create conditioning mask for first frame conditioning.
+        Args:
+            batch_size: Batch size
+            sequence_length: Total sequence length
+            height: Latent height
+            width: Latent width
+            device: Target device
+            first_frame_conditioning_p: Probability of conditioning on the first frame
+        Returns:
+            Boolean mask where True indicates first frame tokens (if conditioning is enabled)
+        """
+        conditioning_mask = torch.zeros(batch_size, sequence_length, dtype=torch.bool, device=device)
+        if first_frame_conditioning_p > 0 and random.random() < first_frame_conditioning_p:
+            first_frame_end_idx = height * width
+            if first_frame_end_idx < sequence_length:
+                conditioning_mask[:, :first_frame_end_idx] = True
+        return conditioning_mask

packages/ltx-trainer/src/ltx_trainer/training_strategies/text_to_video.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""Text-to-video training strategy.
+This strategy implements standard text-to-video generation training where:
+- Only target latents are used (no reference videos)
+- Standard noise application and loss computation
+- Supports first frame conditioning
+- Optionally supports joint audio-video training
+"""
+from typing import Any, Literal
+import torch
+from pydantic import Field
+from torch import Tensor
+from ltx_core.model.transformer.modality import Modality
+from ltx_trainer import logger
+from ltx_trainer.timestep_samplers import TimestepSampler
+from ltx_trainer.training_strategies.base_strategy import (
+    DEFAULT_FPS,
+    ModelInputs,
+    TrainingStrategy,
+    TrainingStrategyConfigBase,
+)
+class TextToVideoConfig(TrainingStrategyConfigBase):
+    """Configuration for text-to-video training strategy."""
+    name: Literal["text_to_video"] = "text_to_video"
+    first_frame_conditioning_p: float = Field(
+        default=0.1,
+        description="Probability of conditioning on the first frame during training",
+        ge=0.0,
+        le=1.0,
+    )
+    with_audio: bool = Field(
+        default=False,
+        description="Whether to include audio in training (joint audio-video generation)",
+    )
+    audio_latents_dir: str = Field(
+        default="audio_latents",
+        description="Directory name for audio latents when with_audio is True",
+    )
+class TextToVideoStrategy(TrainingStrategy):
+    """Text-to-video training strategy.
+    This strategy implements regular video generation training where:
+    - Only target latents are used (no reference videos)
+    - Standard noise application and loss computation
+    - Supports first frame conditioning
+    - Optionally supports joint audio-video training when with_audio=True
+    """
+    config: TextToVideoConfig
+    def __init__(self, config: TextToVideoConfig):
+        """Initialize strategy with configuration.
+        Args:
+            config: Text-to-video configuration
+        """
+        super().__init__(config)
+    @property
+    def requires_audio(self) -> bool:
+        """Whether this training strategy requires audio components."""
+        return self.config.with_audio
+    def get_data_sources(self) -> list[str] | dict[str, str]:
+        """
+        Text-to-video training requires latents and text conditions.
+        When with_audio is True, also requires audio latents.
+        """
+        sources = {
+            "latents": "latents",
+            "conditions": "conditions",
+        }
+        if self.config.with_audio:
+            sources[self.config.audio_latents_dir] = "audio_latents"
+        return sources
+    def prepare_training_inputs(
+        self,
+        batch: dict[str, Any],
+        timestep_sampler: TimestepSampler,
+    ) -> ModelInputs:
+        """Prepare inputs for text-to-video training."""
+        # Get pre-encoded latents - dataset provides uniform non-patchified format [B, C, F, H, W]
+        latents = batch["latents"]
+        video_latents = latents["latents"]
+        # Get video dimensions (assume same for all batch elements)
+        num_frames = latents["num_frames"][0].item()
+        height = latents["height"][0].item()
+        width = latents["width"][0].item()
+        # Patchify latents: [B, C, F, H, W] -> [B, seq_len, C]
+        video_latents = self._video_patchifier.patchify(video_latents)
+        # Handle FPS with backward compatibility
+        fps = latents.get("fps", None)
+        if fps is not None and not torch.all(fps == fps[0]):
+            logger.warning(
+                f"Different FPS values found in the batch. Found: {fps.tolist()}, using the first one: {fps[0].item()}"
+            )
+        fps = fps[0].item() if fps is not None else DEFAULT_FPS
+        # Get text embeddings (already processed by embedding connectors in trainer)
+        conditions = batch["conditions"]
+        video_prompt_embeds = conditions["video_prompt_embeds"]
+        audio_prompt_embeds = conditions["audio_prompt_embeds"]
+        prompt_attention_mask = conditions["prompt_attention_mask"]
+        batch_size = video_latents.shape[0]
+        video_seq_len = video_latents.shape[1]
+        device = video_latents.device
+        dtype = video_latents.dtype
+        # Create conditioning mask (first frame conditioning)
+        video_conditioning_mask = self._create_first_frame_conditioning_mask(
+            batch_size=batch_size,
+            sequence_length=video_seq_len,
+            height=height,
+            width=width,
+            device=device,
+            first_frame_conditioning_p=self.config.first_frame_conditioning_p,
+        )
+        # Sample noise and sigmas
+        sigmas = timestep_sampler.sample_for(video_latents)
+        video_noise = torch.randn_like(video_latents)
+        # Apply noise: noisy = (1 - sigma) * clean + sigma * noise
+        sigmas_expanded = sigmas.view(-1, 1, 1)
+        noisy_video = (1 - sigmas_expanded) * video_latents + sigmas_expanded * video_noise
+        # For conditioning tokens, use clean latents
+        conditioning_mask_expanded = video_conditioning_mask.unsqueeze(-1)
+        noisy_video = torch.where(conditioning_mask_expanded, video_latents, noisy_video)
+        # Compute video targets (velocity prediction)
+        video_targets = video_noise - video_latents
+        # Create per-token timesteps
+        video_timesteps = self._create_per_token_timesteps(video_conditioning_mask, sigmas.squeeze())
+        # Generate video positions using ltx_core's native implementation
+        video_positions = self._get_video_positions(
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            batch_size=batch_size,
+            fps=fps,
+            device=device,
+            dtype=dtype,
+        )
+        # Create video Modality
+        video_modality = Modality(
+            enabled=True,
+            sigma=sigmas,
+            latent=noisy_video,
+            timesteps=video_timesteps,
+            positions=video_positions,
+            context=video_prompt_embeds,
+            context_mask=prompt_attention_mask,
+        )
+        # Video loss mask: True for tokens we want to compute loss on (non-conditioning tokens)
+        video_loss_mask = ~video_conditioning_mask
+        # Handle audio if enabled
+        audio_modality = None
+        audio_targets = None
+        audio_loss_mask = None
+        if self.config.with_audio:
+            audio_modality, audio_targets, audio_loss_mask = self._prepare_audio_inputs(
+                batch=batch,
+                sigmas=sigmas,
+                audio_prompt_embeds=audio_prompt_embeds,
+                prompt_attention_mask=prompt_attention_mask,
+                batch_size=batch_size,
+                device=device,
+                dtype=dtype,
+            )
+        return ModelInputs(
+            video=video_modality,
+            audio=audio_modality,
+            video_targets=video_targets,
+            audio_targets=audio_targets,
+            video_loss_mask=video_loss_mask,
+            audio_loss_mask=audio_loss_mask,
+        )
+    def _prepare_audio_inputs(
+        self,
+        batch: dict[str, Any],
+        sigmas: Tensor,
+        audio_prompt_embeds: Tensor,
+        prompt_attention_mask: Tensor,
+        batch_size: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> tuple[Modality, Tensor, Tensor]:
+        """Prepare audio inputs for joint audio-video training.
+        Args:
+            batch: Raw batch data containing audio_latents
+            sigmas: Sampled sigma values (same as video)
+            audio_prompt_embeds: Audio context embeddings
+            prompt_attention_mask: Attention mask for context
+            batch_size: Batch size
+            device: Target device
+            dtype: Target dtype
+        Returns:
+            Tuple of (audio_modality, audio_targets, audio_loss_mask)
+        """
+        # Get audio latents - dataset provides uniform non-patchified format [B, C, T, F]
+        audio_data = batch["audio_latents"]
+        audio_latents = audio_data["latents"]
+        # Patchify audio latents: [B, C, T, F] -> [B, T, C*F]
+        audio_latents = self._audio_patchifier.patchify(audio_latents)
+        audio_seq_len = audio_latents.shape[1]
+        # Sample audio noise
+        audio_noise = torch.randn_like(audio_latents)
+        # Apply noise to audio (same sigma as video)
+        sigmas_expanded = sigmas.view(-1, 1, 1)
+        noisy_audio = (1 - sigmas_expanded) * audio_latents + sigmas_expanded * audio_noise
+        # Compute audio targets
+        audio_targets = audio_noise - audio_latents
+        # Audio timesteps: all tokens use the sampled sigma (no conditioning mask)
+        audio_timesteps = sigmas.view(-1, 1).expand(-1, audio_seq_len)
+        # Generate audio positions
+        audio_positions = self._get_audio_positions(
+            num_time_steps=audio_seq_len,
+            batch_size=batch_size,
+            device=device,
+            dtype=dtype,
+        )
+        # Create audio Modality
+        audio_modality = Modality(
+            enabled=True,
+            latent=noisy_audio,
+            sigma=sigmas,
+            timesteps=audio_timesteps,
+            positions=audio_positions,
+            context=audio_prompt_embeds,
+            context_mask=prompt_attention_mask,
+        )
+        # Audio loss mask: all tokens contribute to loss (no conditioning)
+        audio_loss_mask = torch.ones(batch_size, audio_seq_len, dtype=torch.bool, device=device)
+        return audio_modality, audio_targets, audio_loss_mask
+    def compute_loss(
+        self,
+        video_pred: Tensor,
+        audio_pred: Tensor | None,
+        inputs: ModelInputs,
+    ) -> Tensor:
+        """Compute masked MSE loss for video and optionally audio."""
+        # Video loss
+        video_loss = (video_pred - inputs.video_targets).pow(2)
+        video_loss_mask = inputs.video_loss_mask.unsqueeze(-1).float()
+        video_loss = video_loss.mul(video_loss_mask).div(video_loss_mask.mean())
+        video_loss = video_loss.mean()
+        # If no audio, return video loss only
+        if not self.config.with_audio or audio_pred is None or inputs.audio_targets is None:
+            return video_loss
+        # Audio loss (no conditioning mask)
+        audio_loss = (audio_pred - inputs.audio_targets).pow(2).mean()
+        # Combined loss
+        return video_loss + audio_loss

packages/ltx-trainer/src/ltx_trainer/training_strategies/video_to_video.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""Video-to-video training strategy for IC-LoRA.
+This strategy implements training with reference video conditioning where:
+- Reference latents (clean) are concatenated with target latents (noised)
+- Video coordinates handle both reference and target sequences
+- Loss is computed only on the target portion
+"""
+from typing import Any, Literal
+import torch
+from pydantic import Field
+from torch import Tensor
+from ltx_core.model.transformer.modality import Modality
+from ltx_trainer import logger
+from ltx_trainer.timestep_samplers import TimestepSampler
+from ltx_trainer.training_strategies.base_strategy import (
+    DEFAULT_FPS,
+    ModelInputs,
+    TrainingStrategy,
+    TrainingStrategyConfigBase,
+)
+class VideoToVideoConfig(TrainingStrategyConfigBase):
+    """Configuration for video-to-video (IC-LoRA) training strategy."""
+    name: Literal["video_to_video"] = "video_to_video"
+    first_frame_conditioning_p: float = Field(
+        default=0.1,
+        description="Probability of conditioning on the first frame during training",
+        ge=0.0,
+        le=1.0,
+    )
+    reference_latents_dir: str = Field(
+        default="reference_latents",
+        description="Directory name for latents of reference videos",
+    )
+class VideoToVideoStrategy(TrainingStrategy):
+    """Video-to-video training strategy for IC-LoRA.
+    This strategy implements training with reference video conditioning where:
+    - Reference latents (clean) are concatenated with target latents (noised)
+    - Video coordinates handle both reference and target sequences
+    - Loss is computed only on the target portion
+    Attributes:
+        reference_downscale_factor: The inferred downscale factor of reference videos.
+            This is computed from the first batch and cached for metadata export.
+    """
+    config: VideoToVideoConfig
+    reference_downscale_factor: int | None
+    def __init__(self, config: VideoToVideoConfig):
+        """Initialize strategy with configuration.
+        Args:
+            config: Video-to-video configuration
+        """
+        super().__init__(config)
+        self.reference_downscale_factor = None  # Will be inferred from first batch
+    def get_data_sources(self) -> dict[str, str]:
+        """IC-LoRA training requires latents, conditions, and reference latents."""
+        return {
+            "latents": "latents",
+            "conditions": "conditions",
+            self.config.reference_latents_dir: "ref_latents",
+        }
+    def prepare_training_inputs(  # noqa: PLR0915
+        self,
+        batch: dict[str, Any],
+        timestep_sampler: TimestepSampler,
+    ) -> ModelInputs:
+        """Prepare inputs for IC-LoRA training with reference videos."""
+        # Get pre-encoded latents - dataset provides uniform non-patchified format [B, C, F, H, W]
+        latents = batch["latents"]
+        target_latents = latents["latents"]
+        ref_latents = batch["ref_latents"]["latents"]
+        # Get dimensions
+        num_frames = latents["num_frames"][0].item()
+        height = latents["height"][0].item()
+        width = latents["width"][0].item()
+        ref_latents_info = batch["ref_latents"]
+        ref_frames = ref_latents_info["num_frames"][0].item()
+        ref_height = ref_latents_info["height"][0].item()
+        ref_width = ref_latents_info["width"][0].item()
+        # Infer reference downscale factor from dimension ratios
+        # This allows training with downscaled reference videos for efficiency
+        reference_downscale_factor = self._infer_reference_downscale_factor(
+            target_height=height,
+            target_width=width,
+            ref_height=ref_height,
+            ref_width=ref_width,
+        )
+        # Cache the scale factor for metadata export (only on first batch)
+        if self.reference_downscale_factor is None:
+            self.reference_downscale_factor = reference_downscale_factor
+        elif self.reference_downscale_factor != reference_downscale_factor:
+            raise ValueError(
+                f"Inconsistent reference downscale factor across batches. "
+                f"First batch had factor={self.reference_downscale_factor}, "
+                f"but current batch has factor={reference_downscale_factor}. "
+                f"All training samples must use the same reference/target resolution ratio."
+            )
+        # Patchify latents: [B, C, F, H, W] -> [B, seq_len, C]
+        target_latents = self._video_patchifier.patchify(target_latents)
+        ref_latents = self._video_patchifier.patchify(ref_latents)
+        # Handle FPS
+        fps = latents.get("fps", None)
+        if fps is not None and not torch.all(fps == fps[0]):
+            logger.warning(
+                f"Different FPS values found in the batch. Found: {fps.tolist()}, using the first one: {fps[0].item()}"
+            )
+        fps = fps[0].item() if fps is not None else DEFAULT_FPS
+        # Get text embeddings (already processed by embedding connectors in trainer)
+        # Video-to-video uses only video embeddings
+        conditions = batch["conditions"]
+        prompt_embeds = conditions["video_prompt_embeds"]
+        prompt_attention_mask = conditions["prompt_attention_mask"]
+        batch_size = target_latents.shape[0]
+        ref_seq_len = ref_latents.shape[1]
+        target_seq_len = target_latents.shape[1]
+        device = target_latents.device
+        dtype = target_latents.dtype
+        # Create conditioning mask
+        # Reference tokens are always conditioning (timestep=0)
+        ref_conditioning_mask = torch.ones(batch_size, ref_seq_len, dtype=torch.bool, device=device)
+        # Target tokens: check for first frame conditioning
+        target_conditioning_mask = self._create_first_frame_conditioning_mask(
+            batch_size=batch_size,
+            sequence_length=target_seq_len,
+            height=height,
+            width=width,
+            device=device,
+            first_frame_conditioning_p=self.config.first_frame_conditioning_p,
+        )
+        # Combined conditioning mask
+        conditioning_mask = torch.cat([ref_conditioning_mask, target_conditioning_mask], dim=1)
+        # Sample noise and sigmas for target
+        sigmas = timestep_sampler.sample_for(target_latents)
+        noise = torch.randn_like(target_latents)
+        sigmas_expanded = sigmas.view(-1, 1, 1)
+        # Apply noise to target
+        noisy_target = (1 - sigmas_expanded) * target_latents + sigmas_expanded * noise
+        # For first frame conditioning in target, use clean latents
+        target_conditioning_mask_expanded = target_conditioning_mask.unsqueeze(-1)
+        noisy_target = torch.where(target_conditioning_mask_expanded, target_latents, noisy_target)
+        # Targets for loss computation
+        targets = noise - target_latents
+        # Concatenate reference (clean) and target (noisy)
+        combined_latents = torch.cat([ref_latents, noisy_target], dim=1)
+        # Create per-token timesteps
+        timesteps = self._create_per_token_timesteps(conditioning_mask, sigmas.squeeze())
+        # Generate positions for reference and target separately, then concatenate
+        ref_positions = self._get_video_positions(
+            num_frames=ref_frames,
+            height=ref_height,
+            width=ref_width,
+            batch_size=batch_size,
+            fps=fps,
+            device=device,
+            dtype=dtype,
+        )
+        # Scale reference positions to match target coordinate space
+        # This maps ref positions from (0, ref_H, ref_W) to (0, target_H, target_W)
+        # Position tensor shape: [B, 3, seq_len, 2] where dim 1 is (time, height, width)
+        if reference_downscale_factor != 1:
+            ref_positions = ref_positions.clone()
+            ref_positions[:, 1, ...] *= reference_downscale_factor  # height axis
+            ref_positions[:, 2, ...] *= reference_downscale_factor  # width axis
+            # Time axis (index 0) remains unchanged
+        target_positions = self._get_video_positions(
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            batch_size=batch_size,
+            fps=fps,
+            device=device,
+            dtype=dtype,
+        )
+        # Concatenate positions along sequence dimension
+        positions = torch.cat([ref_positions, target_positions], dim=2)
+        # Create video Modality
+        video_modality = Modality(
+            enabled=True,
+            latent=combined_latents,
+            sigma=sigmas,
+            timesteps=timesteps,
+            positions=positions,
+            context=prompt_embeds,
+            context_mask=prompt_attention_mask,
+        )
+        # Loss mask: only compute loss on non-conditioning target tokens
+        # Reference tokens: all False (no loss)
+        # Target tokens: True where not conditioning
+        ref_loss_mask = torch.zeros(batch_size, ref_seq_len, dtype=torch.bool, device=device)
+        target_loss_mask = ~target_conditioning_mask
+        video_loss_mask = torch.cat([ref_loss_mask, target_loss_mask], dim=1)
+        return ModelInputs(
+            video=video_modality,
+            audio=None,
+            video_targets=targets,
+            audio_targets=None,
+            video_loss_mask=video_loss_mask,
+            audio_loss_mask=None,
+            ref_seq_len=ref_seq_len,
+        )
+    def compute_loss(
+        self,
+        video_pred: Tensor,
+        _audio_pred: Tensor | None,
+        inputs: ModelInputs,
+    ) -> Tensor:
+        """Compute masked loss only on target portion."""
+        # Extract target portion of prediction
+        ref_seq_len = inputs.ref_seq_len
+        target_pred = video_pred[:, ref_seq_len:, :]
+        # Get target portion of loss mask
+        target_loss_mask = inputs.video_loss_mask[:, ref_seq_len:]
+        # Compute loss
+        loss = (target_pred - inputs.video_targets).pow(2)
+        # Apply loss mask
+        loss_mask = target_loss_mask.unsqueeze(-1).float()
+        loss = loss.mul(loss_mask).div(loss_mask.mean())
+        return loss.mean()
+    def get_checkpoint_metadata(self) -> dict[str, Any]:
+        """Get metadata for checkpoint files."""
+        metadata: dict[str, Any] = {}
+        # Always include reference_downscale_factor for IC-LoRAs so inference
+        # pipelines know the expected scale factor for reference videos.
+        if self.reference_downscale_factor is not None:
+            metadata["reference_downscale_factor"] = self.reference_downscale_factor
+        return metadata
+    @staticmethod
+    def _infer_reference_downscale_factor(
+        target_height: int,
+        target_width: int,
+        ref_height: int,
+        ref_width: int,
+    ) -> int:
+        """Infer the reference downscale factor from target and reference dimensions."""
+        # If dimensions match, no scaling needed
+        if target_height == ref_height and target_width == ref_width:
+            return 1
+        # Calculate scale factors for each dimension
+        if target_height % ref_height != 0 or target_width % ref_width != 0:
+            raise ValueError(
+                f"Target dimensions ({target_height}x{target_width}) must be exact multiples "
+                f"of reference dimensions ({ref_height}x{ref_width})"
+            )
+        scale_h = target_height // ref_height
+        scale_w = target_width // ref_width
+        if scale_h != scale_w:
+            raise ValueError(
+                f"Reference scale must be uniform. Got height scale {scale_h} and width scale {scale_w}. "
+                f"Target: {target_height}x{target_width}, Reference: {ref_height}x{ref_width}"
+            )
+        if scale_h < 1:
+            raise ValueError(
+                f"Reference dimensions ({ref_height}x{ref_width}) cannot be larger than "
+                f"target dimensions ({target_height}x{target_width})"
+            )
+        return scale_h

packages/ltx-trainer/src/ltx_trainer/utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import io
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import ExifTags, Image, ImageCms, ImageOps
+from PIL.Image import Image as PilImage
+def open_image_as_srgb(image_path: str | Path | io.BytesIO) -> PilImage:
+    """
+    Opens an image file, applies rotation (if it's set in metadata) and converts it
+    to the sRGB color space respecting the original image color space .
+    Args:
+        image_path: Path to the image file
+    Returns:
+        PIL Image in sRGB color space
+    """
+    exif_colorspace_srgb = 1
+    with Image.open(image_path) as img_raw:
+        img = ImageOps.exif_transpose(img_raw)
+    input_icc_profile = img.info.get("icc_profile")
+    # Try to convert to sRGB if the image has ICC profile metadata
+    srgb_profile = ImageCms.createProfile(colorSpace="sRGB")
+    if input_icc_profile is not None:
+        input_profile = ImageCms.ImageCmsProfile(io.BytesIO(input_icc_profile))
+        srgb_img = ImageCms.profileToProfile(img, input_profile, srgb_profile, outputMode="RGB")
+    else:
+        # Try fall back to checking EXIF
+        exif_data = img.getexif()
+        if exif_data is not None:
+            # Assume sRGB if no ICC profile and EXIF has no ColorSpace tag
+            color_space_value = exif_data.get(ExifTags.Base.ColorSpace.value)
+            if color_space_value is not None and color_space_value != exif_colorspace_srgb:
+                raise ValueError(
+                    "Image has colorspace tag in EXIF but it isn't set to sRGB,"
+                    " conversion is not supported."
+                    f" EXIF ColorSpace tag value is {color_space_value}",
+                )
+        srgb_img = img.convert("RGB")
+        # Set sRGB profile in metadata since now the image is assumed to be in sRGB.
+        srgb_profile_data = ImageCms.ImageCmsProfile(srgb_profile).tobytes()
+        srgb_img.info["icc_profile"] = srgb_profile_data
+    return srgb_img
+def save_image(image_tensor: torch.Tensor, output_path: Path | str) -> None:
+    """Save an image tensor to a file.
+    Args:
+        image_tensor: Image tensor of shape [C, H, W] or [C, 1, H, W] in range [0, 1] or [0, 255].
+            C must be 3 (RGB).
+        output_path: Path to save the image (any PIL-supported format, e.g., .png or .jpg)
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    # Handle [C, 1, H, W] format (single frame from video tensor)
+    if image_tensor.ndim == 4:
+        # Squeeze frame dimension: [C, 1, H, W] -> [C, H, W]
+        if image_tensor.shape[1] == 1:
+            image_tensor = image_tensor.squeeze(1)
+        else:
+            raise ValueError(f"Expected single-frame tensor with shape [C, 1, H, W], got shape {image_tensor.shape}")
+    if image_tensor.ndim != 3:
+        raise ValueError(f"Expected 3D tensor [C, H, W], got {image_tensor.ndim}D tensor")
+    if image_tensor.shape[0] != 3:
+        raise ValueError(f"Expected 3 channels (RGB), got {image_tensor.shape[0]} channels")
+    # Normalize to [0, 255] uint8
+    if torch.is_floating_point(image_tensor) and image_tensor.max() <= 1.0:
+        image_tensor = image_tensor * 255
+    # Clamp to valid uint8 range to prevent overflow
+    image_tensor = image_tensor.clamp(0, 255)
+    # [C, H, W] -> [H, W, C]
+    image_np: np.ndarray = image_tensor.permute(1, 2, 0).to(torch.uint8).cpu().numpy()
+    # Save using PIL
+    Image.fromarray(image_np).save(output_path)

packages/ltx-trainer/templates/model_card.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+tags:
+  - ltx-2
+  - ltx-video
+  - text-to-video
+  - audio-video
+pinned: true
+language:
+  - en
+license: other
+pipeline_tag: text-to-video
+library_name: diffusers
+---
+# {model_name}
+This is a fine-tuned version of [`{base_model}`]({base_model_link}) trained on custom data.
+## Model Details
+- **Base Model:** [`{base_model}`]({base_model_link})
+- **Training Type:** {training_type}
+- **Training Steps:** {training_steps}
+- **Learning Rate:** {learning_rate}
+- **Batch Size:** {batch_size}
+## Sample Outputs
+| | | | |
+|:---:|:---:|:---:|:---:|
+{sample_grid}
+## Usage
+This model is designed to be used with the LTX-2 (Lightricks Audio-Video) pipeline.
+### 🔌 Using Trained LoRAs in ComfyUI
+In order to use the trained LoRA in ComfyUI, follow these steps:
+1. Copy your trained LoRA checkpoint (`.safetensors` file) to the `models/loras` folder in your ComfyUI installation.
+2. In your ComfyUI workflow:
+    - Add the "Load LoRA" node to choose your LoRA file
+    - Connect it to the "Load Checkpoint" node to apply the LoRA to the base model
+You can find reference Text-to-Video (T2V) and Image-to-Video (I2V) workflows in the
+official [LTX-2 repository](https://github.com/Lightricks/LTX-2).
+### Example Prompts
+{validation_prompts}
+This model inherits the license of the base model ([`{base_model}`]({base_model_link})).
+## Acknowledgments
+- Base model: [Lightricks](https://huggingface.co/Lightricks/LTX-2)
+- Trainer: [LTX-2](https://github.com/Lightricks/LTX-2)