Spaces:

Lightricks
/

LTX-Video-Playground

Running on A100

App Files Files Community

Sapir commited on 24 days ago

Commit

b6c994f

•

1 Parent(s): de2eaeb

README: added inference + installation guidelines, inference clearer.

Browse files

Files changed (4) hide show

README.md +70 -1
xora/examples/image_to_video.py → inference.py +123 -41
scripts/to_safetensors.py +2 -2
xora/examples/text_to_video.py +0 -138

README.md CHANGED Viewed

	@@ -1 +1,70 @@
1	- # ~~xora-core~~

+<div align="center">
+# Xora️
+</div>
+This is the official repository for Xora.
+## Table of Contents
+* [Introduction](#introduction)
+* [Installation](#installation)
+* [Inference](#inference)
+  * [Inference Code](#inference-code)
+* [Acknowledgement](#acknowledgement)
+## Introduction
+The performance of Diffusion Transformers is heavily influenced by the number of generated latent pixels (or tokens). In video generation, the token count becomes substantial as the number of frames increases. To address this, we designed a carefully optimized VAE that compresses videos into a smaller number of tokens while utilizing a deeper latent space. This approach enables our model to generate high-quality 768x512 videos at 24 FPS, achieving near real-time speeds.
+## Installation
+# Setup
+The codebase currently uses Python 3.10.5, CUDA version 12.2, and supports PyTorch >= 2.1.2.
+```bash
+git clone https://github.com/LightricksResearch/xora-core.git
+cd xora-core
+# create env
+python -m venv env
+source env/bin/activate
+python -m pip install -e .\[inference-script\]
+```
+Then, download the model from [Hugging Face](https://huggingface.co/Lightricks/Xora)
+```python
+from huggingface_hub import snapshot_download
+model_path = 'PATH'   # The local directory to save downloaded checkpoint
+snapshot_download("Lightricks/Orah", local_dir=model_path, local_dir_use_symlinks=False, repo_type='model')
+```
+## Inference
+### Inference Code
+To use our model, please follow the inference code in `inference.py` at [https://github.com/LightricksResearch/xora-core/blob/main/inference.py]():
+For text-to-video generation:
+```bash
+python inference.py --ckpt_dir 'PATH' --prompt "PROMPT" --height HEIGHT --width WIDTH
+```
+For image-to-video generation:
+```python
+python inference.py --ckpt_dir 'PATH' --prompt "PROMPT" --input_image_path IMAGE_PATH --height HEIGHT --width WIDTH
+```
+## Acknowledgement
+We are grateful for the following awesome projects when implementing Xora:
+* [DiT](https://github.com/facebookresearch/DiT) and [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha): vision transformers for image generation.
+[//]: # (## Citation)

xora/examples/image_to_video.py → inference.py RENAMED Viewed

@@ -16,9 +16,39 @@ import cv2
 from PIL import Image
 import random
 def load_vae(vae_dir):
-    vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
     with open(vae_config_path, "r") as f:
         vae_config = json.load(f)
@@ -29,7 +59,7 @@ def load_vae(vae_dir):
 def load_unet(unet_dir):
-    unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
     unet_config_path = unet_dir / "config.json"
     transformer_config = Transformer3DModel.load_config(unet_config_path)
     transformer = Transformer3DModel.from_config(transformer_config)
@@ -60,7 +90,7 @@ def center_crop_and_resize(frame, target_height, target_width):
     return frame_resized
-def load_video_to_tensor_with_resize(video_path, target_height=512, target_width=768):
     cap = cv2.VideoCapture(video_path)
     frames = []
     while True:
@@ -68,7 +98,12 @@ def load_video_to_tensor_with_resize(video_path, target_height=512, target_width
         if not ret:
             break
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        frame_resized = center_crop_and_resize(frame_rgb, target_height, target_width)
         frames.append(frame_resized)
     cap.release()
     video_np = (np.array(frames) / 127.5) - 1.0
@@ -99,9 +134,19 @@ def main():
         help="Path to the directory containing unet, vae, and scheduler subdirectories",
     )
     parser.add_argument(
-        "--video_path", type=str, help="Path to the input video file (first frame used)"
     )
-    parser.add_argument("--image_path", type=str, help="Path to the input image file")
     parser.add_argument("--seed", type=int, default="171198")
     # Pipeline parameters
@@ -121,10 +166,16 @@ def main():
         help="Guidance scale for the pipeline",
     )
     parser.add_argument(
-        "--height", type=int, default=512, help="Height of the output video frames"
     )
     parser.add_argument(
-        "--width", type=int, default=768, help="Width of the output video frames"
     )
     parser.add_argument(
         "--num_frames",
@@ -136,12 +187,6 @@ def main():
         "--frame_rate", type=int, default=25, help="Frame rate for the output video"
     )
-    parser.add_argument(
-        "--mixed_precision",
-        action="store_true",
-        help="Mixed precision in float32 and bfloat16",
-    )
     parser.add_argument(
         "--bfloat16",
         action="store_true",
@@ -152,7 +197,6 @@ def main():
     parser.add_argument(
         "--prompt",
         type=str,
-        default='A man wearing a black leather jacket and blue jeans is riding a Harley Davidson motorcycle down a paved road. The man has short brown hair and is wearing a black helmet. The motorcycle is a dark red color with a large front fairing. The road is surrounded by green grass and trees. There is a gas station on the left side of the road with a red and white sign that says "Oil" and "Diner".',
         help="Text prompt to guide generation",
     )
     parser.add_argument(
@@ -161,9 +205,42 @@ def main():
         default="worst quality, inconsistent motion, blurry, jittery, distorted",
         help="Negative prompt for undesired features",
     )
     args = parser.parse_args()
     # Paths for the separate mode directories
     ckpt_dir = Path(args.ckpt_dir)
     unet_dir = ckpt_dir / "unet"
@@ -197,18 +274,6 @@ def main():
     pipeline = XoraVideoPipeline(**submodel_dict).to("cuda")
-    # Load media (video or image)
-    if args.video_path:
-        media_items = load_video_to_tensor_with_resize(
-            args.video_path, args.height, args.width
-        ).unsqueeze(0)
-    elif args.image_path:
-        media_items = load_image_to_tensor_with_resize(
-            args.image_path, args.height, args.width
-        )
-    else:
-        raise ValueError("Either --video_path or --image_path must be provided.")
     # Prepare input for the pipeline
     sample = {
         "prompt": args.prompt,
@@ -231,15 +296,19 @@ def main():
         generator=generator,
         output_type="pt",
         callback_on_step_end=None,
-        height=args.height,
-        width=args.width,
         num_frames=args.num_frames,
         frame_rate=args.frame_rate,
         **sample,
         is_video=True,
         vae_per_channel_normalize=True,
-        conditioning_method=ConditioningMethod.FIRST_FRAME,
-        mixed_precision=args.mixed_precision,
     ).images
     # Save output video
@@ -257,16 +326,29 @@ def main():
         video_np = (video_np * 255).astype(np.uint8)
         fps = args.frame_rate
         height, width = video_np.shape[1:3]
-        output_filename = get_unique_filename(f"video_output_{i}", ".mp4", ".")
-        out = cv2.VideoWriter(
-            output_filename, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
-        )
-        for frame in video_np[..., ::-1]:
-            out.write(frame)
-        out.release()
 if __name__ == "__main__":

 from PIL import Image
 import random
+RECOMMENDED_RESOLUTIONS = [
+    (704, 1216, 41),
+    (704, 1088, 49),
+    (640, 1056, 57),
+    (608, 992, 65),
+    (608, 896, 73),
+    (544, 896, 81),
+    (544, 832, 89),
+    (512, 800, 97),
+    (512, 768, 97),
+    (480, 800, 105),
+    (480, 736, 113),
+    (480, 704, 121),
+    (448, 704, 129),
+    (448, 672, 137),
+    (416, 640, 153),
+    (384, 672, 161),
+    (384, 640, 169),
+    (384, 608, 177),
+    (384, 576, 185),
+    (352, 608, 193),
+    (352, 576, 201),
+    (352, 544, 209),
+    (352, 512, 225),
+    (352, 512, 233),
+    (320, 544, 241),
+    (320, 512, 249),
+    (320, 512, 257),
+]
 def load_vae(vae_dir):
+    vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
     with open(vae_config_path, "r") as f:
         vae_config = json.load(f)
 def load_unet(unet_dir):
+    unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
     unet_config_path = unet_dir / "config.json"
     transformer_config = Transformer3DModel.load_config(unet_config_path)
     transformer = Transformer3DModel.from_config(transformer_config)
     return frame_resized
+def load_video_to_tensor_with_resize(video_path, target_height, target_width):
     cap = cv2.VideoCapture(video_path)
     frames = []
     while True:
         if not ret:
             break
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        if target_height is not None:
+            frame_resized = center_crop_and_resize(
+                frame_rgb, target_height, target_width
+            )
+        else:
+            frame_resized = frame_rgb
         frames.append(frame_resized)
     cap.release()
     video_np = (np.array(frames) / 127.5) - 1.0
         help="Path to the directory containing unet, vae, and scheduler subdirectories",
     )
     parser.add_argument(
+        "--input_video_path",
+        type=str,
+        help="Path to the input video file (first frame used)",
+    )
+    parser.add_argument(
+        "--input_image_path", type=str, help="Path to the input image file"
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        help="Path to save output video, if None will save in working directory.",
     )
     parser.add_argument("--seed", type=int, default="171198")
     # Pipeline parameters
         help="Guidance scale for the pipeline",
     )
     parser.add_argument(
+        "--height",
+        type=int,
+        default=None,
+        help="Height of the output video frames. Optional if an input image provided.",
     )
     parser.add_argument(
+        "--width",
+        type=int,
+        default=None,
+        help="Width of the output video frames. If None will infer from input image.",
     )
     parser.add_argument(
         "--num_frames",
         "--frame_rate", type=int, default=25, help="Frame rate for the output video"
     )
     parser.add_argument(
         "--bfloat16",
         action="store_true",
     parser.add_argument(
         "--prompt",
         type=str,
         help="Text prompt to guide generation",
     )
     parser.add_argument(
         default="worst quality, inconsistent motion, blurry, jittery, distorted",
         help="Negative prompt for undesired features",
     )
+    parser.add_argument(
+        "--custom_resolution",
+        action="store_true",
+        default=False,
+        help="Enable custom resolution (not in recommneded resolutions) if specified (default: False)",
+    )
     args = parser.parse_args()
+    if args.input_image_path is None and args.input_video_path is None:
+        assert (
+            args.height is not None and args.width is not None
+        ), "Must enter height and width for text to image generation."
+    # Load media (video or image)
+    if args.input_video_path:
+        media_items = load_video_to_tensor_with_resize(
+            args.input_video_path, args.height, args.width
+        ).unsqueeze(0)
+    elif args.input_image_path:
+        media_items = load_image_to_tensor_with_resize(
+            args.input_image_path, args.height, args.width
+        )
+    else:
+        media_items = None
+    height = args.height if args.height else media_items.shape[-2]
+    width = args.width if args.width else media_items.shape[-1]
+    assert height % 32 == 0, f"Height ({height}) should be divisible by 32."
+    assert width % 32 == 0, f"Width ({width}) should be divisible by 32."
+    assert (
+        height,
+        width,
+        args.num_frames,
+    ) in RECOMMENDED_RESOLUTIONS or args.custom_resolution, f"The selected resolution + num frames combination is not supported, results would be suboptimal. Supported (h,w,f) are: {RECOMMENDED_RESOLUTIONS}. Use --custom_resolution to enable working with this resolution."
     # Paths for the separate mode directories
     ckpt_dir = Path(args.ckpt_dir)
     unet_dir = ckpt_dir / "unet"
     pipeline = XoraVideoPipeline(**submodel_dict).to("cuda")
     # Prepare input for the pipeline
     sample = {
         "prompt": args.prompt,
         generator=generator,
         output_type="pt",
         callback_on_step_end=None,
+        height=height,
+        width=width,
         num_frames=args.num_frames,
         frame_rate=args.frame_rate,
         **sample,
         is_video=True,
         vae_per_channel_normalize=True,
+        conditioning_method=(
+            ConditioningMethod.FIRST_FRAME
+            if media_items is not None
+            else ConditioningMethod.UNCONDITIONAL
+        ),
+        mixed_precision=not args.bfloat16,
     ).images
     # Save output video
         video_np = (video_np * 255).astype(np.uint8)
         fps = args.frame_rate
         height, width = video_np.shape[1:3]
+        if video_np.shape[0] == 1:
+            output_filename = (
+                args.output_path
+                if args.output_path is not None
+                else get_unique_filename(f"image_output_{i}", ".png", ".")
+            )
+            cv2.imwrite(
+                output_filename, video_np[0][..., ::-1]
+            )  # Save single frame as image
+        else:
+            output_filename = (
+                args.output_path
+                if args.output_path is not None
+                else get_unique_filename(f"video_output_{i}", ".mp4", ".")
+            )
+            out = cv2.VideoWriter(
+                output_filename, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
+            )
+            for frame in video_np[..., ::-1]:
+                out.write(frame)
+            out.release()
 if __name__ == "__main__":

scripts/to_safetensors.py CHANGED Viewed

@@ -100,10 +100,10 @@ def main(
         # Save unet and vae safetensors with the name diffusion_pytorch_model.safetensors
         safetensors.torch.save_file(
-            unet, unet_dir / "diffusion_pytorch_model.safetensors"
         )
         safetensors.torch.save_file(
-            vae, vae_dir / "diffusion_pytorch_model.safetensors"
         )
         # Save config files for unet, vae, and scheduler

         # Save unet and vae safetensors with the name diffusion_pytorch_model.safetensors
         safetensors.torch.save_file(
+            unet, unet_dir / "unet_diffusion_pytorch_model.safetensors"
         )
         safetensors.torch.save_file(
+            vae, vae_dir / "vae_diffusion_pytorch_model.safetensors"
         )
         # Save config files for unet, vae, and scheduler

xora/examples/text_to_video.py DELETED Viewed

@@ -1,138 +0,0 @@
-import torch
-from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
-from xora.models.transformers.transformer3d import Transformer3DModel
-from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
-from xora.schedulers.rf import RectifiedFlowScheduler
-from xora.pipelines.pipeline_xora_video import XoraVideoPipeline
-from pathlib import Path
-from transformers import T5EncoderModel, T5Tokenizer
-import safetensors.torch
-import json
-import argparse
-def load_vae(vae_dir):
-    vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
-    vae_config_path = vae_dir / "config.json"
-    with open(vae_config_path, "r") as f:
-        vae_config = json.load(f)
-    vae = CausalVideoAutoencoder.from_config(vae_config)
-    vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
-    vae.load_state_dict(vae_state_dict)
-    return vae.cuda().to(torch.bfloat16)
-def load_unet(unet_dir):
-    unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
-    unet_config_path = unet_dir / "config.json"
-    transformer_config = Transformer3DModel.load_config(unet_config_path)
-    transformer = Transformer3DModel.from_config(transformer_config)
-    unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
-    transformer.load_state_dict(unet_state_dict, strict=True)
-    return transformer.cuda()
-def load_scheduler(scheduler_dir):
-    scheduler_config_path = scheduler_dir / "scheduler_config.json"
-    scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
-    return RectifiedFlowScheduler.from_config(scheduler_config)
-def main():
-    # Parse command line arguments
-    parser = argparse.ArgumentParser(
-        description="Load models from separate directories"
-    )
-    parser.add_argument(
-        "--separate_dir",
-        type=str,
-        required=True,
-        help="Path to the directory containing unet, vae, and scheduler subdirectories",
-    )
-    parser.add_argument(
-        "--mixed_precision",
-        action="store_true",
-        help="Mixed precision in float32 and bfloat16",
-    )
-    parser.add_argument(
-        "--bfloat16",
-        action="store_true",
-        help="Denoise in bfloat16",
-    )
-    args = parser.parse_args()
-    # Paths for the separate mode directories
-    separate_dir = Path(args.separate_dir)
-    unet_dir = separate_dir / "unet"
-    vae_dir = separate_dir / "vae"
-    scheduler_dir = separate_dir / "scheduler"
-    # Load models
-    vae = load_vae(vae_dir)
-    unet = load_unet(unet_dir)
-    scheduler = load_scheduler(scheduler_dir)
-    # Patchifier (remains the same)
-    patchifier = SymmetricPatchifier(patch_size=1)
-    text_encoder = T5EncoderModel.from_pretrained(
-        "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
-    ).to("cuda")
-    tokenizer = T5Tokenizer.from_pretrained(
-        "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
-    )
-    if args.bfloat16 and unet.dtype != torch.bfloat16:
-        unet = unet.to(torch.bfloat16)
-    # Use submodels for the pipeline
-    submodel_dict = {
-        "transformer": unet,  # using unet for transformer
-        "patchifier": patchifier,
-        "scheduler": scheduler,
-        "text_encoder": text_encoder,
-        "tokenizer": tokenizer,
-        "vae": vae,
-    }
-    pipeline = XoraVideoPipeline(**submodel_dict).to("cuda")
-    # Sample input
-    num_inference_steps = 20
-    num_images_per_prompt = 2
-    guidance_scale = 3
-    height = 512
-    width = 768
-    num_frames = 57
-    frame_rate = 25
-    sample = {
-        "prompt": "A middle-aged man with glasses and a salt-and-pepper beard is driving a car and talking, gesturing with his right hand. "
-        "The man is wearing a dark blue zip-up jacket and a light blue collared shirt. He is sitting in the driver's seat of a car with a black interior. The car is moving on a road with trees and bushes on either side. The man has a serious expression on his face and is looking straight ahead.",
-        "prompt_attention_mask": None,  # Adjust attention masks as needed
-        "negative_prompt": "Ugly deformed",
-        "negative_prompt_attention_mask": None,
-    }
-    # Generate images (video frames)
-    _ = pipeline(
-        num_inference_steps=num_inference_steps,
-        num_images_per_prompt=num_images_per_prompt,
-        guidance_scale=guidance_scale,
-        generator=None,
-        output_type="pt",
-        callback_on_step_end=None,
-        height=height,
-        width=width,
-        num_frames=num_frames,
-        frame_rate=frame_rate,
-        **sample,
-        is_video=True,
-        vae_per_channel_normalize=True,
-        mixed_precision=args.mixed_precision,
-    ).images
-    print("Generated images (video frames).")
-if __name__ == "__main__":
-    main()