|
import spaces |
|
import gradio as gr |
|
import torch |
|
from huggingface_hub import snapshot_download |
|
|
|
from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder |
|
from xora.models.transformers.transformer3d import Transformer3DModel |
|
from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier |
|
from xora.schedulers.rf import RectifiedFlowScheduler |
|
from xora.pipelines.pipeline_xora_video import XoraVideoPipeline |
|
from transformers import T5EncoderModel, T5Tokenizer |
|
from xora.utils.conditioning_method import ConditioningMethod |
|
from pathlib import Path |
|
import safetensors.torch |
|
import json |
|
import numpy as np |
|
import cv2 |
|
from PIL import Image |
|
import tempfile |
|
import os |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
|
|
model_path = "asset" |
|
if not os.path.exists(model_path): |
|
snapshot_download("Lightricks/Xora", local_dir=model_path, repo_type='model', token=hf_token) |
|
|
|
|
|
vae_dir = Path(model_path) / 'vae' |
|
unet_dir = Path(model_path) / 'unet' |
|
scheduler_dir = Path(model_path) / 'scheduler' |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
def load_vae(vae_dir): |
|
vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors" |
|
vae_config_path = vae_dir / "config.json" |
|
with open(vae_config_path, 'r') as f: |
|
vae_config = json.load(f) |
|
vae = CausalVideoAutoencoder.from_config(vae_config) |
|
vae_state_dict = safetensors.torch.load_file(vae_ckpt_path) |
|
vae.load_state_dict(vae_state_dict) |
|
return vae.cuda().to(torch.bfloat16) |
|
|
|
|
|
def load_unet(unet_dir): |
|
unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors" |
|
unet_config_path = unet_dir / "config.json" |
|
transformer_config = Transformer3DModel.load_config(unet_config_path) |
|
transformer = Transformer3DModel.from_config(transformer_config) |
|
unet_state_dict = safetensors.torch.load_file(unet_ckpt_path) |
|
transformer.load_state_dict(unet_state_dict, strict=True) |
|
return transformer.to(device) |
|
|
|
|
|
def load_scheduler(scheduler_dir): |
|
scheduler_config_path = scheduler_dir / "scheduler_config.json" |
|
scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path) |
|
return RectifiedFlowScheduler.from_config(scheduler_config) |
|
|
|
|
|
|
|
def center_crop_and_resize(frame, target_height, target_width): |
|
h, w, _ = frame.shape |
|
aspect_ratio_target = target_width / target_height |
|
aspect_ratio_frame = w / h |
|
if aspect_ratio_frame > aspect_ratio_target: |
|
new_width = int(h * aspect_ratio_target) |
|
x_start = (w - new_width) // 2 |
|
frame_cropped = frame[:, x_start:x_start + new_width] |
|
else: |
|
new_height = int(w / aspect_ratio_target) |
|
y_start = (h - new_height) // 2 |
|
frame_cropped = frame[y_start:y_start + new_height, :] |
|
frame_resized = cv2.resize(frame_cropped, (target_width, target_height)) |
|
return frame_resized |
|
|
|
|
|
def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768): |
|
image = Image.open(image_path).convert("RGB") |
|
image_np = np.array(image) |
|
frame_resized = center_crop_and_resize(image_np, target_height, target_width) |
|
frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float() |
|
frame_tensor = (frame_tensor / 127.5) - 1.0 |
|
return frame_tensor.unsqueeze(0).unsqueeze(2) |
|
|
|
|
|
|
|
preset_options = [ |
|
{"label": "704x1216, 41 frames", "height": 704, "width": 1216, "num_frames": 41}, |
|
{"label": "704x1088, 49 frames", "height": 704, "width": 1088, "num_frames": 49}, |
|
{"label": "640x1056, 57 frames", "height": 640, "width": 1056, "num_frames": 57}, |
|
{"label": "608x992, 65 frames", "height": 608, "width": 992, "num_frames": 65}, |
|
{"label": "608x896, 73 frames", "height": 608, "width": 896, "num_frames": 73}, |
|
{"label": "544x896, 81 frames", "height": 544, "width": 896, "num_frames": 81}, |
|
{"label": "544x832, 89 frames", "height": 544, "width": 832, "num_frames": 89}, |
|
{"label": "512x800, 97 frames", "height": 512, "width": 800, "num_frames": 97}, |
|
{"label": "512x768, 97 frames", "height": 512, "width": 768, "num_frames": 97}, |
|
{"label": "480x800, 105 frames", "height": 480, "width": 800, "num_frames": 105}, |
|
{"label": "480x736, 113 frames", "height": 480, "width": 736, "num_frames": 113}, |
|
{"label": "480x704, 121 frames", "height": 480, "width": 704, "num_frames": 121}, |
|
{"label": "448x704, 129 frames", "height": 448, "width": 704, "num_frames": 129}, |
|
{"label": "448x672, 137 frames", "height": 448, "width": 672, "num_frames": 137}, |
|
{"label": "416x640, 153 frames", "height": 416, "width": 640, "num_frames": 153}, |
|
{"label": "384x672, 161 frames", "height": 384, "width": 672, "num_frames": 161}, |
|
{"label": "384x640, 169 frames", "height": 384, "width": 640, "num_frames": 169}, |
|
{"label": "384x608, 177 frames", "height": 384, "width": 608, "num_frames": 177}, |
|
{"label": "384x576, 185 frames", "height": 384, "width": 576, "num_frames": 185}, |
|
{"label": "352x608, 193 frames", "height": 352, "width": 608, "num_frames": 193}, |
|
{"label": "352x576, 201 frames", "height": 352, "width": 576, "num_frames": 201}, |
|
{"label": "352x544, 209 frames", "height": 352, "width": 544, "num_frames": 209}, |
|
{"label": "352x512, 225 frames", "height": 352, "width": 512, "num_frames": 225}, |
|
{"label": "352x512, 233 frames", "height": 352, "width": 512, "num_frames": 233}, |
|
{"label": "320x544, 241 frames", "height": 320, "width": 544, "num_frames": 241}, |
|
{"label": "320x512, 249 frames", "height": 320, "width": 512, "num_frames": 249}, |
|
{"label": "320x512, 257 frames", "height": 320, "width": 512, "num_frames": 257}, |
|
{"label": "Custom", "height": None, "width": None, "num_frames": None} |
|
] |
|
|
|
|
|
|
|
def preset_changed(preset): |
|
if preset != "Custom": |
|
selected = next(item for item in preset_options if item["label"] == preset) |
|
return ( |
|
selected["height"], |
|
selected["width"], |
|
selected["num_frames"], |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update(visible=False) |
|
) |
|
else: |
|
return None, None, None, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) |
|
|
|
|
|
|
|
vae = load_vae(vae_dir) |
|
unet = load_unet(unet_dir) |
|
scheduler = load_scheduler(scheduler_dir) |
|
patchifier = SymmetricPatchifier(patch_size=1) |
|
text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to(device) |
|
tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer") |
|
|
|
pipeline = XoraVideoPipeline( |
|
transformer=unet, |
|
patchifier=patchifier, |
|
text_encoder=text_encoder, |
|
tokenizer=tokenizer, |
|
scheduler=scheduler, |
|
vae=vae, |
|
).to(device) |
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
def generate_video(image_path=None, prompt="", negative_prompt="", |
|
seed=171198, num_inference_steps=40, num_images_per_prompt=1, |
|
guidance_scale=3, height=512, width=768, num_frames=121, frame_rate=25, progress=gr.Progress()): |
|
|
|
if len(prompt.strip()) < 50: |
|
raise gr.Error("Prompt must be at least 50 characters long. Please provide more details for the best results.", duration=5) |
|
|
|
if image_path: |
|
media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device) |
|
else: |
|
raise ValueError("Image path must be provided.") |
|
|
|
sample = { |
|
"prompt": prompt, |
|
'prompt_attention_mask': None, |
|
'negative_prompt': negative_prompt, |
|
'negative_prompt_attention_mask': None, |
|
'media_items': media_items, |
|
} |
|
|
|
generator = torch.Generator(device="cpu").manual_seed(seed) |
|
|
|
def gradio_progress_callback(self, step, timestep, kwargs): |
|
progress((step + 1) / num_inference_steps) |
|
|
|
images = pipeline( |
|
num_inference_steps=num_inference_steps, |
|
num_images_per_prompt=num_images_per_prompt, |
|
guidance_scale=guidance_scale, |
|
generator=generator, |
|
output_type="pt", |
|
height=height, |
|
width=width, |
|
num_frames=num_frames, |
|
frame_rate=frame_rate, |
|
**sample, |
|
is_video=True, |
|
vae_per_channel_normalize=True, |
|
conditioning_method=ConditioningMethod.FIRST_FRAME, |
|
mixed_precision=True, |
|
callback_on_step_end=gradio_progress_callback |
|
).images |
|
|
|
output_path = tempfile.mktemp(suffix=".mp4") |
|
video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy() |
|
video_np = (video_np * 255).astype(np.uint8) |
|
height, width = video_np.shape[1:3] |
|
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (width, height)) |
|
for frame in video_np[..., ::-1]: |
|
out.write(frame) |
|
out.release() |
|
|
|
return output_path |
|
|
|
|
|
|
|
with gr.Blocks() as iface: |
|
gr.Markdown("# Video Generation with Xora") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
image_input = gr.Image(type="filepath", label="Image Input") |
|
prompt = gr.Textbox(label="Prompt", value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains. The sky is clear with a few wispy clouds, and the sunlight glistens on the motorcycle as it speeds along. The rider is dressed in a black leather jacket and helmet, leaning slightly forward as the wind rustles through nearby trees. The wheels kick up dust, creating a slight trail behind the motorcycle, adding a sense of speed and excitement to the scene.") |
|
negative_prompt = gr.Textbox(label="Negative Prompt", value="worst quality, inconsistent motion...") |
|
|
|
|
|
preset_dropdown = gr.Dropdown( |
|
choices=[p["label"] for p in preset_options], |
|
value="704x1216, 41 frames", |
|
label="Resolution Preset" |
|
) |
|
|
|
|
|
with gr.Accordion("Advanced Options", open=False): |
|
seed = gr.Slider(label="Seed", minimum=0, maximum=1000000, step=1, value=171198) |
|
inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=100, step=1, value=40) |
|
images_per_prompt = gr.Slider(label="Images per Prompt", minimum=1, maximum=10, step=1, value=1) |
|
guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=3.0) |
|
|
|
|
|
height_slider = gr.Slider(label="Height", minimum=256, maximum=1024, step=64, value=704, visible=False) |
|
width_slider = gr.Slider(label="Width", minimum=256, maximum=1024, step=64, value=1216, visible=False) |
|
num_frames_slider = gr.Slider(label="Number of Frames", minimum=1, maximum=200, step=1, value=41, |
|
visible=False) |
|
|
|
frame_rate = gr.Slider(label="Frame Rate", minimum=1, maximum=60, step=1, value=25, visible=False) |
|
|
|
generate_button = gr.Button("Generate Video") |
|
|
|
with gr.Column(): |
|
output_video = gr.Video(label="Generated Video") |
|
|
|
|
|
preset_dropdown.change( |
|
fn=preset_changed, |
|
inputs=[preset_dropdown], |
|
outputs=[height_slider, width_slider, num_frames_slider, height_slider, width_slider, frame_rate] |
|
) |
|
|
|
generate_button.click( |
|
fn=generate_video, |
|
inputs=[image_input, prompt, negative_prompt, seed, inference_steps, images_per_prompt, guidance_scale, |
|
height_slider, width_slider, num_frames_slider, frame_rate], |
|
outputs=output_video |
|
) |
|
|
|
iface.launch(share=True) |
|
|