Spaces:
Runtime error
Runtime error
import json | |
import os | |
import cv2 | |
import numpy as np | |
import torch | |
from diffusers import (AutoencoderKL, CogVideoXDDIMScheduler, DDIMScheduler, | |
DPMSolverMultistepScheduler, | |
EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, | |
PNDMScheduler) | |
from omegaconf import OmegaConf | |
from PIL import Image | |
from transformers import (CLIPImageProcessor, CLIPVisionModelWithProjection, | |
T5EncoderModel, T5Tokenizer) | |
from cogvideox.models.autoencoder_magvit import AutoencoderKLCogVideoX | |
from cogvideox.models.transformer3d import CogVideoXTransformer3DModel | |
from cogvideox.pipeline.pipeline_cogvideox import CogVideoX_Fun_Pipeline | |
from cogvideox.pipeline.pipeline_cogvideox_control import \ | |
CogVideoX_Fun_Pipeline_Control | |
from cogvideox.utils.lora_utils import merge_lora, unmerge_lora | |
from cogvideox.utils.utils import get_video_to_video_latent, save_videos_grid | |
# Low gpu memory mode, this is used when the GPU memory is under 16GB | |
low_gpu_memory_mode = False | |
# model path | |
model_name = "models/Diffusion_Transformer/CogVideoX-Fun-V1.1-2b-Pose" | |
# Choose the sampler in "Euler" "Euler A" "DPM++" "PNDM" and "DDIM" | |
sampler_name = "DDIM_Origin" | |
# Load pretrained model if need | |
transformer_path = None | |
vae_path = None | |
lora_path = None | |
# Other params | |
sample_size = [672, 384] | |
video_length = 49 | |
fps = 8 | |
# Use torch.float16 if GPU does not support torch.bfloat16 | |
# ome graphics cards, such as v100, 2080ti, do not support torch.bfloat16 | |
weight_dtype = torch.bfloat16 | |
control_video = "asset/pose.mp4" | |
# prompts | |
prompt = "A person wearing a knee-length white sleeveless dress and white high-heeled sandals performs a dance in a well-lit room with wooden flooring. The room's background features a closed door, a shelf displaying clear glass bottles of alcoholic beverages, and a partially visible dark-colored sofa. " | |
negative_prompt = "The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. " | |
guidance_scale = 6.0 | |
seed = 43 | |
num_inference_steps = 50 | |
lora_weight = 0.55 | |
save_path = "samples/cogvideox-fun-videos_control" | |
transformer = CogVideoXTransformer3DModel.from_pretrained_2d( | |
model_name, | |
subfolder="transformer", | |
).to(weight_dtype) | |
if transformer_path is not None: | |
print(f"From checkpoint: {transformer_path}") | |
if transformer_path.endswith("safetensors"): | |
from safetensors.torch import load_file, safe_open | |
state_dict = load_file(transformer_path) | |
else: | |
state_dict = torch.load(transformer_path, map_location="cpu") | |
state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict | |
m, u = transformer.load_state_dict(state_dict, strict=False) | |
print(f"missing keys: {len(m)}, unexpected keys: {len(u)}") | |
# Get Vae | |
vae = AutoencoderKLCogVideoX.from_pretrained( | |
model_name, | |
subfolder="vae" | |
).to(weight_dtype) | |
if vae_path is not None: | |
print(f"From checkpoint: {vae_path}") | |
if vae_path.endswith("safetensors"): | |
from safetensors.torch import load_file, safe_open | |
state_dict = load_file(vae_path) | |
else: | |
state_dict = torch.load(vae_path, map_location="cpu") | |
state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict | |
m, u = vae.load_state_dict(state_dict, strict=False) | |
print(f"missing keys: {len(m)}, unexpected keys: {len(u)}") | |
text_encoder = T5EncoderModel.from_pretrained( | |
model_name, subfolder="text_encoder", torch_dtype=weight_dtype | |
) | |
# Get Scheduler | |
Choosen_Scheduler = scheduler_dict = { | |
"Euler": EulerDiscreteScheduler, | |
"Euler A": EulerAncestralDiscreteScheduler, | |
"DPM++": DPMSolverMultistepScheduler, | |
"PNDM": PNDMScheduler, | |
"DDIM_Cog": CogVideoXDDIMScheduler, | |
"DDIM_Origin": DDIMScheduler, | |
}[sampler_name] | |
scheduler = Choosen_Scheduler.from_pretrained( | |
model_name, | |
subfolder="scheduler" | |
) | |
pipeline = CogVideoX_Fun_Pipeline_Control.from_pretrained( | |
model_name, | |
vae=vae, | |
text_encoder=text_encoder, | |
transformer=transformer, | |
scheduler=scheduler, | |
torch_dtype=weight_dtype | |
) | |
if low_gpu_memory_mode: | |
pipeline.enable_sequential_cpu_offload() | |
else: | |
pipeline.enable_model_cpu_offload() | |
generator = torch.Generator(device="cuda").manual_seed(seed) | |
if lora_path is not None: | |
pipeline = merge_lora(pipeline, lora_path, lora_weight, "cuda") | |
video_length = int((video_length - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1 | |
input_video, input_video_mask, clip_image = get_video_to_video_latent(control_video, video_length=video_length, sample_size=sample_size, fps=fps) | |
with torch.no_grad(): | |
sample = pipeline( | |
prompt, | |
num_frames = video_length, | |
negative_prompt = negative_prompt, | |
height = sample_size[0], | |
width = sample_size[1], | |
generator = generator, | |
guidance_scale = guidance_scale, | |
num_inference_steps = num_inference_steps, | |
control_video = input_video, | |
).videos | |
if lora_path is not None: | |
pipeline = unmerge_lora(pipeline, lora_path, lora_weight, "cuda") | |
if not os.path.exists(save_path): | |
os.makedirs(save_path, exist_ok=True) | |
index = len([path for path in os.listdir(save_path)]) + 1 | |
prefix = str(index).zfill(8) | |
if video_length == 1: | |
save_sample_path = os.path.join(save_path, prefix + f".png") | |
image = sample[0, :, 0] | |
image = image.transpose(0, 1).transpose(1, 2) | |
image = (image * 255).numpy().astype(np.uint8) | |
image = Image.fromarray(image) | |
image.save(save_sample_path) | |
else: | |
video_path = os.path.join(save_path, prefix + ".mp4") | |
save_videos_grid(sample, video_path, fps=fps) |