Wan2.1 I2V model (720p)

example
from diffusers.utils import load_image, export_to_video
from transformers import CLIPVisionModel, CLIPImageProcessor, UMT5EncoderModel, AutoTokenizer
from diffusers import WanI2VPipeline, WanTransformer3DModel, UniPCMultistepScheduler, AutoencoderKLWan
import torch

tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)
vae = AutoencoderKLWan.from_pretrained("StevenZhang/Wan2.1-VAE_Diff")


pipe = WanI2VPipeline.from_pretrained(
    'ypyp/wan2.1_i2v_720p',
    tokenizer=tokenizer,
    text_encoder=text_encoder,
    vae=vae,
)

image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
)
device = "cuda"
seed = 0
prompt = ("An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
          "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot.")
generator = torch.Generator(device=device).manual_seed(seed)

pipe.to(device)
pipe.enable_model_cpu_offload()

inputs = {
    'image': image,
    "prompt": prompt,
    'max_area': 720 * 1280,
    "generator": generator,
    "num_inference_steps": 50,
    "guidance_scale": 5.0,
    "num_frames": 81,
    "max_sequence_length": 512,
    "output_type": "np",
    'flow_shift': 5.0
}

output = pipe(**inputs).frames[0]
export_to_video(output, "output.mp4", fps=15)