File size: 2,807 Bytes
2220f8a
 
 
bd871a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
---

license: apache-2.0
---


```

from diffusers.utils import load_image, export_to_video

from transformers import CLIPVisionModel, CLIPImageProcessor, UMT5EncoderModel

from diffusers import WanI2VPipeline, WanTransformer3DModel

import torch



pretrained_model_name_or_path = "./wan_i2v"  # TODO replace with our hf id

image_encoder = CLIPVisionModel.from_pretrained(pretrained_model_name_or_path, subfolder='image_encoder',

                                                torch_dtype=torch.float16)

transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_480p')

# for 720p

# transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_720p',

#                                                          torch_dtype=torch.bfloat16)



image_processor = CLIPImageProcessor.from_pretrained(pretrained_model_name_or_path, subfolder='image_processor')



text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',

                                                torch_dtype=torch.bfloat16)



pipe = WanI2VPipeline.from_pretrained(

    pretrained_model_name_or_path,

    transformer=transformer_i2v,

    text_encoder=text_encoder,

    image_encoder=image_encoder,

    image_processor=image_processor,

)



image = load_image(

    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"

)

device = "cuda"

seed = 0

prompt = ("An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "

          "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot.")

generator = torch.Generator(device=device).manual_seed(seed)



# pipe.to(device)

pipe.enable_model_cpu_offload()



negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'



inputs = {

    'image': image,

    "prompt": prompt,

    # 'max_area': 720 * 1280, # for 720p

    "negative_prompt": negative_prompt,

    'max_area': 480 * 832,

    "generator": generator,

    "num_inference_steps": 40,

    "guidance_scale": 5.0,

    "num_frames": 81,

    "max_sequence_length": 512,

    "output_type": "np",

    # 'flow_shift': 5.0, # for 720p

    'flow_shift': 3.0

}



output = pipe(**inputs).frames[0]



export_to_video(output, "output.mp4", fps=16)





```