yhzhai commited on
Commit
cd50e5a
·
1 Parent(s): 63816b7
Files changed (2) hide show
  1. README.md +120 -3
  2. demo.py +114 -0
README.md CHANGED
@@ -5,13 +5,130 @@ pipeline_tag: text-to-video
5
 
6
 
7
  <h1 align="center">
8
- <a href="https://arxiv.org/abs/2309.01246"><b>Motion Consistency Model: Accelerating Video Diffusion with Disentangled Motion-Appearance Distillation</b></a>
9
  </h1>
10
 
11
- [[Project page]](https://yhzhai.github.io/mcm/) [[Code]](https://github.com/yhZhai/mcm) [[arXiv]](https://arxiv.org/abs/2406.06890)
12
 
13
  **TL;DR**: Our motion consistency model not only accelerates text2video diffusion model sampling process, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.
14
 
15
  ![Our motion consistency model not only distill the motion prior from the teacher to accelerate sampling, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.](https://github.com/yhZhai/mcm/blob/main/static/images/illustration.png?raw=true)
16
 
17
- ## Usage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  <h1 align="center">
8
+ <a href="https://yhzhai.github.io/mcm/"><b>Motion Consistency Model: Accelerating Video Diffusion with Disentangled Motion-Appearance Distillation</b></a>
9
  </h1>
10
 
11
+ [[Project page]](https://yhzhai.github.io/mcm/) [[Code]](https://github.com/yhZhai/mcm) [[arXiv]](https://arxiv.org/abs/2406.06890) [[Demo]](https://huggingface.co/spaces/yhzhai/mcm)
12
 
13
  **TL;DR**: Our motion consistency model not only accelerates text2video diffusion model sampling process, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.
14
 
15
  ![Our motion consistency model not only distill the motion prior from the teacher to accelerate sampling, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.](https://github.com/yhZhai/mcm/blob/main/static/images/illustration.png?raw=true)
16
 
17
+ ## Usage
18
+
19
+ ```python
20
+ from typing import Optional
21
+
22
+ import torch
23
+ from diffusers import (
24
+ AnimateDiffPipeline,
25
+ DiffusionPipeline,
26
+ LCMScheduler,
27
+ MotionAdapter,
28
+ )
29
+ from diffusers.utils import export_to_video
30
+ from peft import PeftModel
31
+
32
+
33
+ def main():
34
+ # select model_path from ["animatediff-webvid", "animatediff-modelscope",
35
+ # "modelscopet2v-webvid", "modelscopet2v-laion", "modelscopet2v-anime",
36
+ # "modelscopet2v-real", "modelscopet2v-3d-cartoon"]
37
+ model_path = "modelscopet2v-laion"
38
+ prompts = ["A cat walking on a treadmill", "A dog walking on a treadmill"]
39
+ num_inference_steps = 4
40
+
41
+ model_id = "yhzhai/mcm"
42
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
43
+ if "animatediff" in model_path:
44
+ pipeline = get_animatediff_pipeline()
45
+ elif "modelscope" in model_path:
46
+ pipeline = get_modelscope_pipeline()
47
+ else:
48
+ raise ValueError(f"Unknown pipeline {model_path}")
49
+
50
+ lora = PeftModel.from_pretrained(
51
+ pipeline.unet,
52
+ model_id,
53
+ subfolder=model_path,
54
+ adapter_name="pretrained_lora",
55
+ torch_device="cpu",
56
+ )
57
+ lora.merge_and_unload()
58
+ pipeline.unet = lora
59
+
60
+ pipeline = pipeline.to(device)
61
+ output = pipeline(
62
+ prompt=prompts,
63
+ num_frames=16,
64
+ guidance_scale=1.0,
65
+ num_inference_steps=num_inference_steps,
66
+ generator=torch.Generator("cpu").manual_seed(42),
67
+ ).frames
68
+ if not isinstance(output, list):
69
+ output = [output[i] for i in range(output.shape[0])]
70
+
71
+ for j in range(len(prompts)):
72
+ export_to_video(
73
+ output[j],
74
+ f"{j}-{model_path}.mp4",
75
+ fps=7,
76
+ )
77
+
78
+
79
+ def get_animatediff_pipeline(
80
+ real_variant: Optional[str] = "realvision",
81
+ motion_module_path: str = "guoyww/animatediff-motion-adapter-v1-5-2",
82
+ ):
83
+ if real_variant is None:
84
+ model_id = "runwayml/stable-diffusion-v1-5"
85
+ elif real_variant == "epicrealism":
86
+ model_id = "emilianJR/epiCRealism"
87
+ elif real_variant == "realvision":
88
+ model_id = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
89
+ else:
90
+ raise ValueError(f"Unknown real_variant {real_variant}")
91
+
92
+ adapter = MotionAdapter.from_pretrained(
93
+ motion_module_path, torch_dtype=torch.float16
94
+ )
95
+ pipe = AnimateDiffPipeline.from_pretrained(
96
+ model_id,
97
+ motion_adapter=adapter,
98
+ torch_dtype=torch.float16,
99
+ )
100
+ scheduler = LCMScheduler.from_pretrained(
101
+ model_id,
102
+ subfolder="scheduler",
103
+ timestep_scaling=4.0,
104
+ clip_sample=False,
105
+ timestep_spacing="linspace",
106
+ beta_schedule="linear",
107
+ beta_start=0.00085,
108
+ beta_end=0.012,
109
+ steps_offset=1,
110
+ )
111
+ pipe.scheduler = scheduler
112
+ pipe.enable_vae_slicing()
113
+ return pipe
114
+
115
+
116
+ def get_modelscope_pipeline():
117
+ model_id = "ali-vilab/text-to-video-ms-1.7b"
118
+ pipe = DiffusionPipeline.from_pretrained(
119
+ model_id, torch_dtype=torch.float16, variant="fp16"
120
+ )
121
+ scheduler = LCMScheduler.from_pretrained(
122
+ model_id,
123
+ subfolder="scheduler",
124
+ timestep_scaling=4.0,
125
+ )
126
+ pipe.scheduler = scheduler
127
+ pipe.enable_vae_slicing()
128
+
129
+ return pipe
130
+
131
+
132
+ if __name__ == "__main__":
133
+ main()
134
+ ```
demo.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ from diffusers import (
5
+ AnimateDiffPipeline,
6
+ DiffusionPipeline,
7
+ LCMScheduler,
8
+ MotionAdapter,
9
+ )
10
+ from diffusers.utils import export_to_video
11
+ from peft import PeftModel
12
+
13
+
14
+ def main():
15
+ # select model_path from ["animatediff-webvid", "animatediff-modelscope",
16
+ # "modelscopet2v-webvid", "modelscopet2v-laion", "modelscopet2v-anime",
17
+ # "modelscopet2v-real", "modelscopet2v-3d-cartoon"]
18
+ model_path = "modelscopet2v-laion"
19
+ prompts = ["A cat walking on a treadmill", "A dog walking on a treadmill"]
20
+ num_inference_steps = 4
21
+
22
+ model_id = "yhzhai/mcm"
23
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ if "animatediff" in model_path:
25
+ pipeline = get_animatediff_pipeline()
26
+ elif "modelscope" in model_path:
27
+ pipeline = get_modelscope_pipeline()
28
+ else:
29
+ raise ValueError(f"Unknown pipeline {model_path}")
30
+
31
+ lora = PeftModel.from_pretrained(
32
+ pipeline.unet,
33
+ model_id,
34
+ subfolder=model_path,
35
+ adapter_name="pretrained_lora",
36
+ torch_device="cpu",
37
+ )
38
+ lora.merge_and_unload()
39
+ pipeline.unet = lora
40
+
41
+ pipeline = pipeline.to(device)
42
+ output = pipeline(
43
+ prompt=prompts,
44
+ num_frames=16,
45
+ guidance_scale=1.0,
46
+ num_inference_steps=num_inference_steps,
47
+ generator=torch.Generator("cpu").manual_seed(42),
48
+ ).frames
49
+ if not isinstance(output, list):
50
+ output = [output[i] for i in range(output.shape[0])]
51
+
52
+ for j in range(len(prompts)):
53
+ export_to_video(
54
+ output[j],
55
+ f"{j}-{model_path}.mp4",
56
+ fps=7,
57
+ )
58
+
59
+
60
+ def get_animatediff_pipeline(
61
+ real_variant: Optional[str] = "realvision",
62
+ motion_module_path: str = "guoyww/animatediff-motion-adapter-v1-5-2",
63
+ ):
64
+ if real_variant is None:
65
+ model_id = "runwayml/stable-diffusion-v1-5"
66
+ elif real_variant == "epicrealism":
67
+ model_id = "emilianJR/epiCRealism"
68
+ elif real_variant == "realvision":
69
+ model_id = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
70
+ else:
71
+ raise ValueError(f"Unknown real_variant {real_variant}")
72
+
73
+ adapter = MotionAdapter.from_pretrained(
74
+ motion_module_path, torch_dtype=torch.float16
75
+ )
76
+ pipe = AnimateDiffPipeline.from_pretrained(
77
+ model_id,
78
+ motion_adapter=adapter,
79
+ torch_dtype=torch.float16,
80
+ )
81
+ scheduler = LCMScheduler.from_pretrained(
82
+ model_id,
83
+ subfolder="scheduler",
84
+ timestep_scaling=4.0,
85
+ clip_sample=False,
86
+ timestep_spacing="linspace",
87
+ beta_schedule="linear",
88
+ beta_start=0.00085,
89
+ beta_end=0.012,
90
+ steps_offset=1,
91
+ )
92
+ pipe.scheduler = scheduler
93
+ pipe.enable_vae_slicing()
94
+ return pipe
95
+
96
+
97
+ def get_modelscope_pipeline():
98
+ model_id = "ali-vilab/text-to-video-ms-1.7b"
99
+ pipe = DiffusionPipeline.from_pretrained(
100
+ model_id, torch_dtype=torch.float16, variant="fp16"
101
+ )
102
+ scheduler = LCMScheduler.from_pretrained(
103
+ model_id,
104
+ subfolder="scheduler",
105
+ timestep_scaling=4.0,
106
+ )
107
+ pipe.scheduler = scheduler
108
+ pipe.enable_vae_slicing()
109
+
110
+ return pipe
111
+
112
+
113
+ if __name__ == "__main__":
114
+ main()