yhzhai
/

mcm

Text-to-Video

Safetensors

Model card Files Files and versions Community

yhzhai commited on Jun 15, 2024

Commit

cd50e5a

1 Parent(s): 63816b7

add demo

Browse files

Files changed (2) hide show

README.md +120 -3
demo.py +114 -0

README.md CHANGED Viewed

@@ -5,13 +5,130 @@ pipeline_tag: text-to-video
 <h1 align="center">
-  <a href="https://arxiv.org/abs/2309.01246"><b>Motion Consistency Model: Accelerating Video Diffusion with Disentangled Motion-Appearance Distillation</b></a>
 </h1>
-[[Project page]](https://yhzhai.github.io/mcm/) [[Code]](https://github.com/yhZhai/mcm) [[arXiv]](https://arxiv.org/abs/2406.06890)
 **TL;DR**: Our motion consistency model not only accelerates text2video diffusion model sampling process, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.
 ![Our motion consistency model not only distill the motion prior from the teacher to accelerate sampling, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.](https://github.com/yhZhai/mcm/blob/main/static/images/illustration.png?raw=true)
-## Usage

 <h1 align="center">
+  <a href="https://yhzhai.github.io/mcm/"><b>Motion Consistency Model: Accelerating Video Diffusion with Disentangled Motion-Appearance Distillation</b></a>
 </h1>
+[[Project page]](https://yhzhai.github.io/mcm/) [[Code]](https://github.com/yhZhai/mcm) [[arXiv]](https://arxiv.org/abs/2406.06890) [[Demo]](https://huggingface.co/spaces/yhzhai/mcm)
 **TL;DR**: Our motion consistency model not only accelerates text2video diffusion model sampling process, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.
 ![Our motion consistency model not only distill the motion prior from the teacher to accelerate sampling, but also can benefit from an additional high-quality image dataset to improve the frame quality of generated videos.](https://github.com/yhZhai/mcm/blob/main/static/images/illustration.png?raw=true)
+## Usage
+```python
+from typing import Optional
+import torch
+from diffusers import (
+    AnimateDiffPipeline,
+    DiffusionPipeline,
+    LCMScheduler,
+    MotionAdapter,
+)
+from diffusers.utils import export_to_video
+from peft import PeftModel
+def main():
+    # select model_path from ["animatediff-webvid", "animatediff-modelscope",
+    # "modelscopet2v-webvid", "modelscopet2v-laion", "modelscopet2v-anime",
+    # "modelscopet2v-real", "modelscopet2v-3d-cartoon"]
+    model_path = "modelscopet2v-laion"
+    prompts = ["A cat walking on a treadmill", "A dog walking on a treadmill"]
+    num_inference_steps = 4
+    model_id = "yhzhai/mcm"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if "animatediff" in model_path:
+        pipeline = get_animatediff_pipeline()
+    elif "modelscope" in model_path:
+        pipeline = get_modelscope_pipeline()
+    else:
+        raise ValueError(f"Unknown pipeline {model_path}")
+    lora = PeftModel.from_pretrained(
+        pipeline.unet,
+        model_id,
+        subfolder=model_path,
+        adapter_name="pretrained_lora",
+        torch_device="cpu",
+    )
+    lora.merge_and_unload()
+    pipeline.unet = lora
+    pipeline = pipeline.to(device)
+    output = pipeline(
+        prompt=prompts,
+        num_frames=16,
+        guidance_scale=1.0,
+        num_inference_steps=num_inference_steps,
+        generator=torch.Generator("cpu").manual_seed(42),
+    ).frames
+    if not isinstance(output, list):
+        output = [output[i] for i in range(output.shape[0])]
+    for j in range(len(prompts)):
+        export_to_video(
+            output[j],
+            f"{j}-{model_path}.mp4",
+            fps=7,
+        )
+def get_animatediff_pipeline(
+    real_variant: Optional[str] = "realvision",
+    motion_module_path: str = "guoyww/animatediff-motion-adapter-v1-5-2",
+):
+    if real_variant is None:
+        model_id = "runwayml/stable-diffusion-v1-5"
+    elif real_variant == "epicrealism":
+        model_id = "emilianJR/epiCRealism"
+    elif real_variant == "realvision":
+        model_id = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
+    else:
+        raise ValueError(f"Unknown real_variant {real_variant}")
+    adapter = MotionAdapter.from_pretrained(
+        motion_module_path, torch_dtype=torch.float16
+    )
+    pipe = AnimateDiffPipeline.from_pretrained(
+        model_id,
+        motion_adapter=adapter,
+        torch_dtype=torch.float16,
+    )
+    scheduler = LCMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+        timestep_scaling=4.0,
+        clip_sample=False,
+        timestep_spacing="linspace",
+        beta_schedule="linear",
+        beta_start=0.00085,
+        beta_end=0.012,
+        steps_offset=1,
+    )
+    pipe.scheduler = scheduler
+    pipe.enable_vae_slicing()
+    return pipe
+def get_modelscope_pipeline():
+    model_id = "ali-vilab/text-to-video-ms-1.7b"
+    pipe = DiffusionPipeline.from_pretrained(
+        model_id, torch_dtype=torch.float16, variant="fp16"
+    )
+    scheduler = LCMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+        timestep_scaling=4.0,
+    )
+    pipe.scheduler = scheduler
+    pipe.enable_vae_slicing()
+    return pipe
+if __name__ == "__main__":
+    main()
+```

demo.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from typing import Optional
+import torch
+from diffusers import (
+    AnimateDiffPipeline,
+    DiffusionPipeline,
+    LCMScheduler,
+    MotionAdapter,
+)
+from diffusers.utils import export_to_video
+from peft import PeftModel
+def main():
+    # select model_path from ["animatediff-webvid", "animatediff-modelscope",
+    # "modelscopet2v-webvid", "modelscopet2v-laion", "modelscopet2v-anime",
+    # "modelscopet2v-real", "modelscopet2v-3d-cartoon"]
+    model_path = "modelscopet2v-laion"
+    prompts = ["A cat walking on a treadmill", "A dog walking on a treadmill"]
+    num_inference_steps = 4
+    model_id = "yhzhai/mcm"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if "animatediff" in model_path:
+        pipeline = get_animatediff_pipeline()
+    elif "modelscope" in model_path:
+        pipeline = get_modelscope_pipeline()
+    else:
+        raise ValueError(f"Unknown pipeline {model_path}")
+    lora = PeftModel.from_pretrained(
+        pipeline.unet,
+        model_id,
+        subfolder=model_path,
+        adapter_name="pretrained_lora",
+        torch_device="cpu",
+    )
+    lora.merge_and_unload()
+    pipeline.unet = lora
+    pipeline = pipeline.to(device)
+    output = pipeline(
+        prompt=prompts,
+        num_frames=16,
+        guidance_scale=1.0,
+        num_inference_steps=num_inference_steps,
+        generator=torch.Generator("cpu").manual_seed(42),
+    ).frames
+    if not isinstance(output, list):
+        output = [output[i] for i in range(output.shape[0])]
+    for j in range(len(prompts)):
+        export_to_video(
+            output[j],
+            f"{j}-{model_path}.mp4",
+            fps=7,
+        )
+def get_animatediff_pipeline(
+    real_variant: Optional[str] = "realvision",
+    motion_module_path: str = "guoyww/animatediff-motion-adapter-v1-5-2",
+):
+    if real_variant is None:
+        model_id = "runwayml/stable-diffusion-v1-5"
+    elif real_variant == "epicrealism":
+        model_id = "emilianJR/epiCRealism"
+    elif real_variant == "realvision":
+        model_id = "SG161222/Realistic_Vision_V6.0_B1_noVAE"
+    else:
+        raise ValueError(f"Unknown real_variant {real_variant}")
+    adapter = MotionAdapter.from_pretrained(
+        motion_module_path, torch_dtype=torch.float16
+    )
+    pipe = AnimateDiffPipeline.from_pretrained(
+        model_id,
+        motion_adapter=adapter,
+        torch_dtype=torch.float16,
+    )
+    scheduler = LCMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+        timestep_scaling=4.0,
+        clip_sample=False,
+        timestep_spacing="linspace",
+        beta_schedule="linear",
+        beta_start=0.00085,
+        beta_end=0.012,
+        steps_offset=1,
+    )
+    pipe.scheduler = scheduler
+    pipe.enable_vae_slicing()
+    return pipe
+def get_modelscope_pipeline():
+    model_id = "ali-vilab/text-to-video-ms-1.7b"
+    pipe = DiffusionPipeline.from_pretrained(
+        model_id, torch_dtype=torch.float16, variant="fp16"
+    )
+    scheduler = LCMScheduler.from_pretrained(
+        model_id,
+        subfolder="scheduler",
+        timestep_scaling=4.0,
+    )
+    pipe.scheduler = scheduler
+    pipe.enable_vae_slicing()
+    return pipe
+if __name__ == "__main__":
+    main()