import spaces import os import random from datetime import datetime from typing import Optional import gradio as gr import numpy as np import torch from diffusers import ( AnimateDiffPipeline, DiffusionPipeline, LCMScheduler, MotionAdapter, ) from diffusers.utils import export_to_video from peft import PeftModel device = "cuda" mcm_id = "yhzhai/mcm" basedir = os.getcwd() savedir = os.path.join( basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S") ) MAX_SEED = np.iinfo(np.int32).max def get_modelscope_pipeline( mcm_variant: Optional[str] = "WebVid", ): model_id = "ali-vilab/text-to-video-ms-1.7b" # if torch.cuda.is_available(): # pipe = DiffusionPipeline.from_pretrained( # model_id, torch_dtype=torch.float16, variant="fp16" # ) # else: pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16") scheduler = LCMScheduler.from_pretrained( model_id, subfolder="scheduler", timestep_scaling=4.0, ) pipe.scheduler = scheduler pipe.enable_vae_slicing() if mcm_variant == "WebVid": subfolder = "modelscopet2v-webvid" elif mcm_variant == "LAION-aes": subfolder = "modelscopet2v-laion" elif mcm_variant == "Anime": subfolder = "modelscopet2v-anime" elif mcm_variant == "Realistic": subfolder = "modelscopet2v-real" elif mcm_variant == "3D Cartoon": subfolder = "modelscopet2v-3d-cartoon" else: subfolder = "modelscopet2v-laion" lora = PeftModel.from_pretrained( pipe.unet, model_id=mcm_id, subfolder=subfolder, adapter_name="lora", torch_device="cpu", ) lora.merge_and_unload() pipe.unet = lora pipe = pipe.to(device) return pipe def get_animatediff_pipeline( real_variant: Optional[str] = "realvision", motion_module_path: str = "guoyww/animatediff-motion-adapter-v1-5-2", mcm_variant: Optional[str] = "WebVid", ): if real_variant is None: model_id = "runwayml/stable-diffusion-v1-5" elif real_variant == "epicrealism": model_id = "emilianJR/epiCRealism" elif real_variant == "realvision": model_id = "SG161222/Realistic_Vision_V6.0_B1_noVAE" else: raise ValueError(f"Unknown real_variant {real_variant}") # if torch.cuda.is_available(): # adapter = MotionAdapter.from_pretrained( # motion_module_path, torch_dtype=torch.float16 # ) # pipe = AnimateDiffPipeline.from_pretrained( # model_id, # motion_adapter=adapter, # torch_dtype=torch.float16, # ) # else: adapter = MotionAdapter.from_pretrained(motion_module_path) pipe = AnimateDiffPipeline.from_pretrained( model_id, motion_adapter=adapter, torch_dtype=torch.float16 ) scheduler = LCMScheduler.from_pretrained( model_id, subfolder="scheduler", timestep_scaling=4.0, clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", beta_start=0.00085, beta_end=0.012, steps_offset=1, ) pipe.scheduler = scheduler pipe.enable_vae_slicing() if mcm_variant == "WebVid": subfolder = "animatediff-webvid" elif mcm_variant == "LAION-aes": subfolder = "animatediff-laion" else: subfolder = "animatediff-laion" lora = PeftModel.from_pretrained( pipe.unet, model_id=mcm_id, subfolder=subfolder, adapter_name="lora", torch_device="cpu", ) lora.merge_and_unload() pipe.unet = lora pipe = pipe.to(device) return pipe pipe_dict = { "ModelScope T2V": { "WebVid": None, "LAION-aes": None, "Anime": None, "Realistic": None, "3D Cartoon": None, }, "AnimateDiff (SD1.5)": {"WebVid": None, "LAION-aes": None}, "AnimateDiff (RealisticVision)": {"WebVid": None, "LAION-aes": None}, "AnimateDiff (epiCRealism)": {"WebVid": None, "LAION-aes": None}, } cache_pipeline = { "base_model": None, "variant": None, "pipeline": None, } # def init_pipelines(): # for base_model in variants.keys(): # for variant in variants[base_model]: # if pipe_dict[base_model][variant] is None: # if base_model == "ModelScope T2V": # pipe_dict[base_model][variant] = get_modelscope_pipeline(mcm_variant=variant) # elif base_model == "AnimateDiff (SD1.5)": # pipe_dict[base_model][variant] = get_animatediff_pipeline( # real_variant=None, # motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", # mcm_variant=variant, # ) # elif base_model == "AnimateDiff (RealisticVision)": # pipe_dict[base_model][variant] = get_animatediff_pipeline( # real_variant="realvision", # motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", # mcm_variant=variant, # ) # elif base_model == "AnimateDiff (epiCRealism)": # pipe_dict[base_model][variant] = get_animatediff_pipeline( # real_variant="epicrealism", # motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", # mcm_variant=variant, # ) # else: # raise ValueError(f"Unknown base_model {base_model}") @spaces.GPU(duration=90) def infer( base_model, variant, prompt, num_inference_steps=4, height=256, width=256, seed=0, randomize_seed=True, ): # if pipe_dict[base_model][variant] is None: # if base_model == "ModelScope T2V": # pipe_dict[base_model][variant] = get_modelscope_pipeline(mcm_variant=variant) # elif base_model == "AnimateDiff (SD1.5)": # pipe_dict[base_model][variant] = get_animatediff_pipeline( # real_variant=None, # motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", # mcm_variant=variant, # ) # elif base_model == "AnimateDiff (RealisticVision)": # pipe_dict[base_model][variant] = get_animatediff_pipeline( # real_variant="realvision", # motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", # mcm_variant=variant, # ) # elif base_model == "AnimateDiff (epiCRealism)": # pipe_dict[base_model][variant] = get_animatediff_pipeline( # real_variant="epicrealism", # motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", # mcm_variant=variant, # ) # else: # raise ValueError(f"Unknown base_model {base_model}") if ( cache_pipeline["base_model"] == base_model and cache_pipeline["variant"] == variant ): pass else: if base_model == "ModelScope T2V": pipeline = get_modelscope_pipeline(mcm_variant=variant) elif base_model == "AnimateDiff (SD1.5)": pipeline = get_animatediff_pipeline( real_variant=None, motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", mcm_variant=variant, ) elif base_model == "AnimateDiff (RealisticVision)": pipeline = get_animatediff_pipeline( real_variant="realvision", motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", mcm_variant=variant, ) elif base_model == "AnimateDiff (epiCRealism)": pipeline = get_animatediff_pipeline( real_variant="epicrealism", motion_module_path="guoyww/animatediff-motion-adapter-v1-5-2", mcm_variant=variant, ) else: raise ValueError(f"Unknown base_model {base_model}") cache_pipeline["base_model"] = base_model cache_pipeline["variant"] = variant cache_pipeline["pipeline"] = pipeline # pipe_dict[base_model][variant] = pipe_dict[base_model][variant].to(device) if randomize_seed: seed = random.randint(0, MAX_SEED) generator = torch.Generator("cpu").manual_seed(seed) progress = gr.Progress(track_tqdm=True) output = cache_pipeline["pipeline"]( prompt=prompt, num_frames=16, guidance_scale=1.0, num_inference_steps=num_inference_steps, height=height, width=width, generator=generator, ).frames if not isinstance(output, list): output = [output[i] for i in range(output.shape[0])] os.makedirs(savedir, exist_ok=True) save_path = os.path.join( savedir, f"sample_{base_model}_{variant}_{seed}.mp4".replace(" ", "_") ) export_to_video( output[0], save_path, fps=7, ) print(f"Saved to {save_path}") # pipe_dict[base_model][variant] = pipe_dict[base_model][variant].to("cpu") return save_path, seed examples = [ [ "ModelScope T2V", "LAION-aes", "Aerial uhd 4k view. mid-air flight over fresh and clean mountain river at sunny summer morning. Green trees and sun rays on horizon. Direct on sun.", 4, 256, 256, ], ["ModelScope T2V", "Anime", "Timelapse misty mountain landscape", 4, 256, 256, ], [ "ModelScope T2V", "WebVid", "Back of woman in shorts going near pure creek in beautiful mountains.", 4, 256, 256, ], [ "ModelScope T2V", "3D Cartoon", "A rotating pandoro (a traditional italian sweet yeast bread, most popular around christmas and new year) being eaten in time-lapse.", 4, 256, 256, ], [ "ModelScope T2V", "Realistic", "Slow motion avocado with a stone falls and breaks into 2 parts with splashes", 4, 256, 256, ], [ "AnimateDiff (RealisticVision)", "LAION-aes", "Slow motion of delicious salmon sachimi set with green vegetables leaves served on wood plate. make homemade japanese food at home.-dan", 8, 512, 512, ], [ "AnimateDiff (RealisticVision)", "WebVid", "Blooming meadow panorama zoom-out shot heavenly clouds and upcoming thunderstorm in mountain range harz, germany.", 8, 512, 512, ], [ "AnimateDiff (RealisticVision)", "LAION-aes", "A young woman in a yellow sweater uses vr glasses, sitting on the shore of a pond on a background of dark waves. a strong wind develops her hair, the sun's rays are reflected from the water.", 8, 512, 512, ], [ "AnimateDiff (RealisticVision)", "LAION-aes", "Female running at sunset. healthy fitness concept", 8, 512, 512, ], ] css = """ #col-container { margin: 0 auto; } """ variants = { "ModelScope T2V": ["WebVid", "LAION-aes", "Anime", "Realistic", "3D Cartoon"], "AnimateDiff (SD1.5)": ["WebVid", "LAION-aes"], "AnimateDiff (RealisticVision)": ["WebVid", "LAION-aes"], "AnimateDiff (epiCRealism)": ["WebVid", "LAION-aes"], } def update_variant(rs): return gr.update(choices=variants[rs], value=None) # init_pipelines() with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML( """
Currently running on {device}.
Model loading takes extra time.
""" ) #ModelScope T2V works the best for resolution 256x256, and AnimateDiff works the best for 512x512.
with gr.Row(): base_model = gr.Dropdown( label="Base model", choices=[ "ModelScope T2V", "AnimateDiff (SD1.5)", "AnimateDiff (RealisticVision)", "AnimateDiff (epiCRealism)", ], value="ModelScope T2V", interactive=True, ) variant_dropdown = gr.Dropdown( variants["ModelScope T2V"], label="MCM Variant", interactive=True, value=None, ) base_model.change( update_variant, inputs=[base_model], outputs=[variant_dropdown] ) with gr.Row(): prompt = gr.Text( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, ) run_button = gr.Button("Run", scale=0) with gr.Row(): with gr.Column(): with gr.Accordion("Advanced Settings", open=True): seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(): num_inference_steps = gr.Slider( label="Number of inference steps", minimum=1, maximum=12, step=1, value=4, ) with gr.Group(): with gr.Row(): text_hint = gr.Textbox( "Hint: ModelScope T2V works the best for resolution 256x256, and AnimateDiff works the best for resolution 512x512.", interactive=False, label="Hint", container=False, ) with gr.Row(): height = gr.Slider( label="Height", minimum=256, maximum=1024, step=64, value=512, interactive=True, ) width = gr.Slider( label="Width", minimum=256, maximum=1024, step=64, value=512, interactive=True, ) with gr.Column(show_progress=True): # result = gr.Video(label="Result", show_label=False, interactive=False, height=512, width=512, autoplay=True) result = gr.Video( label="Result", show_label=False, interactive=False, autoplay=True, # height=512, # width=512, ) gr.Examples( examples=examples, inputs=[base_model, variant_dropdown, prompt, num_inference_steps, height, width], fn=infer, outputs=[result, seed], ) run_button.click( fn=infer, inputs=[ base_model, variant_dropdown, prompt, num_inference_steps, height, width, seed, randomize_seed, ], outputs=[result, seed], ) demo.queue().launch()