from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline from tuneavideo.models.unet import UNet3DConditionModel from tuneavideo.util import save_videos_grid import torch import gradio as gr model_list = [ "runwayml/stable-diffusion-v1-5", "CompVis/stable-diffusion-v1-4", "prompthero/openjourney", "dreamlike-art/dreamlike-photoreal-2.0", "dreamlike-art/dreamlike-diffusion-1.0" ] def tune_video_predict( pipe_id: str, prompt: str, video_length: int, height: int, width: int, num_inference_steps: int, guidance_scale: float, ): unet = UNet3DConditionModel.from_pretrained("Tune-A-Video-library/a-man-is-surfing", subfolder='unet', torch_dtype=torch.float16).to('cuda') pipe = TuneAVideoPipeline.from_pretrained(pipe_id, unet=unet, torch_dtype=torch.float16).to("cuda") video = pipe(prompt, video_length=video_length, height=height, width=width, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).videos output_path = save_videos_grid(video, save_path='output', path=f"{prompt}.gif") return output_path demo_inputs = [ gr.Dropdown( label="Model", choices=model_list, value="CompVis/stable-diffusion-v1-4", ), gr.Textbox( label="Prompt", value='a flower blooming' ), gr.Slider( label="Video Length", minimum=1, maximum=50, value=8, step=1, ), gr.Slider( label="Height", minimum=128, maximum=1280, value=416, step=32, ), gr.Slider( label="Width", minimum=128, maximum=1280, value=416, step=32, ), gr.Slider( label="Num Inference Steps", minimum=1, maximum=100, value=50, step=1, ), gr.Slider( label="Guidance Scale", minimum=0.0, maximum=100, value=7.5, step=0.5, ) ] demo_outputs = gr.outputs.Video(type="gif", label="Output") examples = [ ["CompVis/stable-diffusion-v1-4", "a panda is surfing", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/disco-diffusion-style", "ddfusion style on the church", 5, 416, 416, 50, 7.5], #["sd-dreambooth-library/nasa-space-v2-768", "nasa style galaxy moving", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/mr-potato-head", "sks mr potato head, wearing a pink hat, is surfing.", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/mr-potato-head", "sks mr potato head is surfing in the forest.", 5, 416, 416, 50, 7.5], ] description = "This is an application that generates video based on a text prompt. To get started, simply input text. The default model in the dropdown is a generic model that you can generate anything. Alternatively, for more photorealistic generations, you can use other models in the dropdown. These models are Dreambooth models, and they're trained with a specific object name, so make sure you know what the object is called. You can find an example prompt for a dreambooth model in Examples section right below the interface." title = "Tune-A-Video: One-Shot Tuning of Image Diffusion Models for Text-to-Video Generation" demo_app = gr.Interface( fn=tune_video_predict, inputs=demo_inputs, outputs=demo_outputs, examples=examples, cache_examples=False, title=title, theme="huggingface", description=description ) demo_app.launch(debug=True, enable_queue=True)