from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline from tuneavideo.models.unet import UNet3DConditionModel from tuneavideo.util import save_videos_grid import torch import gradio as gr def tune_video_predict( prompt: str, video_length: int, height: int, width: int, num_inference_steps: int, guidance_scale: float, ): unet = UNet3DConditionModel.from_pretrained('Tune-A-Video-library/a-man-is-surfing', subfolder='unet', torch_dtype=torch.float16).to('cuda') pipe = TuneAVideoPipeline.from_pretrained('CompVis/stable-diffusion-v1-4', unet=unet, torch_dtype=torch.float16).to("cuda") video = pipe(prompt, video_length=video_length, height=height, width=width, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).videos output_path = save_videos_grid(video, save_path='output', path=f"{prompt}.gif") return output_path demo_inputs = [ gr.inputs.Textbox( label="Prompt", default='a panda is surfing' ), gr.inputs.Slider( label="Video Length", minimum=1, maximum=50, default=4, step=1, ), gr.inputs.Slider( label="Height", minimum=128, maximum=1280, default=128, step=32, ), gr.inputs.Slider( label="Width", minimum=128, maximum=1280, default=128, step=32, ), gr.inputs.Slider( label="Num Inference Steps", minimum=1, maximum=100, default=10, step=1, ), gr.inputs.Slider( label="Guidance Scale", minimum=0.0, maximum=50, default=7.5, step=0.5, ) ] demo_outputs = gr.outputs.Video(type="gif", label="Output") examples = [ ["a panda is surfing", 4, 128, 128, 10, 7.5] ] description = "This generates video from an input text, using [one-shot tuning of diffusion models](https://arxiv.org/abs/2212.11565). To use it, simply input a text." demo_app = gr.Interface( fn=tune_video_predict, inputs=demo_inputs, outputs=demo_outputs, examples=examples, cache_examples=False, title="Tune-A-Video", theme="huggingface", description=description ) demo_app.launch(debug=True, enable_queue=True)