from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline from tuneavideo.models.unet import UNet3DConditionModel from tuneavideo.util import save_videos_grid import torch import gradio as gr def tune_video_predict( pipe_id: str, prompt: str, video_length: int, height: int, width: int, num_inference_steps: int, guidance_scale: float, ): unet = UNet3DConditionModel.from_pretrained("Tune-A-Video-library/a-man-is-surfing", subfolder='unet', torch_dtype=torch.float16).to('cuda') pipe = TuneAVideoPipeline.from_pretrained(pipe_id, unet=unet, torch_dtype=torch.float16).to("cuda") video = pipe(prompt, video_length=video_length, height=height, width=width, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).videos output_path = save_videos_grid(video, save_path='output', path=f"{prompt}.gif") return output_path demo_inputs = [ gr.inputs.Dropdown( label="Model", choices=[ "Tune-A-Video-library/a-man-is-surfing", "sd-dreambooth-library/mr-potato-head", ] ), gr.inputs.Textbox( label="Prompt", default='a flower blooming' ), gr.inputs.Slider( label="Video Length", minimum=1, maximum=50, default=8, step=1, ), gr.inputs.Slider( label="Height", minimum=128, maximum=1280, default=416, step=32, ), gr.inputs.Slider( label="Width", minimum=128, maximum=1280, default=416, step=32, ), gr.inputs.Slider( label="Num Inference Steps", minimum=1, maximum=100, default=50, step=1, ), gr.inputs.Slider( label="Guidance Scale", minimum=0.0, maximum=100, default=7.5, step=0.5, ) ] demo_outputs = gr.outputs.Video(type="gif", label="Output") examples = [ ["Tune-A-Video-library/a-man-is-surfing", "a panda is surfing", 5, 416, 416, 50, 7.5], ["Tune-A-Video-library/a-man-is-surfing", "a flower blooming", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/mr-potato-head", "sks mr potato head, wearing a pink hat, is surfing.", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/mr-potato-head", "sks mr potato head is surfing in the forest.", 5, 416, 416, 50, 7.5], ] description = "This generates video from an input text, using [one-shot tuning of diffusion models](https://arxiv.org/abs/2212.11565). To use it, simply input a text." demo_app = gr.Interface( fn=tune_video_predict, inputs=demo_inputs, outputs=demo_outputs, examples=examples, cache_examples=False, title="Tune-A-Video", theme="huggingface", description=description ) demo_app.launch(debug=True, enable_queue=True)