from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline from tuneavideo.models.unet import UNet3DConditionModel from tuneavideo.util import save_videos_grid import torch import gradio as gr from bs4 import BeautifulSoup import requests def model_url_list(): url_list = [] for i in range(1, 9): url_list.append(f"https://huggingface.co/models?p={i}&sort=downloads&search=dreambooth") return url_list def data_scraping(url_list): model_list = [] for url in url_list: response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") div_class = 'grid gap-5 grid-cols-1 2xl:grid-cols-2' div = soup.find('div', {'class': div_class}) for a in div.find_all('a', href=True): model_list.append(a['href']) return model_list model_list = data_scraping(model_url_list()) def tune_video_predict( pipe_id: str, prompt: str, video_length: int, height: int, width: int, num_inference_steps: int, guidance_scale: float, ): unet = UNet3DConditionModel.from_pretrained("Tune-A-Video-library/a-man-is-surfing", subfolder='unet', torch_dtype=torch.float16).to('cuda') pipe = TuneAVideoPipeline.from_pretrained(pipe_id, unet=unet, torch_dtype=torch.float16).to("cuda") video = pipe(prompt, video_length=video_length, height=height, width=width, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).videos output_path = save_videos_grid(video, save_path='output', path=f"{prompt}.gif") return output_path demo_inputs = [ gr.inputs.Dropdown( label="Model", choices=model_list ), gr.inputs.Textbox( label="Prompt", default='a flower blooming' ), gr.inputs.Slider( label="Video Length", minimum=1, maximum=50, default=8, step=1, ), gr.inputs.Slider( label="Height", minimum=128, maximum=1280, default=416, step=32, ), gr.inputs.Slider( label="Width", minimum=128, maximum=1280, default=416, step=32, ), gr.inputs.Slider( label="Num Inference Steps", minimum=1, maximum=100, default=50, step=1, ), gr.inputs.Slider( label="Guidance Scale", minimum=0.0, maximum=100, default=7.5, step=0.5, ) ] demo_outputs = gr.outputs.Video(type="gif", label="Output") examples = [ ["Tune-A-Video-library/a-man-is-surfing", "a panda is surfing", 5, 416, 416, 50, 7.5], ["Tune-A-Video-library/a-man-is-surfing", "a flower blooming", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/mr-potato-head", "sks mr potato head, wearing a pink hat, is surfing.", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/mr-potato-head", "sks mr potato head is surfing in the forest.", 5, 416, 416, 50, 7.5], ] description = "This generates video from an input text, using [one-shot tuning of diffusion models](https://arxiv.org/abs/2212.11565). To use it, simply input a text." demo_app = gr.Interface( fn=tune_video_predict, inputs=demo_inputs, outputs=demo_outputs, examples=examples, cache_examples=False, title="Tune-A-Video", theme="huggingface", description=description ) demo_app.launch(debug=True, enable_queue=True)