from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline from tuneavideo.models.unet import UNet3DConditionModel from tuneavideo.util import save_videos_grid import torch import gradio as gr from bs4 import BeautifulSoup import requests def model_url_list(): url_list = [] for i in range(0, 5): url_list.append(f"https://huggingface.co/models?p={i}&sort=downloads&search=dreambooth") return url_list def data_scraping(url_list): model_list = [] for url in url_list: response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") div_class = 'grid grid-cols-1 gap-5 2xl:grid-cols-2' div = soup.find('div', {'class': div_class}) for a in div.find_all('a', href=True): model_list.append(a['href']) return model_list model_list = data_scraping(model_url_list()) for i in range(len(model_list)): model_list[i] = model_list[i][1:] best_model_list = [ "runwayml/stable-diffusion-v1-5", "CompVis/stable-diffusion-v1-4", "prompthero/openjourney", "dreamlike-art/dreamlike-photoreal-2.0", "dreamlike-art/dreamlike-diffusion-1.0" ] model_list = best_model_list + model_list def tune_video_predict( pipe_id: str, prompt: str, video_length: int, height: int, width: int, num_inference_steps: int, guidance_scale: float, ): unet = UNet3DConditionModel.from_pretrained("Tune-A-Video-library/a-man-is-surfing", subfolder='unet', torch_dtype=torch.float16).to('cuda') pipe = TuneAVideoPipeline.from_pretrained(pipe_id, unet=unet, torch_dtype=torch.float16).to("cuda") video = pipe(prompt, video_length=video_length, height=height, width=width, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale).videos output_path = save_videos_grid(video, save_path='output', path=f"{prompt}.gif") return output_path demo_inputs = [ gr.Dropdown( label="Model", choices=model_list, value="CompVis/stable-diffusion-v1-4", ), gr.inputs.Textbox( label="Prompt", default='a flower blooming' ), gr.inputs.Slider( label="Video Length", minimum=1, maximum=50, default=8, step=1, ), gr.inputs.Slider( label="Height", minimum=128, maximum=1280, default=416, step=32, ), gr.inputs.Slider( label="Width", minimum=128, maximum=1280, default=416, step=32, ), gr.inputs.Slider( label="Num Inference Steps", minimum=1, maximum=100, default=50, step=1, ), gr.inputs.Slider( label="Guidance Scale", minimum=0.0, maximum=100, default=7.5, step=0.5, ) ] demo_outputs = gr.outputs.Video(type="gif", label="Output") examples = [ ["CompVis/stable-diffusion-v1-4", "a panda is surfing", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/disco-diffusion-style", "ddfusion style on the church", 5, 416, 416, 50, 7.5], #["sd-dreambooth-library/nasa-space-v2-768", "nasa style galaxy moving", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/mr-potato-head", "sks mr potato head, wearing a pink hat, is surfing.", 5, 416, 416, 50, 7.5], ["sd-dreambooth-library/mr-potato-head", "sks mr potato head is surfing in the forest.", 5, 416, 416, 50, 7.5], ] description = "This is an application that generates video based on a text prompt. To get started, simply input text. The default model in the dropdown is a generic model that you can generate anything. Alternatively, for more photorealistic generations, you can use other models in the dropdown. These models are Dreambooth models, and they're trained with a specific object name, so make sure you know what the object is called. You can find an example prompt for a dreambooth model in Examples section right below the interface." title = "Tune-A-Video: One-Shot Tuning of Image Diffusion Models for Text-to-Video Generation" demo_app = gr.Interface( fn=tune_video_predict, inputs=demo_inputs, outputs=demo_outputs, examples=examples, cache_examples=True, title=title, theme="huggingface", description=description ) demo_app.launch(debug=True, enable_queue=True)