#!/usr/bin/env python from __future__ import annotations import os import sys import warnings # os.system("cd Make-A-Protagonist/experts/GroundedSAM") # os.system("python -m pip install -e segment_anything") # os.system("python -m pip install -e GroundingDINO") # os.system("cd ../../..") # os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/GroundingDINO") # os.system("pip install --upgrade diffusers[torch]") warnings.filterwarnings("ignore") import gradio as gr from inference import InferencePipeline class InferenceUtil: def __init__(self, hf_token: str | None): self.hf_token = hf_token def load_model_info(self, model_id: str) -> tuple[str, str]: ## TODO the modelcard is in the readme of huggingface repo, should know how to write it try: card = InferencePipeline.get_model_card(model_id, self.hf_token) except Exception: return '', '' # return '' base_model = getattr(card.data, 'base_model', '') protagonist = getattr(card.data, 'protagonist', '') training_prompt = getattr(card.data, 'training_prompt', '') return protagonist, training_prompt # return training_prompt # TITLE = '# [Tune-A-Video](https://tuneavideo.github.io/)' HF_TOKEN = os.getenv('HF_TOKEN') # print("HF Token ===> ", HF_TOKEN) pipe = InferencePipeline(HF_TOKEN) app = InferenceUtil(HF_TOKEN) with gr.Blocks(css='style.css') as demo: # gr.Markdown(TITLE) gr.HTML( """

Make-A-Protagonist:
Generic Video Editing with An Ensemble of Experts

Yuyang Zhao¹ Enze Xie² Lanqing Hong² Zhenguo Li² Gim Hee Lee¹

¹National University of Singapore ²Huawei Noah's Ark Lab

[ arXiv ] [ Code ] [ Homepage ]

TL;DR: The first framework for generic video editing with both visual and textual clues.

""") gr.HTML("""

We provide a Demo Guidance to help users to choose hyperparameters when editing videos.

You may duplicate the space and upgrade GPU for better performance and faster inference without waiting in the queue.

Alternatively, try our GitHub code on your GPU.

""") with gr.Row(): with gr.Column(): with gr.Box(): model_id = gr.Dropdown( label='Model ID', choices=[ 'Make-A-Protagonist/ikun', 'Make-A-Protagonist/huaqiang', 'Make-A-Protagonist/yanzi', 'Make-A-Protagonist/car-turn', ], value='Make-A-Protagonist/ikun') with gr.Row(): base_model_used_for_training = gr.Textbox( label='Protagonist', interactive=False, value='man') prompt_used_for_training = gr.Textbox( label='Training prompt', interactive=False, value='A man is playing basketball') with gr.Box(): ref_image = gr.Image(label='Reference Image', type='pil', visible=True).style(height="auto") ref_pro_prompt = gr.Textbox(label='Reference Image Protagonist Prompt', max_lines=1, placeholder='Example: "man"') prompt = gr.Textbox(label='Prompt', max_lines=1, placeholder='Example: "A panda is surfing"') video_length = gr.Slider(label='Video length', minimum=4, maximum=6, step=1, value=6) fps = gr.Slider(label='FPS', minimum=1, maximum=6, step=1, value=3) seed = gr.Slider(label='Seed', minimum=0, maximum=100000, step=1, value=0) with gr.Accordion('ControlNet Parameters', open=True): control_pose = gr.Slider(label='Pose', minimum=0, maximum=1, step=0.1, value=.5) control_depth = gr.Slider(label='Depth', minimum=0, maximum=1, step=0.1, value=.5) with gr.Accordion('Editing Function', open=True): with gr.Row(): source_pro = gr.Slider(label='Source Protagonist', minimum=0, maximum=1, step=1, value=0) source_bg = gr.Slider(label='Source Background', minimum=0, maximum=1, step=1, value=0) with gr.Accordion('Other Parameters', open=False): num_steps = gr.Slider(label='Number of Steps', minimum=0, maximum=100, step=1, value=50) start_step = gr.Slider(label='Mask Starting Step', minimum=0, maximum=100, step=1, value=0) guidance_scale = gr.Slider(label='CFG Scale', minimum=0, maximum=50, step=0.1, value=12.5) noise_level = gr.Slider(label='Noise Level', minimum=0, maximum=999, step=1, value=0) run_button = gr.Button('Generate') gr.Markdown(''' - It takes a few minutes to download model first. - It takes one minute to load model and conduct DDIM inverse ''') with gr.Column(): result = gr.Video(label='Result') with gr.Row(): examples = [ [ 'Make-A-Protagonist/ikun', 'A man is playing basketball on the beach, anime style.', 6, 3, 33, 50, 12.5, 'data/ikun/reference_images/zhongli.jpg', 'man', 0, 0, 0.5, 0.5, 0, 0 ], [ 'Make-A-Protagonist/huaqiang', 'Elon Musk walking down the street.', 6, 3, 33, 50, 12.5, 'data/huaqiang/reference_images/musk.jpg', 'man', 0, 0, 0.5, 0.5, 0, 1, ], [ 'Make-A-Protagonist/yanzi', 'A panda walking down the snowy street.', 6, 3, 33, 50, 12.5, 'data/yanzi/reference_images/panda.jpeg', 'panda', 0, 0, 0.5, 0.5, 0, 0 ], [ 'Make-A-Protagonist/car-turn', 'A car moving in the desert.', 6, 3, 33, 50, 12.5, 'data/car-turn/reference_images/audi.jpeg', 'car', 0, 0, 0.0, 1.0, 0, 0 ], [ 'Make-A-Protagonist/car-turn', 'A Suzuki Jimny driving down a mountain road in the rain.', 6, 3, 33, 50, 12.5, 'data/car-turn/images/0000.jpg', 'car', 0, 0, 0.0, 1.0, 1, 0 ], ] gr.Examples(examples=examples, inputs=[ model_id, prompt, video_length, fps, seed, num_steps, guidance_scale, ref_image, ref_pro_prompt, noise_level, start_step, control_pose, control_depth, source_pro, source_bg, ], outputs=result, fn=pipe.run, cache_examples=os.getenv('SYSTEM') == 'spaces') model_id.change(fn=app.load_model_info, inputs=model_id, outputs=[ base_model_used_for_training, prompt_used_for_training, ]) inputs = [ model_id, prompt, video_length, fps, seed, num_steps, guidance_scale, ref_image, ref_pro_prompt, noise_level, start_step, control_pose, control_depth, source_pro, source_bg, ] prompt.submit(fn=pipe.run, inputs=inputs, outputs=result) run_button.click(fn=pipe.run, inputs=inputs, outputs=result) demo.queue().launch()

Make-A-Protagonist: Generic Video Editing with An Ensemble of Experts

Yuyang Zhao1 Enze Xie2 Lanqing Hong2 Zhenguo Li2 Gim Hee Lee1

1 National University of Singapore 2 Huawei Noah's Ark Lab

[ arXiv ] [ Code ] [ Homepage ]

TL;DR: The first framework for generic video editing with both visual and textual clues.

Make-A-Protagonist:
Generic Video Editing with An Ensemble of Experts

Yuyang Zhao¹ Enze Xie² Lanqing Hong² Zhenguo Li² Gim Hee Lee¹

¹National University of Singapore ²Huawei Noah's Ark Lab