#!/usr/bin/env python from __future__ import annotations import os import sys import warnings # os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/segment_anything") # os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/GroundingDINO") # os.system("pip install --upgrade diffusers[torch]") warnings.filterwarnings("ignore") import gradio as gr from inference import InferencePipeline class InferenceUtil: def __init__(self, hf_token: str | None): self.hf_token = hf_token def load_model_info(self, model_id: str) -> tuple[str, str]: ## TODO the modelcard is in the readme of huggingface repo, should know how to write it try: card = InferencePipeline.get_model_card(model_id, self.hf_token) except Exception: return '', '' # return '' base_model = getattr(card.data, 'base_model', '') protagonist = getattr(card.data, 'protagonist', '') training_prompt = getattr(card.data, 'training_prompt', '') return protagonist, training_prompt # return training_prompt # TITLE = '# [Tune-A-Video](https://tuneavideo.github.io/)' HF_TOKEN = os.getenv('HF_TOKEN') # print("HF Token ===> ", HF_TOKEN) pipe = InferencePipeline(HF_TOKEN) app = InferenceUtil(HF_TOKEN) with gr.Blocks(css='style.css') as demo: # gr.Markdown(TITLE) gr.HTML( """

Make-A-Protagonist:
Generic Video Editing with An Ensemble of Experts

Yuyang Zhao¹ Enze Xie² Lanqing Hong² Zhenguo Li² Gim Hee Lee¹

¹National University of Singapore ²Huawei Noah's Ark Lab

[ arXiv ] [ Code ] [ Homepage ]

TL;DR: The first framework for generic video editing with both visual and textual clues.

""") with gr.Row(): with gr.Column(): with gr.Box(): model_id = gr.Dropdown( label='Model ID', choices=[ 'Make-A-Protagonist/ikun', 'Make-A-Protagonist/huaqiang', 'Make-A-Protagonist/yanzi', 'Make-A-Protagonist/car-turn', ], value='Make-A-Protagonist/ikun') with gr.Row(): base_model_used_for_training = gr.Textbox( label='Protagonist', interactive=False, value='man') prompt_used_for_training = gr.Textbox( label='Training prompt', interactive=False, value='A man is playing basketball') with gr.Box(): ref_image = gr.Image(label='Reference Image', type='pil', visible=True).style(height="auto") ref_pro_prompt = gr.Textbox(label='Reference Image Protagonist Prompt', max_lines=1, placeholder='Example: "man"') prompt = gr.Textbox(label='Prompt', max_lines=1, placeholder='Example: "A panda is surfing"') video_length = gr.Slider(label='Video length', minimum=4, maximum=8, step=1, value=8) fps = gr.Slider(label='FPS', minimum=1, maximum=8, step=1, value=4) seed = gr.Slider(label='Seed', minimum=0, maximum=100000, step=1, value=0) with gr.Accordion('ControlNet Parameters', open=True): control_pose = gr.Slider(label='Pose', minimum=0, maximum=1, step=0.1, value=.5) control_depth = gr.Slider(label='Depth', minimum=0, maximum=1, step=0.1, value=.5) with gr.Accordion('Editing Function', open=True): with gr.Row(): source_pro = gr.Slider(label='Source Protagonist', minimum=0, maximum=1, step=1, value=0) source_bg = gr.Slider(label='Source Background', minimum=0, maximum=1, step=1, value=0) with gr.Accordion('Other Parameters', open=False): num_steps = gr.Slider(label='Number of Steps', minimum=0, maximum=100, step=1, value=50) guidance_scale = gr.Slider(label='CFG Scale', minimum=0, maximum=50, step=0.1, value=12.5) noise_level = gr.Slider(label='Noise Level', minimum=0, maximum=999, step=1, value=0) run_button = gr.Button('Generate') gr.Markdown(''' - It takes a few minutes to download model first. - It takes one minute to load model and conduct DDIM inverse ''') with gr.Column(): result = gr.Video(label='Result') with gr.Row(): examples = [ [ 'Make-A-Protagonist/ikun', 'A man is playing basketball on the beach, anime style.', 8, 4, 33, 50, 12.5, 'data/ikun/reference_images/zhongli.jpg', 'man', 0, 0.5, 0.5, 0, 0 ], [ 'Make-A-Protagonist/huaqiang', 'Elon Musk walking down the street.', 8, 4, 33, 50, 12.5, 'data/huaqiang/reference_images/musk.jpg', 'man', 0, 0.5, 0.5, 0, 1, ], [ 'Make-A-Protagonist/yanzi', 'A panda walking down the snowy street.', 8, 4, 33, 50, 12.5, 'data/yanzi/reference_images/panda.jpeg', 'panda', 0, 0.5, 0.5, 0, 0 ], [ 'Make-A-Protagonist/car-turn', 'A car moving in the desert.', 8, 4, 33, 50, 12.5, 'data/car-turn/reference_images/audi.jpeg', 'car', 0, 0.0, 1.0, 0, 0 ], [ 'Make-A-Protagonist/car-turn', 'A Suzuki Jimny driving down a mountain road in the rain.', 8, 4, 33, 50, 12.5, 'data/car-turn/images/0000.jpg', 'car', 0, 0.0, 1.0, 1, 0 ], ] gr.Examples(examples=examples, inputs=[ model_id, prompt, video_length, fps, seed, num_steps, guidance_scale, ref_image, ref_pro_prompt, noise_level, control_pose, control_depth, source_pro, source_bg, ], outputs=result, fn=pipe.run, cache_examples=os.getenv('SYSTEM') == 'spaces') model_id.change(fn=app.load_model_info, inputs=model_id, outputs=[ base_model_used_for_training, prompt_used_for_training, ]) inputs = [ model_id, prompt, video_length, fps, seed, num_steps, guidance_scale, ref_image, ref_pro_prompt, noise_level, control_pose, control_depth, source_pro, source_bg, ] prompt.submit(fn=pipe.run, inputs=inputs, outputs=result) run_button.click(fn=pipe.run, inputs=inputs, outputs=result) demo.queue().launch(share=True)

Make-A-Protagonist: Generic Video Editing with An Ensemble of Experts

Yuyang Zhao1 Enze Xie2 Lanqing Hong2 Zhenguo Li2 Gim Hee Lee1

1 National University of Singapore 2 Huawei Noah's Ark Lab

[ arXiv ] [ Code ] [ Homepage ]

TL;DR: The first framework for generic video editing with both visual and textual clues.

Make-A-Protagonist:
Generic Video Editing with An Ensemble of Experts

Yuyang Zhao¹ Enze Xie² Lanqing Hong² Zhenguo Li² Gim Hee Lee¹

¹National University of Singapore ²Huawei Noah's Ark Lab