Spaces:

Make-A-Protagonist
/

Make-A-Protagonist-inference

Runtime error

HeliosZhao

upload

9ae2d76 over 1 year ago

11.8 kB

	#!/usr/bin/env python

	from __future__ import annotations

	import os
	import sys
	import warnings

	os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/segment_anything")
	os.system("python -m pip install -e Make-A-Protagonist/experts/GroundedSAM/GroundingDINO")
	# os.system("pip install --upgrade diffusers[torch]")
	warnings.filterwarnings("ignore")

	import gradio as gr

	from inference import InferencePipeline


	class InferenceUtil:
	def __init__(self, hf_token: str \| None):
	self.hf_token = hf_token

	def load_model_info(self, model_id: str) -> tuple[str, str]:
	## TODO the modelcard is in the readme of huggingface repo, should know how to write it
	try:
	card = InferencePipeline.get_model_card(model_id, self.hf_token)
	except Exception:
	return '', ''
	# return ''
	base_model = getattr(card.data, 'base_model', '')
	protagonist = getattr(card.data, 'protagonist', '')
	training_prompt = getattr(card.data, 'training_prompt', '')
	return protagonist, training_prompt
	# return training_prompt


	# TITLE = '# [Tune-A-Video](https://tuneavideo.github.io/)'
	HF_TOKEN = os.getenv('HF_TOKEN')
	# print("HF Token ===> ", HF_TOKEN)
	pipe = InferencePipeline(HF_TOKEN)
	app = InferenceUtil(HF_TOKEN)

	with gr.Blocks(css='style.css') as demo:
	# gr.Markdown(TITLE)

	gr.HTML(
	"""
	<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
	<h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
	Make-A-Protagonist:
	<br>
	Generic Video Editing with An Ensemble of Experts
	</h1>
	<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
	<a href="https://yuyangzhao.com">Yuyang Zhao</a><sup>1</sup>
	<a href="https://xieenze.github.io/">Enze Xie</a><sup>2</sup>
	<a href="https://scholar.google.com.sg/citations?user=2p7x6OUAAAAJ&hl=en">Lanqing Hong</a><sup>2</sup>
	<a href="https://scholar.google.com.sg/citations?user=XboZC1AAAAAJ&hl=en">Zhenguo Li</a><sup>2</sup>
	<a href="https://www.comp.nus.edu.sg/~leegh/">Gim Hee Lee</a><sup>1</sup>
	</h2>

	<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
	<sup>1 </sup>National University of Singapore
	<sup>2 </sup>Huawei Noah's Ark Lab</span>
	</h2>

	<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
	<span class="link-block">
	[<a href="https://arxiv.org/abs/2305.08850" target="_blank"
	class="external-link ">
	<span class="icon">
	<i class="ai ai-arxiv"></i>
	</span>
	<span>arXiv</span>
	</a>]
	</span>

	<!-- Github link -->
	<span class="link-block">
	[<a href="https://github.com/Make-A-Protagonist/Make-A-Protagonist" target="_blank"
	class="external-link ">
	<span class="icon">
	<i class="fab fa-github"></i>
	</span>
	<span>Code</span>
	</a>]
	</span>

	<!-- Github link -->
	<span class="link-block">
	[<a href="https://make-a-protagonist.github.io/" target="_blank"
	class="external-link ">
	<span class="icon">
	<i class="fab fa-github"></i>
	</span>
	<span>Homepage</span>
	</a>]
	</span>

	</h2>
	<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
	TL;DR: The first framework for generic video editing with both visual and textual clues.
	</h2>
	</div>
	""")


	with gr.Row():
	with gr.Column():
	with gr.Box():
	model_id = gr.Dropdown(
	label='Model ID',
	choices=[
	'Make-A-Protagonist/ikun',
	'Make-A-Protagonist/huaqiang',
	'Make-A-Protagonist/yanzi',
	'Make-A-Protagonist/car-turn',
	],
	value='Make-A-Protagonist/ikun')

	with gr.Row():
	base_model_used_for_training = gr.Textbox(
	label='Protagonist', interactive=False, value='man')
	prompt_used_for_training = gr.Textbox(
	label='Training prompt', interactive=False, value='A man is playing basketball')
	with gr.Box():
	ref_image = gr.Image(label='Reference Image', type='pil', visible=True).style(height="auto")
	ref_pro_prompt = gr.Textbox(label='Reference Image Protagonist Prompt',
	max_lines=1,
	placeholder='Example: "man"')

	prompt = gr.Textbox(label='Prompt',
	max_lines=1,
	placeholder='Example: "A panda is surfing"')
	video_length = gr.Slider(label='Video length',
	minimum=4,
	maximum=8,
	step=1,
	value=8)
	fps = gr.Slider(label='FPS',
	minimum=1,
	maximum=8,
	step=1,
	value=4)
	seed = gr.Slider(label='Seed',
	minimum=0,
	maximum=100000,
	step=1,
	value=0)

	with gr.Accordion('ControlNet Parameters', open=True):
	control_pose = gr.Slider(label='Pose',
	minimum=0,
	maximum=1,
	step=0.1,
	value=.5)
	control_depth = gr.Slider(label='Depth',
	minimum=0,
	maximum=1,
	step=0.1,
	value=.5)

	with gr.Accordion('Editing Function', open=True):
	with gr.Row():
	source_pro = gr.Slider(label='Source Protagonist',
	minimum=0,
	maximum=1,
	step=1,
	value=0)
	source_bg = gr.Slider(label='Source Background',
	minimum=0,
	maximum=1,
	step=1,
	value=0)

	with gr.Accordion('Other Parameters', open=False):
	num_steps = gr.Slider(label='Number of Steps',
	minimum=0,
	maximum=100,
	step=1,
	value=50)
	guidance_scale = gr.Slider(label='CFG Scale',
	minimum=0,
	maximum=50,
	step=0.1,
	value=12.5)

	noise_level = gr.Slider(label='Noise Level',
	minimum=0,
	maximum=999,
	step=1,
	value=0)


	run_button = gr.Button('Generate')

	gr.Markdown('''
	- It takes a few minutes to download model first.
	- It takes one minute to load model and conduct DDIM inverse
	''')
	with gr.Column():
	result = gr.Video(label='Result')
	with gr.Row():
	examples = [
	[
	'Make-A-Protagonist/ikun',
	'A man is playing basketball on the beach, anime style.',
	8,
	4,
	33,
	50,
	12.5,
	'data/ikun/reference_images/zhongli.jpg',
	'man',
	0,
	0.5,
	0.5,
	0,
	0
	],

	[
	'Make-A-Protagonist/huaqiang',
	'Elon Musk walking down the street.',
	8,
	4,
	33,
	50,
	12.5,
	'data/huaqiang/reference_images/musk.jpg',
	'man',
	0,
	0.5,
	0.5,
	0,
	1,
	],

	[
	'Make-A-Protagonist/yanzi',
	'A panda walking down the snowy street.',
	8,
	4,
	33,
	50,
	12.5,
	'data/yanzi/reference_images/panda.jpeg',
	'panda',
	0,
	0.5,
	0.5,
	0,
	0
	],

	[
	'Make-A-Protagonist/car-turn',
	'A car moving in the desert.',
	8,
	4,
	33,
	50,
	12.5,
	'data/car-turn/reference_images/audi.jpeg',
	'car',
	0,
	0.0,
	1.0,
	0,
	0
	],

	[
	'Make-A-Protagonist/car-turn',
	'A Suzuki Jimny driving down a mountain road in the rain.',
	8,
	4,
	33,
	50,
	12.5,
	'data/car-turn/images/0000.jpg',
	'car',
	0,
	0.0,
	1.0,
	1,
	0
	],

	]
	gr.Examples(examples=examples,
	inputs=[
	model_id,
	prompt,
	video_length,
	fps,
	seed,
	num_steps,
	guidance_scale,
	ref_image,
	ref_pro_prompt,
	noise_level,
	control_pose,
	control_depth,
	source_pro,
	source_bg,
	],
	outputs=result,
	fn=pipe.run,
	cache_examples=os.getenv('SYSTEM') == 'spaces')

	model_id.change(fn=app.load_model_info,
	inputs=model_id,
	outputs=[
	base_model_used_for_training,
	prompt_used_for_training,
	])



	inputs = [
	model_id,
	prompt,
	video_length,
	fps,
	seed,
	num_steps,
	guidance_scale,
	ref_image,
	ref_pro_prompt,
	noise_level,
	control_pose,
	control_depth,
	source_pro,
	source_bg,
	]
	prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
	run_button.click(fn=pipe.run, inputs=inputs, outputs=result)

	demo.queue().launch(share=True)