Spaces:

chenyangqi
/

FateZero

Runtime error

App Files Files Community

FateZero / app_fatezero.py

chenyangqi

15 timestep

835a416 about 2 years ago

raw

history blame contribute delete

11.6 kB

	#!/usr/bin/env python

	from __future__ import annotations

	import os

	import gradio as gr

	from inference_fatezero import merge_config_then_run


	# TITLE = '# [FateZero](http://fate-zero-edit.github.io/)'
	HF_TOKEN = os.getenv('HF_TOKEN')
	# pipe = InferencePipeline(HF_TOKEN)
	pipe = merge_config_then_run()
	# app = InferenceUtil(HF_TOKEN)

	with gr.Blocks(css='style.css') as demo:
	# gr.Markdown(TITLE)

	gr.HTML(
	"""
	<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
	<h1 style="font-weight: 900; font-size: 2rem; margin: 0rem">
	FateZero : Fusing Attentions for Zero-shot Text-based Video Editing
	</h1>
	<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
	<a href="https://chenyangqiqi.github.io/">Chenyang Qi</a>
	<a href="https://vinthony.github.io/academic/">Xiaodong Cun</a> , <a href="https://yzhang2016.github.io/">Yong Zhang</a>,
	<a href="https://chenyanglei.github.io">Chenyang Lei</a>, <a href="https://xinntao.github.io/"> Xintao Wang </a>,
	<a href="https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=zh-CN">Ying Shan</a>,
	<a href="http://cqf.io">Qifeng Chen</a>
	</h2>

	<h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
	<span class="link-block">
	[<a href="https://arxiv.org/abs/2303.09535" target="_blank"
	class="external-link ">
	<span class="icon">
	<i class="ai ai-arxiv"></i>
	</span>
	<span>arXiv</span>
	</a>]
	</span>

	<!-- Github link -->
	<span class="link-block">
	[<a href="https://github.com/ChenyangQiQi/FateZero" target="_blank"
	class="external-link ">
	<span class="icon">
	<i class="fab fa-github"></i>
	</span>
	<span>Code</span>
	</a>]
	</span>

	<!-- Github link -->
	<span class="link-block">
	[<a href="http://fate-zero-edit.github.io/" target="_blank"
	class="external-link ">
	<span class="icon">
	<i class="fab fa-github"></i>
	</span>
	<span>Homepage</span>
	</a>]
	</span>

	<!-- Github link -->
	<span class="link-block">
	[<a href="https://hkustconnect-my.sharepoint.com/:v:/g/personal/cqiaa_connect_ust_hk/EXKDI_nahEhKtiYPvvyU9SkBDTG2W4G1AZ_vkC7ekh3ENw?e=ficp9t" target="_blank"
	class="external-link ">
	<span class="icon">
	<i class="fab fa-youtube"></i>
	</span>
	<span>Video</span>
	</a>]
	</span>
	</h2>
	<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
	TL;DR: FateZero is the first zero-shot framework for text-driven video editing via pretrained diffusion models without training.
	</h2>
	</div>
	""")


	gr.HTML("""
	<p>We provide an <a href="https://github.com/ChenyangQiQi/FateZero/blob/main/docs/EditingGuidance.md"> Editing Guidance </a> to help users to choose hyperparameters when editing in-the-wild video.
	<p>Note that due to the limits of memory and computing resources on hugging-face, the results here are only toy examples and take longer to edit.
	<p>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue.
	<br/>
	<a href="https://huggingface.co/spaces/chenyangqi/FateZero?duplicate=true">
	<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
	<p>Alternatively, try our GitHub <a href=https://github.com/ChenyangQiQi/FateZero> code </a> on your GPU.
	</p>""")

	with gr.Row():
	with gr.Column():
	with gr.Accordion('Input Video', open=True):
	# user_input_video = gr.File(label='Input Source Video')
	user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto")
	with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False):
	n_sample_frame = gr.Slider(label='Number of Frames',
	minimum=0,
	maximum=32,
	step=1,
	value=8)
	stride = gr.Slider(label='Temporal stride',
	minimum=0,
	maximum=20,
	step=1,
	value=1)
	start_sample_frame = gr.Number(label='Start frame in the video',
	value=0,
	precision=0)

	with gr.Accordion('Spatial Crop offset', open=False):
	left_crop = gr.Number(label='Left crop',
	value=0,
	precision=0)
	right_crop = gr.Number(label='Right crop',
	value=0,
	precision=0)
	top_crop = gr.Number(label='Top crop',
	value=0,
	precision=0)
	bottom_crop = gr.Number(label='Bottom crop',
	value=0,
	precision=0)
	offset_list = [
	left_crop,
	right_crop,
	top_crop,
	bottom_crop,
	]

	ImageSequenceDataset_list = [
	start_sample_frame,
	n_sample_frame,
	stride
	] + offset_list

	model_id = gr.Dropdown(
	label='Model ID',
	choices=[
	'CompVis/stable-diffusion-v1-4',
	# add shape editing ckpt here
	],
	value='CompVis/stable-diffusion-v1-4')


	with gr.Accordion('Text Prompt', open=True):

	source_prompt = gr.Textbox(label='Source Prompt',
	info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
	max_lines=1,
	placeholder='Example: "a silver jeep driving down a curvy road in the countryside"',
	value='a silver jeep driving down a curvy road in the countryside')
	target_prompt = gr.Textbox(label='Target Prompt',
	info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
	max_lines=1,
	placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"',
	value='watercolor painting of a silver jeep driving down a curvy road in the countryside')





	run_button = gr.Button('Generate')

	with gr.Column():
	result = gr.Video(label='Result')
	# result.style(height=512, width=512)
	with gr.Accordion('FateZero Parameters for attention fusing', open=True):
	cross_replace_steps = gr.Slider(label='Cross-att replace steps',
	info='More steps, replace more cross attention to preserve semantic layout.',
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.7)

	self_replace_steps = gr.Slider(label='Self-att replace steps',
	info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.',
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.7)

	enhance_words = gr.Textbox(label='Enhanced words',
	info='Amplify the target-words cross attention',
	max_lines=1,
	placeholder='Example: "watercolor "',
	value='watercolor')

	enhance_words_value = gr.Slider(label='Target cross-att amplification',
	info='larger value, more elements of target words',
	minimum=0.0,
	maximum=20.0,
	step=1,
	value=10)
	with gr.Accordion('DDIM Parameters', open=True):
	num_steps = gr.Slider(label='Number of Steps',
	info='larger value has better editing capacity, but takes more time and memory. (50 steps may produces memory errors)',
	minimum=0,
	maximum=30,
	step=1,
	value=15)
	guidance_scale = gr.Slider(label='CFG Scale',
	minimum=0,
	maximum=50,
	step=0.1,
	value=7.5)
	with gr.Row():
	from example import style_example
	examples = style_example

	gr.Examples(examples=examples,
	inputs=[
	model_id,
	user_input_video,
	source_prompt,
	target_prompt,
	cross_replace_steps,
	self_replace_steps,
	enhance_words,
	enhance_words_value,
	num_steps,
	guidance_scale,
	user_input_video,
	*ImageSequenceDataset_list
	],
	outputs=result,
	fn=pipe.run,
	cache_examples=True,
	# cache_examples=os.getenv('SYSTEM') == 'spaces'
	)

	inputs = [
	model_id,
	user_input_video,
	source_prompt,
	target_prompt,
	cross_replace_steps,
	self_replace_steps,
	enhance_words,
	enhance_words_value,
	num_steps,
	guidance_scale,
	user_input_video,
	*ImageSequenceDataset_list
	]
	target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result)
	run_button.click(fn=pipe.run, inputs=inputs, outputs=result)

	demo.queue().launch()