#!/usr/bin/env python from __future__ import annotations import os import gradio as gr from inference_followyourpose import merge_config_then_run import sys sys.path.append('FollowYourPose') # result = subprocess.run(['bash', './data/download.sh'], stdout=subprocess.PIPE) import subprocess zip_file = './example_video.zip' output_dir = './data' subprocess.run(['unzip', zip_file, '-d', output_dir]) current_dir = os.getcwd() print("path is :", current_dir) print("current_dir is :", os.listdir(current_dir)) print("dir is :", os.listdir(os.path.join(current_dir,'data'))) print("data/example_video is :", os.listdir(os.path.join(current_dir,'data/example_video'))) HF_TOKEN = os.getenv('HF_TOKEN') pipe = merge_config_then_run() with gr.Blocks(css='style.css') as demo: gr.HTML( """

🕺🕺🕺 Follow Your Pose 💃💃💃
Pose-Guided Text-to-Video Generation using Pose-Free Videos

Yue Ma* Yingqing He* , Xiaodong Cun, Xintao Wang , Ying Shan, Xiu Li, Qifeng Chen

[ arXiv ] [ Code ] [ Homepage ]

TL;DR: We tune 2D stable-diffusion to generate the character videos from pose and text description.

""") gr.HTML("""

In order to run the demo successfully, we recommend the length of video is about 3~5 seconds. The temporal crop offset and sampling stride are used to adjust the starting point and interval of video samples. Due to the GPU limit of this demo, it currently generates 8-frame videos. For generating longer videos (e.g. 32 frames) shown on our webpage, we recommend trying our GitHub code on your own GPU.

""") with gr.Row(): with gr.Column(): with gr.Accordion('Input Video', open=True): # user_input_video = gr.File(label='Input Source Video') user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto") video_type = gr.Dropdown( label='The type of input video', choices=[ "Raw Video", "Skeleton Video" ], value="Raw Video") with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False): n_sample_frame = gr.Slider(label='Number of Frames', minimum=0, maximum=32, step=1, value=8) stride = gr.Slider(label='Temporal stride', minimum=0, maximum=20, step=1, value=1) with gr.Accordion('Spatial Crop offset', open=False): left_crop = gr.Number(label='Left crop', value=0, precision=0) right_crop = gr.Number(label='Right crop', value=0, precision=0) top_crop = gr.Number(label='Top crop', value=0, precision=0) bottom_crop = gr.Number(label='Bottom crop', value=0, precision=0) offset_list = [ left_crop, right_crop, top_crop, bottom_crop, ] ImageSequenceDataset_list = [ n_sample_frame, stride ] + offset_list with gr.Accordion('Text Prompt', open=True): target_prompt = gr.Textbox(label='Target Prompt', info='The simple background may achieve better results(e.g., "beach", "moon" prompt is better than "street" and "market")', max_lines=1, placeholder='Example: "Iron man on the beach"', value='Iron man on the beach') run_button = gr.Button('Generate') with gr.Column(): result = gr.Video(label='Result') # result.style(height=512, width=512) with gr.Accordion('DDIM Parameters', open=True): num_steps = gr.Slider(label='Number of Steps', info='larger value has better editing capacity, but takes more time and memory.', minimum=0, maximum=50, step=1, value=50) guidance_scale = gr.Slider(label='CFG Scale', minimum=0, maximum=50, step=0.1, value=12.0) with gr.Row(): from example import style_example examples = style_example gr.Examples(examples=examples, inputs = [ user_input_video, target_prompt, num_steps, guidance_scale, video_type, *ImageSequenceDataset_list ], outputs=result, fn=pipe.run, cache_examples=True, ) inputs = [ user_input_video, target_prompt, num_steps, guidance_scale, video_type, *ImageSequenceDataset_list ] target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result) run_button.click(fn=pipe.run, inputs=inputs, outputs=result) demo.queue().launch() # demo.queue().launch(share=False, server_name='0.0.0.0', server_port=80)

🕺🕺🕺 Follow Your Pose 💃💃💃 Pose-Guided Text-to-Video Generation using Pose-Free Videos

Yue Ma* Yingqing He* , Xiaodong Cun, Xintao Wang , Ying Shan, Xiu Li, Qifeng Chen

[ arXiv ] [ Code ] [ Homepage ]

TL;DR: We tune 2D stable-diffusion to generate the character videos from pose and text description.

🕺🕺🕺 Follow Your Pose 💃💃💃
Pose-Guided Text-to-Video Generation using Pose-Free Videos