import gradio as gr import os import torch import argparse import os import sys import yaml import datetime sys.path.append(os.path.dirname(os.getcwd())) from pipelines.sd_controlnet_rave import RAVE from pipelines.sd_multicontrolnet_rave import RAVE_MultiControlNet import subprocess import utils.constants as const import utils.video_grid_utils as vgu import warnings warnings.filterwarnings("ignore") import pprint import glob def init_device(): device_name = 'cuda' if torch.cuda.is_available() else 'cpu' device = torch.device(device_name) return device def init_paths(input_ns): if input_ns.save_folder == None or input_ns.save_folder == '': input_ns.save_folder = input_ns.video_name else: input_ns.save_folder = os.path.join(input_ns.save_folder, input_ns.video_name) save_dir = os.path.join(const.OUTPUT_PATH, input_ns.save_folder) os.makedirs(save_dir, exist_ok=True) save_idx = max([int(x[-5:]) for x in os.listdir(save_dir)])+1 if os.listdir(save_dir) != [] else 0 input_ns.save_path = os.path.join(save_dir, f'{input_ns.positive_prompts}-{str(save_idx).zfill(5)}') if '-' in input_ns.preprocess_name: input_ns.hf_cn_path = [const.PREPROCESSOR_DICT[i] for i in input_ns.preprocess_name.split('-')] else: input_ns.hf_cn_path = const.PREPROCESSOR_DICT[input_ns.preprocess_name] input_ns.hf_path = "runwayml/stable-diffusion-v1-5" input_ns.inverse_path = os.path.join(const.GENERATED_DATA_PATH, 'inverses', input_ns.video_name, f'{input_ns.preprocess_name}_{input_ns.model_id}_{input_ns.grid_size}x{input_ns.grid_size}_{input_ns.pad}') input_ns.control_path = os.path.join(const.GENERATED_DATA_PATH, 'controls', input_ns.video_name, f'{input_ns.preprocess_name}_{input_ns.grid_size}x{input_ns.grid_size}_{input_ns.pad}') os.makedirs(input_ns.control_path, exist_ok=True) os.makedirs(input_ns.inverse_path, exist_ok=True) os.makedirs(input_ns.save_path, exist_ok=True) return input_ns def install_civitai_model(model_id): full_path = os.path.join(const.CWD, 'CIVIT_AI', 'diffusers_models', model_id, '*') if len(glob.glob(full_path)) > 0: full_path = glob.glob(full_path)[0] return full_path install_path = os.path.join(const.CWD, 'CIVIT_AI', 'safetensors') install_path_model = os.path.join(const.CWD, 'CIVIT_AI', 'safetensors', model_id) diffusers_path = os.path.join(const.CWD, 'CIVIT_AI', 'diffusers_models', model_id) convert_py_path = os.path.join(const.CWD, 'CIVIT_AI', 'convert.py') os.makedirs(install_path, exist_ok=True) os.makedirs(diffusers_path, exist_ok=True) subprocess.run(f'wget https://civitai.com/api/download/models/{model_id} --content-disposition --directory {install_path_model}'.split()) model_name = glob.glob(os.path.join(install_path, model_id, '*'))[0] model_name2 = os.path.basename(glob.glob(os.path.join(install_path, model_id, '*'))[0]).replace('.safetensors', '') diffusers_path_model_name = os.path.join(const.CWD, 'CIVIT_AI', 'diffusers_models', model_id, model_name2) print(model_name) subprocess.run(f'python {convert_py_path} --checkpoint_path {model_name} --dump_path {diffusers_path_model_name} --from_safetensors'.split()) subprocess.run(f'rm -rf {install_path}'.split()) return diffusers_path_model_name def run(*args): batch_size = 4 batch_size_vae = 1 is_ddim_inversion = True is_shuffle = True num_inference_steps = 20 num_inversion_step = 20 cond_step_start = 0.0 give_control_inversion = True inversion_prompt = '' save_folder = '' list_of_inputs = [x for x in args] input_ns = argparse.Namespace(**{}) input_ns.video_path = list_of_inputs[0] # video_path input_ns.video_name = os.path.basename(input_ns.video_path).replace('.mp4', '').replace('.gif', '') input_ns.preprocess_name = list_of_inputs[1] input_ns.batch_size = batch_size input_ns.batch_size_vae = batch_size_vae input_ns.cond_step_start = cond_step_start input_ns.controlnet_conditioning_scale = list_of_inputs[2] input_ns.controlnet_guidance_end = list_of_inputs[3] input_ns.controlnet_guidance_start = list_of_inputs[4] input_ns.give_control_inversion = give_control_inversion input_ns.grid_size = list_of_inputs[5] input_ns.sample_size = list_of_inputs[6] input_ns.pad = list_of_inputs[7] input_ns.guidance_scale = list_of_inputs[8] input_ns.inversion_prompt = inversion_prompt input_ns.is_ddim_inversion = is_ddim_inversion input_ns.is_shuffle = is_shuffle input_ns.negative_prompts = list_of_inputs[9] input_ns.num_inference_steps = num_inference_steps input_ns.num_inversion_step = num_inversion_step input_ns.positive_prompts = list_of_inputs[10] input_ns.save_folder = save_folder input_ns.seed = list_of_inputs[11] input_ns.model_id = const.MODEL_IDS[list_of_inputs[12]] # input_ns.width = list_of_inputs[23] # input_ns.height = list_of_inputs[24] # input_ns.original_size = list_of_inputs[25] diffusers_model_path = os.path.join(const.CWD, 'CIVIT_AI', 'diffusers_models') os.makedirs(diffusers_model_path, exist_ok=True) if 'model_id' not in list(input_ns.__dict__.keys()): input_ns.model_id = "None" if str(input_ns.model_id) != 'None': input_ns.model_id = install_civitai_model(input_ns.model_id) device = init_device() input_ns = init_paths(input_ns) input_ns.image_pil_list = vgu.prepare_video_to_grid(input_ns.video_path, input_ns.sample_size, input_ns.grid_size, input_ns.pad) print(input_ns.video_path) input_ns.sample_size = len(input_ns.image_pil_list) print(f'Frame count: {len(input_ns.image_pil_list)}') controlnet_class = RAVE_MultiControlNet if '-' in str(input_ns.controlnet_conditioning_scale) else RAVE CN = controlnet_class(device) CN.init_models(input_ns.hf_cn_path, input_ns.hf_path, input_ns.preprocess_name, input_ns.model_id) input_dict = vars(input_ns) pp = pprint.PrettyPrinter(indent=4) pp.pprint(input_dict) yaml_dict = {k:v for k,v in input_dict.items() if k != 'image_pil_list'} start_time = datetime.datetime.now() if '-' in str(input_ns.controlnet_conditioning_scale): res_vid, control_vid_1, control_vid_2 = CN(input_dict) else: res_vid, control_vid = CN(input_dict) end_time = datetime.datetime.now() # res_vid = [x.crop() .resize((x.size[0], x.size[1])) for x in res_vid] # control_vid = [x[2:-2, 2:-2].resize((x.size[0], x.size[1])) for x in control_vid] save_name = f"{'-'.join(input_ns.positive_prompts.split())}_cstart-{input_ns.controlnet_guidance_start}_gs-{input_ns.guidance_scale}_pre-{'-'.join((input_ns.preprocess_name.replace('-','+').split('_')))}_cscale-{input_ns.controlnet_conditioning_scale}_grid-{input_ns.grid_size}_pad-{input_ns.pad}_model-{os.path.basename(input_ns.model_id)}" res_vid[0].save(os.path.join(input_ns.save_path, f'{save_name}.gif'), save_all=True, append_images=res_vid[1:], loop=10000) control_vid[0].save(os.path.join(input_ns.save_path, f'control_{save_name}.gif'), save_all=True, append_images=control_vid[1:], optimize=False, loop=10000) yaml_dict['total_time'] = (end_time - start_time).total_seconds() yaml_dict['total_number_of_frames'] = len(res_vid) yaml_dict['sec_per_frame'] = yaml_dict['total_time']/yaml_dict['total_number_of_frames'] with open(os.path.join(input_ns.save_path, 'config.yaml'), 'w') as yaml_file: yaml.dump(yaml_dict, yaml_file) return os.path.join(input_ns.save_path, f'{save_name}.gif'), os.path.join(input_ns.save_path, f'control_{save_name}.gif') def output_video_fn(video_path, text_prompt): fold_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "example_videos") video_path = os.path.join(fold_path, os.path.basename(video_path).replace('input', 'output')) return video_path block = gr.Blocks().queue() with block: gr.HTML( """

RAVE: Randomized Noise Shuffling for Fast and Consistent Video Editing with Diffusion Models

Ozgur Kara1, Bariscan Kurtkaya2, Hidir Yesiltepe4, James M. Rehg1,3, Pinar Yanardag4

1Georgia Institute of Technology, 2KUIS AI Center, 3University of Illinois Urbana-Champaign, 4Virginia Tech

[arXiv] [GitHub] [Project Webpage]

TL; DR: RAVE is a zero-shot, lightweight, and fast framework for text-guided video editing, supporting videos of any length utilizing text-to-image pretrained diffusion models.

Note that this page is a limited demo of RAVE. To run with more configurations, please check out our GitHub page.

""") with gr.Row(): with gr.Column(): with gr.Row(): input_path = gr.File(label='Upload Input Video', file_types=['.mp4'], scale=1) inputs = gr.Video(label='Input Video', format='mp4', visible=True, interactive=False, scale=5) input_path.upload(lambda x:x, inputs=[input_path], outputs=[inputs]) gr.Markdown('# Example Video Edits') with gr.Row(): example_input = gr.Video(label='Input Example', format='mp4', visible=True, interactive=False) example_output = gr.Video(label='Output Example', format='mp4', visible=True, interactive=False) # input(os.path.join(os.path.dirname(os.path.abspath(__file__)), "example_videos", "exp_input_1.mp4")) ex_prompt = gr.Textbox(label='Text Prompt', interactive=False) with gr.Row(): ex_list = [] ex_prompt_dict = { '1': "A black panther", '2': "A medieval knight", '3': "Swarovski blue crystal swan", '4': "Switzerland SBB CFF FFS train", '5': "White cupcakes, moving on the table", } for i in range(1,6): ex_list.append([os.path.join(os.path.dirname(os.path.abspath(__file__)), "example_videos", f"exp_input_{i}.mp4"), ex_prompt_dict[str(i)]]) ex = gr.Examples( examples=ex_list, inputs=[example_input, ex_prompt], outputs=example_output, fn=output_video_fn, cache_examples=True,) with gr.Column(): with gr.Row(): result_video = gr.Image(label='Edited Video', interactive=False) control_video = gr.Image(label='Control Video', interactive=False) with gr.Row(): positive_prompts = gr.Textbox(label='Positive prompts') negative_prompts = gr.Textbox(label='Negative prompts') model_id = gr.Dropdown(const.MODEL_IDS, label='Model id', value='SD 1.5') with gr.Row(): preprocess_list = ['depth_zoe', 'lineart_realistic', 'lineart_standard', 'softedge_hed'] preprocess_name = gr.Dropdown(preprocess_list, label='Control type', value='depth_zoe') guidance_scale = gr.Slider(label='Guidance scale', minimum=0, maximum=40, step=0.1, value=7.5) seed = gr.Slider(label='Seed', minimum=0, maximum=2147483647, step=1, value=0, randomize=True) run_button = gr.Button(value='Run All') with gr.Accordion('Configuration', open=False): with gr.Row(): controlnet_conditioning_scale = gr.Slider(label='ControlNet conditioning scale', minimum=0.0, maximum=1.0, value=1.0, step=0.01) controlnet_guidance_end = gr.Slider(label='ControlNet guidance end', minimum=0.0, maximum=1.0, value=1.0, step=0.01) controlnet_guidance_start = gr.Slider(label='ControlNet guidance start', minimum=0.0, maximum=1.0, value=0.0, step=0.01) with gr.Row(): grid_size = gr.Slider(label='Grid size (n x n)', minimum=2, maximum=3, value=3, step=1) sample_size = gr.Slider(label='Number of grids', minimum=1, maximum=10, value=1, step=1) pad = gr.Slider(label='Pad', minimum=1, maximum=5, value=2, step=1) inputs = [input_path, preprocess_name, controlnet_conditioning_scale, controlnet_guidance_end, controlnet_guidance_start, grid_size, sample_size, pad, guidance_scale, negative_prompts, positive_prompts, seed, model_id] run_button.click(fn=run, inputs=inputs, outputs=[result_video, control_video]) if __name__ == "__main__": block.queue(max_size=20) block.launch(share=True)