import os import cv2 import torch import spaces import imageio import numpy as np import gradio as gr torch.jit.script = lambda f: f import argparse from utils.batch_inference import ( BSRInferenceLoop, BIDInferenceLoop ) # import subprocess # subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) device = 'cuda' if torch.cuda.is_available() else 'cpu' def get_example(task): case = { "dn": [ ['examples/bus.mp4',], ['examples/koala.mp4',], ['examples/flamingo.mp4',], ['examples/rhino.mp4',], ['examples/elephant.mp4',], ['examples/sheep.mp4',], ['examples/dog-agility.mp4',], # ['examples/dog-gooses.mp4',], ], "sr": [ ['examples/bus_sr.mp4',], ['examples/koala_sr.mp4',], ['examples/flamingo_sr.mp4',], ['examples/rhino_sr.mp4',], ['examples/elephant_sr.mp4',], ['examples/sheep_sr.mp4',], ['examples/dog-agility_sr.mp4',], # ['examples/dog-gooses_sr.mp4',], ] } return case[task] def update_prompt(input_video): video_name = input_video.split('/')[-1] return set_default_prompt(video_name) # Map videos to corresponding images video_to_image = { 'bus.mp4': ['examples_frames/bus'], 'koala.mp4': ['examples_frames/koala'], 'dog-gooses.mp4': ['examples_frames/dog-gooses'], 'flamingo.mp4': ['examples_frames/flamingo'], 'rhino.mp4': ['examples_frames/rhino'], 'elephant.mp4': ['examples_frames/elephant'], 'sheep.mp4': ['examples_frames/sheep'], 'dog-agility.mp4': ['examples_frames/dog-agility'], 'bus_sr.mp4': ['examples_frames/bus_sr'], 'koala_sr.mp4': ['examples_frames/koala_sr'], 'dog-gooses_sr.mp4': ['examples_frames/dog_gooses_sr'], 'flamingo_sr.mp4': ['examples_frames/flamingo_sr'], 'rhino_sr.mp4': ['examples_frames/rhino_sr'], 'elephant_sr.mp4': ['examples_frames/elephant_sr'], 'sheep_sr.mp4': ['examples_frames/sheep_sr'], 'dog-agility_sr.mp4': ['examples_frames/dog-agility_sr'], } def images_to_video(image_list, output_path, fps=10): # Convert PIL Images to numpy arrays frames = [np.array(img).astype(np.uint8) for img in image_list] frames = frames[:20] # Create video writer writer = imageio.get_writer(output_path, fps=fps, codec='libx264') for frame in frames: writer.append_data(frame) writer.close() def video2frames(video_path): # Open the video file video = cv2.VideoCapture(video_path) img_path = video_path[:-4] # Initialize frame counter frame_count = 0 os.makedirs(img_path, exist_ok=True) while True: # Read a frame from the video ret, frame = video.read() # If the frame was not successfully read, then we have reached the end of the video if not ret: break # Write the frame to a JPG file frame_file = f"{img_path}/{frame_count:05}.jpg" cv2.imwrite(frame_file, frame) # Increment the frame counter frame_count += 1 # Release the video file video.release() return img_path @spaces.GPU(duration=120) def DiffBIR_restore(input_video, prompt, sr_ratio, n_frames, n_steps, guidance_scale, seed, n_prompt, task): video_name = input_video.split('/')[-1] if video_name in video_to_image: frames_path = video_to_image[video_name][0] else: frames_path = video2frames(input_video) print(f"[INFO] input_video: {input_video}") print(f"[INFO] Frames path: {frames_path}") args = argparse.Namespace() # args.task = True, choices=["sr", "dn", "fr", "fr_bg"] args.task = task args.upscale = sr_ratio ### sampling parameters args.steps = n_steps args.better_start = True args.tiled = False args.tile_size = 512 args.tile_stride = 256 args.pos_prompt = prompt args.neg_prompt = n_prompt args.cfg_scale = guidance_scale ### input parameters args.input = frames_path args.n_samples = 1 args.batch_size = 10 args.final_size = (480, 854) args.config = "configs/inference/my_cldm.yaml" ### guidance parameters args.guidance = False args.g_loss = "w_mse" args.g_scale = 0.0 args.g_start = 1001 args.g_stop = -1 args.g_space = "latent" args.g_repeat = 1 ### output parameters args.output = " " ### common parameters args.seed = seed args.device = "cuda" args.n_frames = n_frames ### latent control parameters args.warp_period = [0, 0.1] args.merge_period = [0, 0] args.ToMe_period = [0, 1] args.merge_ratio = [0.6, 0] if args.task == "sr": restored_vid_path = BSRInferenceLoop(args).run() elif args.task == "dn": restored_vid_path = BIDInferenceLoop(args).run() torch.cuda.empty_cache() return restored_vid_path ######## # demo # ######## intro = """

DiffIR2VR
Restores/upscales your zero-shot videos

[Project page] [arXiv]
Note that this page is a limited demo of DiffIR2VR. For more configurations, please visit our GitHub page. The code will be released soon!
For super-resolution, it is recommended that the final frame size (original size * upscale ratio) be around 480x854, else the demo may fail due to lengthy inference times.
""" with gr.Blocks(css="style.css") as demo: gr.HTML(intro) with gr.Tab(label="Super-resolution with DiffBIR"): with gr.Row(): input_video = gr.Video(label="Input Video") output_video = gr.Video(label="Restored Video", interactive=False, autoplay=True) with gr.Row(): run_button = gr.Button(value="Restore your video!", variant="primary") with gr.Accordion('Advanced options', open=False): prompt = gr.Textbox( label="Prompt", max_lines=1, placeholder="describe your video content" # value="bear, Van Gogh Style" ) sr_ratio = gr.Slider(label='Upscale ratio', minimum=1, maximum=16, value=4, step=0.5) n_frames = gr.Slider(label='Frames', minimum=1, maximum=60, value=10, step=1) n_steps = gr.Slider(label='Steps', minimum=1, maximum=100, value=5, step=1) guidance_scale = gr.Slider(label='Guidance Scale', minimum=0.1, maximum=30.0, value=4.0, step=0.1) seed = gr.Slider(label='Seed', info="-1=result is always different", minimum=-1, maximum=1000, step=1, randomize=True) n_prompt = gr.Textbox( label='Negative Prompt', value="low quality, blurry, spray, low-resolution, noisy, unsharp, weird textures, JPEG artifact, aliasing, over-smooth" ) task = gr.Textbox(value="sr", visible=False) # input_video.change( # fn = update_prompt, # inputs = [input_video], # outputs = [prompt], # queue = False) run_button.click(fn = DiffBIR_restore, inputs = [input_video, prompt, sr_ratio, n_frames, n_steps, guidance_scale, seed, n_prompt, task ], outputs = [output_video] ) gr.Examples( examples=get_example("sr"), label='Examples', inputs=[input_video], outputs=[output_video], examples_per_page=7 ) with gr.Tab(label="Denoise with DiffBIR"): with gr.Row(): input_video = gr.Video(label="Input Video") output_video = gr.Video(label="Restored Video", interactive=False, autoplay=True) with gr.Row(): run_button = gr.Button(value="Restore your video!", variant="primary") with gr.Accordion('Advanced options', open=False): prompt = gr.Textbox( label="Prompt", max_lines=1, placeholder="describe your video content" # value="bear, Van Gogh Style" ) n_frames = gr.Slider(label='Frames', minimum=1, maximum=60, value=10, step=1) n_steps = gr.Slider(label='Steps', minimum=1, maximum=100, value=5, step=1) guidance_scale = gr.Slider(label='Guidance Scale', minimum=0.1, maximum=30.0, value=4.0, step=0.1) seed = gr.Slider(label='Seed', info="-1=result is always different", minimum=-1, maximum=1000, step=1, randomize=True) n_prompt = gr.Textbox( label='Negative Prompt', value="low quality, blurry, spray, low-resolution, noisy, unsharp, weird textures, JPEG artifact, aliasing, over-smooth" ) task = gr.Textbox(value="dn", visible=False) sr_ratio = gr.Number(value=1, visible=False) # input_video.change( # fn = update_prompt, # inputs = [input_video], # outputs = [prompt], # queue = False) run_button.click(fn = DiffBIR_restore, inputs = [input_video, prompt, sr_ratio, n_frames, n_steps, guidance_scale, seed, n_prompt, task ], outputs = [output_video] ) gr.Examples( examples=get_example("dn"), label='Examples', inputs=[input_video], outputs=[output_video], examples_per_page=7 ) demo.queue() demo.launch()