Spaces:

Koi953215
/

DiffIR2VR

Running on Zero

File size: 11,993 Bytes

import os
import cv2
import torch
import spaces
import imageio
import numpy as np
import gradio as gr
torch.jit.script = lambda f: f

import argparse
from utils.batch_inference import (
    BSRInferenceLoop, BIDInferenceLoop
)

# import subprocess
# subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
def get_example(task):
    case = {
        "dn": [
        ['examples/bus.mp4',],
        ['examples/koala.mp4',],
        ['examples/flamingo.mp4',],
        ['examples/rhino.mp4',],
        ['examples/elephant.mp4',],
        ['examples/sheep.mp4',],
        ['examples/dog-agility.mp4',],
        # ['examples/dog-gooses.mp4',],
        ], 
        "sr": [
        ['examples/bus_sr.mp4',],
        ['examples/koala_sr.mp4',],
        ['examples/flamingo_sr.mp4',],
        ['examples/rhino_sr.mp4',],
        ['examples/elephant_sr.mp4',],
        ['examples/sheep_sr.mp4',],
        ['examples/dog-agility_sr.mp4',],
        # ['examples/dog-gooses_sr.mp4',],
        ]
        
    }
    return case[task]



def update_prompt(input_video):
    video_name = input_video.split('/')[-1]
    return set_default_prompt(video_name)


# Map videos to corresponding images
video_to_image = {
    'bus.mp4': ['examples_frames/bus'],
    'koala.mp4': ['examples_frames/koala'],
    'dog-gooses.mp4': ['examples_frames/dog-gooses'],
    'flamingo.mp4': ['examples_frames/flamingo'],
    'rhino.mp4': ['examples_frames/rhino'],
    'elephant.mp4': ['examples_frames/elephant'],
    'sheep.mp4': ['examples_frames/sheep'],
    'dog-agility.mp4': ['examples_frames/dog-agility'],

    'bus_sr.mp4': ['examples_frames/bus_sr'],
    'koala_sr.mp4': ['examples_frames/koala_sr'],
    'dog-gooses_sr.mp4': ['examples_frames/dog_gooses_sr'],
    'flamingo_sr.mp4': ['examples_frames/flamingo_sr'],
    'rhino_sr.mp4': ['examples_frames/rhino_sr'],
    'elephant_sr.mp4': ['examples_frames/elephant_sr'],
    'sheep_sr.mp4': ['examples_frames/sheep_sr'],
    'dog-agility_sr.mp4': ['examples_frames/dog-agility_sr'],
}


def images_to_video(image_list, output_path, fps=10):
    # Convert PIL Images to numpy arrays
    frames = [np.array(img).astype(np.uint8) for img in image_list]
    frames = frames[:20]

    # Create video writer
    writer = imageio.get_writer(output_path, fps=fps, codec='libx264')

    for frame in frames:
        writer.append_data(frame)

    writer.close()

def video2frames(video_path):
    # Open the video file
    video = cv2.VideoCapture(video_path)

    img_path = video_path[:-4]
    # Initialize frame counter
    frame_count = 0
    os.makedirs(img_path, exist_ok=True)

    while True:
        # Read a frame from the video
        ret, frame = video.read()

        # If the frame was not successfully read, then we have reached the end of the video
        if not ret:
            break

        # Write the frame to a JPG file
        frame_file = f"{img_path}/{frame_count:05}.jpg"
        cv2.imwrite(frame_file, frame)

        # Increment the frame counter
        frame_count += 1

    # Release the video file
    video.release()

    return img_path

@spaces.GPU(duration=120)
def DiffBIR_restore(input_video, prompt, sr_ratio, n_frames, n_steps, guidance_scale, seed, n_prompt, task):

    video_name = input_video.split('/')[-1]
    if video_name in video_to_image:
        frames_path = video_to_image[video_name][0]
    else:
        frames_path = video2frames(input_video)

    print(f"[INFO] input_video: {input_video}")
    print(f"[INFO] Frames path: {frames_path}")
    args = argparse.Namespace()

    # args.task = True, choices=["sr", "dn", "fr", "fr_bg"]
    args.task = task
    args.upscale = sr_ratio

    ### sampling parameters
    args.steps = n_steps
    args.better_start = True
    args.tiled = False
    args.tile_size = 512
    args.tile_stride = 256
    args.pos_prompt = prompt
    args.neg_prompt = n_prompt
    args.cfg_scale = guidance_scale
    ### input parameters
    args.input = frames_path
    args.n_samples = 1
    args.batch_size = 10
    args.final_size = (480, 854)
    args.config = "configs/inference/my_cldm.yaml"
    ### guidance parameters
    args.guidance = False
    args.g_loss = "w_mse"
    args.g_scale = 0.0
    args.g_start = 1001
    args.g_stop = -1
    args.g_space = "latent"
    args.g_repeat = 1
    ### output parameters
    args.output = " "
    ### common parameters
    args.seed = seed
    args.device = "cuda"

    args.n_frames = n_frames
    ### latent control parameters
    args.warp_period = [0, 0.1]
    args.merge_period = [0, 0]
    args.ToMe_period = [0, 1]
    args.merge_ratio = [0.6, 0]

    if args.task == "sr":
        restored_vid_path = BSRInferenceLoop(args).run()
    elif args.task == "dn":
        restored_vid_path = BIDInferenceLoop(args).run()
    
    torch.cuda.empty_cache()
    return restored_vid_path

########
# demo #
########


intro = """
<div style="text-align:center">
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;">
   DiffIR2VR
   <br/>
   <small>Restores/upscales your zero-shot videos</small>
</h1>
<span>[<a target="_blank" href="https://jimmycv07.github.io/DiffIR2VR_web/">Project page</a>] [<a target="_blank" href="https://huggingface.co/papers/2406.06523">arXiv</a>]</span>
<div style="display:flex; justify-content: center;margin-top: 0.5em">Note that this page is a limited demo of DiffIR2VR. 
For more configurations, please visit our GitHub page. The code will be released soon!</div>
<div style="display:flex; justify-content: center;margin-top: 0.5em; color: red;">For super-resolution, 
it is recommended that the final frame size (original size * upscale ratio) be around 480x854, 
else the demo may fail due to lengthy inference times.</div>
</div>
"""
 

with gr.Blocks(css="style.css") as demo:

    gr.HTML(intro)
    

    with gr.Tab(label="Super-resolution with DiffBIR"):
        with gr.Row():
            input_video = gr.Video(label="Input Video")
            output_video = gr.Video(label="Restored Video", interactive=False, autoplay=True)

        with gr.Row():
            run_button = gr.Button(value="Restore your video!", variant="primary")

        with gr.Accordion('Advanced options', open=False):
            prompt = gr.Textbox(
                label="Prompt",
                max_lines=1, 
                placeholder="describe your video content"
                # value="bear, Van Gogh Style"
            )
            sr_ratio = gr.Slider(label='Upscale ratio',
                                    minimum=1,
                                    maximum=16,
                                    value=4,
                                    step=0.5)
            n_frames = gr.Slider(label='Frames',
                                    minimum=1,
                                    maximum=60,
                                    value=10,
                                    step=1)
            n_steps = gr.Slider(label='Steps',
                                    minimum=1,
                                    maximum=100,
                                    value=5,
                                    step=1)
            guidance_scale = gr.Slider(label='Guidance Scale',
                                        minimum=0.1,
                                        maximum=30.0,
                                        value=4.0,
                                        step=0.1)
            seed = gr.Slider(label='Seed',
                                info="-1=result is always different",
                                minimum=-1,
                                maximum=1000,
                                step=1,
                                randomize=True)
            n_prompt = gr.Textbox(
                label='Negative Prompt',
                value="low quality, blurry, spray, low-resolution, noisy, unsharp, weird textures, JPEG artifact, aliasing, over-smooth"
            )
            task = gr.Textbox(value="sr", visible=False)
        # input_video.change(
        #     fn = update_prompt,
        #     inputs = [input_video],
        #     outputs = [prompt],
        #     queue = False)
        
        run_button.click(fn = DiffBIR_restore,
                        inputs = [input_video, 
                                prompt, 
                                sr_ratio,
                                n_frames,
                                n_steps, 
                                guidance_scale, 
                                seed, 
                                n_prompt,
                                task
                                ],
                                    outputs = [output_video]
                                    )
        gr.Examples(
            examples=get_example("sr"),
            label='Examples',
            inputs=[input_video],
            outputs=[output_video],
            examples_per_page=7
        )

    with gr.Tab(label="Denoise with DiffBIR"):
        with gr.Row():
            input_video = gr.Video(label="Input Video")
            output_video = gr.Video(label="Restored Video", interactive=False, autoplay=True)

        with gr.Row():
            run_button = gr.Button(value="Restore your video!", variant="primary")

        with gr.Accordion('Advanced options', open=False):
            prompt = gr.Textbox(
                label="Prompt",
                max_lines=1, 
                placeholder="describe your video content"
                # value="bear, Van Gogh Style"
            )
            n_frames = gr.Slider(label='Frames',
                                    minimum=1,
                                    maximum=60,
                                    value=10,
                                    step=1)
            n_steps = gr.Slider(label='Steps',
                                    minimum=1,
                                    maximum=100,
                                    value=5,
                                    step=1)
            guidance_scale = gr.Slider(label='Guidance Scale',
                                        minimum=0.1,
                                        maximum=30.0,
                                        value=4.0,
                                        step=0.1)
            seed = gr.Slider(label='Seed',
                                info="-1=result is always different",
                                minimum=-1,
                                maximum=1000,
                                step=1,
                                randomize=True)
            n_prompt = gr.Textbox(
                label='Negative Prompt',
                value="low quality, blurry, spray, low-resolution, noisy, unsharp, weird textures, JPEG artifact, aliasing, over-smooth"
            )
            task = gr.Textbox(value="dn", visible=False)
            sr_ratio = gr.Number(value=1, visible=False)
                        
        # input_video.change(
        #     fn = update_prompt,
        #     inputs = [input_video],
        #     outputs = [prompt],
        #     queue = False)
        run_button.click(fn = DiffBIR_restore,
                        inputs = [input_video, 
                                prompt, 
                                sr_ratio,
                                n_frames,
                                n_steps, 
                                guidance_scale, 
                                seed, 
                                n_prompt,
                                task
                                ],
                                    outputs = [output_video]
                                    )
        gr.Examples(
            examples=get_example("dn"),
            label='Examples',
            inputs=[input_video],
            outputs=[output_video],
            examples_per_page=7
        )

demo.queue()

demo.launch()