File size: 7,103 Bytes
c95d2d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import gradio as gr
import pandas as pd


BASELINE = f'<a target="_blank" href=https://github.com/showlab/loveu-tgve-2023 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">Tune-A-Video (Baseline)</a>'
COLS = ["Method", "CLIPScore (Frame Consistency) ⬆️", "CLIPScore (Text Alignment) ⬆️", "PickScore ⬆️", "Human Preference ⬆️", "References"]
TYPES = ["markdown", "number", "number", "number", "str", "markdown"]


def get_leaderboard():
    all_data = []
    
    baseline_0 = {
        "Method": '**Tune-A-Video**',
        "CLIPScore (Frame Consistency) ⬆️":0.92,
        "CLIPScore (Text Alignment) ⬆️":27.12,
        "PickScore ⬆️":20.36,
        "Human Preference ⬆️":'',
        "References": ','.join([f'<a target="_blank" href="https://arxiv.org/abs/2212.11565" style="color: blue">Paper</a>', 
                               f'<a target="_blank" href="https://github.com/showlab/Tune-A-Video" style="color: blue">Code</a>', 
                               f'<a target="_blank" href="https://tuneavideo.github.io/" style="color: blue">Website</a>', 
                               f'<a target="_blank" href="https://huggingface.co/spaces/Tune-A-Video-library/Tune-A-Video-inference" style="color: blue">Demo</a>'])
    }
    baseline_1 = {
        "Method": 'VideoCrafter (todo)',
        "References": ','.join([f'<a target="_blank" href="https://github.com/VideoCrafter/VideoCrafter" style="color: blue">Code</a>',
                                 f'<a target="_blank" href="https://huggingface.co/spaces/VideoCrafter/VideoCrafter" style="color: blue">Demo</a>'])
    }
    all_data += [baseline_0, baseline_1]

    dataframe = pd.DataFrame.from_records(all_data)
    dataframe = dataframe.sort_values(by=['PickScore ⬆️'], ascending=False)
    print(dataframe)
    dataframe = dataframe[COLS]
    return dataframe

leaderboard = get_leaderboard()
    
def refresh():
    return get_leaderboard()

def load_edited_video(source_video, *args):
    result = source_video.split('/')[-1].split('.mp4')[0] + '-edit.mp4'
    return os.path.join(os.path.dirname(__file__), f"files/{result}")


block = gr.Blocks()
with block:
    with gr.Tab("Leaderboard"): 
        with gr.Row():
            gr.Markdown(f"""
    # 🤗 LOVEU-TGVE @ CVPR 2023 Leaderboard
    <font size="4">
    <b>Welcome to the <a href="https://sites.google.com/view/loveucvpr23/track4" target="_blank">Text-Guided Video Editing (TGVE)</a> competition leaderboard of <a href="https://sites.google.com/view/loveucvpr23/home" target="_blank">LOVEU Workshop @ CVPR 2023</a>!</b>

    Leveraging AI for video editing has the potential to unleash creativity for artists across all skill levels. The rapidly-advancing field of Text-Guided Video Editing (TGVE) is here to address this challenge. Recent works in this field include <a href="https://tuneavideo.github.io/" target="_blank">Tune-A-Video</a>, <a href="https://research.runwayml.com/gen2" target="_blank">Gen-2</a>, and <a href="https://dreamix-video-editing.github.io/" target="_blank">Dreamix</a>. 
    In this competition track, we provide a standard set of videos and prompts. As a researcher, you will develop a model that takes a video and a prompt for how to edit it, and your model will produce an edited video. For instance, you might be given a video of “a man is surfing inside the barrel of a wave,” and your model will edit the video to “a man is surfing on a wave made of aurora borealis.”

    During the competition, evaluation results performed against the following 3 automatic metrics will be displayed on the leaderboard:
    - <a href="https://arxiv.org/abs/2103.00020" target="_blank">CLIPScore</a> (Frame Consistency) - the average cosine similarity between all pairs of CLIP image embeddings computed on all frames of output videos.
    - <a href="https://arxiv.org/abs/2103.00020" target="_blank">CLIPScore</a> (Text Alignment) - the average CLIP score between all frames of output videos and corresponding edited prompts.
    - <a href="https://arxiv.org/abs/2305.01569" target="_blank">PickScore</a> - the average PickScore between all frames of output videos.

    After all submissions are uploaded, we will run a human-evaluation of all submitted videos. Specifically, we will have human labelers compare all submitted videos. Labelers will evaluate videos on the following criteria:

    - Text alignment: How well does the generated video match the caption?
    - Structure: How well does the generated video preserve the structure of the original video?
    - Quality: Aesthetically, how good is this video?

    We will choose a winner and a runner-up based on the human evaluation results. 
    </font>

    The **bold** method name indicates that the implementation is **official** (by the author / developer of the original method).""")
        
        with gr.Row():
            leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
                                                        datatype=TYPES, max_rows=10)
        with gr.Row():
            refresh_button = gr.Button("Refresh")
            refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table]) 
        block.load(refresh, inputs=[], outputs=[leaderboard_table])

    with gr.Tab("Baseline Demo"): 
        with gr.Row():
            gr.Markdown(f"""Some examples generated by {BASELINE} are shown below.""")
        with gr.Row():
            with gr.Column():
                source_video = gr.Video(type="file", label='Source Video', format="mp4", interactive=True)
                source_prompt = gr.Textbox(label='Source Prompt',
                                # info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
                                max_lines=2,
                                placeholder='Example: "A cat in the grass in the sun."',
                                # value='A cat in the grass in the sun.'
                                )

            with gr.Column():
                result = gr.Video(type="file", label='Edited Video', format="mp4", interactive=True)
                editing_prompt = gr.Textbox(label='Editing Prompt',
                                    # info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
                                    max_lines=2,
                                    placeholder='Example: "A dog in the grass in the sun."',
                                    # value='A dog in the grass in the sun.'
                                    )
        
        with gr.Row():
            from example import examples
            gr.Examples(examples=examples,
                    inputs=[source_video, source_prompt, editing_prompt],
                    outputs=result,
                    fn=load_edited_video,
                    cache_examples=True,
                    )
block.launch()