|
import os |
|
import gradio as gr |
|
import pandas as pd |
|
|
|
|
|
BASELINE = f'<a target="_blank" href=https://github.com/showlab/loveu-tgve-2023 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">Tune-A-Video (Baseline)</a>' |
|
COLS = ["Method", "CLIPScore (Frame Consistency) ⬆️", "CLIPScore (Text Alignment) ⬆️", "PickScore ⬆️", "Human Preference ⬆️", "References"] |
|
TYPES = ["markdown", "number", "number", "number", "str", "markdown"] |
|
|
|
|
|
def get_leaderboard(): |
|
all_data = [] |
|
|
|
baseline_0 = { |
|
"Method": '**Tune-A-Video**', |
|
"CLIPScore (Frame Consistency) ⬆️":0.92, |
|
"CLIPScore (Text Alignment) ⬆️":27.12, |
|
"PickScore ⬆️":20.36, |
|
"Human Preference ⬆️":'', |
|
"References": ','.join([f'<a target="_blank" href="https://arxiv.org/abs/2212.11565" style="color: blue">Paper</a>', |
|
f'<a target="_blank" href="https://github.com/showlab/Tune-A-Video" style="color: blue">Code</a>', |
|
f'<a target="_blank" href="https://tuneavideo.github.io/" style="color: blue">Website</a>', |
|
f'<a target="_blank" href="https://huggingface.co/spaces/Tune-A-Video-library/Tune-A-Video-inference" style="color: blue">Demo</a>']) |
|
} |
|
baseline_1 = { |
|
"Method": 'VideoCrafter (todo)', |
|
"References": ','.join([f'<a target="_blank" href="https://github.com/VideoCrafter/VideoCrafter" style="color: blue">Code</a>', |
|
f'<a target="_blank" href="https://huggingface.co/spaces/VideoCrafter/VideoCrafter" style="color: blue">Demo</a>']) |
|
} |
|
all_data += [baseline_0, baseline_1] |
|
|
|
dataframe = pd.DataFrame.from_records(all_data) |
|
dataframe = dataframe.sort_values(by=['PickScore ⬆️'], ascending=False) |
|
print(dataframe) |
|
dataframe = dataframe[COLS] |
|
return dataframe |
|
|
|
leaderboard = get_leaderboard() |
|
|
|
def refresh(): |
|
return get_leaderboard() |
|
|
|
def load_edited_video(source_video, *args): |
|
result = source_video.split('/')[-1].split('.mp4')[0] + '-edit.mp4' |
|
return os.path.join(os.path.dirname(__file__), f"files/{result}") |
|
|
|
|
|
block = gr.Blocks() |
|
with block: |
|
with gr.Tab("Leaderboard"): |
|
with gr.Row(): |
|
gr.Markdown(f""" |
|
# 🤗 LOVEU-TGVE @ CVPR 2023 Leaderboard |
|
<font size="4"> |
|
<b>Welcome to the <a href="https://sites.google.com/view/loveucvpr23/track4" target="_blank">Text-Guided Video Editing (TGVE)</a> competition leaderboard of <a href="https://sites.google.com/view/loveucvpr23/home" target="_blank">LOVEU Workshop @ CVPR 2023</a>!</b> |
|
|
|
Leveraging AI for video editing has the potential to unleash creativity for artists across all skill levels. The rapidly-advancing field of Text-Guided Video Editing (TGVE) is here to address this challenge. Recent works in this field include <a href="https://tuneavideo.github.io/" target="_blank">Tune-A-Video</a>, <a href="https://research.runwayml.com/gen2" target="_blank">Gen-2</a>, and <a href="https://dreamix-video-editing.github.io/" target="_blank">Dreamix</a>. |
|
In this competition track, we provide a standard set of videos and prompts. As a researcher, you will develop a model that takes a video and a prompt for how to edit it, and your model will produce an edited video. For instance, you might be given a video of “a man is surfing inside the barrel of a wave,” and your model will edit the video to “a man is surfing on a wave made of aurora borealis.” |
|
|
|
During the competition, evaluation results performed against the following 3 automatic metrics will be displayed on the leaderboard: |
|
- <a href="https://arxiv.org/abs/2103.00020" target="_blank">CLIPScore</a> (Frame Consistency) - the average cosine similarity between all pairs of CLIP image embeddings computed on all frames of output videos. |
|
- <a href="https://arxiv.org/abs/2103.00020" target="_blank">CLIPScore</a> (Text Alignment) - the average CLIP score between all frames of output videos and corresponding edited prompts. |
|
- <a href="https://arxiv.org/abs/2305.01569" target="_blank">PickScore</a> - the average PickScore between all frames of output videos. |
|
|
|
After all submissions are uploaded, we will run a human-evaluation of all submitted videos. Specifically, we will have human labelers compare all submitted videos. Labelers will evaluate videos on the following criteria: |
|
|
|
- Text alignment: How well does the generated video match the caption? |
|
- Structure: How well does the generated video preserve the structure of the original video? |
|
- Quality: Aesthetically, how good is this video? |
|
|
|
We will choose a winner and a runner-up based on the human evaluation results. |
|
</font> |
|
|
|
The **bold** method name indicates that the implementation is **official** (by the author / developer of the original method).""") |
|
|
|
with gr.Row(): |
|
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS, |
|
datatype=TYPES, max_rows=10) |
|
with gr.Row(): |
|
refresh_button = gr.Button("Refresh") |
|
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table]) |
|
block.load(refresh, inputs=[], outputs=[leaderboard_table]) |
|
|
|
with gr.Tab("Baseline Demo"): |
|
with gr.Row(): |
|
gr.Markdown(f"""Some examples generated by {BASELINE} are shown below.""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
source_video = gr.Video(type="file", label='Source Video', format="mp4", interactive=True) |
|
source_prompt = gr.Textbox(label='Source Prompt', |
|
|
|
max_lines=2, |
|
placeholder='Example: "A cat in the grass in the sun."', |
|
|
|
) |
|
|
|
with gr.Column(): |
|
result = gr.Video(type="file", label='Edited Video', format="mp4", interactive=True) |
|
editing_prompt = gr.Textbox(label='Editing Prompt', |
|
|
|
max_lines=2, |
|
placeholder='Example: "A dog in the grass in the sun."', |
|
|
|
) |
|
|
|
with gr.Row(): |
|
from example import examples |
|
gr.Examples(examples=examples, |
|
inputs=[source_video, source_prompt, editing_prompt], |
|
outputs=result, |
|
fn=load_edited_video, |
|
cache_examples=True, |
|
) |
|
block.launch() |