import os
import gradio as gr
import pandas as pd
BASELINE = f'Tune-A-Video (Baseline)'
COLS = ["Method", "Human Eval (Text Alignment) ⬆️", "Human Eval (Structure) ⬆️", "Human Eval (Quality) ⬆️", "Human Eval (Avg.) ⬆️",
"References"]
COLS_AUTO = ["Method", "CLIPScore (Text Alignment) ⬆️", "CLIPScore (Frame Consistency) ⬆️", "PickScore ⬆️"]
TYPES = ["markdown", "number", "number", "number", "number", "markdown"]
TYPES_AUTO = ["markdown", "number", "number", "number"]
def get_leaderboard():
from result import submission_results
all_data = []
baseline_0 = {
"Method": '**Tune-A-Video**',
"CLIPScore (Frame Consistency) ⬆️":92.40,
"CLIPScore (Text Alignment) ⬆️":27.12,
"PickScore ⬆️":20.36,
"References": ','.join([f'Paper',
f'Code',
f'Website',
f'Demo'])
}
baseline_1 = {
"Method": 'VideoCrafter',
"CLIPScore (Frame Consistency) ⬆️":88.51,
"CLIPScore (Text Alignment) ⬆️":25.55,
"PickScore ⬆️":19.17,
"References": ','.join([f'Code',
f'Demo'])
}
all_data += [baseline_0, baseline_1]
all_data += submission_results
dataframe = pd.DataFrame.from_records(all_data)
dataframe = dataframe.sort_values(by=['Human Eval (Avg.) ⬆️'], ascending=False)
print(dataframe)
dataframe_human = dataframe[COLS]
dataframe_auto = dataframe[COLS_AUTO]
dataframe_auto = dataframe_auto.sort_values(by=['CLIPScore (Text Alignment) ⬆️'], ascending=False)
return dataframe_human, dataframe_auto
leaderboard, leaderboard_auto = get_leaderboard()
def refresh():
return get_leaderboard()
def load_edited_video(source_video, *args):
result = source_video.split('/')[-1].split('.mp4')[0] + '-edit.mp4'
return os.path.join(os.path.dirname(__file__), f"files/{result}")
block = gr.Blocks()
with block:
with gr.Tab("Leaderboard"):
with gr.Row():
gr.Markdown(f"""
# 🤗 LOVEU-TGVE @ CVPR 2023 Leaderboard
Welcome to the Text-Guided Video Editing (TGVE) competition leaderboard of LOVEU Workshop @ CVPR 2023!
Leveraging AI for video editing has the potential to unleash creativity for artists across all skill levels. The rapidly-advancing field of Text-Guided Video Editing (TGVE) is here to address this challenge. Recent works in this field include Tune-A-Video, Gen-2, and Dreamix.
In this competition track, we provide a standard set of videos and prompts. As a researcher, you will develop a model that takes a video and a prompt for how to edit it, and your model will produce an edited video. For instance, you might be given a video of “a man is surfing inside the barrel of a wave,” and your model will edit the video to “a man is surfing on a wave made of aurora borealis.”
To participate in the contest, you will submit the videos generated by your model. As you develop your model, you may want to visually evaluate your results and use automated metrics such as the CLIPScore and PickScore to track your progress:
- CLIPScore (Frame Consistency) - the average cosine similarity between all pairs of CLIP image embeddings computed on all frames of output videos.
- CLIPScore (Text Alignment) - the average CLIP score between all frames of output videos and corresponding edited prompts.
- PickScore - the average PickScore between all frames of output videos.
After all submissions are uploaded, we will run a human-evaluation of all submitted videos. Specifically, we will have human labelers compare all submitted videos to the baseline videos that were edited with the Tune-A-Video model. Labelers will evaluate videos on the following criteria:
- Text alignment: Which video better matches the caption?
- Structure: Which video better preserves the structure of the input video?
- Quality: Aesthetically, which video is better?
We will choose a winner and a runner-up based on the human evaluation results.
The **bold** method name indicates that the implementation is **official** (by the author / developer of the original method).""")
with gr.Row():
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
datatype=TYPES, max_rows=10)
with gr.Accordion("Expand for automated metrics results", open=False):
with gr.Row():
leaderboard_table_auto = gr.components.Dataframe(value=leaderboard_auto, headers=COLS_AUTO,
datatype=TYPES_AUTO, max_rows=10)
with gr.Row():
refresh_button = gr.Button("Refresh")
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, leaderboard_table_auto])
block.load(refresh, inputs=[], outputs=[leaderboard_table, leaderboard_table_auto])
with gr.Tab("Baseline Demo"):
with gr.Row():
gr.Markdown(f"""Some examples generated by {BASELINE} are shown below.""")
with gr.Row():
with gr.Column():
source_video = gr.Video(type="file", label='Source Video', format="mp4", interactive=True)
source_prompt = gr.Textbox(label='Source Prompt',
# info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.',
max_lines=2,
placeholder='Example: "A cat in the grass in the sun."',
# value='A cat in the grass in the sun.'
)
with gr.Column():
result = gr.Video(type="file", label='Edited Video', format="mp4", interactive=True)
editing_prompt = gr.Textbox(label='Editing Prompt',
# info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")',
max_lines=2,
placeholder='Example: "A dog in the grass in the sun."',
# value='A dog in the grass in the sun.'
)
with gr.Row():
from example import examples
gr.Examples(examples=examples,
inputs=[source_video, source_prompt, editing_prompt],
outputs=result,
fn=load_edited_video,
cache_examples=True,
)
block.launch()