Spaces:

TIGER-Lab
/

VideoScore-Leaderboard

Running

hexuan21

update app.py

3f98a3d 4 months ago

3.88 kB

	import pandas as pd
	import gradio as gr
	import csv
	import json
	import os
	import shutil
	from huggingface_hub import Repository

	HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN")

	MODEL_INFO = [
	"Model",
	"Avg",
	"Visual Quality",
	"Temporal Consistency",
	"Dynamic Degree",
	"Text-to-Video Alignment",
	"Factual Consistency"
	]

	DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number',]

	SUBMISSION_NAME = "VideoScore-Leaderboard"
	SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/hexuan21/", SUBMISSION_NAME)
	CSV_DIR = "./VideoScore-Leaderboard/leaderboard_res.csv"

	COLUMN_NAMES = MODEL_INFO

	LEADERBORAD_INTRODUCTION = """# VideoScore Leaderboard

	🏆 Welcome to the VideoScore Leaderboard! The leaderboard covers many popular text-to-video generative models and evaluates them on 4 dimensions: <br>

	"Visual Quality", "Temporal Consistency", "Dynamic Degree", "Text-to-Video Alignment".

	To demonstrate the performance of our VideoScore,
	we use VideoScore to choose the best from videos with same prompt but different seeds.
	Then we use some feature-based metrics mentioned in both <a href="https://arxiv.org/abs/2406.15252">VideoScore paper</a>
	and <a href="https://arxiv.org/abs/2310.11440">EvalCrafter paper</a>,
	see more info about these metrics in the second sheet "About" above.


	<a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2Fhexuan21%2FVideoScore-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
	"""

	TABLE_INTRODUCTION = """
	"""

	LEADERBORAD_INFO = """
	Here is the detailed information for the used metrics. <br>

	<a href="https://arxiv.org/abs/2406.15252">VideoScore</a> and <a href="https://arxiv.org/abs/2310.11440">EvalCrafter</a> both
	conduct studies about the correlation between these feature-based metrics (like CLIP-Score and SSIM) and the human scoring on generated videos.
	Some of these metrics show a relatively good correlation but some correlates bad with human scores. <br>

	Below are the metrics for each dimension, raw score of these metrics is [0,1] and larger is better if there's no extra explanation, then scaled to [0, 100] <br>

	(1) Visual Quality = average(VQA_A, VQA_T) <br>

	VQA_A and VQA_T are both from EvalCrafter metrics suite.


	(2) Temporal Consistency = average(CLIP_Temp, Face_Consistency_Score, Warping_Error) <br>

	CLIP_Temp, Face_Consistency_Score, Warping_Error are all from EvalCrafter metrics suite.

	Warping_Error is "100*(1 - raw_result)" so that larger score indicate better performance.


	(3) Dynamic Degree = average(SSIM_dyn, MSE_dyn) <br>

	SSIM_dyn and MSE_dyn are both from VideoScore.

	SSIM_dyn is "100*(1-raw_result)" so that larger score indicate better performance.

	MSE_dyn is "100(1-raw_results/255^2)" since the value range of pixel is 0-255 and the theoretical maximum of MSE is 255255.


	(4) Text-to-Video Alignment = average(CLIP-Score, BLIP-BLEU) <br>

	CLIP-Scoreand BLIP-BLEU are both from EvalCrafter metrics suite.

	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite the t2v models and the used metrics"
	CITATION_BUTTON_TEXT = r"""


	"""

	def get_df():
	repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
	repo.git_pull()
	df = pd.read_csv(CSV_DIR)
	df['Model'] = df['Model'].apply(lambda x: f"[{x.split(']')[0][1:]}]({x.split('(')[1][:-1]})")
	df['Avg'] = df[["Visual Quality",
	"Temporal Consistency",
	"Dynamic Degree",
	"Text-to-Video Alignment",
	"Factual Consistency"]].mean(axis=1).round(2)
	df = df.sort_values(by=['Avg'], ascending=False)
	return df[COLUMN_NAMES]


	def refresh_data():
	return get_df()