|
import pandas as pd |
|
import gradio as gr |
|
import csv |
|
import json |
|
import os |
|
import shutil |
|
from huggingface_hub import Repository |
|
|
|
HF_TOKEN = os.environ.get("HUGGINGFACE_TOKEN") |
|
|
|
MODEL_INFO = [ |
|
"Model", |
|
"Avg", |
|
"Visual Quality", |
|
"Temporal Consistency", |
|
"Dynamic Degree", |
|
"Text-to-Video Alignment", |
|
"Factual Consistency" |
|
] |
|
|
|
DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number',] |
|
|
|
SUBMISSION_NAME = "VideoScore-Leaderboard" |
|
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/hexuan21/", SUBMISSION_NAME) |
|
CSV_DIR = "./VideoScore-Leaderboard/leaderboard_res.csv" |
|
|
|
COLUMN_NAMES = MODEL_INFO |
|
|
|
LEADERBORAD_INTRODUCTION = """# VideoScore Leaderboard |
|
|
|
🏆 Welcome to the **VideoScore Leaderboard**! <br> |
|
|
|
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;"> |
|
<a href='https://arxiv.org/abs/2406.15252'>📃Paper</a> |
|
<a href='https://tiger-ai-lab.github.io/VideoScore/'>🌐Website</a> |
|
<a href='https://github.com/TIGER-AI-Lab/VideoScore'>💻Github</a> |
|
<a href='https://huggingface.co/datasets/TIGER-Lab/VideoFeedback'>🛢️VideoFeedback (Dataset)</a> |
|
<a href='https://huggingface.co/TIGER-Lab/VideoScore'>🤗VideoScore (Model)</a> |
|
<a href='https://huggingface.co/spaces/TIGER-Lab/VideoScore'>🤗Demo</a> |
|
<a href='https://api.wandb.ai/links/xuanhe/ptohlfcx'>📉Wandb</a> |
|
</div> |
|
|
|
The leaderboard covers many popular text-to-video generative models and evaluates them on 5 dimensions: <br> |
|
|
|
"Visual Quality", "Temporal Consistency", "Dynamic Degree", "Text-to-Video Alignment", "Factual Consistency" |
|
|
|
We sample 200 prompts from <a href="https://arxiv.org/abs/2403.06098">VidProM</a> to generate 200 videos using various T2V models (for those closed-source model, we generate 100). |
|
|
|
<a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FVideoScore-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a> |
|
""" |
|
|
|
TABLE_INTRODUCTION = """ |
|
""" |
|
|
|
LEADERBORAD_INFO = """ |
|
""" |
|
|
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite the t2v models and related papers" |
|
CITATION_BUTTON_TEXT = r""" |
|
@article{he2024videoscore, |
|
title = {VideoScore: Building Automatic Metrics to Simulate Fine-grained Human Feedback for Video Generation}, |
|
author = {He, Xuan and Jiang, Dongfu and Zhang, Ge and Ku, Max and Soni, Achint and Siu, Sherman and Chen, Haonan and Chandra, Abhranil and Jiang, Ziyan and Arulraj, Aaran and Wang, Kai and Do, Quy Duc and Ni, Yuansheng and Lyu, Bohan and Narsupalli, Yaswanth and Fan, Rongqi and Lyu, Zhiheng and Lin, Yuchen and Chen, Wenhu}, |
|
journal = {ArXiv}, |
|
year = {2024}, |
|
volume={abs/2406.15252}, |
|
url = {https://arxiv.org/abs/2406.15252}, |
|
} |
|
|
|
@misc{pika, |
|
title = {Pika {L}ab}, |
|
howpublished = {\url{https://www.pika.art/}}, |
|
} |
|
|
|
@article{text2video-zero, |
|
title={Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators}, |
|
author={Khachatryan, Levon and Movsisyan, Andranik and Tadevosyan, Vahram and Henschel, Roberto and Wang, Zhangyang and Navasardyan, Shant and Shi, Humphrey}, |
|
journal={arXiv preprint arXiv:2303.13439}, |
|
year={2023} |
|
} |
|
|
|
@misc{chen2024videocrafter2, |
|
title={VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models}, |
|
author={Haoxin Chen and Yong Zhang and Xiaodong Cun and Menghan Xia and Xintao Wang and Chao Weng and Ying Shan}, |
|
year={2024}, |
|
eprint={2401.09047}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CV} |
|
} |
|
|
|
@article{Wang2023ModelScopeTT, |
|
title={ModelScope Text-to-Video Technical Report}, |
|
author={Jiuniu Wang and Hangjie Yuan and Dayou Chen and Yingya Zhang and Xiang Wang and Shiwei Zhang}, |
|
journal={ArXiv}, |
|
year={2023}, |
|
volume={abs/2308.06571}, |
|
url={https://api.semanticscholar.org/CorpusID:260887737} |
|
} |
|
|
|
@article{wang2023lavie, |
|
title={LAVIE: High-Quality Video Generation with Cascaded Latent Diffusion Models}, |
|
author={Wang, Yaohui and Chen, Xinyuan and Ma, Xin and Zhou, Shangchen and Huang, Ziqi and Wang, Yi and Yang, Ceyuan and He, Yinan and Yu, Jiashuo and Yang, Peiqing and others}, |
|
journal={arXiv preprint arXiv:2309.15103}, |
|
year={2023} |
|
} |
|
|
|
@article{guo2023animatediff, |
|
title={AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning}, |
|
author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Liang, Zhengyang and Wang, Yaohui and Qiao, Yu and Agrawala, Maneesh and Lin, Dahua and Dai, Bo}, |
|
journal={International Conference on Learning Representations}, |
|
year={2024} |
|
} |
|
|
|
@article{guo2023sparsectrl, |
|
title={SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models}, |
|
author={Guo, Yuwei and Yang, Ceyuan and Rao, Anyi and Agrawala, Maneesh and Lin, Dahua and Dai, Bo}, |
|
journal={arXiv preprint arXiv:2311.16933}, |
|
year={2023} |
|
} |
|
|
|
@article{he2022lvdm, |
|
title={Latent Video Diffusion Models for High-Fidelity Long Video Generation}, |
|
author={Yingqing He and Tianyu Yang and Yong Zhang and Ying Shan and Qifeng Chen}, |
|
year={2022}, |
|
eprint={2211.13221}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CV} |
|
} |
|
|
|
@software{Mullan_Hotshot-XL_2023, |
|
author = {Mullan, John and Crawbuck, Duncan and Sastry, Aakash}, |
|
license = {Apache-2.0}, |
|
month = oct, |
|
title = {{Hotshot-XL}}, |
|
url = {https://github.com/hotshotco/hotshot-xl}, |
|
version = {1.0.0}, |
|
year = {2023} |
|
} |
|
|
|
@misc{zeroscope, |
|
title = {ZeroScope v2}, |
|
author = {Spencer Sterling}, |
|
url = {https://huggingface.co/cerspense/zeroscope_v2_576w}, |
|
year={2024}, |
|
} |
|
|
|
@article{yuan2024magictime, |
|
title={MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators}, |
|
author={Yuan, Shenghai and Huang, Jinfa and Shi, Yujun and Xu, Yongqi and Zhu, Ruijie and Lin, Bin and Cheng, Xinhua and Yuan, Li and Luo, Jiebo}, |
|
journal={arXiv preprint arXiv:2404.05014}, |
|
year={2024} |
|
} |
|
|
|
@misc{chen2023videocrafter1, |
|
title={VideoCrafter1: Open Diffusion Models for High-Quality Video Generation}, |
|
author={Haoxin Chen and Menghan Xia and Yingqing He and Yong Zhang and Xiaodong Cun and Shaoshu Yang and Jinbo Xing and Yaofang Liu and Qifeng Chen and Xintao Wang and Chao Weng and Ying Shan}, |
|
year={2023}, |
|
eprint={2310.19512}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CV} |
|
} |
|
|
|
@article{xing2023dynamicrafter, |
|
title={DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors}, |
|
author={Jinbo Xing and Menghan Xia and Yong Zhang and Haoxin Chen and Xintao Wang and Tien-Tsin Wong and Ying Shan}, |
|
year={2023}, |
|
eprint={2310.12190}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CV} |
|
} |
|
|
|
@article{ma2024latte, |
|
title={Latte: Latent Diffusion Transformer for Video Generation}, |
|
author={Ma, Xin and Wang, Yaohui and Jia, Gengyun and Chen, Xinyuan and Liu, Ziwei and Li, Yuan-Fang and Chen, Cunjian and Qiao, Yu}, |
|
journal={arXiv preprint arXiv:2401.03048}, |
|
year={2024} |
|
} |
|
|
|
|
|
@software{opensora, |
|
author = {Zangwei Zheng and Xiangyu Peng and Tianji Yang and Chenhui Shen and Shenggui Li and Hongxin Liu and Yukun Zhou and Tianyi Li and Yang You}, |
|
title = {Open-Sora: Democratizing Efficient Video Production for All}, |
|
month = {March}, |
|
year = {2024}, |
|
url = {https://github.com/hpcaitech/Open-Sora} |
|
} |
|
|
|
@software{pku_yuan_lab_and_tuzhan_ai_etc_2024_10948109, |
|
author = {PKU-Yuan Lab and Tuzhan AI etc.}, |
|
title = {Open-Sora-Plan}, |
|
month = apr, |
|
year = 2024, |
|
publisher = {GitHub}, |
|
doi = {10.5281/zenodo.10948109}, |
|
url = {https://doi.org/10.5281/zenodo.10948109} |
|
} |
|
|
|
@article{jin2023unified, |
|
title={Unified Language-Vision Pretraining in LLM with Dynamic Discrete Visual Tokenization}, |
|
author={Jin, Yang and Xu, Kun and Xu, Kun and Chen, Liwei and Liao, Chao and Tan, Jianchao and Mu, Yadong and others}, |
|
journal={arXiv preprint arXiv:2309.04669}, |
|
year={2023} |
|
} |
|
|
|
@article{jin2024video, |
|
title={Video-LaVIT: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization}, |
|
author={Jin, Yang and Sun, Zhicheng and Xu, Kun and Chen, Liwei and Jiang, Hao and Huang, Quzhe and Song, Chengru and Liu, Yuliang and Zhang, Di and Song, Yang and others}, |
|
journal={arXiv preprint arXiv:2402.03161}, |
|
year={2024} |
|
} |
|
|
|
|
|
|
|
@misc{gen2, |
|
title = {Gen-2}, |
|
howpublished = {\url{https://runwayml.com/research/gen-2?utm_source=creatorstoolbox.io/}}, |
|
} |
|
|
|
@misc{morphstudio, |
|
title = {Kling}, |
|
howpublished = {\url{https://www.morphstudio.com/}}, |
|
} |
|
|
|
@misc{kling, |
|
title = {Kling}, |
|
howpublished = {\url{https://kling.kuaishou.com/}}, |
|
} |
|
|
|
|
|
@article{wang2024vidprom, |
|
title={Vidprom: A million-scale real prompt-gallery dataset for text-to-video diffusion models}, |
|
author={Wang, Wenhao and Yang, Yi}, |
|
journal={arXiv preprint arXiv:2403.06098}, |
|
year={2024} |
|
} |
|
|
|
""" |
|
|
|
def get_df(): |
|
repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN) |
|
repo.git_pull() |
|
df = pd.read_csv(CSV_DIR) |
|
df['Model'] = df['Model'].apply(lambda x: f"[{x.split(']')[0][1:]}]({x.split('(')[1][:-1]})") |
|
df['Avg'] = df[["Visual Quality", |
|
"Temporal Consistency", |
|
"Dynamic Degree", |
|
"Text-to-Video Alignment", |
|
"Factual Consistency"]].mean(axis=1).round(2) |
|
df = df.sort_values(by=['Avg'], ascending=False) |
|
return df[COLUMN_NAMES] |
|
|
|
|
|
def refresh_data(): |
|
return get_df() |