# this is .py for store constants MODEL_INFO = ["Model"] TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making", "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT", "TVQA", "MV", "NBA", "Driving-exam", "Driving-decision-making", "SQA3D"] AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"] DATA_TITILE_TYPE = ["markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", ] CSV_DIR = "./file/result.csv" # COLUMN_NAMES = MODEL_INFO + TASK_INFO COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2 LEADERBORAD_INTRODUCTION = """# Video-Bench Leaderboard Welcome to the leaderboard of the Video-Bench! 🏆 Video-Bench consists of 15K questions with human-like video for evaluating Video-LLMs, covering three-level and 13 evaluation dimensions including both the spatial and temporal understanding. Please refer to [our paper](https://arxiv.org/abs/2311.16103) for more details. """ SUBMIT_INTRODUCTION = """# Submit Introduction Obtain `Video-Bench-Input.json` from our [github repository](https://github.com/PKU-YuanGroup/Video-Bench#%EF%B8%8F-evaluate-your-own-model) after evaluation. ## Submit Example For example, if you want to upload Video-ChatGPT's result in the leaderboard, you need to: 1. Fill in 'Chat-UniVi-7B' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank). 2. Fill in 'Chat-UniVi-7B' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank). 3. Fill in 'https://github.com/x/x' in 'Model Link'. 4. Upload `Video-Bench-Input.json`. 5. Click the 'Submit Eval' button. 6. Click 'Refresh' to obtain the uploaded leaderboard. """ TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models. We use accurancy(%) as the primary evaluation metric for each tasks. """ LEADERBORAD_INFO = """ Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation. In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench. SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality. We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes. Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation. We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding. By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""@misc{ning2023videobench, title={Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models}, author={Munan Ning and Bin Zhu and Yujia Xie and Bin Lin and Jiaxi Cui and Lu Yuan and Dongdong Chen and Li Yuan}, year={2023}, eprint={2311.16103}, archivePrefix={arXiv}, primaryClass={cs.CV} } }"""