SEED-Bench_Leaderboard / constants.py
tttoaster's picture
Update constants.py
07bbdd6
raw history blame
No virus
6.42 kB
# this is .py for store constants
MODEL_INFO = ["Model Type", "Model", "Language Model"]
MODEL_INFO_V2 = ["Model", "Language Model"]
MODEL_SIZE = ["<10B", ">=10B"]
DIMENSION_LEVEL = ["L1", "L2", "L3"]
LEADERBOARD_VERSION = ["Version1", "Version2"]
TASK_INFO = ["Avg. All", "Avg. Img", "Avg. Video", "Scene Understanding", "Instance Identity", "Instance Attribute", "Instance Location", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Action Recognition", "Action Prediction", "Procedure Understanding"]
TASK_V2_INFO = ["Avg. P1", "Avg. P2", "Avg. P3", "Scene Understanding", "Instance Identity", "Instance Attribute", "Instance Location", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Celebrity Recognition", "Landmark Recognition", "Chart Understanding", "Visual Referring Expression", "Science Knowledge", "Emotion Recognition", "Visual Mathematics", "Difference Spotting", "Meme Comprehension", "Global Video Understanding", "Action Recognition", "Action Predicion", "Procedure Understanding", "In-Context Captioning", "Interleaved Image-Text Analysis", "Text-to-Image Generation", "Next Image Prediction", "Text-Image Creation"]
AVG_INFO = ["Avg. All", "Avg. Img", "Avg. Video"]
AVG_V2_INFO = ["Avg. P1", "Avg. P2", "Avg. P3"]
DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
DATA_TITILE_V2_TYPE = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
CSV_DIR = "./file/result.csv"
CSV_V2_DIR = "./file/result_v2.csv"
COLUMN_NAMES = MODEL_INFO + TASK_INFO
COLUMN_V2_NAMES = MODEL_INFO_V2 + TASK_V2_INFO
DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192]
DATA_NUM_V2 = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 435, 330, 500, 501, 199, 277, 501, 132, 501, 159, 1594, 1509, 1225, 1023, 120, 49, 1008, 81, 79]
LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard
Welcome to the leaderboard of the SEED-Bench! πŸ†
SEED-Bench consists of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both the spatial and temporal understanding.
Please refer to [our paper](https://arxiv.org/abs/2307.16125) for more details.
"""
SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark v1 Introduction
1. Obtain JSON file from our [github repository](https://github.com/AILab-CVC/SEED-Bench#leaderboard-submit) after evaluation. For example, you can obtain InstructBLIP's JSON file as results/results.json after running
```shell
python eval.py --model instruct_blip --anno_path SEED-Bench.json --output-dir results
```
2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify InstructBLIP's performance, you need to fill in 'InstructBLIP' in 'Revision Model Name'.
3. Please provide the correct link of your model's repository for each submission.
4. For the evaluation dimension, you can choose "All/Image/Video", and the results of dimensions that are not evaluated will be set to zero.
5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
## Submit Example
For example, if you want to upload InstructBLIP's result in the leaderboard, you need to:
1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
2. Select 'ImageLLM' in 'Model Type'.
3. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
4. Select 'Flan-T5-XL' in 'LLM Type'.
5. Select 'All' in 'Evaluation Dimension'.
6. Upload results.json.
7. Click the 'Submit Eval' button.
8. Click 'Refresh' to obtain the uploaded leaderboard.
"""
TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
We use accurancy(%) as the primary evaluation metric for each tasks.
"""
LEADERBORAD_INFO = """
Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@article{li2023seed,
title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension},
author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
journal={arXiv preprint arXiv:2307.16125},
year={2023}
}"""