Spaces:
Sleeping
Sleeping
# this is .py for store constants | |
MODEL_INFO = ["Model Type", "Model", "Language Model"] | |
TASK_INFO = ["Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Avg. Img", "Action Recognition", "Action Prediction", "Procedure Understanding", "Avg. Video", "Avg. All"] | |
AVG_INFO = ["Avg. Img", "Avg. Video", "Avg. All"] | |
DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"] | |
CSV_DIR = "./file/result.csv" | |
COLUMN_NAMES = MODEL_INFO + TASK_INFO | |
DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192] | |
UNTUNED_MODEL_RESULTS = '''LLM & Flan-T5 & Flan-T5-XL &23.0 &29.0 &32.8 &31.8 &20.5 &31.8 &33.0 &18.2 &19.4 &23.2 &34.9 &25.4 \\ | |
LLM & Vicuna & Vicuna-7B &23.4 &30.7 &29.7 &30.9 &30.8 &28.6 &29.8 &18.5 &13.4 &27.3 &34.5 &23.8 \\ | |
LLM & LLaMA & LLaMA-7B &26.3 &27.4 &26.2 &28.3 &25.1 &28.8 &19.2 &37.0 & 9.0 &33.0 &23.1 &26.2 \\ | |
ImageLLM & BLIP-2 & Flan-T5-XL &59.1 &53.9 &49.2 &42.3 &43.2 &36.7 &55.7 &45.6 &25.9 &32.6 &47.5 &24.0 \\ | |
ImageLLM & InstructBLIP & Flan-T5-XL &60.3 &58.5 &63.4 &40.6 &58.4 &38.7 &51.6 &45.9 &25.9 &33.1 &49.1 &27.1 \\ | |
ImageLLM & InstructBLIP-Vicuna & Vicuna-7B &60.2 &58.9 &65.6 &43.6 &57.2 &40.3 &52.6 &47.7 &43.5 &34.5 &49.6 &23.1 \\ | |
ImageLLM & LLaVA & LLaMA-7B &42.7 &34.9 &33.5 &28.4 &41.9 &30.8 &27.8 &46.8 &27.7 &29.7 &21.4 &19.1 \\ | |
ImageLLM & MiniGPT-4 & Flan-T5-XL &56.3 &49.2 &45.8 &37.9 &45.3 &32.6 &47.4 &57.1 &11.8 &38.2 &24.5 &27.1 \\ | |
ImageLLM & VPGTrans & LLaMA-7B &51.9 &44.1 &39.9 &36.1 &33.7 &36.4 &32.0 &53.2 &30.6 &39.5 &24.3 &31.9 \\ | |
ImageLLM & MultiModal-GPT & LLaMA-7B &43.6 &37.9 &31.5 &30.8 &27.3 &30.1 &29.9 &51.4 &18.8 &36.9 &25.8 &24.0 \\ | |
ImageLLM & Otter & LLaMA-7B &44.9 &38.6 &32.2 &30.9 &26.3 &31.8 &32.0 &51.4 &31.8 &37.9 &27.2 &24.8 \\ | |
ImageLLM & OpenFlamingo & LLaMA-7B &43.9 &38.1 &31.3 &30.1 &27.3 &30.6 &29.9 &50.2 &20.0 &37.2 &25.4 &24.2 \\ | |
ImageLLM & LLaMA-Adapter V2 & LLaMA-7B &45.2 &38.5 &29.3 &33.0 &29.7 &35.5 &39.2 &52.0 &24.7 &38.6 &18.5 &19.6 \\ | |
ImageLLM & GVT & Vicuna-7B &41.7 &35.5 &31.8 &29.5 &36.2 &32.0 &32.0 &51.1 &27.1 &33.9 &25.4 &23.0 \\ | |
ImageLLM & mPLUG-Owl & LLaMA-7B &49.7 &45.3 &32.5 &36.7 &27.3 &32.7 &44.3 &54.7 &28.8 &26.7 &17.9 &26.5 \\ | |
VideoLLM & VideoChat & Vicuna-7B &47.1 &43.8 &34.9 &40.0 &32.8 &34.6 &42.3 &50.5 &17.7 &34.9 &36.4 &27.3 \\ | |
VideoLLM & Video-ChatGPT & LLaMA-7B &37.2 &31.4 &33.2 &28.4 &35.5 &29.5 &23.7 &42.3 &25.9 &27.6 &21.3 &21.1 \\ | |
VideoLLM & Valley & LLaMA-13B &39.3 &32.9 &31.6 &27.9 &24.2 &30.1 &27.8 &43.8 &11.8 &31.3 &23.2 &20.7 \\''' | |
LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard | |
Welcome to the leaderboard of the SEED-Bench! π | |
This is a community where participants create multimodal language models and action generation algorithms to generate API function calls based goals described in natural lanugage! | |
Please refer to [our paper](https://arxiv.org/abs/2307.16125) for more details. | |
""" | |
TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models. | |
We use accurancy(%) as the primary evaluation metric for most tasks. | |
""" | |
LEADERBORAD_INFO = """ | |
Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation. | |
In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench. | |
SEED-Bench consists of 19K multiple choice questions with accurate human annotations ($\times$6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality. | |
We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes. | |
Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation. | |
We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding. | |
By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research. | |
""" | |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" | |
CITATION_BUTTON_TEXT = r"""@article{li2023seed, | |
title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension}, | |
author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying}, | |
journal={arXiv preprint arXiv:2307.16125}, | |
year={2023} | |
}""" | |