Spaces:

AILab-CVC
/

SEED-Bench_Leaderboard

Running

App Files Files Community

tttoaster commited on Dec 4, 2023

Commit

43a97f8

1 Parent(s): 5a5d6b1

Upload 14 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
__pycache__/constants.cpython-38.pyc +0 -0
app.py +322 -106
constants.py +57 -22
file/SEED-Bench-1.json +0 -0
file/SEED-Bench-2.json +3 -0
file/result.csv +39 -38
file/result_v2.csv +25 -24

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+file/SEED-Bench-2.json filter=lfs diff=lfs merge=lfs -text

__pycache__/constants.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/constants.cpython-38.pyc and b/__pycache__/constants.cpython-38.pyc differ

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ def prediction_analyse(prediction_content):
     predictions = prediction_content.split("\n")
     # 读取 ground_truth JSON 文件
-    with open("./file/SEED-Bench.json", "r") as file:
         ground_truth_data = json.load(file)["questions"]
     # 将 ground_truth 数据转换为以 question_id 为键的字典
@@ -53,97 +53,228 @@ def prediction_analyse(prediction_content):
     return results
 def add_new_eval(
     input_file,
     model_name_textbox: str,
     revision_name_textbox: str,
     model_type: str,
     model_link: str,
     LLM_type: str,
     LLM_name_textbox: str,
     Evaluation_dimension: str,
 ):
     if input_file is None:
         return "Error! Empty file!"
     else:
-        content = input_file.decode("utf-8")
-        prediction = prediction_analyse(content)
-        csv_data = pd.read_csv(CSV_DIR)
-        Start_dimension, End_dimension = 1, 13
-        if Evaluation_dimension == 'Image':
-            End_dimension = 10
-        elif Evaluation_dimension == 'Video':
-            Start_dimension = 10
-        each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)}
-        # count for average image\video\all
-        total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
-        total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
-        total_image = sum(prediction[i]["total"] for i in range(1, 10))
-        total_video = sum(prediction[i]["total"] for i in range(10, 13))
-        if Evaluation_dimension != 'Video':
-            average_accuracy_image = round(total_correct_image / total_image * 100, 1)
-        else:
-            average_accuracy_image = 0
-        if Evaluation_dimension != 'Image':
-            average_accuracy_video = round(total_correct_video / total_video * 100, 1)
-        else:
-            average_accuracy_video = 0
-        if Evaluation_dimension == 'All':
-            overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
-        else:
-            overall_accuracy = 0
-        if LLM_type == 'Other':
-            LLM_name = LLM_name_textbox
-        else:
-            LLM_name = LLM_type
-        if revision_name_textbox == '':
-            col = csv_data.shape[0]
-            model_name = model_name_textbox
-        else:
-            model_name = revision_name_textbox
-            model_name_list = csv_data['Model']
-            name_list = [name.split(']')[0][1:] for name in model_name_list]
-            if revision_name_textbox not in name_list:
                 col = csv_data.shape[0]
             else:
-                col = name_list.index(revision_name_textbox)
-        if model_link == '':
-            model_name = model_name  # no url
         else:
-            model_name = '[' + model_name + '](' + model_link + ')'
-        # add new data
-        new_data = [
-            model_type,
-            model_name,
-            LLM_name,
-            overall_accuracy,
-            average_accuracy_image,
-            average_accuracy_video,
-            each_task_accuracy[1],
-            each_task_accuracy[2],
-            each_task_accuracy[3],
-            each_task_accuracy[4],
-            each_task_accuracy[5],
-            each_task_accuracy[6],
-            each_task_accuracy[7],
-            each_task_accuracy[8],
-            each_task_accuracy[9],
-            each_task_accuracy[10],
-            each_task_accuracy[11],
-            each_task_accuracy[12],
-            ]
-        csv_data.loc[col] = new_data
-        csv_data = csv_data.to_csv(CSV_DIR, index=False)
     return 0
 def get_baseline_df():
@@ -202,23 +333,21 @@ with block:
                 interactive=True,
             )
-            '''
             # selection for model size part:
-            filter_model_size = gr.CheckboxGroup(
                 choices=MODEL_SIZE,
                 value=MODEL_SIZE,
                 label="Model Size",
                 interactive=True,
             )
-            filter_dimension_level = gr.CheckboxGroup(
-                        choices=DIMENSION_LEVEL,
-                        label="Model level",
-                        multiselect=False,
-                        value=DIMENSION_LEVEL[1],
-                        interactive=True,
-                    )
-            '''
             # 创建数据帧组件
             data_component_v2 = gr.components.Dataframe(
@@ -229,15 +358,38 @@ with block:
                 interactive=False,
                 visible=True,
                 )
-            def on_checkbox_group_v2_change(selected_columns):
-                # pdb.set_trace()
                 selected_columns = [item for item in TASK_V2_INFO if item in selected_columns]
                 present_columns = MODEL_INFO_V2 + selected_columns
-                updated_data = get_all_v2_df()[present_columns]
-                updated_data = updated_data.sort_values(by=present_columns[2], ascending=False)
                 updated_headers = present_columns
-                # pdb.set_trace()
                 update_datatype = [DATA_TITILE_V2_TYPE[COLUMN_V2_NAMES.index(x)] for x in updated_headers]
                 filter_component = gr.components.Dataframe(
@@ -252,8 +404,9 @@ with block:
                 return filter_component.value
-            # 将复选框组关联到处理函数
-            checkbox_group_v2.change(fn=on_checkbox_group_v2_change, inputs=checkbox_group_v2, outputs=data_component_v2)
         # table seed-bench-v1
         with gr.TabItem("🏅 SEED Benchmark v1", elem_id="seed-benchmark-tab-table", id=1):
@@ -277,15 +430,21 @@ with block:
                 interactive=True,
             )
-            '''
             # selection for model size part:
-            filter_model_size = gr.CheckboxGroup(
                 choices=MODEL_SIZE,
                 value=MODEL_SIZE,
                 label="Model Size",
                 interactive=True,
             )
-            '''
             # 创建数据帧组件
             data_component = gr.components.Dataframe(
@@ -297,12 +456,36 @@ with block:
                 visible=True,
                 )
-            def on_checkbox_group_change(selected_columns):
-                # pdb.set_trace()
                 selected_columns = [item for item in TASK_INFO if item in selected_columns]
                 present_columns = MODEL_INFO + selected_columns
-                updated_data = get_all_df()[present_columns]
-                updated_data = updated_data.sort_values(by=present_columns[3], ascending=False)
                 updated_headers = present_columns
                 update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
@@ -318,8 +501,9 @@ with block:
                 return filter_component.value
-            # 将复选框组关联到处理函数
-            checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
         # table 2
         with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
@@ -358,9 +542,18 @@ with block:
                     model_link = gr.Textbox(
                         label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
                     )
                 with gr.Column():
                     LLM_type = gr.Dropdown(
                         choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
                         label="LLM type",
@@ -374,11 +567,25 @@ with block:
                     )
                     Evaluation_dimension = gr.Dropdown(
                         choices=["All", "Image", "Video"],
-                        label="Evaluation dimension",
                         multiselect=False,
                         value="All",
                         interactive=True,
                     )
             with gr.Column():
@@ -394,18 +601,27 @@ with block:
                         revision_name_textbox,
                         model_type,
                         model_link,
                         LLM_type,
                         LLM_name_textbox,
                         Evaluation_dimension,
                     ],
-                    # outputs = submission_result,
                 )
     with gr.Row():
         data_run = gr.Button("Refresh")
         data_run.click(
-            get_baseline_df, outputs=data_component
         )
     # block.load(get_baseline_df, outputs=data_title)

     predictions = prediction_content.split("\n")
     # 读取 ground_truth JSON 文件
+    with open("./file/SEED-Bench-1.json", "r") as file:
         ground_truth_data = json.load(file)["questions"]
     # 将 ground_truth 数据转换为以 question_id 为键的字典
     return results
+def prediction_analyse_v2(prediction_content):
+    # pdb.set_trace()
+    predictions = prediction_content.split("\n")
+    # 读取 ground_truth JSON 文件
+    with open("./file/SEED-Bench-2.json", "r") as file:
+        ground_truth_data = json.load(file)["questions"]
+    # 将 ground_truth 数据转换为以 question_id 为键的字典
+    ground_truth = {item["question_id"]: item for item in ground_truth_data}
+    # 初始化结果统计字典
+    results = {i: {"correct": 0, "total": 0} for i in range(1, 28)}
+    # 遍历 predictions，计算每个 question_type_id 的正确预测数和总预测数
+    for prediction in predictions:
+        # pdb.set_trace()
+        prediction = prediction.strip()
+        if not prediction:
+            continue
+        try:
+            prediction = json.loads(prediction)
+        except json.JSONDecodeError:
+            print(f"Warning: Skipping invalid JSON data in line: {prediction}")
+            continue
+        question_id = prediction["question_id"]
+        gt_item = ground_truth[question_id]
+        question_type_id = gt_item["question_type_id"]
+        if prediction["prediction"] == gt_item["answer"]:
+            results[question_type_id]["correct"] += 1
+        results[question_type_id]["total"] += 1
+    return results
 def add_new_eval(
     input_file,
     model_name_textbox: str,
     revision_name_textbox: str,
     model_type: str,
     model_link: str,
+    model_size: str,
+    benchmark_version: str,
     LLM_type: str,
     LLM_name_textbox: str,
     Evaluation_dimension: str,
+    Evaluation_dimension_2: str,
+    Evaluation_method: str
 ):
     if input_file is None:
         return "Error! Empty file!"
     else:
+        # v1 evaluation
+        if benchmark_version == 'v1':
+            content = input_file.decode("utf-8")
+            prediction = prediction_analyse(content)
+            csv_data = pd.read_csv(CSV_DIR)
+            Start_dimension, End_dimension = 1, 13
+            if Evaluation_dimension == 'Image':
+                End_dimension = 10
+            elif Evaluation_dimension == 'Video':
+                Start_dimension = 10
+            each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 13)}
+            # count for average image\video\all
+            total_correct_image = sum(prediction[i]["correct"] for i in range(1, 10))
+            total_correct_video = sum(prediction[i]["correct"] for i in range(10, 13))
+            total_image = sum(prediction[i]["total"] for i in range(1, 10))
+            total_video = sum(prediction[i]["total"] for i in range(10, 13))
+            if Evaluation_dimension != 'Video':
+                average_accuracy_image = round(total_correct_image / total_image * 100, 1)
+            else:
+                average_accuracy_image = 0
+            if Evaluation_dimension != 'Image':
+                average_accuracy_video = round(total_correct_video / total_video * 100, 1)
+            else:
+                average_accuracy_video = 0
+            if Evaluation_dimension == 'All':
+                overall_accuracy = round((total_correct_image + total_correct_video) / (total_image + total_video) * 100, 1)
+            else:
+                overall_accuracy = 0
+            if LLM_type == 'Other':
+                LLM_name = LLM_name_textbox
+            else:
+                LLM_name = LLM_type
+            if revision_name_textbox == '':
                 col = csv_data.shape[0]
+                model_name = model_name_textbox
             else:
+                model_name = revision_name_textbox
+                model_name_list = csv_data['Model']
+                name_list = [name.split(']')[0][1:] for name in model_name_list]
+                if revision_name_textbox not in name_list:
+                    col = csv_data.shape[0]
+                else:
+                    col = name_list.index(revision_name_textbox)
+            if model_link == '':
+                model_name = model_name  # no url
+            else:
+                model_name = '[' + model_name + '](' + model_link + ')'
+            # add new data
+            new_data = [
+                model_type,
+                model_name,
+                LLM_name,
+                model_size,
+                Evaluation_method,
+                overall_accuracy,
+                average_accuracy_image,
+                average_accuracy_video,
+                each_task_accuracy[1],
+                each_task_accuracy[2],
+                each_task_accuracy[3],
+                each_task_accuracy[4],
+                each_task_accuracy[5],
+                each_task_accuracy[6],
+                each_task_accuracy[7],
+                each_task_accuracy[8],
+                each_task_accuracy[9],
+                each_task_accuracy[10],
+                each_task_accuracy[11],
+                each_task_accuracy[12],
+                ]
+            csv_data.loc[col] = new_data
+            csv_data = csv_data.to_csv(CSV_DIR, index=False)
+        # v2 evaluation
         else:
+            content = input_file.decode("utf-8")
+            prediction = prediction_analyse_v2(content)
+            csv_data = pd.read_csv(CSV_V2_DIR)
+            Start_dimension, End_dimension = 1, 28
+            if Evaluation_dimension_2 == 'L1':
+                End_dimension = 23
+            elif Evaluation_dimension_2 == 'L2':
+                End_dimension = 25
+            elif Evaluation_dimension_2 == 'L3':
+                End_dimension = 28
+            # pdb.set_trace()
+            each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)}
+            average_p1 = round(sum(each_task_accuracy[key] for key in range(1,23)) / 22, 1)
+            if Evaluation_dimension_2 == 'L2':
+                average_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
+                average_p3 = 0
+            else:
+                average_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
+                average_p3 = round(sum(each_task_accuracy[key] for key in range(25,28)) / 3, 1)
+            if LLM_type == 'Other':
+                LLM_name = LLM_name_textbox
+            else:
+                LLM_name = LLM_type
+            if revision_name_textbox == '':
+                col = csv_data.shape[0]
+                model_name = model_name_textbox
+            else:
+                model_name = revision_name_textbox
+                model_name_list = csv_data['Model']
+                name_list = [name.split(']')[0][1:] for name in model_name_list]
+                if revision_name_textbox not in name_list:
+                    col = csv_data.shape[0]
+                else:
+                    col = name_list.index(revision_name_textbox)
+            if model_link == '':
+                model_name = model_name  # no url
+            else:
+                model_name = '[' + model_name + '](' + model_link + ')'
+            # add new data
+            new_data = [
+                model_name,
+                LLM_name,
+                model_size,
+                Evaluation_method,
+                average_p1,
+                average_p2,
+                average_p3,
+                each_task_accuracy[1],
+                each_task_accuracy[2],
+                each_task_accuracy[3],
+                each_task_accuracy[4],
+                each_task_accuracy[5],
+                each_task_accuracy[6],
+                each_task_accuracy[7],
+                each_task_accuracy[8],
+                each_task_accuracy[9],
+                each_task_accuracy[10],
+                each_task_accuracy[11],
+                each_task_accuracy[12],
+                each_task_accuracy[13],
+                each_task_accuracy[14],
+                each_task_accuracy[15],
+                each_task_accuracy[16],
+                each_task_accuracy[17],
+                each_task_accuracy[18],
+                each_task_accuracy[19],
+                each_task_accuracy[20],
+                each_task_accuracy[21],
+                each_task_accuracy[22],
+                each_task_accuracy[23],
+                each_task_accuracy[24],
+                each_task_accuracy[25],
+                each_task_accuracy[26],
+                each_task_accuracy[27]
+                ]
+            csv_data.loc[col] = new_data
+            csv_data = csv_data.to_csv(CSV_V2_DIR, index=False)
     return 0
 def get_baseline_df():
                 interactive=True,
             )
             # selection for model size part:
+            model_size_v2 = gr.CheckboxGroup(
                 choices=MODEL_SIZE,
                 value=MODEL_SIZE,
                 label="Model Size",
                 interactive=True,
             )
+            # selection for model size part:
+            evaluation_method_v2 = gr.CheckboxGroup(
+                choices=EVALUATION_METHOD,
+                value=EVALUATION_METHOD,
+                label="Evaluation Method",
+                interactive=True,
+            )
             # 创建数据帧组件
             data_component_v2 = gr.components.Dataframe(
                 interactive=False,
                 visible=True,
                 )
+            def on_filter_model_size_method_v2_change(selected_model_size, selected_evaluation_method, selected_columns):
+                updated_data = get_all_v2_df()
+                # model_size & evaluation_method:
+                # 自定义过滤函数
+                def custom_filter(row, model_size_filters, evaluation_method_filters):
+                    model_size = row['Model Size']
+                    evaluation_method = row['Evaluation Method']
+                    if model_size == '-':
+                        size_filter = '-' in model_size_filters
+                    elif 'B' in model_size:
+                        size = float(model_size.replace('B', ''))
+                        size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
+                    else:
+                        size_filter = False
+                    method_filter = evaluation_method in evaluation_method_filters
+                    return size_filter and method_filter
+                # 使用自定义过滤函数过滤数据
+                mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, evaluation_method_filters=selected_evaluation_method)
+                updated_data = updated_data[mask]
+                # columns:
                 selected_columns = [item for item in TASK_V2_INFO if item in selected_columns]
                 present_columns = MODEL_INFO_V2 + selected_columns
+                updated_data = updated_data[present_columns]
+                updated_data = updated_data.sort_values(by="Avg. P1", ascending=False)
                 updated_headers = present_columns
                 update_datatype = [DATA_TITILE_V2_TYPE[COLUMN_V2_NAMES.index(x)] for x in updated_headers]
                 filter_component = gr.components.Dataframe(
                 return filter_component.value
+            model_size_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2)
+            evaluation_method_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2)
+            checkbox_group_v2.change(fn=on_filter_model_size_method_v2_change, inputs=[model_size_v2, evaluation_method_v2, checkbox_group_v2], outputs=data_component_v2)
         # table seed-bench-v1
         with gr.TabItem("🏅 SEED Benchmark v1", elem_id="seed-benchmark-tab-table", id=1):
                 interactive=True,
             )
             # selection for model size part:
+            model_size = gr.CheckboxGroup(
                 choices=MODEL_SIZE,
                 value=MODEL_SIZE,
                 label="Model Size",
                 interactive=True,
             )
+            # selection for model size part:
+            evaluation_method = gr.CheckboxGroup(
+                choices=EVALUATION_METHOD,
+                value=EVALUATION_METHOD,
+                label="Evaluation Method",
+                interactive=True,
+            )
             # 创建数据帧组件
             data_component = gr.components.Dataframe(
                 visible=True,
                 )
+            def on_filter_model_size_method_change(selected_model_size, selected_evaluation_method, selected_columns):
+                updated_data = get_all_df()
+                # model_size & evaluation_method:
+                # 自定义过滤函数
+                def custom_filter(row, model_size_filters, evaluation_method_filters):
+                    model_size = row['Model Size']
+                    evaluation_method = row['Evaluation Method']
+                    if model_size == '-':
+                        size_filter = '-' in model_size_filters
+                    elif 'B' in model_size:
+                        size = float(model_size.replace('B', ''))
+                        size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
+                    else:
+                        size_filter = False
+                    method_filter = evaluation_method in evaluation_method_filters
+                    return size_filter and method_filter
+                # 使用自定义过滤函数过滤数据
+                mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, evaluation_method_filters=selected_evaluation_method)
+                updated_data = updated_data[mask]
+                # columns:
                 selected_columns = [item for item in TASK_INFO if item in selected_columns]
                 present_columns = MODEL_INFO + selected_columns
+                updated_data = updated_data[present_columns]
+                updated_data = updated_data.sort_values(by="Avg. All", ascending=False)
                 updated_headers = present_columns
                 update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
                 return filter_component.value
+            model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component)
+            evaluation_method.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component)
+            checkbox_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, evaluation_method, checkbox_group], outputs=data_component)
         # table 2
         with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
                     model_link = gr.Textbox(
                         label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
                     )
+                    model_size = gr.Textbox(
+                        label="Model size", placeholder="7B(Input content format must be 'number+B' or '-')"
+                    )
+                    benchmark_version= gr.Dropdown(
+                        choices=["v1", "v2"],
+                        label="Benchmark version",
+                        multiselect=False,
+                        value="v1",
+                        interactive=True,
+                    )
                 with gr.Column():
                     LLM_type = gr.Dropdown(
                         choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
                         label="LLM type",
                     )
                     Evaluation_dimension = gr.Dropdown(
                         choices=["All", "Image", "Video"],
+                        label="Evaluation dimension for SEED-Bench 1(for evaluate SEED-Bench 1)",
                         multiselect=False,
                         value="All",
                         interactive=True,
                     )
+                    Evaluation_dimension_2 = gr.Dropdown(
+                        choices=["L1", "L2", "L3"],
+                        label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)",
+                        multiselect=False,
+                        value="L2",
+                        interactive=True,
+                    )
+                    Evaluation_method = gr.Dropdown(
+                        choices=EVALUATION_METHOD,
+                        label="Evaluation method",
+                        multiselect=False,
+                        value=EVALUATION_METHOD[0],
+                        interactive=True,
+                    )
             with gr.Column():
                         revision_name_textbox,
                         model_type,
                         model_link,
+                        model_size,
+                        benchmark_version,
                         LLM_type,
                         LLM_name_textbox,
                         Evaluation_dimension,
+                        Evaluation_dimension_2,
+                        Evaluation_method
                     ],
                 )
+    def refresh_data():
+        value1 = get_baseline_df()
+        value2 = get_baseline_v2_df()
+        return value1, value2
     with gr.Row():
         data_run = gr.Button("Refresh")
         data_run.click(
+            refresh_data, outputs=[data_component, data_component_v2]
         )
     # block.load(get_baseline_df, outputs=data_title)

constants.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # this is .py for store constants
-MODEL_INFO = ["Model Type", "Model", "Language Model"]
-MODEL_INFO_V2 = ["Model", "Language Model"]
-MODEL_SIZE = ["<10B", ">=10B"]
 DIMENSION_LEVEL = ["L1", "L2", "L3"]
 LEADERBOARD_VERSION = ["Version1", "Version2"]
 TASK_INFO = ["Avg. All", "Avg. Img", "Avg. Video", "Scene Understanding", "Instance Identity", "Instance Attribute", "Instance Location", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition",  "Action Recognition", "Action Prediction", "Procedure Understanding"]
@@ -10,8 +11,8 @@ TASK_V2_INFO = ["Avg. P1", "Avg. P2", "Avg. P3", "Scene Understanding", "Instanc
 AVG_INFO = ["Avg. All", "Avg. Img", "Avg. Video"]
 AVG_V2_INFO = ["Avg. P1", "Avg. P2", "Avg. P3"]
-DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
-DATA_TITILE_V2_TYPE = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
 CSV_DIR = "./file/result.csv"
 CSV_V2_DIR = "./file/result_v2.csv"
@@ -24,49 +25,83 @@ DATA_NUM_V2 = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 435, 330, 500, 501, 19
 LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard
     Welcome to the leaderboard of the SEED-Bench! 🏆
-    SEED-Bench consists of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both the spatial and temporal understanding.
-    Please refer to [our paper](https://arxiv.org/abs/2307.16125) for more details.
     """
-SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark v1 Introduction
-    1. Obtain JSON file from our [github repository](https://github.com/AILab-CVC/SEED-Bench#leaderboard-submit) after evaluation. For example, you can obtain InstructBLIP's JSON file as results/results.json after running
     ```shell
     python eval.py --model instruct_blip --anno_path SEED-Bench.json --output-dir results
     ```
     2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify InstructBLIP's performance, you need to fill in 'InstructBLIP' in 'Revision Model Name'.
     3. Please provide the correct link of your model's repository for each submission.
-    4. For the evaluation dimension, you can choose "All/Image/Video", and the results of dimensions that are not evaluated will be set to zero.
     5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
     ## Submit Example
-    For example, if you want to upload InstructBLIP's result in the leaderboard, you need to:
     1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
     2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
-    2. Select 'ImageLLM' in 'Model Type'.
-    3. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
-    4. Select 'Flan-T5-XL' in 'LLM Type'.
-    5. Select 'All' in 'Evaluation Dimension'.
-    6. Upload results.json.
-    7. Click the 'Submit Eval' button.
-    8. Click 'Refresh' to obtain the uploaded leaderboard.
 """
 TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
         We use accurancy(%) as the primary evaluation metric for each tasks.
     """
 LEADERBORAD_INFO = """
       Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
-      In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
-      SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
       We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
       Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
-      We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
       By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""@article{li2023seed,
   title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension},
   author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
   journal={arXiv preprint arXiv:2307.16125},

 # this is .py for store constants
+MODEL_INFO = ["Model Type", "Model", "Language Model", "Evaluation Method"]
+MODEL_INFO_V2 = ["Model", "Language Model", "Evaluation Method"]
+MODEL_SIZE = ["<10B", ">=10B", "-"]
+EVALUATION_METHOD = ["PPL", "PPL for A/B/C/D", "Generate", "NG"]
 DIMENSION_LEVEL = ["L1", "L2", "L3"]
 LEADERBOARD_VERSION = ["Version1", "Version2"]
 TASK_INFO = ["Avg. All", "Avg. Img", "Avg. Video", "Scene Understanding", "Instance Identity", "Instance Attribute", "Instance Location", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition",  "Action Recognition", "Action Prediction", "Procedure Understanding"]
 AVG_INFO = ["Avg. All", "Avg. Img", "Avg. Video"]
 AVG_V2_INFO = ["Avg. P1", "Avg. P2", "Avg. P3"]
+DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
+DATA_TITILE_V2_TYPE = ["markdown", "markdown","markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
 CSV_DIR = "./file/result.csv"
 CSV_V2_DIR = "./file/result_v2.csv"
 LEADERBORAD_INTRODUCTION = """# SEED-Bench Leaderboard
     Welcome to the leaderboard of the SEED-Bench! 🏆
+    SEED-Bench-1 consists of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both the spatial and temporal understanding.
+    Please refer to [SEED-Bench-1 paper](https://arxiv.org/abs/2307.16125) for more details.
+    SEED-Bench-2 comprises 24K multiple-choice questions with accurate human anno- tations, which spans 27 dimensions, including the evalu- ation of both text and image generation.
+    Please refer to [SEED-Bench-2 paper](https://arxiv.org/abs/2311.17092) for more details.
     """
+SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark Introduction
+    1. Obtain JSON file from our [github repository](https://github.com/AILab-CVC/SEED-Bench#leaderboard-submit) after evaluation. For example on SEED-Bench-1, you can obtain InstructBLIP's JSON file as results/results.json after running
     ```shell
     python eval.py --model instruct_blip --anno_path SEED-Bench.json --output-dir results
     ```
+    And for example on SEED-Bench-2, you can obtain InternLM_Xcomposer_VL's JSON file as results/results.json after running
+    ```shell
+    python eval.py --model InternLM_Xcomposer_VL --anno_path SEED-Bench_v2_level1_2_3.json --output-dir results --evaluate_level L2 --evaluate_part all --evaluate_version v2
+    ```
     2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify InstructBLIP's performance, you need to fill in 'InstructBLIP' in 'Revision Model Name'.
     3. Please provide the correct link of your model's repository for each submission.
+    4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
     5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
     ## Submit Example
+    For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
     1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
     2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
+    3. Select 'ImageLLM' in 'Model Type'.
+    4. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
+    5. Fill in '7B' in 'Model size'.
+    6. Select 'v1' in 'Benchmark version'.
+    7. Select 'Flan-T5-XL' in 'LLM Type'.
+    8. Select 'All' in 'Evaluation Dimension for SEED-Bench 1'.
+    9. Select 'PPL' in 'Evaluate Method'.
+    10. Upload results.json.
+    11. Click the 'Submit Eval' button.
+    12. Click 'Refresh' to obtain the uploaded leaderboard.
+    For example on SEED-Bench-2, if you want to upload InternLM_Xcomposer_VL's result in the leaderboard, you need to:
+    1. Fill in 'InternLM_Xcomposer_VL' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
+    2. Fill in 'InternLM_Xcomposer_VL' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
+    3. Select 'ImageLLM' in 'Model Type'.
+    4. Fill in 'https://github.com/InternLM/InternLM-XComposer' in 'Model Link'.
+    5. Fill in '7B' in 'Model size'.
+    6. Select 'v2' in 'Benchmark version'.
+    7. Select 'Other' in 'LLM Type'.
+    8. Fill 'InternLM-7B' in 'LLM model(for Other)'
+    9. Select 'L2' in 'Evaluation Dimension for SEED-Bench 2'.
+    10. Select 'PPL' in 'Evaluate Method'.
+    11. Upload results.json.
+    12. Click the 'Submit Eval' button.
+    13. Click 'Refresh' to obtain the uploaded leaderboard.
 """
 TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
         We use accurancy(%) as the primary evaluation metric for each tasks.
+        SEED-Bench-1 calculates the overall accuracy by dividing the total number of correct QA answers by the total number of QA questions.
+        SEED-Bench-2 represents the overall accuracy using the average accuracy of each dimension.
     """
 LEADERBORAD_INFO = """
       Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
+      [SEED-Bench-1](https://arxiv.org/abs/2307.16125) consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
+      [SEED-Bench-2](https://arxiv.org/abs/2311.17092)  comprises 24K multiple-choice questions with accurate human anno- tations, which spans 27 dimensions, including the evalu- ation of both text and image generation.
       We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
       Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
       By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@article{li2023seed2,
+  title={SEED-Bench-2: Benchmarking Multimodal Large Language Models},
+  author={Li, Bohao and Ge, Yuying and Ge, Yixiao and Wang, Guangzhi and Wang, Rui and Zhang, Ruimao and Shan, Ying},
+  journal={arXiv preprint arXiv:2311.17092},
+  year={2023}
+  }
+  @article{li2023seed,
   title={SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension},
   author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
   journal={arXiv preprint arXiv:2307.16125},

file/SEED-Bench-1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

file/SEED-Bench-2.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95c29b47709e43246be32cf4343611766e7737ad7062096e197932eff7c8543d
+size 18076409

file/result.csv CHANGED Viewed

@@ -1,38 +1,39 @@
-Model Type,Model,Language Model,Avg. All,Avg. Img,Avg. Video,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Action Recognition,Action Prediction,Procedure Understanding
-LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,27.7,27.3,28.6,23,29,32.8,31.8,20.5,31.8,33,18.2,19.4,23.2,34.9,25.4
-LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,28.5,28.2,29.5,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,27.3,34.5,23.8
-LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,26.8,26.6,27.3,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37,9,33,23.1,26.2
-ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,46.4,49.7,36.7,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,32.6,47.5,24
-ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,52.7,57.8,38.3,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,33.1,49.1,27.1
-ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,53.4,58.8,38.1,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,34.5,49.6,23.1
-ImageLLM,[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),Vicuna-13B,61.6,68.2,42.7,74.9,71.3,68.9,63.5,61.3,51.4,73.2,77,60.5,48.9,41.1,36.6
-ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,42.8,47.4,29.9,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,38.2,24.5,27.1
-ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,39.1,41.8,31.4,51.9,44.1,39.9,36.1,33.7,36.4,32,53.2,30.6,39.5,24.3,31.9
-ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,33.2,34.5,29.2,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,36.9,25.8,24
-ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,33.9,35.2,30.4,44.9,38.6,32.2,30.9,26.3,31.8,32,51.4,31.8,37.9,27.2,24.8
-ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,39.7,42.9,30.6,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,36.8,29.2,23.8
-ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,33.1,34.5,29.3,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20,37.2,25.4,24.2
-ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,40.9,42.7,35.7,53.2,45.3,40,31.2,39.3,32.6,36.1,51.4,25.9,42.9,34.7,26.9
-ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,32.7,35.2,25.8,45.2,38.5,29.3,33,29.7,35.5,39.2,52,24.7,38.6,18.5,19.6
-ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,33.5,35.5,27.8,41.7,35.5,31.8,29.5,36.2,32,32,51.1,27.1,33.9,25.4,23
-ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,34,37.9,23,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,26.7,17.9,26.5
-ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,50,54.4,37.5,63.4,57.1,58.5,44,41.4,37.9,55.7,60.7,25.9,41.3,40.4,27
-ImageLLM,[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,58.2,65.4,37.8,73.3,67.3,69.6,57.7,52.9,48.2,59.8,74.6,53.5,43.9,39.2,26.7
-ImageLLM,[Qwen-VL](https://huggingface.co/Qwen/Qwen-VL),Qwen-7B,56.3,62.3,39.1,71.2,66.4,67.7,53.5,44.8,43.8,62.9,74.9,51.2,44.7,38.5,32
-ImageLLM,[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,0,44.5,0,55.8,45.3,42.3,40.2,36.8,34.9,37.1,55.9,38.8,0,0,0
-ImageLLM,[IDEFICS-80b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-65B,0,53.2,0,64,52.6,50.8,48.3,46.1,45.5,62.9,68,51.8,0,0,0
-ImageLLM,[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,0,66.9,0,75,71.7,67.6,60.8,56.2,55.3,74.4,77,48.5,0,0,0
-ImageLLM,[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,48.9,53.7,35.4,64.1,54.2,54.1,46.5,45.3,38.2,51.6,60.7,44.7,37.8,45.3,20.0
-ImageLLM,[mPLUG-Owl2](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,57.8,64.1,39.8,72.7,67.6,63.6,53.6,58.5,50.8,70.1,76.4,30.2,46.0,38.7,32.9
-ImageLLM,[LLaMA-VID-7B](https://github.com/dvlab-research/LLaMA-VID),LLaMA-7B,59.9,67.6,37.9,75.4,71.2,68.9,62.9,58.4,50.7,70.1,76.1,54.7,42.8,35.2,35.6
-ImageLLM,[Pink-LLaMA2](https://github.com/SY-Xuan/Pink/stargazers),LLaMA2-7B,0,67.0,0,75.2,70.1,70.1,63.3,53.8,50.2,69.1,74.3,50.0,0,0,0
-ImageLLM,[InfMLLM-13B](https://github.com/mightyzau/InfMLLM),Vicuna-13B,62.3,69.6,41.5,75.5,73,70.4,66.2,63.3,54.2,72.2,77.9,37.2,49.5,39,33.9
-ImageLLM,[ShareGPT4V-7B](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V),Vicuna-7B,0,69.7,0,75.3,71.4,72.3,63.1,62,53.9,70.1,79.8,54.7,0,0,0
-ImageLLM,[ShareGPT4V-13B](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V),Vicuna-13B,0,70.8,0,75.9,74.1,73.5,66.8,62.4,54.8,75.3,77.3,46.5,0,0,0
-VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,37.6,39,33.7,47.1,43.8,34.9,40,32.8,34.6,42.3,50.5,17.7,34.9,36.4,27.3
-VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,31.2,33.9,23.5,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,27.6,21.3,21.1
-VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,30.3,32,25.4,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,31.3,23.2,20.7
-Other,[Unified-IO-2 7B (2.5M)](),from scratch,60.5,65.6,46,70.7,69,67.4,55.4,62.6,45.5,60.8,67.1,58.1,57.5,43.2,34
-Other,[Unified-IO-2 7B](),from scratch,60.4,65.5,46,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58,42.7,34
-Other,[Unified-IO-2 3B](),from scratch,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64,41.9,57.5,36,39
-Other,[Unified-IO-2 1B](),from scratch,49.6,55.1,34,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6

+Model Type,Model,Language Model,Model Size,Evaluation Method,Avg. All,Avg. Img,Avg. Video,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Action Recognition,Action Prediction,Procedure Understanding
+LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,3B,PPL,27.7,27.3,28.6,23.0,29.0,32.8,31.8,20.5,31.8,33.0,18.2,19.4,23.2,34.9,25.4
+LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,7B,PPL,28.5,28.2,29.5,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,27.3,34.5,23.8
+LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,7B,PPL,26.8,26.6,27.3,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37.0,9.0,33.0,23.1,26.2
+ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,46.4,49.7,36.7,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,32.6,47.5,24.0
+ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,52.7,57.8,38.3,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,33.1,49.1,27.1
+ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,53.4,58.8,38.1,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,34.5,49.6,23.1
+ImageLLM,[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),Vicuna-13B,13B,Generate,61.6,68.2,42.7,74.9,71.3,68.9,63.5,61.3,51.4,73.2,77.0,60.5,48.9,41.1,36.6
+ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,42.8,47.4,29.9,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,38.2,24.5,27.1
+ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,39.1,41.8,31.4,51.9,44.1,39.9,36.1,33.7,36.4,32.0,53.2,30.6,39.5,24.3,31.9
+ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,33.2,34.5,29.2,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,36.9,25.8,24.0
+ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,33.9,35.2,30.4,44.9,38.6,32.2,30.9,26.3,31.8,32.0,51.4,31.8,37.9,27.2,24.8
+ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,7B,PPL,39.7,42.9,30.6,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,36.8,29.2,23.8
+ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,33.1,34.5,29.3,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20.0,37.2,25.4,24.2
+ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,7B,PPL,40.9,42.7,35.7,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.9,42.9,34.7,26.9
+ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,32.7,35.2,25.8,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,24.7,38.6,18.5,19.6
+ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,33.5,35.5,27.8,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,27.1,33.9,25.4,23.0
+ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,34.0,37.9,23.0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,26.7,17.9,26.5
+ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,1.3B,PPL,50.0,54.4,37.5,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,25.9,41.3,40.4,27.0
+ImageLLM,[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL for A/B/C/D,58.2,65.4,37.8,73.3,67.3,69.6,57.7,52.9,48.2,59.8,74.6,53.5,43.9,39.2,26.7
+ImageLLM,[Qwen-VL](https://huggingface.co/Qwen/Qwen-VL),Qwen-7B,7B,PPL for A/B/C/D,56.3,62.3,39.1,71.2,66.4,67.7,53.5,44.8,43.8,62.9,74.9,51.2,44.7,38.5,32.0
+ImageLLM,[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,NG,0.0,44.5,0.0,55.8,45.3,42.3,40.2,36.8,34.9,37.1,55.9,38.8,0.0,0.0,0.0
+ImageLLM,[IDEFICS-80b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-65B,65B,NG,0.0,53.2,0.0,64.0,52.6,50.8,48.3,46.1,45.5,62.9,68.0,51.8,0.0,0.0,0.0
+ImageLLM,[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,0.0,66.9,0.0,75.0,71.7,67.6,60.8,56.2,55.3,74.4,77.0,48.5,0.0,0.0,0.0
+ImageLLM,[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,48.9,53.7,35.4,64.1,54.2,54.1,46.5,45.3,38.2,51.6,60.7,44.7,37.8,45.3,20.0
+ImageLLM,[mPLUG-Owl2](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,NG,57.8,64.1,39.8,72.7,67.6,63.6,53.6,58.5,50.8,70.1,76.4,30.2,46.0,38.7,32.9
+ImageLLM,[LLaMA-VID-7B](https://github.com/dvlab-research/LLaMA-VID),LLaMA-7B,7B,Generate,59.9,67.6,37.9,75.4,71.2,68.9,62.9,58.4,50.7,70.1,76.1,54.7,42.8,35.2,35.6
+ImageLLM,[Pink-LLaMA2](https://github.com/SY-Xuan/Pink/stargazers),LLaMA2-7B,7B,NG,0.0,67.0,0.0,75.2,70.1,70.1,63.3,53.8,50.2,69.1,74.3,50.0,0.0,0.0,0.0
+ImageLLM,[InfMLLM-13B](https://github.com/mightyzau/InfMLLM),Vicuna-13B,13B,NG,62.3,69.6,41.5,75.5,73.0,70.4,66.2,63.3,54.2,72.2,77.9,37.2,49.5,39.0,33.9
+ImageLLM,[ShareGPT4V-7B](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V),Vicuna-7B,7B,Generate,0.0,69.7,0.0,75.3,71.4,72.3,63.1,62.0,53.9,70.1,79.8,54.7,0.0,0.0,0.0
+ImageLLM,[ShareGPT4V-13B](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4V),Vicuna-13B,13B,Generate,0.0,70.8,0.0,75.9,74.1,73.5,66.8,62.4,54.8,75.3,77.3,46.5,0.0,0.0,0.0
+ImageLLM,[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,67.3,69.1,60.5,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,57.6,65.7,51.7,63.4
+VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.6,39.0,33.7,47.1,43.8,34.9,40.0,32.8,34.6,42.3,50.5,17.7,34.9,36.4,27.3
+VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,31.2,33.9,23.5,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,27.6,21.3,21.1
+VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,30.3,32.0,25.4,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,31.3,23.2,20.7
+Other,[Unified-IO-2 7B (2.5M)](),from scratch,7B,NG,60.5,65.6,46.0,70.7,69.0,67.4,55.4,62.6,45.5,60.8,67.1,58.1,57.5,43.2,34.0
+Other,[Unified-IO-2 7B](),from scratch,7B,NG,60.4,65.5,46.0,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58.0,42.7,34.0
+Other,[Unified-IO-2 3B](),from scratch,3B,NG,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64.0,41.9,57.5,36.0,39.0
+Other,[Unified-IO-2 1B](),from scratch,1B,NG,49.6,55.1,34.0,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6

file/result_v2.csv CHANGED Viewed

@@ -1,24 +1,25 @@
-Model,Language Model,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
-[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,41,35.3,0,58.5,48.6,49,39.1,43.4,36.2,48.5,52.9,60.7,51.8,51.4,19.2,43.2,52.4,29.3,22,17.8,38.6,42.5,37.7,36.2,22.9,40,30.6,0,0,0
-[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,42.2,35.7,0,58.9,49.7,61.7,35.1,58.1,34.9,47.4,55.9,61.4,48.5,45.4,26.4,41.7,47.7,34.5,21.2,22.8,35.2,41.5,36.1,40.5,24.5,36.7,34.7,0,0,0
-[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,41.4,29.7,0,53.6,43.9,49,37.8,56.5,35.8,43.3,56.2,57.2,60.3,44.4,27.9,39.2,39.4,23,26.5,36.5,55.4,40.4,38.6,31.2,15.6,26.7,32.7,0,0,0
-[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,38.7,30.2,0,53.8,47.5,38.3,34.2,42,34.7,40.2,52.9,46.4,51.8,45.6,30.3,40.2,37.6,34.3,20.5,27,50,44.1,36.2,25.1,18.6,40,20.4,0,0,0
-[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,39.4,34.1,0,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,41.8,55.2,45.2,20.2,41.2,43.3,24.2,25,19,46.7,39,38.7,27.4,28.6,45.8,22.5,0,0,0
-[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,36.2,23.9,0,46.9,38.6,33.6,35.6,27.5,34.4,33,50.8,47.6,52.4,38.2,30.1,34.7,36.1,31.5,27.3,24.6,44,37.8,38.2,20.9,33.5,19.2,28.6,0,0,0
-[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,37.4,34.9,0,46.9,42.5,32,32.3,27.7,29.7,29.9,48.3,35.2,60.9,50.4,24.2,42.2,37.6,32.1,27.3,40.1,56.5,37.6,38.7,25.3,24.4,39.2,30.6,0,0,0
-[Otter](https://github.com/Luodian/Otter),LLaMA-7B,36.4,36.6,0,45.9,39.7,31.9,31.6,26.4,32,33,49.2,39.3,59.7,53,23.6,41.2,36.1,37.3,22,27.4,46.7,36.6,37.9,26,24.8,42.5,30.6,0,0,0
-[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,37.3,35.5,0,46.7,42.3,31.7,33.4,27.4,29.8,29.9,47.7,35.6,60.3,49.8,24.2,42.2,39,32.1,27.3,39.9,54.9,37.6,38.4,25.2,24.1,38.3,32.7,0,0,0
-[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,37.5,0,0,45.2,38.5,29.3,33,29.7,35.5,39.2,52,48.7,58.5,46.4,24.2,41.2,40.1,39.7,23.5,29.1,52.2,41.9,38.2,18.8,20.3,0,0,0,0,0
-[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,34.4,38.6,0,41.7,35.5,31.8,29.5,36.2,32,32,51.1,35.2,39.4,36.4,25,36.2,31.1,20.6,22.7,41.5,59.2,40.4,29.7,26.3,24.1,42.5,34.7,0,0,0
-[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,39.4,28.9,0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,49.2,70.9,49.6,23.2,44.2,44,32.5,23.5,33.5,54.9,42,37.8,18.3,19.3,29.2,28.6,0,0,0
-[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,46.3,23.3,0,63.4,57.1,58.5,44,41.4,37.9,55.7,60.7,68.1,82.1,51.4,21.2,48.2,43.7,30.7,28,25.2,42.8,48.5,40.8,39.5,30,24.2,22.5,0,0,0
-[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,43.1,35.5,0,56.5,47.6,54.8,46.9,54.2,40.3,55.7,55,47.4,62.4,55.6,25.2,43.7,41.2,20.6,28.8,34.3,47.2,39.7,42.8,29.6,19.1,42.5,28.6,0,0,0
-[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,47.3,30.8,0,63.7,62.4,66.7,51.3,60.2,38.5,47.4,59.8,69,60.6,49.8,25,45.7,56.7,31.1,24.2,35.7,50.3,46.1,39.4,29.4,28.1,39.2,22.5,0,0,0
-[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,38,40.3,0,48.2,38.2,37.8,32.9,29,32.4,37.1,54.1,45.5,52.4,52.8,22.6,42.7,33.2,26.6,21.2,56.5,48.4,42.7,38.6,23.6,20.5,45.8,34.7,0,0,0
-[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,59.2,32.1,0,74.8,70.5,67.6,60.5,55.3,53.4,76.3,76.1,61.4,86.1,78,27.2,60.3,84.8,68.9,25.8,47.7,56.6,58.6,49.9,37.6,24.9,27.5,36.7,0,0,0
-[Emu](https://github.com/baaivision/Emu),LLaMA-13B,42.5,41.1,41.4,59,50,43.7,37.1,44.3,33.6,49.5,58.3,61.4,68.8,61.6,19,45.7,41.5,24.2,26.4,29.3,37.1,41.9,42.7,37.9,21.8,51.7,30.6,46.8,43.2,34.2
-[Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,30.7,35.6,33.9,36.4,35.1,25.6,29.9,36.1,30.9,39.2,41.7,31,30.9,27.4,21.2,34.2,31.8,24.4,17.4,24.2,39,35.5,33.8,25.6,24.5,46.7,24.5,45.1,19.8,36.7
-[seed-llama](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,43.9,43.4,52.3,64,55,51.3,45.4,43.3,37.9,56.7,59.2,57,55.5,52.8,18.8,49.3,44.8,28.8,24.4,29.5,41.5,46.7,39.4,43.9,20.3,54.2,32.7,50.2,40.7,65.8
-[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,37,35.3,0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34,30.6,27.4,40,30.6,0,0,0
-[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,36.4,31,0,44.1,37,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22,33.2,37.2,22.4,25,46.1,61.4,42.6,32.2,27,19,37.5,24.5,0,0,0
-[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,34.5,32.2,0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52,35.2,44.9,43.4,23.8,33.2,37.2,26,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0,0,0

+Model,Language Model,Model Size,Evaluation Method,Avg. P1,Avg. P2,Avg. P3,Scene Understanding,Instance Identity,Instance Attribute,Instance Location,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Celebrity Recognition,Landmark Recognition,Chart Understanding,Visual Referring Expression,Science Knowledge,Emotion Recognition,Visual Mathematics,Difference Spotting,Meme Comprehension,Global Video Understanding,Action Recognition,Action Predicion,Procedure Understanding,In-Context Captioning,Interleaved Image-Text Analysis,Text-to-Image Generation,Next Image Prediction,Text-Image Creation
+[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,41.0,35.3,0.0,58.5,48.6,49.0,39.1,43.4,36.2,48.5,52.9,60.7,51.8,51.4,19.2,43.2,52.4,29.3,22.0,17.8,38.6,42.5,37.7,36.2,22.9,40.0,30.6,0.0,0.0,0.0
+[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,3B,PPL,42.2,35.7,0.0,58.9,49.7,61.7,35.1,58.1,34.9,47.4,55.9,61.4,48.5,45.4,26.4,41.7,47.7,34.5,21.2,22.8,35.2,41.5,36.1,40.5,24.5,36.7,34.7,0.0,0.0,0.0
+[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,7B,PPL,41.4,29.7,0.0,53.6,43.9,49.0,37.8,56.5,35.8,43.3,56.2,57.2,60.3,44.4,27.9,39.2,39.4,23.0,26.5,36.5,55.4,40.4,38.6,31.2,15.6,26.7,32.7,0.0,0.0,0.0
+[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,7B,PPL,38.7,30.2,0.0,53.8,47.5,38.3,34.2,42.0,34.7,40.2,52.9,46.4,51.8,45.6,30.3,40.2,37.6,34.3,20.5,27.0,50.0,44.1,36.2,25.1,18.6,40.0,20.4,0.0,0.0,0.0
+[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Vicuna-7B,7B,PPL,39.4,34.1,0.0,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,41.8,55.2,45.2,20.2,41.2,43.3,24.2,25.0,19.0,46.7,39.0,38.7,27.4,28.6,45.8,22.5,0.0,0.0,0.0
+[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,7B,PPL,36.2,23.9,0.0,46.9,38.6,33.6,35.6,27.5,34.4,33.0,50.8,47.6,52.4,38.2,30.1,34.7,36.1,31.5,27.3,24.6,44.0,37.8,38.2,20.9,33.5,19.2,28.6,0.0,0.0,0.0
+[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,7B,PPL,37.4,34.9,0.0,46.9,42.5,32.0,32.3,27.7,29.7,29.9,48.3,35.2,60.9,50.4,24.2,42.2,37.6,32.1,27.3,40.1,56.5,37.6,38.7,25.3,24.4,39.2,30.6,0.0,0.0,0.0
+[Otter](https://github.com/Luodian/Otter),LLaMA-7B,7B,PPL,36.4,36.6,0.0,45.9,39.7,31.9,31.6,26.4,32.0,33.0,49.2,39.3,59.7,53.0,23.6,41.2,36.1,37.3,22.0,27.4,46.7,36.6,37.9,26.0,24.8,42.5,30.6,0.0,0.0,0.0
+[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,7B,PPL,37.3,35.5,0.0,46.7,42.3,31.7,33.4,27.4,29.8,29.9,47.7,35.6,60.3,49.8,24.2,42.2,39.0,32.1,27.3,39.9,54.9,37.6,38.4,25.2,24.1,38.3,32.7,0.0,0.0,0.0
+[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,7B,PPL,37.5,0.0,0.0,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,48.7,58.5,46.4,24.2,41.2,40.1,39.7,23.5,29.1,52.2,41.9,38.2,18.8,20.3,0.0,0.0,0.0,0.0,0.0
+[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,7B,PPL,34.4,38.6,0.0,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,35.2,39.4,36.4,25.0,36.2,31.1,20.6,22.7,41.5,59.2,40.4,29.7,26.3,24.1,42.5,34.7,0.0,0.0,0.0
+[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,7B,PPL,39.4,28.9,0.0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,49.2,70.9,49.6,23.2,44.2,44.0,32.5,23.5,33.5,54.9,42.0,37.8,18.3,19.3,29.2,28.6,0.0,0.0,0.0
+[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder only 1.3B,1.3B,PPL,46.3,23.3,0.0,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,68.1,82.1,51.4,21.2,48.2,43.7,30.7,28.0,25.2,42.8,48.5,40.8,39.5,30.0,24.2,22.5,0.0,0.0,0.0
+[Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat),Qwen-7B,7B,PPL,43.1,35.5,0.0,56.5,47.6,54.8,46.9,54.2,40.3,55.7,55.0,47.4,62.4,55.6,25.2,43.7,41.2,20.6,28.8,34.3,47.2,39.7,42.8,29.6,19.1,42.5,28.6,0.0,0.0,0.0
+[LLaVA-1.5](https://github.com/haotian-liu/LLaVA),vicuna-7B,7B,PPL,47.3,30.8,0.0,63.7,62.4,66.7,51.3,60.2,38.5,47.4,59.8,69.0,60.6,49.8,25.0,45.7,56.7,31.1,24.2,35.7,50.3,46.1,39.4,29.4,28.1,39.2,22.5,0.0,0.0,0.0
+[IDEFICS-9b-instruct](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct),LLaMA-7B,7B,PPL,38.0,40.3,0.0,48.2,38.2,37.8,32.9,29.0,32.4,37.1,54.1,45.5,52.4,52.8,22.6,42.7,33.2,26.6,21.2,56.5,48.4,42.7,38.6,23.6,20.5,45.8,34.7,0.0,0.0,0.0
+[InternLM-XComposer-VL](https://github.com/InternLM/InternLM-XComposer),InternLM-7B,7B,PPL,59.2,32.1,0.0,74.8,70.5,67.6,60.5,55.3,53.4,76.3,76.1,61.4,86.1,78.0,27.2,60.3,84.8,68.9,25.8,47.7,56.6,58.6,49.9,37.6,24.9,27.5,36.7,0.0,0.0,0.0
+[Emu](https://github.com/baaivision/Emu),LLaMA-13B,13B,PPL,42.5,41.1,41.4,59.0,50.0,43.7,37.1,44.3,33.6,49.5,58.3,61.4,68.8,61.6,19.0,45.7,41.5,24.2,26.4,29.3,37.1,41.9,42.7,37.9,21.8,51.7,30.6,46.8,43.2,34.2
+[Next-GPT](https://github.com/NExT-GPT/NExT-GPT),vicuna-7B,7B,PPL,30.7,35.6,33.9,36.4,35.1,25.6,29.9,36.1,30.9,39.2,41.7,31.0,30.9,27.4,21.2,34.2,31.8,24.4,17.4,24.2,39.0,35.5,33.8,25.6,24.5,46.7,24.5,45.1,19.8,36.7
+[SEED-LLaMA](https://github.com/AILab-CVC/SEED),LLaMA2-Chat-13B,13B,PPL,43.9,43.4,52.3,64.0,55.0,51.3,45.4,43.3,37.9,56.7,59.2,57.0,55.5,52.8,18.8,49.3,44.8,28.8,24.4,29.5,41.5,46.7,39.4,43.9,20.3,54.2,32.7,50.2,40.7,65.8
+[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,68.1,44.2,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,57.6,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
+[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.0,35.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
+[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,36.4,31.0,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
+[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.5,32.2,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0