Spaces:

AILab-CVC
/

SEED-Bench_Leaderboard

Running

App Files Files Community

tttoaster commited on Jan 5, 2024

Commit

a7b9847

1 Parent(s): e08fe51

Upload 15 files

Browse files

Files changed (8) hide show

__pycache__/constants.cpython-38.pyc +0 -0
app.py +39 -17
constants.py +2 -0
file/result.csv +1 -1
file/result_v2.csv +1 -1
file/result_v2_task.csv +1 -1
src/__pycache__/utils_display.cpython-38.pyc +0 -0
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0

__pycache__/constants.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/constants.cpython-38.pyc and b/__pycache__/constants.cpython-38.pyc differ

app.py CHANGED Viewed

@@ -242,7 +242,9 @@ def add_new_eval(
             csv_task_data = pd.read_csv(CSV_V2_TASK_DIR)
             Start_dimension, End_dimension = 1, 28
-            if Evaluation_dimension_2 == 'L1':
                 End_dimension = 23
             elif Evaluation_dimension_2 == 'L2':
                 End_dimension = 25
@@ -252,25 +254,45 @@ def add_new_eval(
             each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)}
             average_single = round(sum(prediction[i]["correct"] for i in range(1, 17)) / sum(prediction[i]["total"] for i in range(1, 17)) * 100, 1)
-            average_multi = round(sum(prediction[i]["correct"] for i in range(17, 19)) / sum(prediction[i]["total"] for i in range(17, 19)) * 100, 1)
-            average_video = round(sum(prediction[i]["correct"] for i in range(19, 23)) / sum(prediction[i]["total"] for i in range(19, 23)) * 100, 1)
-            average_p1 = round(sum(prediction[i]["correct"] for i in range(1, 23)) / sum(prediction[i]["total"] for i in range(1, 23)) * 100, 1)
             average_task_single = round(sum(each_task_accuracy[key] for key in range(1,17)) / 16, 1)
-            average_task_multi = round(sum(each_task_accuracy[key] for key in range(17,19)) / 2, 1)
-            average_task_video = round(sum(each_task_accuracy[key] for key in range(19,23)) / 4, 1)
-            average_task_p1 = round(sum(each_task_accuracy[key] for key in range(1,23)) / 22, 1)
-            if Evaluation_dimension_2 == 'L2':
-                average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
-                average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
                 average_p3 = 0
                 average_task_p3 = 0
             else:
-                average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
-                average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
-                average_p3 = round(sum(prediction[i]["correct"] for i in range(25, 28)) / sum(prediction[i]["total"] for i in range(25, 28)) * 100, 1)
-                average_task_p3 = round(sum(each_task_accuracy[key] for key in range(25,28)) / 3, 1)
             if LLM_type == 'Other':
                 LLM_name = LLM_name_textbox
@@ -707,7 +729,7 @@ with block:
                         interactive=True,
                     )
                     Evaluation_dimension_2 = gr.Dropdown(
-                        choices=["L1", "L2", "L3"],
                         label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)",
                         multiselect=False,
                         value="L2",

             csv_task_data = pd.read_csv(CSV_V2_TASK_DIR)
             Start_dimension, End_dimension = 1, 28
+            if Evaluation_dimension_2 == 'Single':
+                End_dimension = 17
+            elif Evaluation_dimension_2 == 'L1':
                 End_dimension = 23
             elif Evaluation_dimension_2 == 'L2':
                 End_dimension = 25
             each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)}
             average_single = round(sum(prediction[i]["correct"] for i in range(1, 17)) / sum(prediction[i]["total"] for i in range(1, 17)) * 100, 1)
             average_task_single = round(sum(each_task_accuracy[key] for key in range(1,17)) / 16, 1)
+            # Single
+            if Evaluation_dimension_2 == 'Single':
+                average_multi = 0
+                average_video = 0
+                average_p1 = 0
+                average_p2 = 0
                 average_p3 = 0
+                average_task_multi = 0
+                average_task_video = 0
+                average_task_p1 = 0
+                average_task_p2 = 0
                 average_task_p3 = 0
             else:
+                average_multi = round(sum(prediction[i]["correct"] for i in range(17, 19)) / sum(prediction[i]["total"] for i in range(17, 19)) * 100, 1)
+                average_video = round(sum(prediction[i]["correct"] for i in range(19, 23)) / sum(prediction[i]["total"] for i in range(19, 23)) * 100, 1)
+                average_p1 = round(sum(prediction[i]["correct"] for i in range(1, 23)) / sum(prediction[i]["total"] for i in range(1, 23)) * 100, 1)
+                average_task_multi = round(sum(each_task_accuracy[key] for key in range(17,19)) / 2, 1)
+                average_task_video = round(sum(each_task_accuracy[key] for key in range(19,23)) / 4, 1)
+                average_task_p1 = round(sum(each_task_accuracy[key] for key in range(1,23)) / 22, 1)
+                # L2
+                if Evaluation_dimension_2 == 'L2':
+                    average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
+                    average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
+                    average_p3 = 0
+                    average_task_p3 = 0
+                # L3
+                elif Evaluation_dimension_2 == 'L3':
+                    average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
+                    average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
+                    average_p3 = round(sum(prediction[i]["correct"] for i in range(25, 28)) / sum(prediction[i]["total"] for i in range(25, 28)) * 100, 1)
+                    average_task_p3 = round(sum(each_task_accuracy[key] for key in range(25,28)) / 3, 1)
+                # L1
+                else:
+                    average_p2 = 0
+                    average_task_p2 = 0
+                    average_p3 = 0
+                    average_task_p3 = 0
             if LLM_type == 'Other':
                 LLM_name = LLM_name_textbox
                         interactive=True,
                     )
                     Evaluation_dimension_2 = gr.Dropdown(
+                        choices=["Single", "L1", "L2", "L3"],
                         label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)",
                         multiselect=False,
                         value="L2",

constants.py CHANGED Viewed

@@ -52,6 +52,8 @@ SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark Introduction
     4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
     5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
     ## Submit Example
     For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
     1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).

     4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
     5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
+    Note: The format of the submitted json file is a dict for each line. This dict contains two keys: question_id and prediction. Specific examples are as follows: {"question_id": "5_0", "prediction": "B"}
     ## Submit Example
     For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
     1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).

file/result.csv CHANGED Viewed

@@ -43,4 +43,4 @@ Other,[Unified-IO-2 7B (2.5M)](https://unified-io-2.allenai.org),from scratch,7B
 Other,[Unified-IO-2 7B](https://unified-io-2.allenai.org),from scratch,7B,PPL,60.4,65.5,46,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58,42.7,34
 Other,[Unified-IO-2 3B (3M)](https://unified-io-2.allenai.org),from scratch,3B,PPL,60.2,64.1,45.6,69,66.6,66.5,54.3,62,42.3,50.5,65.3,44.2,57.5,36.2,39.4
 Other,[Unified-IO-2 3B](https://unified-io-2.allenai.org),from scratch,3B,PPL,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64,41.9,57.5,36,39
-Other,[Unified-IO-2 1B](https://unified-io-2.allenai.org),from scratch,1B,PPL,49.6,55.1,34,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6

 Other,[Unified-IO-2 7B](https://unified-io-2.allenai.org),from scratch,7B,PPL,60.4,65.5,46,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58,42.7,34
 Other,[Unified-IO-2 3B (3M)](https://unified-io-2.allenai.org),from scratch,3B,PPL,60.2,64.1,45.6,69,66.6,66.5,54.3,62,42.3,50.5,65.3,44.2,57.5,36.2,39.4
 Other,[Unified-IO-2 3B](https://unified-io-2.allenai.org),from scratch,3B,PPL,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64,41.9,57.5,36,39
+Other,[Unified-IO-2 1B](https://unified-io-2.allenai.org),from scratch,1B,PPL,49.6,55.1,34,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6

file/result_v2.csv CHANGED Viewed

@@ -25,4 +25,4 @@ Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Vi
 [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
 [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
 [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
-[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0

 [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
 [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
 [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
+[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0

file/result_v2_task.csv CHANGED Viewed

@@ -25,4 +25,4 @@ Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Vi
 [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,70.0,78.6,61.3,69.2,44.2,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
 [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.4,41.0,33.4,37.0,35.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
 [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,35.8,53.8,30.2,36.4,31.0,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
-[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.9,44.7,28.0,34.5,32.2,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0

 [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,70.0,78.6,61.3,69.2,44.2,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
 [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.4,41.0,33.4,37.0,35.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
 [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,35.8,53.8,30.2,36.4,31.0,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
+[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.9,44.7,28.0,34.5,32.2,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0

src/__pycache__/utils_display.cpython-38.pyc CHANGED Viewed

Binary files a/src/__pycache__/utils_display.cpython-38.pyc and b/src/__pycache__/utils_display.cpython-38.pyc differ

src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc CHANGED Viewed

Binary files a/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc and b/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc differ