Upload 15 files
Browse files- __pycache__/constants.cpython-38.pyc +0 -0
- app.py +39 -17
- constants.py +2 -0
- file/result.csv +1 -1
- file/result_v2.csv +1 -1
- file/result_v2_task.csv +1 -1
- src/__pycache__/utils_display.cpython-38.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0
__pycache__/constants.cpython-38.pyc
CHANGED
Binary files a/__pycache__/constants.cpython-38.pyc and b/__pycache__/constants.cpython-38.pyc differ
|
|
app.py
CHANGED
@@ -242,7 +242,9 @@ def add_new_eval(
|
|
242 |
csv_task_data = pd.read_csv(CSV_V2_TASK_DIR)
|
243 |
|
244 |
Start_dimension, End_dimension = 1, 28
|
245 |
-
if Evaluation_dimension_2 == '
|
|
|
|
|
246 |
End_dimension = 23
|
247 |
elif Evaluation_dimension_2 == 'L2':
|
248 |
End_dimension = 25
|
@@ -252,25 +254,45 @@ def add_new_eval(
|
|
252 |
each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)}
|
253 |
|
254 |
average_single = round(sum(prediction[i]["correct"] for i in range(1, 17)) / sum(prediction[i]["total"] for i in range(1, 17)) * 100, 1)
|
255 |
-
average_multi = round(sum(prediction[i]["correct"] for i in range(17, 19)) / sum(prediction[i]["total"] for i in range(17, 19)) * 100, 1)
|
256 |
-
average_video = round(sum(prediction[i]["correct"] for i in range(19, 23)) / sum(prediction[i]["total"] for i in range(19, 23)) * 100, 1)
|
257 |
-
average_p1 = round(sum(prediction[i]["correct"] for i in range(1, 23)) / sum(prediction[i]["total"] for i in range(1, 23)) * 100, 1)
|
258 |
-
|
259 |
average_task_single = round(sum(each_task_accuracy[key] for key in range(1,17)) / 16, 1)
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
average_p3 = 0
|
|
|
|
|
|
|
|
|
268 |
average_task_p3 = 0
|
269 |
else:
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
if LLM_type == 'Other':
|
276 |
LLM_name = LLM_name_textbox
|
@@ -707,7 +729,7 @@ with block:
|
|
707 |
interactive=True,
|
708 |
)
|
709 |
Evaluation_dimension_2 = gr.Dropdown(
|
710 |
-
choices=["L1", "L2", "L3"],
|
711 |
label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)",
|
712 |
multiselect=False,
|
713 |
value="L2",
|
|
|
242 |
csv_task_data = pd.read_csv(CSV_V2_TASK_DIR)
|
243 |
|
244 |
Start_dimension, End_dimension = 1, 28
|
245 |
+
if Evaluation_dimension_2 == 'Single':
|
246 |
+
End_dimension = 17
|
247 |
+
elif Evaluation_dimension_2 == 'L1':
|
248 |
End_dimension = 23
|
249 |
elif Evaluation_dimension_2 == 'L2':
|
250 |
End_dimension = 25
|
|
|
254 |
each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)}
|
255 |
|
256 |
average_single = round(sum(prediction[i]["correct"] for i in range(1, 17)) / sum(prediction[i]["total"] for i in range(1, 17)) * 100, 1)
|
|
|
|
|
|
|
|
|
257 |
average_task_single = round(sum(each_task_accuracy[key] for key in range(1,17)) / 16, 1)
|
258 |
+
|
259 |
+
# Single
|
260 |
+
if Evaluation_dimension_2 == 'Single':
|
261 |
+
average_multi = 0
|
262 |
+
average_video = 0
|
263 |
+
average_p1 = 0
|
264 |
+
average_p2 = 0
|
265 |
average_p3 = 0
|
266 |
+
average_task_multi = 0
|
267 |
+
average_task_video = 0
|
268 |
+
average_task_p1 = 0
|
269 |
+
average_task_p2 = 0
|
270 |
average_task_p3 = 0
|
271 |
else:
|
272 |
+
average_multi = round(sum(prediction[i]["correct"] for i in range(17, 19)) / sum(prediction[i]["total"] for i in range(17, 19)) * 100, 1)
|
273 |
+
average_video = round(sum(prediction[i]["correct"] for i in range(19, 23)) / sum(prediction[i]["total"] for i in range(19, 23)) * 100, 1)
|
274 |
+
average_p1 = round(sum(prediction[i]["correct"] for i in range(1, 23)) / sum(prediction[i]["total"] for i in range(1, 23)) * 100, 1)
|
275 |
+
average_task_multi = round(sum(each_task_accuracy[key] for key in range(17,19)) / 2, 1)
|
276 |
+
average_task_video = round(sum(each_task_accuracy[key] for key in range(19,23)) / 4, 1)
|
277 |
+
average_task_p1 = round(sum(each_task_accuracy[key] for key in range(1,23)) / 22, 1)
|
278 |
+
# L2
|
279 |
+
if Evaluation_dimension_2 == 'L2':
|
280 |
+
average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
|
281 |
+
average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
|
282 |
+
average_p3 = 0
|
283 |
+
average_task_p3 = 0
|
284 |
+
# L3
|
285 |
+
elif Evaluation_dimension_2 == 'L3':
|
286 |
+
average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
|
287 |
+
average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
|
288 |
+
average_p3 = round(sum(prediction[i]["correct"] for i in range(25, 28)) / sum(prediction[i]["total"] for i in range(25, 28)) * 100, 1)
|
289 |
+
average_task_p3 = round(sum(each_task_accuracy[key] for key in range(25,28)) / 3, 1)
|
290 |
+
# L1
|
291 |
+
else:
|
292 |
+
average_p2 = 0
|
293 |
+
average_task_p2 = 0
|
294 |
+
average_p3 = 0
|
295 |
+
average_task_p3 = 0
|
296 |
|
297 |
if LLM_type == 'Other':
|
298 |
LLM_name = LLM_name_textbox
|
|
|
729 |
interactive=True,
|
730 |
)
|
731 |
Evaluation_dimension_2 = gr.Dropdown(
|
732 |
+
choices=["Single", "L1", "L2", "L3"],
|
733 |
label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)",
|
734 |
multiselect=False,
|
735 |
value="L2",
|
constants.py
CHANGED
@@ -52,6 +52,8 @@ SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark Introduction
|
|
52 |
4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
|
53 |
5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
|
54 |
|
|
|
|
|
55 |
## Submit Example
|
56 |
For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
|
57 |
1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
|
|
|
52 |
4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
|
53 |
5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
|
54 |
|
55 |
+
Note: The format of the submitted json file is a dict for each line. This dict contains two keys: question_id and prediction. Specific examples are as follows: {"question_id": "5_0", "prediction": "B"}
|
56 |
+
|
57 |
## Submit Example
|
58 |
For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
|
59 |
1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
|
file/result.csv
CHANGED
@@ -43,4 +43,4 @@ Other,[Unified-IO-2 7B (2.5M)](https://unified-io-2.allenai.org),from scratch,7B
|
|
43 |
Other,[Unified-IO-2 7B](https://unified-io-2.allenai.org),from scratch,7B,PPL,60.4,65.5,46,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58,42.7,34
|
44 |
Other,[Unified-IO-2 3B (3M)](https://unified-io-2.allenai.org),from scratch,3B,PPL,60.2,64.1,45.6,69,66.6,66.5,54.3,62,42.3,50.5,65.3,44.2,57.5,36.2,39.4
|
45 |
Other,[Unified-IO-2 3B](https://unified-io-2.allenai.org),from scratch,3B,PPL,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64,41.9,57.5,36,39
|
46 |
-
Other,[Unified-IO-2 1B](https://unified-io-2.allenai.org),from scratch,1B,PPL,49.6,55.1,34,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6
|
|
|
43 |
Other,[Unified-IO-2 7B](https://unified-io-2.allenai.org),from scratch,7B,PPL,60.4,65.5,46,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58,42.7,34
|
44 |
Other,[Unified-IO-2 3B (3M)](https://unified-io-2.allenai.org),from scratch,3B,PPL,60.2,64.1,45.6,69,66.6,66.5,54.3,62,42.3,50.5,65.3,44.2,57.5,36.2,39.4
|
45 |
Other,[Unified-IO-2 3B](https://unified-io-2.allenai.org),from scratch,3B,PPL,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64,41.9,57.5,36,39
|
46 |
+
Other,[Unified-IO-2 1B](https://unified-io-2.allenai.org),from scratch,1B,PPL,49.6,55.1,34,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6
|
file/result_v2.csv
CHANGED
@@ -25,4 +25,4 @@ Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Vi
|
|
25 |
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
|
26 |
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
|
27 |
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
|
28 |
-
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0
|
|
|
25 |
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
|
26 |
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
|
27 |
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
|
28 |
+
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0
|
file/result_v2_task.csv
CHANGED
@@ -25,4 +25,4 @@ Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Vi
|
|
25 |
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,70.0,78.6,61.3,69.2,44.2,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
|
26 |
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.4,41.0,33.4,37.0,35.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
|
27 |
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,35.8,53.8,30.2,36.4,31.0,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
|
28 |
-
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.9,44.7,28.0,34.5,32.2,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0
|
|
|
25 |
[GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,70.0,78.6,61.3,69.2,44.2,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
|
26 |
[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.4,41.0,33.4,37.0,35.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
|
27 |
[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,35.8,53.8,30.2,36.4,31.0,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
|
28 |
+
[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.9,44.7,28.0,34.5,32.2,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0
|
src/__pycache__/utils_display.cpython-38.pyc
CHANGED
Binary files a/src/__pycache__/utils_display.cpython-38.pyc and b/src/__pycache__/utils_display.cpython-38.pyc differ
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc
CHANGED
Binary files a/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc and b/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc differ
|
|