tttoaster commited on
Commit
a7b9847
1 Parent(s): e08fe51

Upload 15 files

Browse files
__pycache__/constants.cpython-38.pyc CHANGED
Binary files a/__pycache__/constants.cpython-38.pyc and b/__pycache__/constants.cpython-38.pyc differ
 
app.py CHANGED
@@ -242,7 +242,9 @@ def add_new_eval(
242
  csv_task_data = pd.read_csv(CSV_V2_TASK_DIR)
243
 
244
  Start_dimension, End_dimension = 1, 28
245
- if Evaluation_dimension_2 == 'L1':
 
 
246
  End_dimension = 23
247
  elif Evaluation_dimension_2 == 'L2':
248
  End_dimension = 25
@@ -252,25 +254,45 @@ def add_new_eval(
252
  each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)}
253
 
254
  average_single = round(sum(prediction[i]["correct"] for i in range(1, 17)) / sum(prediction[i]["total"] for i in range(1, 17)) * 100, 1)
255
- average_multi = round(sum(prediction[i]["correct"] for i in range(17, 19)) / sum(prediction[i]["total"] for i in range(17, 19)) * 100, 1)
256
- average_video = round(sum(prediction[i]["correct"] for i in range(19, 23)) / sum(prediction[i]["total"] for i in range(19, 23)) * 100, 1)
257
- average_p1 = round(sum(prediction[i]["correct"] for i in range(1, 23)) / sum(prediction[i]["total"] for i in range(1, 23)) * 100, 1)
258
-
259
  average_task_single = round(sum(each_task_accuracy[key] for key in range(1,17)) / 16, 1)
260
- average_task_multi = round(sum(each_task_accuracy[key] for key in range(17,19)) / 2, 1)
261
- average_task_video = round(sum(each_task_accuracy[key] for key in range(19,23)) / 4, 1)
262
- average_task_p1 = round(sum(each_task_accuracy[key] for key in range(1,23)) / 22, 1)
263
-
264
- if Evaluation_dimension_2 == 'L2':
265
- average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
266
- average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
267
  average_p3 = 0
 
 
 
 
268
  average_task_p3 = 0
269
  else:
270
- average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
271
- average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
272
- average_p3 = round(sum(prediction[i]["correct"] for i in range(25, 28)) / sum(prediction[i]["total"] for i in range(25, 28)) * 100, 1)
273
- average_task_p3 = round(sum(each_task_accuracy[key] for key in range(25,28)) / 3, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  if LLM_type == 'Other':
276
  LLM_name = LLM_name_textbox
@@ -707,7 +729,7 @@ with block:
707
  interactive=True,
708
  )
709
  Evaluation_dimension_2 = gr.Dropdown(
710
- choices=["L1", "L2", "L3"],
711
  label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)",
712
  multiselect=False,
713
  value="L2",
 
242
  csv_task_data = pd.read_csv(CSV_V2_TASK_DIR)
243
 
244
  Start_dimension, End_dimension = 1, 28
245
+ if Evaluation_dimension_2 == 'Single':
246
+ End_dimension = 17
247
+ elif Evaluation_dimension_2 == 'L1':
248
  End_dimension = 23
249
  elif Evaluation_dimension_2 == 'L2':
250
  End_dimension = 25
 
254
  each_task_accuracy = {i: round(prediction[i]["correct"] / prediction[i]["total"] * 100, 1) if i >= Start_dimension and i < End_dimension else 0 for i in range(1, 28)}
255
 
256
  average_single = round(sum(prediction[i]["correct"] for i in range(1, 17)) / sum(prediction[i]["total"] for i in range(1, 17)) * 100, 1)
 
 
 
 
257
  average_task_single = round(sum(each_task_accuracy[key] for key in range(1,17)) / 16, 1)
258
+
259
+ # Single
260
+ if Evaluation_dimension_2 == 'Single':
261
+ average_multi = 0
262
+ average_video = 0
263
+ average_p1 = 0
264
+ average_p2 = 0
265
  average_p3 = 0
266
+ average_task_multi = 0
267
+ average_task_video = 0
268
+ average_task_p1 = 0
269
+ average_task_p2 = 0
270
  average_task_p3 = 0
271
  else:
272
+ average_multi = round(sum(prediction[i]["correct"] for i in range(17, 19)) / sum(prediction[i]["total"] for i in range(17, 19)) * 100, 1)
273
+ average_video = round(sum(prediction[i]["correct"] for i in range(19, 23)) / sum(prediction[i]["total"] for i in range(19, 23)) * 100, 1)
274
+ average_p1 = round(sum(prediction[i]["correct"] for i in range(1, 23)) / sum(prediction[i]["total"] for i in range(1, 23)) * 100, 1)
275
+ average_task_multi = round(sum(each_task_accuracy[key] for key in range(17,19)) / 2, 1)
276
+ average_task_video = round(sum(each_task_accuracy[key] for key in range(19,23)) / 4, 1)
277
+ average_task_p1 = round(sum(each_task_accuracy[key] for key in range(1,23)) / 22, 1)
278
+ # L2
279
+ if Evaluation_dimension_2 == 'L2':
280
+ average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
281
+ average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
282
+ average_p3 = 0
283
+ average_task_p3 = 0
284
+ # L3
285
+ elif Evaluation_dimension_2 == 'L3':
286
+ average_p2 = round(sum(prediction[i]["correct"] for i in range(23, 25)) / sum(prediction[i]["total"] for i in range(23, 25)) * 100, 1)
287
+ average_task_p2 = round(sum(each_task_accuracy[key] for key in range(23,25)) / 2, 1)
288
+ average_p3 = round(sum(prediction[i]["correct"] for i in range(25, 28)) / sum(prediction[i]["total"] for i in range(25, 28)) * 100, 1)
289
+ average_task_p3 = round(sum(each_task_accuracy[key] for key in range(25,28)) / 3, 1)
290
+ # L1
291
+ else:
292
+ average_p2 = 0
293
+ average_task_p2 = 0
294
+ average_p3 = 0
295
+ average_task_p3 = 0
296
 
297
  if LLM_type == 'Other':
298
  LLM_name = LLM_name_textbox
 
729
  interactive=True,
730
  )
731
  Evaluation_dimension_2 = gr.Dropdown(
732
+ choices=["Single", "L1", "L2", "L3"],
733
  label="Evaluation dimension for SEED-Bench 2(for evaluate SEED-Bench 2)",
734
  multiselect=False,
735
  value="L2",
constants.py CHANGED
@@ -52,6 +52,8 @@ SUBMIT_INTRODUCTION = """# Submit on SEED Benchmark Introduction
52
  4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
53
  5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
54
 
 
 
55
  ## Submit Example
56
  For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
57
  1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
 
52
  4. For the evaluation dimension, you can choose "All/Image/Video" for SEED-Bench-1 and "L1/L2/L3" for SEED-Bench-2, and the results of dimensions that are not evaluated will be set to zero.
53
  5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
54
 
55
+ Note: The format of the submitted json file is a dict for each line. This dict contains two keys: question_id and prediction. Specific examples are as follows: {"question_id": "5_0", "prediction": "B"}
56
+
57
  ## Submit Example
58
  For example on SEED-Bench-1, if you want to upload InstructBLIP's result in the leaderboard, you need to:
59
  1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
file/result.csv CHANGED
@@ -43,4 +43,4 @@ Other,[Unified-IO-2 7B (2.5M)](https://unified-io-2.allenai.org),from scratch,7B
43
  Other,[Unified-IO-2 7B](https://unified-io-2.allenai.org),from scratch,7B,PPL,60.4,65.5,46,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58,42.7,34
44
  Other,[Unified-IO-2 3B (3M)](https://unified-io-2.allenai.org),from scratch,3B,PPL,60.2,64.1,45.6,69,66.6,66.5,54.3,62,42.3,50.5,65.3,44.2,57.5,36.2,39.4
45
  Other,[Unified-IO-2 3B](https://unified-io-2.allenai.org),from scratch,3B,PPL,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64,41.9,57.5,36,39
46
- Other,[Unified-IO-2 1B](https://unified-io-2.allenai.org),from scratch,1B,PPL,49.6,55.1,34,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6
 
43
  Other,[Unified-IO-2 7B](https://unified-io-2.allenai.org),from scratch,7B,PPL,60.4,65.5,46,71.3,68.8,67.5,55.5,61.2,45.4,62.9,66.5,59.3,58,42.7,34
44
  Other,[Unified-IO-2 3B (3M)](https://unified-io-2.allenai.org),from scratch,3B,PPL,60.2,64.1,45.6,69,66.6,66.5,54.3,62,42.3,50.5,65.3,44.2,57.5,36.2,39.4
45
  Other,[Unified-IO-2 3B](https://unified-io-2.allenai.org),from scratch,3B,PPL,58.7,63.8,44.2,68.8,65.8,67.2,52.9,60.4,43.1,55.7,64,41.9,57.5,36,39
46
+ Other,[Unified-IO-2 1B](https://unified-io-2.allenai.org),from scratch,1B,PPL,49.6,55.1,34,63.8,57.7,54.6,41.9,53.7,33.3,51.5,58.3,47.7,39.8,34.5,24.6
file/result_v2.csv CHANGED
@@ -25,4 +25,4 @@ Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Vi
25
  [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
26
  [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
27
  [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
28
- [Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0
 
25
  [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,69.8,73.1,61.7,68.1,37.9,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
26
  [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,36.7,35.4,34.2,36.2,37.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
27
  [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,38.3,49.8,31.6,36.9,33.7,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
28
+ [Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,35.3,40.7,28.5,33.9,33.7,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0
file/result_v2_task.csv CHANGED
@@ -25,4 +25,4 @@ Model,Language Model,Model Size,Evaluation Method,Avg. Single,Avg. Multi,Avg. Vi
25
  [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,70.0,78.6,61.3,69.2,44.2,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
26
  [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.4,41.0,33.4,37.0,35.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
27
  [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,35.8,53.8,30.2,36.4,31.0,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
28
- [Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.9,44.7,28.0,34.5,32.2,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0
 
25
  [GPT-4V](https://openai.com/research/gpt-4v-system-card),\-,-,Generate,70.0,78.6,61.3,69.2,44.2,0.0,77.5,73.9,70.6,61.8,56.8,56.9,74.2,78.5,82.3,91.8,97.4,45.1,71.9,66.1,71.1,43.9,67.9,89.3,64.5,65.7,51.7,63.4,29.2,59.2,0.0,0.0,0.0
26
  [VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,7B,PPL,37.4,41.0,33.4,37.0,35.3,0.0,44.3,40.7,32.2,36.9,32.9,32.6,42.3,51.1,45.7,35.2,46.8,20.6,43.2,39.4,34.3,19.7,30.3,51.6,41.5,34.0,30.6,27.4,40.0,30.6,0.0,0.0,0.0
27
  [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,7B,PPL,35.8,53.8,30.2,36.4,31.0,0.0,44.1,37.0,35.8,30.7,44.2,31.1,29.9,49.9,39.8,49.7,40.6,22.0,33.2,37.2,22.4,25.0,46.1,61.4,42.6,32.2,27.0,19.0,37.5,24.5,0.0,0.0,0.0
28
+ [Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,13B,PPL,34.9,44.7,28.0,34.5,32.2,0.0,45.3,36.4,33.7,30.6,27.1,31.5,35.1,52.0,35.2,44.9,43.4,23.8,33.2,37.2,26.0,22.7,37.1,52.2,31.5,32.1,21.9,26.5,35.8,28.6,0.0,0.0,0.0
src/__pycache__/utils_display.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/utils_display.cpython-38.pyc and b/src/__pycache__/utils_display.cpython-38.pyc differ
 
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc CHANGED
Binary files a/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc and b/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc differ