kennymckormick commited on
Commit
044f86f
1 Parent(s): 746f6aa

update lb_info.py

Browse files
Files changed (1) hide show
  1. lb_info.py +18 -1
lb_info.py CHANGED
@@ -32,7 +32,7 @@ This leaderboard was last updated: {}.
32
  """
33
  # CONSTANTS-FIELDS
34
  META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
35
- MAIN_FIELDS = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench', 'MME', 'SEEDBench_IMG', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']
36
  MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
37
  MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
38
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
@@ -122,6 +122,23 @@ LEADERBOARD_MD['LLaVABench'] = """
122
  - We also include the official results (obtained by gpt-4-0314) for applicable models.
123
  """
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  from urllib.request import urlopen
126
 
127
  def load_results():
 
32
  """
33
  # CONSTANTS-FIELDS
34
  META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
35
+ MAIN_FIELDS = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench', 'MME', 'SEEDBench_IMG', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench', 'AI2D_TEST']
36
  MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
37
  MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
38
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
 
122
  - We also include the official results (obtained by gpt-4-0314) for applicable models.
123
  """
124
 
125
+ LEADERBOARD_MD['COCO_VAL'] = """
126
+ ## COCO Caption Results
127
+
128
+ - By default, we evaluate COCO Caption Validation set (5000 samples), and report the following metrics: `BLEU-1, BLEU-4, CIDEr, ROUGE-L
129
+ - We use the following prompt to evaluate all VLMs: `Please describe this image in general. Directly provide the description, do not include prefix like "This image depicts". `
130
+ - **No specific prompt is adopted for all VLMs.**
131
+ """
132
+
133
+ LEADERBOARD_MD['ScienceQA_VAL'] = """
134
+ # ScienceQA Evaluation Results
135
+
136
+ - We benchmark the **image** subset of ScienceQA validation and test set, and report the Top-1 accuracy.
137
+ - During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
138
+ """
139
+
140
+ LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
141
+
142
  from urllib.request import urlopen
143
 
144
  def load_results():