kennymckormick commited on
Commit
64d336c
1 Parent(s): a6e43e6

add OCRBench

Browse files
Files changed (2) hide show
  1. gen_table.py +5 -0
  2. meta_data.py +8 -1
gen_table.py CHANGED
@@ -78,6 +78,8 @@ def BUILD_L1_DF(results, fields):
78
  res[d].append(item[d]['Overall'])
79
  if d == 'MME':
80
  scores.append(item[d]['Overall'] / 28)
 
 
81
  else:
82
  scores.append(item[d]['Overall'])
83
  ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
@@ -106,6 +108,9 @@ def BUILD_L2_DF(results, dataset):
106
  if dataset == 'MME':
107
  non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
108
  overall_fields = overall_fields + ['Perception', 'Cognition']
 
 
 
109
 
110
  for m in results:
111
  item = results[m]
 
78
  res[d].append(item[d]['Overall'])
79
  if d == 'MME':
80
  scores.append(item[d]['Overall'] / 28)
81
+ elif d == 'OCRBench':
82
+ scores.append(item[d]['Final Score'] / 10)
83
  else:
84
  scores.append(item[d]['Overall'])
85
  ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
 
108
  if dataset == 'MME':
109
  non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
110
  overall_fields = overall_fields + ['Perception', 'Cognition']
111
+ if dataset == 'OCRBench':
112
+ non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)]
113
+ overall_fields = ['Final Score']
114
 
115
  for m in results:
116
  item = results[m]
meta_data.py CHANGED
@@ -124,4 +124,11 @@ LEADERBOARD_MD['ScienceQA_VAL'] = """
124
  - During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
125
  """
126
 
127
- LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
 
 
 
 
 
 
 
 
124
  - During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
125
  """
126
 
127
+ LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
128
+
129
+ LEADERBOARD_MD['OCRBench'] = """
130
+ ## OCRBench Evaluation Results
131
+
132
+ - The evaluation of OCRBench is implemented by the official team: https://github.com/Yuliang-Liu/MultimodalOCR.
133
+ - The performance of GPT4V might be underestimated: GPT4V rejects to answer 12 percent of the questions due to the policy of OpenAI. For those questions, the returned answer is "Your input image may contain content that is not allowed by our safety system."
134
+ """