Spaces:
Runtime error
Runtime error
Junming Yang
commited on
[Leaderboard] Support leaderboard dynamic avg score calculation (#193)
Browse files* add VQA meta_data
* Support leaderboard dynamic avg score calculation
- app.py +3 -0
- gen_table.py +50 -33
- meta_data.py +2 -2
app.py
CHANGED
|
@@ -52,7 +52,9 @@ with gr.Blocks() as demo:
|
|
| 52 |
visible=True)
|
| 53 |
|
| 54 |
def filter_df(fields, model_size, model_type):
|
|
|
|
| 55 |
headers = check_box['essential'] + fields
|
|
|
|
| 56 |
df = cp.deepcopy(table)
|
| 57 |
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
| 58 |
df = df[df['flag']]
|
|
@@ -62,6 +64,7 @@ with gr.Blocks() as demo:
|
|
| 62 |
df = df[df['flag']]
|
| 63 |
df.pop('flag')
|
| 64 |
|
|
|
|
| 65 |
comp = gr.components.DataFrame(
|
| 66 |
value=df[headers],
|
| 67 |
type='pandas',
|
|
|
|
| 52 |
visible=True)
|
| 53 |
|
| 54 |
def filter_df(fields, model_size, model_type):
|
| 55 |
+
filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
|
| 56 |
headers = check_box['essential'] + fields
|
| 57 |
+
new_fields = [field for field in fields if field not in filter_list]
|
| 58 |
df = cp.deepcopy(table)
|
| 59 |
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
| 60 |
df = df[df['flag']]
|
|
|
|
| 64 |
df = df[df['flag']]
|
| 65 |
df.pop('flag')
|
| 66 |
|
| 67 |
+
df = generate_table(results, new_fields, df)
|
| 68 |
comp = gr.components.DataFrame(
|
| 69 |
value=df[headers],
|
| 70 |
type='pandas',
|
gen_table.py
CHANGED
|
@@ -60,45 +60,22 @@ def model_type_flag(line, FIELDS):
|
|
| 60 |
|
| 61 |
|
| 62 |
def BUILD_L1_DF(results, fields):
|
| 63 |
-
res = defaultdict(list)
|
| 64 |
-
for i, m in enumerate(results):
|
| 65 |
-
item = results[m]
|
| 66 |
-
meta = item['META']
|
| 67 |
-
for k in META_FIELDS:
|
| 68 |
-
if k == 'Parameters (B)':
|
| 69 |
-
param = meta['Parameters']
|
| 70 |
-
res[k].append(float(param.replace('B', '')) if param != '' else None)
|
| 71 |
-
elif k == 'Method':
|
| 72 |
-
name, url = meta['Method']
|
| 73 |
-
res[k].append(f'<a href="{url}">{name}</a>')
|
| 74 |
-
else:
|
| 75 |
-
res[k].append(meta[k])
|
| 76 |
-
scores, ranks = [], []
|
| 77 |
-
for d in fields:
|
| 78 |
-
key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
|
| 79 |
-
res[d].append(item[d][key_name])
|
| 80 |
-
if d == 'MME':
|
| 81 |
-
scores.append(item[d][key_name] / 28)
|
| 82 |
-
elif d == 'OCRBench':
|
| 83 |
-
scores.append(item[d][key_name] / 10)
|
| 84 |
-
else:
|
| 85 |
-
scores.append(item[d][key_name])
|
| 86 |
-
ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values()]))
|
| 87 |
-
res['Avg Score'].append(round(np.mean(scores), 1))
|
| 88 |
-
res['Avg Rank'].append(round(np.mean(ranks), 2))
|
| 89 |
-
|
| 90 |
-
df = pd.DataFrame(res)
|
| 91 |
-
df = df.sort_values('Avg Score')
|
| 92 |
-
df = df.iloc[::-1]
|
| 93 |
-
|
| 94 |
check_box = {}
|
| 95 |
check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
| 98 |
type_map = defaultdict(lambda: 'number')
|
| 99 |
type_map['Method'] = 'html'
|
| 100 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
| 101 |
check_box['type_map'] = type_map
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
return df, check_box
|
| 103 |
|
| 104 |
|
|
@@ -153,3 +130,43 @@ def BUILD_L2_DF(results, dataset):
|
|
| 153 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
| 154 |
check_box['type_map'] = type_map
|
| 155 |
return df, check_box
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
def BUILD_L1_DF(results, fields):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
check_box = {}
|
| 64 |
check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
|
| 65 |
+
# revise there to set defualt dataset
|
| 66 |
+
defualt_dataset = ['MMBench_TEST_EN', 'MMStar', 'MME', 'MMMU_VAL', 'MathVista', 'OCRBench', 'MMVet']
|
| 67 |
+
check_box['required'] = ['Avg Score', 'Avg Rank'] + defualt_dataset
|
| 68 |
+
check_box['avg'] = ['Avg Score', 'Avg Rank']
|
| 69 |
+
check_box['all'] = check_box['avg'] + fields
|
| 70 |
type_map = defaultdict(lambda: 'number')
|
| 71 |
type_map['Method'] = 'html'
|
| 72 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
| 73 |
check_box['type_map'] = type_map
|
| 74 |
+
|
| 75 |
+
res = generate_table(results, fields)
|
| 76 |
+
df = pd.DataFrame(res)
|
| 77 |
+
df = df.sort_values('Avg Score')
|
| 78 |
+
df = df.iloc[::-1]
|
| 79 |
return df, check_box
|
| 80 |
|
| 81 |
|
|
|
|
| 130 |
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
| 131 |
check_box['type_map'] = type_map
|
| 132 |
return df, check_box
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def generate_table(results, fields, df=None):
|
| 136 |
+
res = defaultdict(list)
|
| 137 |
+
for i, m in enumerate(results):
|
| 138 |
+
item = results[m]
|
| 139 |
+
meta = item['META']
|
| 140 |
+
for k in META_FIELDS:
|
| 141 |
+
if k == 'Parameters (B)':
|
| 142 |
+
param = meta['Parameters']
|
| 143 |
+
res[k].append(float(param.replace('B', '')) if param != '' else None)
|
| 144 |
+
elif k == 'Method':
|
| 145 |
+
name, url = meta['Method']
|
| 146 |
+
res[k].append(f'<a href="{url}">{name}</a>')
|
| 147 |
+
res['name'].append(name)
|
| 148 |
+
else:
|
| 149 |
+
res[k].append(meta[k])
|
| 150 |
+
scores, ranks = [], []
|
| 151 |
+
for d in fields:
|
| 152 |
+
key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
|
| 153 |
+
res[d].append(item[d][key_name])
|
| 154 |
+
if d == 'MME':
|
| 155 |
+
scores.append(item[d][key_name] / 28)
|
| 156 |
+
elif d == 'OCRBench':
|
| 157 |
+
scores.append(item[d][key_name] / 10)
|
| 158 |
+
else:
|
| 159 |
+
scores.append(item[d][key_name])
|
| 160 |
+
ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values()]))
|
| 161 |
+
res['Avg Score'].append(round(np.mean(scores), 1))
|
| 162 |
+
res['Avg Rank'].append(round(np.mean(ranks), 2))
|
| 163 |
+
if df is None:
|
| 164 |
+
return res
|
| 165 |
+
else:
|
| 166 |
+
res = pd.DataFrame(res)
|
| 167 |
+
df.set_index('name', inplace=True)
|
| 168 |
+
res.set_index('name', inplace=True)
|
| 169 |
+
df.update(res)
|
| 170 |
+
df = df.sort_values('Avg Score')
|
| 171 |
+
df = df.iloc[::-1]
|
| 172 |
+
return df
|
meta_data.py
CHANGED
|
@@ -21,8 +21,8 @@ This leaderboard was last updated: {}.
|
|
| 21 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
| 22 |
MAIN_FIELDS = [
|
| 23 |
'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
|
| 24 |
-
'MMMU_VAL', 'MathVista', '
|
| 25 |
-
'
|
| 26 |
]
|
| 27 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
| 28 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
|
|
|
| 21 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
| 22 |
MAIN_FIELDS = [
|
| 23 |
'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
|
| 24 |
+
'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
| 25 |
+
'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
|
| 26 |
]
|
| 27 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
| 28 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|