|
import copy as cp |
|
import json |
|
from collections import defaultdict |
|
from urllib.request import urlopen |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
|
|
from meta_data import DEFAULT_BENCH, META_FIELDS, RESULTS |
|
|
|
|
|
def load_results_local(): |
|
with open(RESULTS, 'r') as infile: |
|
data = json.load(infile) |
|
return data |
|
|
|
def nth_large(val, vals): |
|
return sum([1 for v in vals if v > val]) + 1 |
|
|
|
|
|
|
|
def model_size_flag(sz, FIELDS): |
|
if pd.isna(sz) and 'Unknown' in FIELDS: |
|
return True |
|
if pd.isna(sz): |
|
return False |
|
if '7B' in FIELDS and sz == 7: |
|
return True |
|
if '13B' in FIELDS and sz == 13: |
|
return True |
|
if '70B' in FIELDS and sz == 70: |
|
return True |
|
return False |
|
|
|
|
|
def model_type_flag(line, FIELDS): |
|
if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes': |
|
return True |
|
if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes': |
|
return True |
|
|
|
|
|
if 'Commercial LLMs' in FIELDS and line['Commercial LLMs'] == 'Yes': |
|
return True |
|
if 'General LLMs' in FIELDS and line['General LLMs'] == 'Yes': |
|
return True |
|
if 'Medical LLMs' in FIELDS and line['Medical LLMs'] == 'Yes': |
|
return True |
|
if 'SOTA' in FIELDS and line['SOTA'] == 'Yes': |
|
return True |
|
return False |
|
|
|
def BUILD_L1_DF(results, fields): |
|
check_box = {} |
|
check_box['essential'] = ['Method', 'Param (B)'] |
|
|
|
check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH |
|
check_box['avg'] = ['Avg Score', 'Avg Rank'] |
|
check_box['all'] = check_box['avg'] + fields |
|
type_map = defaultdict(lambda: 'number') |
|
type_map['Method'] = 'html' |
|
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str' |
|
check_box['type_map'] = type_map |
|
|
|
df = generate_table(results, fields) |
|
return df, check_box |
|
|
|
|
|
def generate_table(results, fields): |
|
|
|
def get_mmbench_v11(item): |
|
assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item |
|
val = (item['MMBench_TEST_CN_V11']['Overall'] + item['MMBench_TEST_EN_V11']['Overall']) / 2 |
|
val = float(f'{val:.1f}') |
|
return val |
|
|
|
res = defaultdict(list) |
|
for i, m in enumerate(results): |
|
item = results[m] |
|
meta = item['META'] |
|
for k in META_FIELDS: |
|
if k == 'Param (B)': |
|
param = meta['Parameters'] |
|
res[k].append(float(param.replace('B', '')) if param != '' else None) |
|
elif k == 'Method': |
|
name = meta['Method'][0] |
|
res[k].append(f'{name}') |
|
res['name'].append(name) |
|
else: |
|
res[k].append(meta[k]) |
|
scores, ranks = [], [] |
|
for d in fields: |
|
key_name = 'Overall' if d != 'OCRBench' else 'Final Score' |
|
|
|
if d == 'MMBench_V11': |
|
val = get_mmbench_v11(item) |
|
res[d].append(val) |
|
scores.append(val) |
|
ranks.append(nth_large(val, [get_mmbench_v11(x) for x in results.values()])) |
|
elif d in item: |
|
res[d].append(item[d][key_name]) |
|
if d == 'MME': |
|
scores.append(item[d][key_name] / 28) |
|
elif d == 'OCRBench': |
|
scores.append(item[d][key_name] / 10) |
|
else: |
|
scores.append(item[d][key_name]) |
|
ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x])) |
|
else: |
|
res[d].append(None) |
|
scores.append(None) |
|
ranks.append(None) |
|
|
|
res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None) |
|
res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None) |
|
|
|
df = pd.DataFrame(res) |
|
valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])] |
|
valid = valid.sort_values('Avg Score') |
|
valid = valid.iloc[::-1] |
|
if len(fields): |
|
missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0]) |
|
missing = missing.iloc[::-1] |
|
df = pd.concat([valid, missing]) |
|
return df |
|
|