In [None]:
import json
import copy as cp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

def download_file(url, filename=None):
 from urllib.request import urlretrieve
 if filename is None:
 filename = url.split('/')[-1]
 urlretrieve(url, filename)

font_URL = 'http://opencompass.openxlab.space/utils/Fonts/segoepr.ttf'
download_file(font_URL)

font12 = fm.FontProperties(fname='segoepr.ttf', size=12)
font15 = fm.FontProperties(fname='segoepr.ttf', size=15, weight='bold')
font18 = fm.FontProperties(fname='segoepr.ttf', size=18, weight='bold')

DATA_URL = 'http://opencompass.openxlab.space/utils/OpenVLM.json'
download_file(DATA_URL)

In [None]:
def pre_normalize(raw_data, labels):
 data_list = cp.deepcopy(raw_data)
 minimum, maximum, max_range, range_map = {}, {}, 0, {}
 for lb in labels:
 minimum[lb] = min([x[lb] for x in data_list])
 maximum[lb] = max([x[lb] for x in data_list])
 max_range = max(max_range, maximum[lb] - minimum[lb])
 max_range *= 1.25
 for lb in labels:
 mid = (minimum[lb] + maximum[lb]) / 2
 new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)
 range_map[lb] = new_range
 for item in data_list:
 assert new_range[0] <= item[lb] <= new_range[1]
 item[lb] = (item[lb] - new_range[0]) / max_range * 100
 return data_list, range_map

# solve the problem that some benchmark score is too high and out of range
def log_normalize(raw_data, labels):
 data_list = cp.deepcopy(raw_data)
 minimum, maximum, max_range, range_map = {}, {}, 0, {}
 for lb in labels:
 minimum[lb] = min([np.log(x[lb]) for x in data_list])
 maximum[lb] = max([np.log(x[lb]) for x in data_list])
 max_range = max(max_range, maximum[lb] - minimum[lb])
 max_range *= 1.005
 for lb in labels:
 mid = (minimum[lb] + maximum[lb]) / 2
 new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)
 range_map[lb] = new_range
 for item in data_list:
 assert new_range[0] <= np.log(item[lb]) <= new_range[1]
 item[lb] = (np.log(item[lb]) - new_range[0]) / max_range * 100
 return data_list, range_map

In [None]:
# Draw MMBench Radar Graph
data = json.loads(open('OpenVLM.json').read())['results']
models = list(data)
print(models)

# model2vis = [
# 'GPT-4v (detail: low)', 'GeminiProVision', 'Qwen-VL-Plus', 
# 'InternLM-XComposer2-VL', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',
# 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'
# ]

model2vis = [
 # 'GPT-4v (detail: low)', 'GeminiProVision', 'InternLM-XComposer2-VL', 
 'GPT-4v (1106, detail-low)', 'Gemini-1.0-Pro', 'Gemini-1.5-Pro', #'Gemini-1.5-Flash', 'Qwen-VL-Plus', 
 'InternLM-XComposer2', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',
 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'
]

colors = [
 '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', 
 '#e377c2', '#7f7f7f', '#bcbd22'
]

In [None]:
from collections import defaultdict

split = 'MMBench_TEST_EN'
# data_sub = {k: v[split] for k, v in data.items()}
data_sub = {k: defaultdict(int, v)[split] for k, v in data.items()}
# solve the problem that some model lack the evaluation of MMBench_TEST_EN

labels = list(data_sub[model2vis[0]])
labels.remove('Overall')
num_vars = len(labels)

raw_data = [data_sub[m] for m in model2vis]
data_list, range_map = pre_normalize(raw_data, labels)

alpha = 0.25
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles_deg = np.linspace(0, 360, num_vars, endpoint=False).tolist()
fig, ax_base = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), subplot_kw=dict(polar=True))

for i in range(len(data_list)):
 item = data_list[i]
 model_name = model2vis[i]
 color = colors[i]
 tmp_angles = angles[:] + [angles[0]]
 tmp_values = [item[lb] for lb in labels] + [item[labels[0]]]
 ax_base.plot(tmp_angles, tmp_values, color=color, linewidth=1, linestyle='solid', label=model_name)
 ax_base.fill(tmp_angles, tmp_values, color=color, alpha=alpha)
 
angles += [angles[0]]
ax_base.set_ylim(0, 100)
ax_base.set_yticks([40, 60, 80, 100])
ax_base.set_yticklabels([''] * 4)

ax_base.tick_params(pad=25)
ax_base.set_xticks(angles[:-1])
ax_base.set_xticklabels(labels, fontproperties=font18)

leg = ax_base.legend(loc='center right', bbox_to_anchor=(1.6, 0.5), prop=font15, ncol=1, frameon=True, labelspacing=1.2)
for line in leg.get_lines():
 line.set_linewidth(2.5)

cx, cy, sz = 0.44, 0.435, 0.34
axes = [fig.add_axes([cx - sz, cy - sz, cx + sz, cy + sz], projection='polar', label='axes%d' % i) for i in range(num_vars)]
 
for ax, angle, label in zip(axes, angles_deg, labels):
 ax.patch.set_visible(False)
 ax.grid(False)
 ax.xaxis.set_visible(False)
 cur_range = range_map[label]
 label_list = [cur_range[0] + (cur_range[1] - cur_range[0]) / 5 * i for i in range(2, 6)]
 label_list = [f'{x:.1f}' for x in label_list]
 ax.set_rgrids(range(40, 120, 20), angle=angle, labels=label_list, font_properties=font12)
 ax.spines['polar'].set_visible(False)
 ax.set_ylim(0, 100)

title_text = f'{len(model2vis)} Representative VLMs on MMBench Test.'
plt.figtext(.7, .95, title_text, fontproperties=font18, ha='center')
plt.show()

In [None]:
labels = ['SEEDBench_IMG', 'CCBench', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MME', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']
num_vars = len(labels)

raw_data = [{k: data[m][k]['Overall'] for k in labels} for m in model2vis]
data_list, range_map = pre_normalize(raw_data, labels)

alpha = 0.25
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles_deg = np.linspace(0, 360, num_vars, endpoint=False).tolist()
fig, ax_base = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), subplot_kw=dict(polar=True))

for i in range(len(data_list)):
 item = data_list[i]
 model_name = model2vis[i]
 color = colors[i]
 tmp_angles = angles[:] + [angles[0]]
 tmp_values = [item[lb] for lb in labels] + [item[labels[0]]]
 ax_base.plot(tmp_angles, tmp_values, color=color, linewidth=1, linestyle='solid', label=model_name)
 ax_base.fill(tmp_angles, tmp_values, color=color, alpha=alpha)
 
angles += [angles[0]]
ax_base.set_ylim(0, 100)
ax_base.set_yticks([40, 60, 80, 100])
ax_base.set_yticklabels([''] * 4)

ax_base.tick_params(pad=15)
ax_base.set_xticks(angles[:-1])
ax_base.set_xticklabels(labels, fontproperties=font18)

dataset_map = {
 'MMBench_TEST_EN': 'MMBench (Test)', 
 'MMBench_TEST_CN': 'MMBenchCN (Test)', 
 'MathVista': 'MathVista (TestMini)', 
 'MMMU_VAL': 'MMMU (Val)'
}
for i, label in enumerate(ax_base.get_xticklabels()):
 x,y = label.get_position()
 text = label.get_text()
 text = dataset_map[text] if text in dataset_map else text
 lab = ax_base.text(x, y, text, transform=label.get_transform(),
 ha=label.get_ha(), va=label.get_va(), font_properties=font15)
 lab.set_rotation(360 / num_vars * i + 270)
 labels.append(lab)
ax_base.set_xticklabels([])

leg = ax_base.legend(loc='center right', bbox_to_anchor=(1.6, 0.5), prop=font15, ncol=1, frameon=True, labelspacing=1.2)
for line in leg.get_lines():
 line.set_linewidth(2.5)

cx, cy, sz = 0.44, 0.435, 0.34
axes = [fig.add_axes([cx - sz, cy - sz, cx + sz, cy + sz], projection='polar', label='axes%d' % i) for i in range(num_vars)]
 
for ax, angle, label in zip(axes, angles_deg, labels):
 ax.patch.set_visible(False)
 ax.grid(False)
 ax.xaxis.set_visible(False)
 cur_range = range_map[label]
 label_list = [cur_range[0] + (cur_range[1] - cur_range[0]) / 5 * i for i in range(2, 6)]
 label_list = [f'{x:.1f}' for x in label_list]
 ax.set_rgrids(range(40, 120, 20), angle=angle, labels=label_list, font_properties=font12)
 ax.spines['polar'].set_visible(False)
 ax.set_ylim(0, 100)

title_text = f'{len(model2vis)} Representative VLMs on {num_vars} Benchmarks in OpenCompass Multi-Modal Leaderboard.'
plt.figtext(.7, .95, title_text, fontproperties=font18, ha='center')
plt.show()