|
import gradio as gr |
|
import plotly.graph_objects as go |
|
from datasets import load_dataset |
|
import plotly.graph_objects as go |
|
from plotly.subplots import make_subplots |
|
|
|
import os |
|
|
|
import json |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
|
|
|
|
|
|
CATEGORIES = ["task-solving", "math-reasoning", "general-instruction", "natural-question", "safety"] |
|
LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl'] |
|
|
|
|
|
force_download = bool(int(os.environ.get("force_download", "1"))) |
|
HF_TOKEN = str(os.environ.get("HF_TOKEN", "")) |
|
DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", "")) |
|
PERFORMANCE_FILENAME = str(os.environ.get("PERFORMANCE_FILENAME", "gpt4_single_json.csv")) |
|
|
|
MODEL_DFRAME = None |
|
|
|
|
|
CATEGORIES = ["task-solving", "math-reasoning", "general-instruction", "natural-question", "safety"] |
|
LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl'] |
|
|
|
FORCE_DOWNLOAD = bool(int(os.environ.get("FORCE_DOWNLOAD", "0"))) |
|
HF_TOKEN = str(os.environ.get("HF_TOKEN", "")) |
|
DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", "SeaLLMs/Sea-bench")) |
|
|
|
PERFORMANCE_FILENAME = str(os.environ.get("PERFORMANCE_FILENAME", "model_judgment/gpt-4_single.jsonl")) |
|
QUESTION_FILE_NAME = str(os.environ.get("QUESTION_FILE_NAME", "question.jsonl")) |
|
|
|
rename_map = { |
|
"seallm-13b-chat": "SeaLLM-13b", |
|
"polylm-13b": "PolyLM-13b", |
|
"qwen-14b": "Qwen-14b", |
|
"gpt-3.5-turbo": "GPT-3.5-turbo", |
|
"SeaLLM-7B-v2.5": "SeaLLM-7B-v2.5", |
|
} |
|
CATEGORIES = [ "task-solving", "math-reasoning", "general-instruction", "natural-question", "safety", ] |
|
|
|
CATEGORIES_NAMES = { |
|
"task-solving": 'Task-solving', |
|
"math-reasoning": 'Math', |
|
"general-instruction": 'General-instruction', |
|
"natural-question": 'NaturalQA', |
|
"safety": 'Safety', |
|
} |
|
|
|
LANGS = ['en', 'vi', 'id', 'ms', 'tl', 'th', 'km', 'lo', 'my'] |
|
LANG_NAMES = { |
|
'en': 'eng', |
|
'vi': 'vie', |
|
'th': 'tha', |
|
'id': 'ind', |
|
'km': 'khm', |
|
'lo': 'lao', |
|
'ms': 'msa', |
|
'my': 'mya', |
|
'tl': 'tgl', |
|
} |
|
|
|
|
|
|
|
MODEL_DFRAME = None |
|
|
|
|
|
def read_jsonl_report(question_path, file_path): |
|
with open(question_path, 'r') as f: |
|
questions = [ |
|
json.loads(x) |
|
for x in f |
|
] |
|
questions = { |
|
q['question_id']: q |
|
for q in questions |
|
} |
|
|
|
q2result = [] |
|
fin = open(file_path, "r") |
|
for line in fin: |
|
obj = json.loads(line) |
|
obj["category"] = questions[obj['question_id']]['category'] |
|
obj["lang"] = questions[obj['question_id']]['lang'] |
|
q2result.append(obj) |
|
df = pd.DataFrame(q2result) |
|
return df |
|
|
|
def get_report_df_from_jsonl(): |
|
from huggingface_hub import hf_hub_download |
|
assert DATA_SET_REPO_PATH != '' |
|
assert HF_TOKEN != '' |
|
repo_id = DATA_SET_REPO_PATH |
|
question_path = hf_hub_download( |
|
repo_id=repo_id, |
|
filename=QUESTION_FILE_NAME, |
|
force_download=FORCE_DOWNLOAD, |
|
local_dir='./hf_cache', |
|
repo_type="dataset", |
|
token=HF_TOKEN |
|
) |
|
file_path = hf_hub_download( |
|
repo_id=repo_id, |
|
filename=PERFORMANCE_FILENAME, |
|
force_download=FORCE_DOWNLOAD, |
|
local_dir='./hf_cache', |
|
repo_type="dataset", |
|
token=HF_TOKEN |
|
) |
|
print(f'Downloaded file at {question_path}/ {file_path} from {DATA_SET_REPO_PATH} / {PERFORMANCE_FILENAME}') |
|
return read_jsonl_report(question_path, file_path) |
|
|
|
|
|
def get_model_df(): |
|
global MODEL_DFRAME |
|
if isinstance(MODEL_DFRAME, pd.DataFrame): |
|
print(f'Load cache data frame') |
|
return MODEL_DFRAME |
|
from huggingface_hub import hf_hub_download |
|
assert DATA_SET_REPO_PATH != '' |
|
assert HF_TOKEN != '' |
|
repo_id = DATA_SET_REPO_PATH |
|
filename = PERFORMANCE_FILENAME |
|
|
|
|
|
file_path = hf_hub_download( |
|
repo_id=repo_id, |
|
filename=filename, |
|
force_download=force_download, |
|
local_dir='./hf_cache', |
|
repo_type="dataset", |
|
token=HF_TOKEN |
|
) |
|
print(f'Downloaded file at {file_path} from {DATA_SET_REPO_PATH} / {PERFORMANCE_FILENAME}') |
|
MODEL_DFRAME = pd.read_csv(file_path) |
|
return MODEL_DFRAME |
|
|
|
|
|
def aggregate_df(df, model_dict, category_name, categories): |
|
scores_all = [] |
|
all_models = df["model"].unique() |
|
for model in all_models: |
|
for i, cat in enumerate(categories): |
|
|
|
res = df[(df[category_name]==cat) & (df["model"]==model) & (df["score"] >= 0)] |
|
score = res["score"].mean() |
|
cat_name = cat |
|
scores_all.append({"model": model, category_name: cat_name, "score": score}) |
|
|
|
target_models = list(model_dict.keys()) |
|
scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models] |
|
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True) |
|
|
|
df_score = pd.DataFrame(scores_target) |
|
df_score = df_score[df_score["model"].isin(target_models)] |
|
|
|
rename_map = model_dict |
|
|
|
for k, v in rename_map.items(): |
|
df_score.replace(k, v, inplace=True) |
|
return df_score |
|
|
|
|
|
def polar_subplot(fig, dframe, model_names, category_label, category_names, row, col, showlegend=True): |
|
|
|
|
|
colors = px.colors.qualitative.Plotly |
|
for i, (model, model_name) in enumerate(model_names): |
|
cat_list = dframe[dframe['model'] == model_name][category_label].tolist() |
|
score_list = dframe[dframe['model'] == model_name]['score'].tolist() |
|
if len(cat_list) == 0: |
|
continue |
|
cat_list += [cat_list[0]] |
|
cat_list = [category_names[x] for x in cat_list] |
|
score_list += [score_list[0]] |
|
polar = go.Scatterpolar( |
|
name = model_name, |
|
r = score_list, |
|
theta = cat_list, |
|
legendgroup=f'{i}', |
|
marker=dict(color=colors[i]), |
|
hovertemplate="""Score: %{r:.2f}""", |
|
showlegend=showlegend, |
|
) |
|
fig.add_trace(polar, row, col) |
|
|
|
|
|
def plot_agg_fn(): |
|
df = get_report_df_from_jsonl() |
|
|
|
all_models = df["model"].unique() |
|
model_names = list(rename_map.items()) |
|
colors = px.colors.qualitative.Plotly |
|
|
|
cat_df = aggregate_df(df, rename_map, "category", CATEGORIES, ) |
|
lang_df = aggregate_df(df, rename_map, "lang", LANGS, ) |
|
|
|
fig = make_subplots( |
|
rows=1, cols=2, |
|
specs=[[{'type': 'polar'}]*2], |
|
subplot_titles=("By Category", "By Language"), |
|
) |
|
fig.layout.annotations[0].y = 1.05 |
|
fig.layout.annotations[1].y = 1.05 |
|
|
|
|
|
for i, (model, model_name) in enumerate(model_names): |
|
cat_list = cat_df[cat_df['model'] == model_name]['category'].tolist() |
|
score_list = cat_df[cat_df['model'] == model_name]['score'].tolist() |
|
if len(cat_list) == 0: |
|
continue |
|
cat_list += [cat_list[0]] |
|
cat_list = [CATEGORIES_NAMES[x] for x in cat_list] |
|
score_list += [score_list[0]] |
|
polar = go.Scatterpolar( |
|
name = model_name, |
|
r = score_list, |
|
theta = cat_list, |
|
legendgroup=f'{i}', |
|
marker=dict(color=colors[i]), |
|
hovertemplate="""Score: %{r:.2f}""", |
|
) |
|
fig.add_trace(polar, 1, 1) |
|
|
|
|
|
for i, (model, model_name) in enumerate(model_names): |
|
cat_list = lang_df[lang_df['model'] == model_name]['lang'].tolist() |
|
score_list = lang_df[lang_df['model'] == model_name]['score'].tolist() |
|
if len(cat_list) == 0: |
|
continue |
|
cat_list += [cat_list[0]] |
|
score_list += [score_list[0]] |
|
cat_list = [LANG_NAMES[x] for x in cat_list] |
|
polar = go.Scatterpolar( |
|
name = model_name, |
|
r = score_list, |
|
theta = cat_list, |
|
legendgroup=f'{i}', |
|
marker=dict(color=colors[i]), |
|
hovertemplate="""Score: %{r:.2f}""", |
|
showlegend=False, |
|
) |
|
fig.add_trace(polar, 1, 2) |
|
|
|
|
|
polar_config = dict( |
|
angularaxis = dict( |
|
rotation=90, |
|
), |
|
radialaxis = dict( |
|
range=[0, 10], |
|
), |
|
) |
|
|
|
fig.update_layout( |
|
polar = polar_config, |
|
polar2 = polar_config, |
|
title='Sea-Bench (rated by GPT-4)', |
|
) |
|
return fig |
|
|
|
|
|
def plot_by_lang_fn(): |
|
df = get_report_df_from_jsonl() |
|
model_names = list(rename_map.items()) |
|
|
|
fig = make_subplots( |
|
rows=3, cols=3, |
|
specs=[[{'type': 'polar'}]*3] * 3, |
|
|
|
subplot_titles=[LANG_NAMES[x] for x in LANGS], |
|
|
|
) |
|
|
|
for ano in fig.layout.annotations: |
|
ano.y = ano.y + 0.02 |
|
has_safety = ['vi', 'id', 'th'] |
|
|
|
for lang_id, lang in enumerate(LANGS): |
|
cat_names = CATEGORIES if lang in has_safety else [x for x in CATEGORIES if x != 'safety'] |
|
cat_lang_df = aggregate_df(df[df['lang'] == lang], rename_map, "category", cat_names, ) |
|
row = lang_id // 3 + 1 |
|
col = lang_id % 3 + 1 |
|
polar_subplot(fig, cat_lang_df, model_names, 'category', CATEGORIES_NAMES, row, col, showlegend=lang_id == 0) |
|
|
|
polar_config = dict( |
|
angularaxis = dict( |
|
rotation=90, |
|
), |
|
radialaxis = dict( |
|
range=[0, 10], |
|
), |
|
) |
|
layer_kwargs = {f"polar{i}": polar_config for i in range(1, 10)} |
|
fig.update_layout( |
|
title='Sea-Bench - By language (rated by GPT-4)', |
|
height=1000, |
|
width=1500, |
|
**layer_kwargs |
|
) |
|
return fig |
|
|
|
|
|
|
|
|
|
description = """ |
|
<h1>Sea-bench benchmark leaderboard</h1> |
|
""" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Column(): |
|
gr.Markdown(description) |
|
gr_plot_agg = gr.Plot(label="Aggregated by category and language") |
|
gr_plot_bylang = gr.Plot(label='Breakdown by language') |
|
|
|
|
|
demo.load(plot_agg_fn, [], gr_plot_agg) |
|
demo.load(plot_by_lang_fn, [], gr_plot_bylang) |
|
|
|
|
|
demo.launch() |
|
|