nxphi47's picture
Create app.py
01f830e
raw history blame
No virus
6.98 kB
import gradio as gr
import plotly.graph_objects as go
from datasets import load_dataset
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
# ==
import json
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
CATEGORIES = ["task-solving", "math-reasoning", "general-instruction", "natural-question", "safety"]
LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl']
# benchmark_name = "sea_bench_all"
# with open(f"data/{benchmark_name}/question.jsonl", 'r') as f:
# questions = [
# json.loads(x)
# for x in f
# ]
# questions = {
# q['question_id']: q
# for q in questions
# }
# def get_model_df():
# cnt = 0
# q2result = []
# fin = open(f"data/{benchmark_name}/model_judgment/gpt-4_single.jsonl", "r")
# for line in fin:
# obj = json.loads(line)
# # obj["category"] = CATEGORIES[(obj["question_id"]-81)//10]
# obj["category"] = questions[obj['question_id']]['category']
# obj["lang"] = questions[obj['question_id']]['lang']
# q2result.append(obj)
# df = pd.DataFrame(q2result)
# return df
force_download = bool(int(os.environ.get("force_download", "1")))
HF_TOKEN = str(os.environ.get("HF_TOKEN", ""))
DATA_SET_REPO_PATH = str(os.environ.get("DATA_SET_REPO_PATH", ""))
PERFORMANCE_FILENAME = str(os.environ.get("PERFORMANCE_FILENAME", "gpt4_single_json.csv"))
MODEL_DFRAME = None
def get_model_df():
global MODEL_DFRAME
if isinstance(MODEL_DFRAME, pd.DataFrame):
print(f'Load cache data frame')
return MODEL_DFRAME
from huggingface_hub import hf_hub_download
assert DATA_SET_REPO_PATH != ''
assert HF_TOKEN != ''
repo_id = DATA_SET_REPO_PATH
filename = PERFORMANCE_FILENAME
# data_path = f"{DATA_SET_REPO_PATH}/{PERFORMANCE_FILENAME}"
file_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
force_download=force_download,
local_dir='./hf_cache',
repo_type="dataset",
token=HF_TOKEN
)
print(f'Downloaded file at {file_path} from {DATA_SET_REPO_PATH} / {PERFORMANCE_FILENAME}')
MODEL_DFRAME = pd.read_csv(file_path)
return MODEL_DFRAME
def aggregate_df(df, model_dict, category_name, categories):
scores_all = []
all_models = df["model"].unique()
for model in all_models:
for i, cat in enumerate(categories):
# filter category/model, and score format error (<1% case)
res = df[(df[category_name]==cat) & (df["model"]==model) & (df["score"] >= 0)]
score = res["score"].mean()
cat_name = cat
scores_all.append({"model": model, category_name: cat_name, "score": score})
target_models = list(model_dict.keys())
scores_target = [scores_all[i] for i in range(len(scores_all)) if scores_all[i]["model"] in target_models]
scores_target = sorted(scores_target, key=lambda x: target_models.index(x["model"]), reverse=True)
df_score = pd.DataFrame(scores_target)
df_score = df_score[df_score["model"].isin(target_models)]
rename_map = model_dict
for k, v in rename_map.items():
df_score.replace(k, v, inplace=True)
return df_score
rename_map = {
"seallm13b10L4k_a_sft4xdpo_5a": "SeaLLM-13b-10L",
"polylm": "PolyLM-13b",
"qwen": "Qwen-14b",
"gpt-3.5-turbo": "GPT-3.5-turbo",
"gpt-4-1106-preview": "GPT-4-turbo",
}
CATEGORIES = [ "task-solving", "math-reasoning", "general-instruction", "natural-question", "safety", ]
CATEGORIES_NAMES = {
"task-solving": 'Task-solving',
"math-reasoning": 'Math',
"general-instruction": 'General-instruction',
"natural-question": 'NaturalQA',
"safety": 'Safety',
}
# LANGS = ['en', 'vi', 'th', 'id', 'km', 'lo', 'ms', 'my', 'tl']
LANGS = ['en', 'vi', 'id', 'ms', 'tl', 'th', 'km', 'lo', 'my']
LANG_NAMES = {
'en': 'eng',
'vi': 'vie',
'th': 'tha',
'id': 'ind',
'km': 'khm',
'lo': 'lao',
'ms': 'msa',
'my': 'mya',
'tl': 'tgl',
}
def plot_fn():
df = get_model_df()
all_models = df["model"].unique()
model_names = list(rename_map.items())
colors = px.colors.qualitative.Plotly
cat_df = aggregate_df(df, rename_map, "category", CATEGORIES, )
lang_df = aggregate_df(df, rename_map, "lang", LANGS, )
fig = make_subplots(
rows=1, cols=2,
specs=[[{'type': 'polar'}]*2],
subplot_titles=("By Category", "By Language"),
)
fig.layout.annotations[0].y = 1.05
fig.layout.annotations[1].y = 1.05
# cat category
for i, (model, model_name) in enumerate(model_names):
cat_list = cat_df[cat_df['model'] == model_name]['category'].tolist()
score_list = cat_df[cat_df['model'] == model_name]['score'].tolist()
cat_list += [cat_list[0]]
cat_list = [CATEGORIES_NAMES[x] for x in cat_list]
score_list += [score_list[0]]
polar = go.Scatterpolar(
name = model_name,
r = score_list,
theta = cat_list,
legendgroup=f'{i}',
marker=dict(color=colors[i]),
hovertemplate="""Score: %{r:.2f}""",
)
fig.add_trace(polar, 1, 1)
# cat langs
for i, (model, model_name) in enumerate(model_names):
cat_list = lang_df[lang_df['model'] == model_name]['lang'].tolist()
score_list = lang_df[lang_df['model'] == model_name]['score'].tolist()
cat_list += [cat_list[0]]
score_list += [score_list[0]]
cat_list = [LANG_NAMES[x] for x in cat_list]
polar = go.Scatterpolar(
name = model_name,
r = score_list,
theta = cat_list,
legendgroup=f'{i}',
marker=dict(color=colors[i]),
hovertemplate="""Score: %{r:.2f}""",
showlegend=False,
)
fig.add_trace(polar, 1, 2)
polar_config = dict(
angularaxis = dict(
rotation=90, # start position of angular axis
),
radialaxis = dict(
range=[0, 10],
),
)
fig.update_layout(
polar = polar_config,
polar2 = polar_config,
title='Sea-Bench (rated by GPT-4)',
)
return fig
with gr.Blocks() as demo:
with gr.Column():
# with gr.Row():
# min_price = gr.Number(value=250, label="Minimum Price")
# max_price = gr.Number(value=1000, label="Maximum Price")
# boroughs = gr.CheckboxGroup(choices=["Queens", "Brooklyn", "Manhattan", "Bronx", "Staten Island"], value=["Queens", "Brooklyn"], label="Select Boroughs:")
# btn = gr.Button(value="Update Filter")
gr_plot = gr.Plot()
demo.load(plot_fn, [], gr_plot)
# btn.click(filter_map, [min_price, max_price, boroughs], map)
demo.launch()