OpsEval / app.py
Junetheriver's picture
added qa leaderboards
22cd459
raw
history blame
No virus
4.8 kB
# gradio display leaderboard
import pandas as pd
import numpy as np
import matplotlib
# matplotlib.use('macosx')
import gradio as gr
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from apscheduler.schedulers.background import BackgroundScheduler
from texts import INTRODUCTION_TEXT, TITLE
from leaderboards import eng_leaderboards, chi_leaderboards
from opseval_datasets import *
# df_lang = {
# 'English': pd.read_csv("./leaderboard/wired_network_en.csv"),
# 'Chinese': pd.read_csv("./leaderboard/wired_network_zh.csv"),
# }
def create_lang_tabs(lang, lang_cates):
df_dict = {}
for dataset, cates in lang_cates:
dataset_dt = {}
for cat in cates:
leaderboard_df = pd.read_csv(f'./data/{dataset}_{lang}_{cat}.csv')
dataset_dt[cat] = leaderboard_df
df_dict[dataset] = dataset_dt
return df_dict
dict_lang = {
'English': create_lang_tabs('en', eng_leaderboards),
'Chinese': create_lang_tabs('zh', chi_leaderboards)
}
def process_mc_df(df, shot=None):
# 将name列重命名为Model
df = df.rename(columns={"name": "Model"})
# 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency
df = df.set_index("Model")
# df = df.stack().unstack()
df.columns = pd.MultiIndex.from_tuples([("Zeroshot", "Naive"), ("Zeroshot", "SC"), ("Zeroshot", "CoT"), ("Zeroshot", "CoT+SC"), ("Fewshot", "Naive"), ("Fewshot", "SC"), ("Fewshot", "CoT"), ("Fewshot", "CoT+SC")])
# 保留shot的列,比如如果shot=Zeroshot那么只有Zeroshot的列会被保留
if shot:
df = df[shot]
# 将除了Model列之外的列的value转换为数值型,失败的为NaN
df = df.apply(pd.to_numeric, errors="coerce")
# 保留小数点后两位
df = df.round(2)
# 给每一行添加一列BestScore
df["BestScore"] = df.max(axis=1)
# 根据BestScore给df排序
df = df.sort_values(by="BestScore", ascending=False)
# reset_index
df = df.reset_index()
return df
def process_qa_df(df):
# 保留小数点后四位
df = df.round(4)
return df
def dataframe_to_gradio(df, is_mc=True, shot=None):
if is_mc:
df = process_mc_df(df, shot)
else:
df = process_qa_df(df)
headers = df.columns
# types = ["str"] + ["number"] * (len(headers) - 1)
return gr.components.Dataframe(
value=df.values.tolist(),
headers=[label for label in df.columns],
# datatype=types,
# max_rows=10,
)
def plot_radar_chart(df, attributes):
fig = go.Figure()
for index, row in df.iterrows():
model = row['Model']
values = row[attributes].tolist()
fig.add_trace(go.Scatterpolar(
r=values,
theta=attributes,
fill='toself',
name=model
))
fig.update_layout(
title="OpsEval",
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 0.9]
)),
showlegend=True
)
return fig
def create_lang_leader_board(lang_dict):
best_scores = {}
best_plot_datasets = []
for dataset, value in lang_dict.items():
for cat, df in value.items():
if cat == 'mc':
processed = process_mc_df(df)
bestscores = processed['BestScore']
best_scores[dataset] = bestscores
best_plot_datasets.append(dataset)
best_df = pd.DataFrame(best_scores)
# print(best_scores)
# print(best_df)
# plot = plot_radar_chart(pd.DataFrame(best_scores), best_plot_datasets)
# gr.Plot(plot)
for dataset, value in lang_dict.items():
with gr.Tab(dataset_abbr_en_dict[dataset]):
for cat, df in value.items():
if cat == 'mc':
for shot in ['Zeroshot', 'Fewshot']:
with gr.Tab(f'Multiple Choice Question ({shot})'):
dataframe_to_gradio(df, is_mc=True, shot=shot)
else:
with gr.Tab('Question Answering'):
dataframe_to_gradio(df, is_mc=False)
def launch_gradio():
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
for key, dict in dict_lang.items():
with gr.Tab(key):
create_lang_leader_board(dict)
demo.launch()
pd.set_option('display.float_format', '{:.02f}'.format)
scheduler = BackgroundScheduler()
scheduler.add_job(launch_gradio, 'interval', hours=1)
scheduler.start()
launch_gradio()