# gradio display leaderboard import pandas as pd import numpy as np import matplotlib # matplotlib.use('macosx') import gradio as gr import matplotlib.pyplot as plt import plotly.graph_objects as go from apscheduler.schedulers.background import BackgroundScheduler from texts import INTRODUCTION_TEXT, TITLE from leaderboards import eng_leaderboards, chi_leaderboards from opseval_datasets import * # df_lang = { # 'English': pd.read_csv("./leaderboard/wired_network_en.csv"), # 'Chinese': pd.read_csv("./leaderboard/wired_network_zh.csv"), # } def create_lang_tabs(lang, lang_cates): df_dict = {} for dataset, cates in lang_cates: dataset_dt = {} for cat in cates: leaderboard_df = pd.read_csv(f'./data_v2/{dataset}_{lang}_{cat}_gen.csv') dataset_dt[cat] = leaderboard_df df_dict[dataset] = dataset_dt return df_dict dict_lang = { 'English': create_lang_tabs('en', eng_leaderboards), 'Chinese': create_lang_tabs('zh', chi_leaderboards) } def process_mc_df(df, shot=None): # 将name列重命名为Model df = df.rename(columns={"name": "Model"}) # 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency df = df.set_index("Model") # df = df.stack().unstack() df.columns = pd.MultiIndex.from_tuples([("Zeroshot", "Naive"), ("Zeroshot", "SC"), ("Zeroshot", "CoT"), ("Zeroshot", "CoT+SC"), ("Fewshot", "Naive"), ("Fewshot", "SC"), ("Fewshot", "CoT"), ("Fewshot", "CoT+SC")]) # 保留shot的列,比如如果shot=Zeroshot那么只有Zeroshot的列会被保留 if shot: df = df[shot] # 将除了Model列之外的列的value转换为数值型,失败的为NaN df = df.apply(pd.to_numeric, errors="coerce") # 保留小数点后两位 df = df.round(2) # 给每一行添加一列BestScore df["BestScore"] = df.max(axis=1) # 根据BestScore给df排序 df = df.sort_values(by="BestScore", ascending=False) # reset_index df = df.reset_index() # 对于所有空的值,填充为'/' df = df.fillna('/') return df def process_qa_df(df): # 保留小数点后四位 df = df.round(4) return df def dataframe_to_gradio(df, is_mc=True, shot=None): if is_mc: df = process_mc_df(df, shot) else: df = process_qa_df(df) headers = df.columns # types = ["str"] + ["number"] * (len(headers) - 1) return gr.components.Dataframe( value=df.values.tolist(), headers=[label for label in df.columns], # datatype=types, # max_rows=10, ) def plot_radar_chart(df, attributes): fig = go.Figure() for index, row in df.iterrows(): model = row['Model'] values = row[attributes].tolist() fig.add_trace(go.Scatterpolar( r=values, theta=attributes, fill='toself', name=model )) fig.update_layout( title="OpsEval", polar=dict( radialaxis=dict( visible=True, range=[0, 0.9] )), showlegend=True ) return fig def create_lang_leader_board(lang_dict): best_scores = {} best_plot_datasets = [] for dataset, value in lang_dict.items(): for cat, df in value.items(): if cat == 'mc': processed = process_mc_df(df) bestscores = processed['BestScore'] best_scores[dataset] = bestscores best_plot_datasets.append(dataset) best_df = pd.DataFrame(best_scores) # print(best_scores) # print(best_df) # plot = plot_radar_chart(pd.DataFrame(best_scores), best_plot_datasets) # gr.Plot(plot) for dataset, value in lang_dict.items(): with gr.Tab(dataset_abbr_en_dict[dataset]): for cat, df in value.items(): if cat == 'mc': for shot in ['Zeroshot', 'Fewshot']: with gr.Tab(f'Multiple Choice Question ({shot})'): dataframe_to_gradio(df, is_mc=True, shot=shot) else: with gr.Tab('Question Answering'): dataframe_to_gradio(df, is_mc=False) def launch_gradio(): demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") gr.Markdown("""# 🏅 Leaderboard \n Latest update: 2024-05-15\n""", elem_classes="markdown-text") for key, dict in dict_lang.items(): with gr.Tab(key): create_lang_leader_board(dict) demo.launch() pd.set_option('display.float_format', '{:.02f}'.format) scheduler = BackgroundScheduler() scheduler.add_job(launch_gradio, 'interval', hours=1) scheduler.start() launch_gradio()