# gradio display leaderboard import pandas as pd import numpy as np import matplotlib # matplotlib.use('macosx') import gradio as gr import matplotlib.pyplot as plt import plotly.graph_objects as go from apscheduler.schedulers.background import BackgroundScheduler from texts import * from leaderboards import eng_leaderboards, chi_leaderboards import toml import os from opseval_datasets import * config = toml.load("config.toml") def create_lang_tabs(lang, lang_cates): df_dict = {} for dataset, cates in lang_cates: dataset_dt = {} for cat in cates: leaderboard_df = pd.read_csv(f'./data_v2/{dataset}_{lang}_{cat}_gen.csv') dataset_dt[cat] = leaderboard_df df_dict[dataset] = dataset_dt return df_dict dict_lang = { 'English': create_lang_tabs('en', eng_leaderboards), 'Chinese': create_lang_tabs('zh', chi_leaderboards) } def process_mc_df(df, shot=None): # 将name列重命名为Model df = df.rename(columns={"name": "Model"}) # 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency df = df.set_index("Model") # df = df.stack().unstack() df.columns = pd.MultiIndex.from_tuples([("Zeroshot", "Naive"), ("Zeroshot", "SC"), ("Zeroshot", "CoT"), ("Zeroshot", "CoT+SC"), ("Fewshot", "Naive"), ("Fewshot", "SC"), ("Fewshot", "CoT"), ("Fewshot", "CoT+SC")]) # 保留shot的列,比如如果shot=Zeroshot那么只有Zeroshot的列会被保留 if shot: df = df[shot] # 将除了Model列之外的列的value转换为数值型,失败的为NaN df = df.apply(pd.to_numeric, errors="coerce") # 保留小数点后两位 df = df.round(2) # 给每一行添加一列BestScore df["BestScore"] = df.max(axis=1) # 根据BestScore给df排序 df = df.sort_values(by="BestScore", ascending=False) # reset_index df = df.reset_index() # 对于所有空的值,填充为'/' df = df.fillna('/') return df def process_qa_df(df): # 保留小数点后四位 df = df.round(4) return df def dataframe_to_gradio(df, is_mc=True, shot=None): if is_mc: df = process_mc_df(df, shot) else: df = process_qa_df(df) headers = df.columns # types = ["str"] + ["number"] * (len(headers) - 1) return gr.components.Dataframe( value=df.values.tolist(), headers=[label for label in df.columns], # datatype=types, # max_rows=10, ) def plot_radar_chart(df, attributes): fig = go.Figure() for index, row in df.iterrows(): model = row['Model'] values = row[attributes].tolist() fig.add_trace(go.Scatterpolar( r=values, theta=attributes, fill='toself', name=model )) fig.update_layout( title="OpsEval", polar=dict( radialaxis=dict( visible=True, range=[0, 0.9] )), showlegend=True ) return fig def create_lang_leader_board(lang_dict, lang='en'): best_scores = {} best_plot_datasets = [] for dataset, value in lang_dict.items(): for cat, df in value.items(): if cat == 'mc': processed = process_mc_df(df) bestscores = processed['BestScore'] best_scores[dataset] = bestscores best_plot_datasets.append(dataset) best_df = pd.DataFrame(best_scores) # print(best_scores) # print(best_df) # plot = plot_radar_chart(pd.DataFrame(best_scores), best_plot_datasets) # gr.Plot(plot) tab_list = [] for dataset, value in lang_dict.items(): chosen_dict = dataset_abbr_en_dict if lang == "en" else dataset_abbr_zh_dict with gr.Tab(chosen_dict[dataset]) as tab: for cat, df in value.items(): if cat == 'mc': for shot in ['Zeroshot', 'Fewshot']: with gr.Tab(f'Multiple Choice Question ({shot})'): df_component = dataframe_to_gradio(df, is_mc=True, shot=shot) # df_list.append(df_component) else: with gr.Tab('Question Answering'): df_component = dataframe_to_gradio(df, is_mc=False) # df_list.append(df_component) tab_list.append(tab) return tab_list def get_latest_modification_date(): latest = 0 for file in os.listdir(config['dataset']['dataset_dir']): if file.endswith('.csv'): mtime = os.path.getmtime(os.path.join(config['dataset']['dataset_dir'], file)) latest = max(latest, mtime) latest = pd.to_datetime(latest, unit='s') return latest.strftime("%Y-%m-%d %H:%M:%S") translation_dict = { 'zh': { 'intro': ZH_INTRODUCTION_TEXT, 'title': ZH_TITLE, 'lb_sec': f"""# 🏅 排行榜 \n 更新时间: {get_latest_modification_date()}\n""", }, 'en': { 'intro': INTRODUCTION_TEXT, 'title': TITLE, 'lb_sec': f"""# 🏅 Leaderboard \n Latest update: {get_latest_modification_date()}\n""" } } def get_language_lb(language): tab_dict = {'English': None, 'Chinese': None} for key, dict in dict_lang.items(): tab_list = create_lang_leader_board(dict, language) tab_dict[key] = tab_list return [*tab_dict['English'], *tab_dict['Chinese']] def switch_language(language): # gr.update(visible=True) return translation_dict[language]['title'], translation_dict[language]['intro'], translation_dict[language]['lb_sec'], *get_language_lb(language), language def get_lb_body(language='en'): tab_dict = {'English': None, 'Chinese': None} with gr.Blocks() as body: for key, dict in dict_lang.items(): with gr.Tab(key): tab_list = create_lang_leader_board(dict, language) tab_dict[key] = tab_list return body, tab_dict def launch_gradio(): demo = gr.Blocks() with demo: lang_state = gr.State("en") with gr.Row(): en_button = gr.Button("English", variant="primary") zh_button = gr.Button("中文", variant="primary") title = gr.HTML(TITLE) intro = gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") leaderboard_section = gr.Markdown(f"""# 🏅 Leaderboard \n Latest update: {get_latest_modification_date()}\n""", elem_classes="markdown-text") lb_body, tab_dict = get_lb_body(language=lang_state.value) tab_list = [*tab_dict['English'], *tab_dict['Chinese']] # print(tab_list) en_button.click(switch_language, inputs=[gr.State("en")], outputs=[title, intro, leaderboard_section, *tab_list, lang_state], postprocess=False) zh_button.click(switch_language, inputs=[gr.State("zh")], outputs=[title, intro, leaderboard_section, *tab_list, lang_state], postprocess=False) demo.launch() pd.set_option('display.float_format', '{:.02f}'.format) scheduler = BackgroundScheduler() scheduler.add_job(launch_gradio, 'interval', hours=1) scheduler.start() launch_gradio()