File size: 4,796 Bytes
0f89c55
 
 
733bd44
 
 
 
 
 
 
 
a6d507f
 
733bd44
a6d507f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733bd44
 
a6d507f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22cd459
 
 
 
 
a6d507f
 
 
 
22cd459
 
733bd44
a6d507f
733bd44
 
 
a6d507f
 
733bd44
 
 
a6d507f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733bd44
 
 
 
 
 
 
a6d507f
733bd44
a6d507f
733bd44
 
0f89c55
a6d507f
 
733bd44
 
 
0f89c55
733bd44
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# gradio display leaderboard

import pandas as pd
import numpy as np
import matplotlib
# matplotlib.use('macosx')
import gradio as gr
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from apscheduler.schedulers.background import BackgroundScheduler
from texts import INTRODUCTION_TEXT, TITLE
from leaderboards import eng_leaderboards, chi_leaderboards
from opseval_datasets import *


# df_lang = {
#     'English': pd.read_csv("./leaderboard/wired_network_en.csv"),
#     'Chinese': pd.read_csv("./leaderboard/wired_network_zh.csv"),
# }



def create_lang_tabs(lang, lang_cates):
    df_dict = {}
    for dataset, cates in lang_cates:
        dataset_dt = {}
        for cat in cates:
            leaderboard_df = pd.read_csv(f'./data/{dataset}_{lang}_{cat}.csv')
            dataset_dt[cat] = leaderboard_df
        df_dict[dataset] = dataset_dt
    return df_dict


dict_lang = {
    'English': create_lang_tabs('en', eng_leaderboards),
    'Chinese': create_lang_tabs('zh', chi_leaderboards)
}

def process_mc_df(df, shot=None):
    # 将name列重命名为Model
    df = df.rename(columns={"name": "Model"})
    # 将zero_naive, zero_self_con, zero_cot, zero_cot_self_con, few_naive, few_self_con, few_cot, few_cot_self_con列重新组织成MultiIndex,一层为Zeroshot, Fewshot,一层为Naive, Self-Consistency, CoT, CoT+Self-Consistency
    df = df.set_index("Model")
    # df = df.stack().unstack()
    df.columns = pd.MultiIndex.from_tuples([("Zeroshot", "Naive"), ("Zeroshot", "SC"), ("Zeroshot", "CoT"), ("Zeroshot", "CoT+SC"), ("Fewshot", "Naive"), ("Fewshot", "SC"), ("Fewshot", "CoT"), ("Fewshot", "CoT+SC")])
    # 保留shot的列,比如如果shot=Zeroshot那么只有Zeroshot的列会被保留
    if shot:
        df = df[shot]
    # 将除了Model列之外的列的value转换为数值型,失败的为NaN
    df = df.apply(pd.to_numeric, errors="coerce")
    # 保留小数点后两位
    df = df.round(2)
    # 给每一行添加一列BestScore
    df["BestScore"] = df.max(axis=1)
    # 根据BestScore给df排序
    df = df.sort_values(by="BestScore", ascending=False)
    # reset_index
    df = df.reset_index()
    return df

def process_qa_df(df):
    # 保留小数点后四位
    df = df.round(4)
    return df

def dataframe_to_gradio(df, is_mc=True, shot=None):

    if is_mc:
        df = process_mc_df(df, shot)
    else:
        df = process_qa_df(df)
    headers = df.columns
    # types = ["str"] + ["number"] * (len(headers) - 1)

    return gr.components.Dataframe(
        value=df.values.tolist(),
        headers=[label for label in df.columns],
        # datatype=types,
        # max_rows=10,
    )

def plot_radar_chart(df, attributes):
    fig = go.Figure()

    for index, row in df.iterrows():
        model = row['Model']
        values = row[attributes].tolist()
        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=attributes,
            fill='toself',
            name=model
        ))

    fig.update_layout(
        title="OpsEval",
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 0.9]
            )),
        showlegend=True
    )

    return fig


def create_lang_leader_board(lang_dict):
    
    best_scores = {}
    best_plot_datasets = []
    for dataset, value in lang_dict.items():
        for cat, df in value.items():
            if cat == 'mc':
                processed = process_mc_df(df)
                bestscores = processed['BestScore']
                best_scores[dataset] = bestscores
                best_plot_datasets.append(dataset)
    best_df = pd.DataFrame(best_scores)
    # print(best_scores)
    # print(best_df)
    # plot = plot_radar_chart(pd.DataFrame(best_scores), best_plot_datasets)
    # gr.Plot(plot)

    for dataset, value in lang_dict.items():
        with gr.Tab(dataset_abbr_en_dict[dataset]):
            for cat, df in value.items():
                if cat == 'mc':
                    for shot in ['Zeroshot', 'Fewshot']:
                        with gr.Tab(f'Multiple Choice Question ({shot})'):
                            dataframe_to_gradio(df, is_mc=True, shot=shot)
                else:
                    with gr.Tab('Question Answering'):
                        dataframe_to_gradio(df, is_mc=False)
    


def launch_gradio():
    demo = gr.Blocks()

    with demo:
        gr.HTML(TITLE)
        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
        for key, dict in dict_lang.items():
            with gr.Tab(key):
                create_lang_leader_board(dict)

    demo.launch()

pd.set_option('display.float_format', '{:.02f}'.format)

scheduler = BackgroundScheduler()
scheduler.add_job(launch_gradio, 'interval', hours=1)
scheduler.start()

launch_gradio()